Mercurial > repos > iarc > mutspec
view mutspecStat.pl @ 7:eda59b985b1c draft default tip
Uploaded
author | iarc |
---|---|
date | Mon, 13 Mar 2017 08:21:19 -0400 |
parents | 46a10309dfe2 |
children |
line wrap: on
line source
#!/usr/bin/env perl #-----------------------------------# # Author: Maude # # Script: mutspecStat.pl # # Last update: 09/02/17 # #-----------------------------------# use strict; use warnings; use Getopt::Long; use Pod::Usage; use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/); use File::Path; use Spreadsheet::WriteExcel; our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested. our ($refGenome, $output, $folder_temp, $path_R_Scripts, $path_SeqrefGenome) = ("empty", "empty", "empty", "empty", "empty", "empty"); # The reference genome to use; The path for saving the result; The path for saving the temporary files; The path to R scripts; The path to the fasta reference sequences our ($poolData, $oneReportPerSample) = (2, 2); # If a folder is pass as input file pool all the data and generate the report on the pool and for each samples; # Generate one report for each samples GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'refGenome=s'=>\$refGenome, 'outfile|o=s' => \$output, 'temp=s' => \$folder_temp, 'pathRscript=s' => \$path_R_Scripts, 'pathSeqRefGenome=s' => \$path_SeqrefGenome, 'poolData' => \$poolData, 'reportSample' => \$oneReportPerSample) or pod2usage(2); our ($input) = @ARGV; pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help); pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man); pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 0); # No argument is pass to the command line print the usage of the script pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 2); # Only one argument is expected to be pass to @ARGV (the input) ####### The input must be a folder with one or several annotated files if(!-d $input) { print STDERR "Error: The input must be a Dataset List\n"; print STDERR "Even for 1 file, please create a Dataset List\n"; exit; } ###################################################################################################################################################### # GLOBAL VARIABLES # ###################################################################################################################################################### # Recover the current path our $pwd = `pwd`; chomp($pwd); # Path to R scripts our $pathRscriptChi2test = "$path_R_Scripts/R/chi2test_MutSpecStat_Galaxy.r"; our $pathRScriptFigs = "$path_R_Scripts/R/figs_MutSpecStat_Galaxy.r"; our $pathRScriptTxnSB = "$path_R_Scripts/R/transciptionalStrandBias.r"; our $pathRScriptMutSpectrum = "$path_R_Scripts/R/mutationSpectra_Galaxy.r"; # The path for saving the files with enough mutations for calculating the statistics; our $folderCheckedForStat = "$pwd/folder_checked"; if(!-e $folderCheckedForStat) { mkdir($folderCheckedForStat) or die "$!: $folderCheckedForStat\n"; } # Output dir with all the results our $folderMutAnalysis = ""; # Hash table with the length of each chromosomes our %chromosomes; # Define the name of the column containing the chromosome, start, ref and alt alleles (based on Annovar output) our ($chr_name, $start_name, $ref_name, $alt_name) = qw(Chr Start Ref Alt); # Annovar annotation used our $func_name = "Func.refGene"; our $exonicFunc_name = "ExonicFunc.refGene"; our $strand_name = "Strand"; our $context_name = "context"; # Font formats our ($format_A10, $format_A10Boldleft, $format_A10ItalicRed) = ("", "", ""); our ($formatT_left, $formatT_right, $formatT_bottomRight, $formatT_bottomLeft, $formatT_bottom, $formatT_bottomHeader, $formatT_bottomRightHeader, $formatT_bottomHeader2, $formatT_rightHeader); our ($formatT_graphTitle); our ($table_topleft, $table_topRight, $table_bottomleft, $table_bottomRight, $table_top, $table_right, $table_bottom, $table_bottomItalicRed, $table_left, $table_bottomrightHeader, $table_left2, $table_middleHeader, $table_middleHeader2); # Hash table with the result of chi2 test for the strand bias our %h_chi2 = (); # For NMF input our %h_inputNMF = (); ###################################################################################################################################################### # MAIN # ###################################################################################################################################################### # Check the presence of the flags and create the output and temp directories CheckFlags(); # First check if the files are annotated or not. # If the files are annotated check there is enough mutations for generating the statistics, otherwise remove the samples from the analysis checkVariants(); # Retrieve chromosomes length checkChrDir(); # Calculate the statistics and generate the report ReportMutDist(); # Remove the temporary directory rmtree($folder_temp); rmtree($folderCheckedForStat); ###################################################################################################################################################### # FUNCTIONS # ###################################################################################################################################################### # Check the presence of the flags and create the output and temp directories sub CheckFlags { # Check the reference genome if($refGenome eq "empty") { print STDERR "Missing flag !\n"; print STDERR "You forget to specify the name for the reference genome!!!\nPlease specify it with the flag --refGenome\n"; exit; } # If no output is specified write the result as the same place as the input file if($output eq "empty") { # The input is a folder with one or more annotated files my $directory = dirname( $input ); $folderMutAnalysis = "$directory/Mutational_Analysis"; if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; } } else { if(!-e $output) { mkdir($output) or die "$!: $output\n"; } $folderMutAnalysis = "$output/Mutational_Analysis"; if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; } } # If no temp folder is specified write the result in the current path my ($filename, $directories, $suffix) = fileparse($input, qr/\.[^.]*/); if($folder_temp eq "empty") { $folder_temp = "$pwd/TEMP_MutationalAnalysis_$filename"; } if(!-e $folder_temp) { mkdir($folder_temp) or die "$!: $folder_temp\n"; } # Check the path to the R scripts if($path_R_Scripts eq "empty") { print STDERR "Missing flag !\n"; print STDERR "You forget to specify the path for the R scripts!!!\nPlease specify it with the flag --pathRscript\n"; exit; } foreach my $file (`ls $input/*`) { chomp($file); ## Verify the name of file, must be <= 31 chars for the sheet name my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/); if(length($filename) > 31) { print STDERR "Error: The filename of: $file\nMust be <= 31 chars\nPlease modify it before running the script\n"; exit; } } } # Check input file(s) sub checkVariants { # Count the number of file(s) with enough mutations (at least 1 with a strand orientation) my $timerFile = 0; my @listRemovedFile = (); foreach my $file (`ls $input/*`) { chomp($file); ### Check if the file is annotated my $testAnnotation = "toto"; $testAnnotation = `grep 'Func.refGene' $file`; if($testAnnotation eq "toto") { print STDERR "Error: The input file you specify is not annotated!\nThe file concerned is: $file !!!!\nPlease first annotate your file before trying to generate the report on mutation spectra\n"; exit; } else { ### check if there is at least 1 mutation with a strand info my $strand_value = recoverNumCol($file, "Strand"); my $nbSBScoding = 0; open(F1, $file) or die "$!: $file\n"; my $header = <F1>; while(<F1>) { $_ =~ s/[\r\n]+$//; my @tab = split("\t", $_); if($tab[$strand_value] ne "NA") { $nbSBScoding++; } } close F1; if($nbSBScoding != 0) { $timerFile++; `cp $file $folderCheckedForStat/`; } else { print STDOUT "\n\nWarning: There is no variant to compute statistics for $file\n\n"; push(@listRemovedFile, $file); } } } if($timerFile == 0) { print STDERR "\n\nError: No variants to compute statistics for:\n"; foreach (@listRemovedFile) { print STDERR $_."\n"; } exit; } } # Retrieve chromosomes length sub checkChrDir { my @files = `ls $path_SeqrefGenome/$refGenome"_seq"/*.fa`; foreach my $file (@files) { if ($file !~ /chr(\d+|x|y)\.fa/i){next;} open(FILE,$file); <FILE>; my $seq=""; while (<FILE>){ chomp; $seq.=$_;} $file =~ /chr(.*)\.fa/; $chromosomes{"chr".$1}=length($seq); } } # Calculate the statistics and generate the report sub ReportMutDist { print STDOUT "-----------------------------------------------------------------\n"; print STDOUT "-----------------Report Mutational Analysis----------------------\n"; print STDOUT "-----------------------------------------------------------------\n"; my $folderFigure = "$folderMutAnalysis/Figures"; if(-e $folderFigure) { rmtree($folderFigure); mkdir($folderFigure) or die "Can't create the directory $folderFigure\n"; } else { mkdir($folderFigure) or die "Can't create the directory $folderFigure\n"; } my $folderChi2 = "$folderFigure/Chi2"; if(!-e $folderChi2) { mkdir($folderChi2) or die "Can't create the directory $folderChi2\n"; } my $folderWebLogo = "$folderFigure/WebLogo"; if(!-e $folderWebLogo) { mkdir($folderWebLogo) or die "Can't create the directory $folderWebLogo\n"; } my $folderNMF = "$folderFigure/Input_NMF"; if(!-e $folderNMF) { mkdir($folderNMF) or die "Can't create the directory $folderNMF\n"; } ################################################################################################ ### Calculates all the statistics ### ################################################################################################ ########### Recover the functional region for all the samples. Allows to thave the same annotations for the pie chart "Impact on protein sequence" my @tab_func = recoverAnnovarAnnotation($func_name); if(@tab_func == 0) { print STDERR "Error: the table for the functional region is empty!!!!! check $folderCheckedForStat\n$func_name\n"; exit; } ############ Calculate the different statistics present in the report my %h_file = (); CalculateStatistics(\%h_file, \@tab_func); ############ Calculate the chi2 for the strand bias CalculateChi2(\%h_file, $folderChi2); ############ Write the different statistics present in the report WriteStatistics(\%h_file, $#tab_func, $folderFigure, $folderChi2, $folderNMF); ############ Create logo for studying the 10 flanking bases of the mutation CreateLogo(\%h_file, $folderWebLogo); } # Calculate the different statistics present in the report sub CalculateStatistics { my ($refH_file, $refT_func) = @_; our ($chr_value, $start_value, $ref_value, $alt_value, $func_value, $exonicFunc_value, $strand_value, $contextSeq_value) = ("", "", "", "", "", "", "", "", "", ""); # Generate the pool of all the data if($poolData == 1) { my @listFile = `ls $folderCheckedForStat`; # For keeping the header only one time chomp($listFile[0]); system("cp $folderCheckedForStat/$listFile[0] $folderCheckedForStat/Pool_Data.txt"); open(OUT, ">>", "$folderCheckedForStat/Pool_Data.txt") or die "$!: $folderCheckedForStat/Pool_Data.txt\n"; for(my $i=1; $i<=$#listFile; $i++) { chomp($listFile[$i]); open(F1, "$folderCheckedForStat/$listFile[$i]") or die "$!: $folderCheckedForStat/$listFile[$i]\n"; my $header = <F1>; while(<F1>) { print OUT $_; } close F1; } close OUT; } foreach my $file (`ls $folderCheckedForStat/*`) { chomp($file); ############ Recover the number of the columns of interest $chr_value = recoverNumCol($file, $chr_name); $start_value = recoverNumCol($file, $start_name); $ref_value = recoverNumCol($file, $ref_name); $alt_value = recoverNumCol($file, $alt_name); $func_value = recoverNumCol($file, $func_name); $exonicFunc_value = recoverNumCol($file, $exonicFunc_name); $strand_value = recoverNumCol($file, $strand_name); $contextSeq_value = recoverNumCol($file, $context_name); ############ Recover the number of the columns of interest ############ Calculate the statistics for each file File2Hash($file, $func_value, $exonicFunc_value, $chr_value, $ref_value, $alt_value, $strand_value, $contextSeq_value, $refH_file, $refT_func); } } # Calculate the chi2 for the strand bias sub CalculateChi2 { my ($refH_file, $folderChi2) = @_; # No value for the chi2 if(scalar (keys %{$refH_file}) == 0) { print STDERR "Error: No value for calculating the chi2 for the strand bias\n"; exit; } # Strand bias for one mutation type for all the samples my %h_tempchi2 = (); my ($ca_NonTr, $ca_Tr, $cg_NonTr, $cg_Tr, $ct_NonTr, $ct_Tr, $ta_NonTr, $ta_Tr, $tc_NonTr, $tc_Tr, $tg_NonTr, $tg_Tr) = (0,0,0,0,0,0, 0,0,0,0,0,0); my $nb_file = 0; foreach my $k_file (sort keys %{$refH_file}) { $nb_file++; foreach my $k_func (sort keys %{$refH_file->{$k_file}{'6mutType'}}) { foreach my $k_mutation (sort keys %{$refH_file->{$k_file}{'6mutType'}{$k_func}}) { if($k_mutation eq "C:G>A:T") { $h_tempchi2{'C>A'}{$k_file}{'NonTr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}; $h_tempchi2{'C>A'}{$k_file}{'Tr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; } if($k_mutation eq "C:G>G:C") { $h_tempchi2{'C>G'}{$k_file}{'NonTr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}; $h_tempchi2{'C>G'}{$k_file}{'Tr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; } if($k_mutation eq "C:G>T:A") { $h_tempchi2{'C>T'}{$k_file}{'NonTr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}; $h_tempchi2{'C>T'}{$k_file}{'Tr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; } if($k_mutation eq "T:A>A:T") { $h_tempchi2{'T>A'}{$k_file}{'NonTr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}; $h_tempchi2{'T>A'}{$k_file}{'Tr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; } if($k_mutation eq "T:A>C:G") { $h_tempchi2{'T>C'}{$k_file}{'NonTr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}; $h_tempchi2{'T>C'}{$k_file}{'Tr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; } if($k_mutation eq "T:A>G:C") { $h_tempchi2{'T>G'}{$k_file}{'NonTr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}; $h_tempchi2{'T>G'}{$k_file}{'Tr'} += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; } } } } # Create the input file for NMF open(CHI2, ">", "$folderChi2/Input_chi2_strandBias.txt") or die "$!: $folderChi2/Input_chi2_strandBias.txt\n"; print CHI2 "SampleName\tNonTr\tTr\tAlteration\n"; foreach my $k_mutation (sort keys %h_tempchi2) { foreach my $k_file (sort keys %{$h_tempchi2{$k_mutation}}) { print CHI2 "$k_file\t$h_tempchi2{$k_mutation}{$k_file}{'NonTr'}\t$h_tempchi2{$k_mutation}{$k_file}{'Tr'}\t$k_mutation\n"; } } close CHI2; `Rscript $pathRscriptChi2test --folderChi2 $folderChi2 2>&1`; # `Rscript $pathRscriptChi2test $folderChi2 2>&1`; if(!-e "$folderChi2/Output_chi2_strandBias.txt") { print STDERR "Error: Chi2 test didn't work !!!\n"; exit; } } # Write the different statistics in the report sub WriteStatistics { my ($refH_file, $nb_func, $folderFigure, $folderChi2, $folderNMF) = @_; # Save the different graphs in specific folde if(!-e "$folderFigure/Overall_mutation_distribution") { mkdir("$folderFigure/Overall_mutation_distribution") or die "Can't create the directory $folderFigure/Overall_mutation_distribution\n"; } if(!-e "$folderFigure/Impact_protein_sequence") { mkdir("$folderFigure/Impact_protein_sequence") or die "Can't create the directory $folderFigure/Impact_protein_sequence\n"; } if(!-e "$folderFigure/SBS_distribution") { mkdir("$folderFigure/SBS_distribution") or die "Can't create the directory $folderFigure/SBS_distribution\n"; } if(!-e "$folderFigure/Stranded_Analysis") { mkdir("$folderFigure/Stranded_Analysis") or die "Can't create the directory $folderFigure/Stranded_Analysis\n"; } if(!-e "$folderFigure/Trinucleotide_Sequence_Context") { mkdir("$folderFigure/Trinucleotide_Sequence_Context") or die "Can't create the directory $folderFigure/Trinucleotide_Sequence_Context\n"; } if(!-e "$folderFigure/Distribution_SBS_Per_Chromosomes") { mkdir("$folderFigure/Distribution_SBS_Per_Chromosomes") or die "Can't create the directory $folderFigure/Distribution_SBS_Per_Chromosomes\n"; } # Create a workbook with all the samples my $wb = ""; my $ws_sum = ""; my ($ws_inputNMF_count, $ws_inputNMF_percent) = ("", ""); # Create one Excel file with all the samples if($oneReportPerSample == 2) { $wb = Spreadsheet::WriteExcel->new("$folderMutAnalysis/Report_Mutation_Spectra.xls"); ############## Set the variables for font formats in the Excel report Format_A10($wb, \$format_A10); # Text center in Arial 10 Format_A10BoldLeft($wb, \$format_A10Boldleft); # Text on the left in Arial 10 bold Format_TextSection($wb, \$formatT_left, \$formatT_right, \$formatT_bottomRight, \$formatT_bottomLeft, \$formatT_bottom, \$formatT_bottomHeader, \$formatT_bottomRightHeader, \$formatT_bottomHeader2, \$formatT_rightHeader); Format_GraphTitle($wb, \$formatT_graphTitle); Format_Table($wb, \$table_topleft, \$table_topRight, \$table_bottomleft, \$table_bottomRight, \$table_top, \$table_right, \$table_bottom, \$table_bottomItalicRed, \$table_left, \$table_bottomrightHeader, \$table_left2, \$table_middleHeader, \$table_middleHeader2); Format_A10ItalicRed($wb, \$format_A10ItalicRed); ############### Worksheet with a summary of the samples $ws_sum = $wb->add_worksheet("Sample_List"); $ws_sum->write(0, 0, "Samples", $format_A10); $ws_sum->write(0, 1, "Total number SBS", $format_A10); $ws_sum->write(0, 2, "Total number of Indel", $format_A10); $ws_sum->write(0, 3, "Total number of mutations", $format_A10); $ws_sum->set_column(0,0, 50); $ws_sum->set_column(1,1, 20); $ws_sum->set_column(2,2, 20); $ws_sum->set_column(3,3, 22); ############### Write the input matrix for NMF for the count and the un-normalized frequency $ws_inputNMF_count = $wb->add_worksheet("Input_NMF_Count"); $ws_inputNMF_percent = $wb->add_worksheet("Input_NMF_Percent"); } ################################################ Set the Rows and columns of the different part of the report my $row_SumSheet = 1; # First row for the summary sheet of the report my $rowStart_SBSdistrBySeg = 48; my $colStart_SBSdistrBySeg = 0; # For the table SBS distribution by segment my $colStart_matrixSeqContext = 19; # Sequence context my $col_inputNMF = 0; # Write the names of the samples with at least 33 SBS ## For each file foreach my $k_file (sort keys %{$refH_file}) { print "File in process: $k_file\n"; # Count the total of mutations for 6 mutation types on genomic strand my ($c_ca6_g, $c_cg6_g, $c_ct6_g, $c_ta6_g, $c_tc6_g, $c_tg6_g) = (0,0,0, 0,0,0); if($k_file ne "Pool_Data") { $col_inputNMF++; } ############### Save the chi2 values into a hash table if(-e "$folderChi2/Output_chi2_strandBias.txt") { chi2hash("$folderChi2/Output_chi2_strandBias.txt", $k_file); } # Create one workbook for each sample if($oneReportPerSample == 1) { $wb = Spreadsheet::WriteExcel->new("$folderMutAnalysis/Report_Mutation_Spectra-$k_file.xls"); ############## Set the variables for font formats in the Excel report Format_A10($wb, \$format_A10); # Text center in Arial 10 Format_A10BoldLeft($wb, \$format_A10Boldleft); # Text on the left in Arial 10 bold Format_TextSection($wb, \$formatT_left, \$formatT_right, \$formatT_bottomRight, \$formatT_bottomLeft, \$formatT_bottom, \$formatT_bottomHeader, \$formatT_bottomRightHeader, \$formatT_bottomHeader2, \$formatT_rightHeader); Format_GraphTitle($wb, \$formatT_graphTitle); Format_Table($wb, \$table_topleft, \$table_topRight, \$table_bottomleft, \$table_bottomRight, \$table_top, \$table_right, \$table_bottom, \$table_bottomItalicRed, \$table_left, \$table_bottomrightHeader, \$table_left2, \$table_middleHeader, \$table_middleHeader2); Format_A10ItalicRed($wb, \$format_A10ItalicRed); ############### Worksheet with a summary of the samples $ws_sum = $wb->add_worksheet("Sample_List"); $ws_sum->write(0, 0, "Samples", $format_A10); $ws_sum->write(0, 1, "Total number SBS", $format_A10); $ws_sum->write(0, 2, "Total number of Indel", $format_A10); $ws_sum->write(0, 3, "Total number of mutations", $format_A10); $ws_sum->set_column(0,0, 50); $ws_sum->set_column(1,1, 20); $ws_sum->set_column(2,2, 20); $ws_sum->set_column(3,3, 22); # Write in the Samples sheet the name and the total number of SBS $ws_sum->write(1, 0, "$k_file", $format_A10); $ws_sum->write(1, 1, $refH_file->{$k_file}{'TotalSBSGenomic'}, $format_A10); $ws_sum->write(1, 2, $refH_file->{$k_file}{'TotalIndelGenomic'}, $format_A10); $ws_sum->write($row_SumSheet, 3, $refH_file->{$k_file}{'TotalMutGenomic'}, $format_A10); } # One workbook with all the samples else { # Write in the Samples sheet the name and the total number of SBS $ws_sum->write($row_SumSheet, 0, $k_file, $format_A10); $ws_sum->write($row_SumSheet, 1, $refH_file->{$k_file}{'TotalSBSGenomic'}, $format_A10); $ws_sum->write($row_SumSheet, 2, $refH_file->{$k_file}{'TotalIndelGenomic'}, $format_A10); $ws_sum->write($row_SumSheet, 3, $refH_file->{$k_file}{'TotalMutGenomic'}, $format_A10); # For NMF don't consider the pool of the samples if($k_file ne "Pool_Data") { # Write in the input NMF sheet the name of the samples $ws_inputNMF_count->write(0, $col_inputNMF, $k_file); $ws_inputNMF_percent->write(0, $col_inputNMF, $k_file); } } # Calculate the correlation between the number of SBS and the size of the chromosome PearsonCoefficient($refH_file, $k_file); # Add a worksheet to the workbook my $ws = $wb->add_worksheet($k_file); # Write the titles of the different sections of the report WriteBorderSection($wb, $ws, $rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, $nb_func, $colStart_matrixSeqContext); # Write the mutation types (6 types) WriteHeaderSection($wb, $ws, $rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, $nb_func, $colStart_matrixSeqContext); # Save the figures of each samples in a different folder if(!-e "$folderFigure/Overall_mutation_distribution/$k_file") { mkdir("$folderFigure/Overall_mutation_distribution/$k_file") or die "Can't create the directory $folderFigure/Overall_mutation_distribution/$k_file\n"; } if(!-e "$folderFigure/Impact_protein_sequence/$k_file") { mkdir("$folderFigure/Impact_protein_sequence/$k_file") or die "Can't create the directory $folderFigure/Impact_protein_sequence/$k_file\n"; } if(!-e "$folderFigure/SBS_distribution/$k_file") { mkdir("$folderFigure/SBS_distribution/$k_file") or die "Can't create the directory $folderFigure/SBS_distribution\n"; } if(!-e "$folderFigure/Stranded_Analysis/$k_file") { mkdir("$folderFigure/Stranded_Analysis/$k_file") or die "Can't create the directory $folderFigure/Stranded_Analysis/$k_file\n"; } if(!-e "$folderFigure/Trinucleotide_Sequence_Context/$k_file") { mkdir("$folderFigure/Trinucleotide_Sequence_Context/$k_file") or die "Can't create the directory $folderFigure/Trinucleotide_Sequence_Context/$k_file\n"; } ################################################################################################################################################## ################################################################# Write the statistics ########################################################## ################################################################################################################################################## my $row_SBSDistrBySegAndFunc_CG = $rowStart_SBSdistrBySeg+($nb_func*2)+16; ######## Count of SBS by functional impact on the protein (Table 2) + Create the input for ggplot2 (pie chart with functional impact) + Create the input for ggplot2 (pie chart of SBS vs. Indels) writeDistrFuncImpact($ws, $refH_file, $k_file, "$folderFigure/Impact_protein_sequence/$k_file/$k_file-DistributionExoFunc.txt", "$folderFigure/Overall_mutation_distribution/$k_file/$k_file-OverallMutationDistribution.txt"); ######## Result of the chi2 for the strand bias (Table 3) + Create the input for ggplot2 (Strand bias bar graph) writeChi2result($wb, $ws, "$folderFigure/Stranded_Analysis/$k_file/$k_file-StrandBias.txt", $refH_file, $k_file); ######## SBS distribution by functional region (Table 4) + Strand bias by functional region (Table 5) + Create the input for ggplot2 (SBS distribution) + Overall count and percent of SBS (Table 1) writeStatbyFuncRegion($refH_file, $k_file, $ws, $rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, $nb_func, \$row_SBSDistrBySegAndFunc_CG, "$folderFigure/SBS_distribution/$k_file/$k_file-SBS_distribution.txt"); ######## Distribution of SBS per chromosomes and the result of Pearson test (Table 6) writeDistrByChr($ws, $refH_file, $k_file, $row_SBSDistrBySegAndFunc_CG, $colStart_SBSdistrBySeg, "$folderFigure/Distribution_SBS_Per_Chromosomes/$k_file-DistributionSNVS_per_chromosome.txt"); ######## Trinucleotide sequence context on genomic strand (Panel 1) # Represent the trinucleotide with a heatmap with count of SBS my $heatmapCountggplot2 = "$folderFigure/Trinucleotide_Sequence_Context/$k_file/$k_file-HeatmapCount-Genomic.txt"; my $heatmapPercentggplot2 = "$folderFigure/Trinucleotide_Sequence_Context/$k_file/$k_file-HeatmapPercent-Genomic.txt"; my $triNtBarChartggplot2 = "$folderFigure/Trinucleotide_Sequence_Context/$k_file/$k_file-MutationSpectraPercent-Genomic.txt"; writeTriNtGenomic($ws, $refH_file, $k_file, $colStart_matrixSeqContext, $heatmapCountggplot2, $heatmapPercentggplot2, $triNtBarChartggplot2, \$c_ca6_g, \$c_cg6_g, \$c_ct6_g, \$c_ta6_g, \$c_tc6_g, \$c_tg6_g); # For the input matrix for NMF if($k_file ne "Pool_Data") { push(@{$h_inputNMF{'Sample'}}, $k_file); } ######## Trinucleotide sequence context on genomic strand (Panel 2) my $triNtBarChartCodingCountggplot2 = "$folderFigure/Stranded_Analysis/$k_file/$k_file-StrandedSignatureCount.txt"; my $triNtBarChartCodingPercentggplot2 = "$folderFigure/Stranded_Analysis/$k_file/$k_file-StrandedSignaturePercent.txt"; writeTriNtCoding($ws, $rowStart_SBSdistrBySeg, $colStart_matrixSeqContext, $refH_file, $k_file, $triNtBarChartCodingCountggplot2, $triNtBarChartCodingPercentggplot2); ######## Generate the figures and include them in the report createWriteFigs($ws, $rowStart_SBSdistrBySeg, $colStart_matrixSeqContext, $folderFigure, $k_file, $c_ca6_g, $c_cg6_g, $c_ct6_g, $c_ta6_g, $c_tc6_g, $c_tg6_g); # Next sample $row_SumSheet++; } # End $k_file ######## Write the input matrix for NMF # One workbook with all the samples writeInputNMF($ws_inputNMF_count, $ws_inputNMF_percent, "$folderNMF/Input_NMF_Count.txt", "$folderNMF/Input_NMF_Frequency.txt"); # Close the workbook $wb->close(); } # Create logo for representing the sequence context with n bases sub CreateLogo { my ($refH_file, $folderWebLogo) = @_; my $folderSample = ""; foreach my $k_file (sort keys %{$refH_file}) { $folderSample = "$folderWebLogo/$k_file"; if(!-e $folderSample) { mkdir($folderSample) or die "Can't create the directory $folderSample\n"; } my $test_lengthSeqContext = 0; foreach my $k_mutation (sort keys %{$refH_file->{$k_file}{'WebLogo3'}}) { $k_mutation =~ /(\w)>(\w)/; my ($ref, $alt) = ($1, $2); open(WEBLOGO, ">", "$folderSample/$k_file-$ref$alt.fa") or die "$!: $folderSample/$k_file-$ref$alt.fa\n"; foreach (@{$refH_file->{$k_file}{'WebLogo3'}{$k_mutation}}) { print WEBLOGO ">$k_file\n$_\n"; if(length($_) < 10) { $test_lengthSeqContext = 0; } else { $test_lengthSeqContext = 1; } } close WEBLOGO; } ## Generate the logo foreach my $fastaFile (`ls $folderSample/*.fa`) { chomp($fastaFile); my ($filename, $directories, $suffix) = fileparse("$folderSample/$fastaFile", qr/\.[^.]*/); $filename =~ /(.+)\-/; my $title = $1; ## Test if there is fasta sequences for the mutation type my $nbLigne_temp = `wc -l $fastaFile`; my @nbLigne = split(" ", $nbLigne_temp); if($nbLigne[0] == 0) { print "WARNING: No sequence for $filename\n"; next; } # When length sequence context is lower than 10 the image is to small for adding a title if($test_lengthSeqContext == 1) { system("weblogo -c classic -F png_print -U probability --title $title < $fastaFile > $folderSample/$filename-Probability.png"); } else { system("weblogo -c classic -F png_print -U probability < $fastaFile > $folderSample/$filename-Probability.png"); } } } } ### Save the count of SBS for each file into a hash table sub File2Hash { my ($inputFile, $func_value, $exonicFunc_value, $chr_value, $ref_value, $alt_value, $strand_value, $contextSeq_value, $refH_file, $refT_func) = @_; my ($filename, $directories, $suffix) = fileparse($inputFile, qr/\.[^.]*/); # Initialisation of the hash my @tab_mutation = qw(C:G>A:T C:G>G:C C:G>T:A T:A>A:T T:A>C:G T:A>G:C); my @tab_aaChange = ("NonTr", "Tr", "TotalMutG"); my @tabExoFunc = ("frameshift insertion", "frameshift deletion", "frameshift block substitution", "frameshift substitution", "stopgain", "stoploss", "nonframeshift insertion", "nonframeshift deletion", "nonframeshift substitution", "nonframeshift block substitution", "nonsynonymous SNV", "synonymous SNV", "unknown", "NA"); # Total number of SBS on the genomic strand $refH_file->{$filename}{'TotalSBSGenomic'} = 0; # Total number of Indel on the genomic strand $refH_file->{$filename}{'TotalIndelGenomic'} = 0; # Total number of SBS on the coding strand $refH_file->{$filename}{'TotalSBSCoding'} = 0; # Total number of SBS and Indel on the genomic strand $refH_file->{$filename}{'TotalMutGenomic'} = 0; ##################################################### # Initialisation of the tables and hash tables # ##################################################### ## SBS by segment (6 mutation types) foreach my $elt_tabFunc (@$refT_func) { foreach my $elt_tabMutation (@tab_mutation) { foreach my $elt_aaChange (@tab_aaChange) { $refH_file->{$filename}{'6mutType'}{$elt_tabFunc}{$elt_tabMutation}{$elt_aaChange} = 0; } } } ## Pearson correlation $refH_file->{$filename}{'SBSPerChr'}{'AllMutType'} = 0; # Count of SBS per chromosome foreach mutation types foreach my $elt_tabMutation (@tab_mutation) { foreach my $chromosome (sort keys %chromosomes){ $refH_file->{$filename}{'SBSPerChr'}{$elt_tabMutation}{'CHR'}{$chromosome}{'chr'} = 0;} $refH_file->{$filename}{'SBSPerChr'}{$elt_tabMutation}{'Pearson'} = 0; } foreach my $chromosome (sort keys %chromosomes) { $refH_file->{$filename}{'SBSPerChr'}{'TotalPerChr'}{$chromosome}{'chr'}=0; } ## Impact of SBS on protein foreach my $elt_exoFunc (@tabExoFunc) { $refH_file->{$filename}{'ImpactSBS'}{$elt_exoFunc} = 0; } ## Sequence context (genomic strand) my @tab_mutation2 = qw(C>A C>G C>T T>A T>C T>G); my @tab_context = qw(A_A A_C A_G A_T C_A C_C C_G C_T G_A G_C G_G G_T T_A T_C T_G T_T); foreach my $elt_context (@tab_context) { foreach my $elt_mutation3 (@tab_mutation2) { $refH_file->{$filename}{'SeqContextG'}{$elt_context}{$elt_mutation3} = 0; } } ## Sequence context (coding strand) my @tab_TrNonTr = qw(NonTr Tr); foreach my $elt_context (@tab_context) { foreach my $elt_mutation2 (@tab_mutation2) { foreach my $trNonTr (@tab_TrNonTr) { $refH_file->{$filename}{'SeqContextC'}{$elt_context}{$elt_mutation2}{$trNonTr} = 0; } } } ##################################################### # Parse the intput file # ##################################################### open(F1,$inputFile) or die "$!: $inputFile\n"; my $header = <F1>; while(<F1>) { $_ =~ s/[\r\n]+$//; my @tab = split("\t", $_); ### Don't consider random chromosomes and chromosome M if( ($tab[$chr_value] =~ /random/i) || ($tab[$chr_value] =~ /M/i) ) { next; } ### Recover the trinucleotide sequence context: Extract the base just before and after the mutation my $context = ""; my $contextSequence = $tab[$contextSeq_value]; $contextSequence =~ tr/a-z/A-Z/; my @tempContextSequence = split("", $contextSequence); my $total_nbBaseContext = $#tempContextSequence; my $midlle_totalNbBaseContext = $total_nbBaseContext/2; # For having the middle of the sequence my $before = $midlle_totalNbBaseContext - 1; my $after = $midlle_totalNbBaseContext + 1; $context = $tempContextSequence[$before]."_".$tempContextSequence[$after]; ### Recover the annotations on the impact on the protein for creating the pie chart my $exoFunc = ""; # Sometimes the annotation is repeated frameshift deletion;frameshift deletion if($tab[$exonicFunc_value] =~ /\;/) { my @temp = split(";", $tab[$exonicFunc_value]); if($temp[0] eq $temp[1]) { $exoFunc = $temp[0]; } } # The annotations have changed after MAJ Annovar 2014Jul22 (stopgain SNV => stopgain) elsif($tab[$exonicFunc_value] eq "stopgain SNV") { $exoFunc = "stopgain"; } elsif($tab[$exonicFunc_value] eq "stoploss SNV") { $exoFunc = "stoploss"; } elsif($tab[$exonicFunc_value] eq "nonsynonymous_SNV") { $exoFunc = "nonsynonymous SNV"; } elsif($tab[$exonicFunc_value] eq "stopgain_SNV") { $exoFunc = "stopgain SNV"; } elsif($tab[$exonicFunc_value] eq "synonymous_SNV") { $exoFunc = "synonymous SNV"; } else { $exoFunc = $tab[$exonicFunc_value]; } if(exists $refH_file->{$filename}{'ImpactSBS'}{$exoFunc}) { # If the sequence context if not recovered correctly don't considered the variants if( ($context =~ /N/) || (length($context) != 3) ) { next; } $refH_file->{$filename}{'ImpactSBS'}{$exoFunc}++; $refH_file->{$filename}{'TotalMutGenomic'}++; } else { print "WARNING: Exonic function not considered: $exoFunc\n"; } #### Only SBS are considered for the statistics if( ($tab[$ref_value] =~ /^[ACGT]$/i) && ($tab[$alt_value] =~ /^[ACGT]$/i) ) { # If the sequence context if not recovered correctly don't considered the variants if( ($context =~ /N/) || (length($context) != 3) ) { next; } # Total number of SBS on the genomic strand $refH_file->{$filename}{'TotalSBSGenomic'}++; # Total number of SBS on the coding strand with a sequence context if( ($tab[$strand_value] eq "+") || ($tab[$strand_value] eq "-") ) { if( ($context ne "NA") && (($context =~ /N/) || (length($context) != 3)) ) { next; } $refH_file->{$filename}{'TotalSBSCoding'}++; } } else { $refH_file->{$filename}{'TotalIndelGenomic'}++; } ### Number of SBS per chromosome: remove the "chr" my $chrNameForH=$tab[$chr_value]; if(exists $refH_file->{$filename}{'SBSPerChr'}{'TotalPerChr'}{$chrNameForH}{'chr'}) { $refH_file->{$filename}{'SBSPerChr'}{'TotalPerChr'}{$chrNameForH}{'chr'}++; } #### Some func value are repeated and separated by ";" my $funcSegment = ""; if($tab[$func_value] =~ /;/) { my @temp = split(";", $tab[$func_value]); $funcSegment = $temp[0]; } else { $funcSegment = $tab[$func_value]; } ##################################################### # Calculate the statistics for each mutation type # ##################################################### if( (($tab[$ref_value] eq "C") && ($tab[$alt_value] eq "A")) || ( ($tab[$ref_value] eq "G") && ($tab[$alt_value] eq "T") ) ) { my $mutation = "C:G>A:T"; statPerMutType($filename, \@tab, $ref_value, $alt_value, \@tempContextSequence, $before, $after, $context, $funcSegment, $mutation, $refH_file, $chrNameForH, $strand_value, $midlle_totalNbBaseContext); } if( (($tab[$ref_value] eq "C") && ($tab[$alt_value] eq "G")) || ( ($tab[$ref_value] eq "G") && ($tab[$alt_value] eq "C") ) ) { my $mutation = "C:G>G:C"; statPerMutType($filename, \@tab, $ref_value, $alt_value, \@tempContextSequence, $before, $after, $context, $funcSegment, $mutation, $refH_file, $chrNameForH, $strand_value, $midlle_totalNbBaseContext); } if( (($tab[$ref_value] eq "C") && ($tab[$alt_value] eq "T")) || ( ($tab[$ref_value] eq "G") && ($tab[$alt_value] eq "A") ) ) { my $mutation = "C:G>T:A"; statPerMutType($filename, \@tab, $ref_value, $alt_value, \@tempContextSequence, $before, $after, $context, $funcSegment, $mutation, $refH_file, $chrNameForH, $strand_value, $midlle_totalNbBaseContext); } if( (($tab[$ref_value] eq "T") && ($tab[$alt_value] eq "A")) || ( ($tab[$ref_value] eq "A") && ($tab[$alt_value] eq "T") ) ) { my $mutation = "T:A>A:T"; statPerMutType($filename, \@tab, $ref_value, $alt_value, \@tempContextSequence, $before, $after, $context, $funcSegment, $mutation, $refH_file, $chrNameForH, $strand_value, $midlle_totalNbBaseContext); } if( (($tab[$ref_value] eq "T") && ($tab[$alt_value] eq "C")) || ( ($tab[$ref_value] eq "A") && ($tab[$alt_value] eq "G")) ) { my $mutation = "T:A>C:G"; statPerMutType($filename, \@tab, $ref_value, $alt_value, \@tempContextSequence, $before, $after, $context, $funcSegment, $mutation, $refH_file, $chrNameForH, $strand_value, $midlle_totalNbBaseContext); } if( (($tab[$ref_value] eq "T") && ($tab[$alt_value] eq "G")) || ( ($tab[$ref_value] eq "A") && ($tab[$alt_value] eq "C")) ) { my $mutation = "T:A>G:C"; statPerMutType($filename, \@tab, $ref_value, $alt_value, \@tempContextSequence, $before, $after, $context, $funcSegment, $mutation, $refH_file, $chrNameForH, $strand_value, $midlle_totalNbBaseContext); } } close F1; } ### Count the number of SBS for 12 and 6 categories sub statPerMutType { my ($filename, $refTab, $ref_value, $alt_value, $refTab_tempSeqContext, $before, $after, $context, $funcSegment, $mutation, $refH_file, $chrNameForH, $strand_value, $midlle_totalNbBaseContext) = @_; my @tab = @$refTab; my @tempContextSequence = @$refTab_tempSeqContext; # Split the mutations $mutation =~ /(\w)\:(\w)\>(\w)\:(\w)/; my ($ref1, $ref2, $alt1, $alt2) = ($1, $2, $3, $4); # Count the total number of mutations $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'TotalMutG'}++; # Pearson correlation if(exists $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}) { $refH_file->{$filename}{'SBSPerChr'}{$mutation}{'CHR'}{$chrNameForH}{'chr'}++; } #### Sequence context - 6 mutation types - genomic strand my $mutationSeqContext6mutType = "$ref1>$alt1"; # We want to express the mutation in C> or T> if( ($tab[$ref_value] eq $ref2) && ($tab[$alt_value] eq $alt2) ) { my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]); my $context_reverse = $base5."_".$base3; if(exists $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context_reverse}{$mutationSeqContext6mutType}++; } } elsif(exists $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}) { $refH_file->{$filename}{'SeqContextG'}{$context}{$mutationSeqContext6mutType}++; } #### Strand analysis C>N and T>N on NonTr strand if( (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq $ref1)&&($tab[$alt_value] eq $alt1))) || (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq $ref2)&&($tab[$alt_value] eq $alt2))) ) { if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'NonTr'}++; } # C>A With the sequence context (C>N and T>N on strand +) if( ($tab[$strand_value] eq "+") && (($tab[$ref_value] eq $ref1)&&($tab[$alt_value] eq $alt1)) ) { if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{$mutationSeqContext6mutType}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context}{$mutationSeqContext6mutType}{'NonTr'}++; } } # C>A With the sequence context (G>N and A>N on strand -) else { my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]); my $context_reverse = $base5."_".$base3; if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{$mutationSeqContext6mutType}{'NonTr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{$mutationSeqContext6mutType}{'NonTr'}++; } } } #### Strand analysis C>N and T>N on Tr strand if( (($tab[$strand_value] eq "-") && (($tab[$ref_value] eq $ref1)&&($tab[$alt_value] eq $alt1))) || (($tab[$strand_value] eq "+") && (($tab[$ref_value] eq $ref2)&&($tab[$alt_value] eq $alt2))) ) { if(exists $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}) { $refH_file->{$filename}{'6mutType'}{$funcSegment}{$mutation}{'Tr'}++; } # C>N and T>N With the sequence context (strand -) if( ($tab[$strand_value] eq "-") && (($tab[$ref_value] eq $ref1)&&($tab[$alt_value] eq $alt1)) ) { if(exists $refH_file->{$filename}{'SeqContextC'}{$context}{$mutationSeqContext6mutType}{'Tr'}) { $refH_file->{$filename}{'SeqContextC'}{$context}{$mutationSeqContext6mutType}{'Tr'}++; } } # C>N and T>N with the sequence context (strand +) if( ($tab[$strand_value] eq "+") && (($tab[$ref_value] eq $ref2)&&($tab[$alt_value] eq $alt2)) ) { my $base3 = complement($tempContextSequence[$before]); my $base5 = complement($tempContextSequence[$after]); my $context_reverse = $base5."_".$base3; if(exists $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{$mutationSeqContext6mutType}{'Tr'}) { $refH_file->{$filename}{'SeqContextC'}{$context_reverse}{$mutationSeqContext6mutType}{'Tr'}++; } } } #### WebLogo-3 if(($tab[$ref_value] eq $ref1) && ($tab[$alt_value] eq $alt1)) { # For the logo all the sequences must have the same length if(scalar(@tempContextSequence) == 2) { next; } my ($contextTemp1, $contextTemp2) = ("", ""); for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= $tempContextSequence[$i]; } for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= $tempContextSequence[$i]; } my $context = $contextTemp1.$ref1.$contextTemp2; push(@{$refH_file->{$filename}{'WebLogo3'}{$mutationSeqContext6mutType}}, $context); } else { if(scalar(@tempContextSequence) == 2) { next; } my ($contextTemp1, $contextTemp2) = ("", ""); for(my $i=0; $i<$midlle_totalNbBaseContext; $i++) { $contextTemp1 .= complement($tempContextSequence[$i]); } for(my $i=$midlle_totalNbBaseContext+1; $i<=$#tempContextSequence; $i++) { $contextTemp2 .= complement($tempContextSequence[$i]); } my $context = $contextTemp1.$ref1.$contextTemp2; $context = reverse $context; push(@{$refH_file->{$filename}{'WebLogo3'}{$mutationSeqContext6mutType}}, $context); } } # Calculate the correlation between the number of SBS and the size of the chromosome sub PearsonCoefficient { our ($refH_file, $filename) = @_; #### Calculate the Pearson coefficient my @total_SBS = (); # Pearson for all mutation types # Create a 2D array foreach my $k_mutation (sort keys %{$refH_file->{$filename}{'SBSPerChr'}}) { my $x = []; my $correlation = 0; if($k_mutation eq "AllMutType") { next; } elsif($k_mutation eq "TotalPerChr") { next; } elsif($k_mutation eq "ChrSize") { next; } else { my $testZero = 0; # The correlation function doesn't works if all the variables are equal to zero # generate an anonymous 2D array where $x->[1] is the row # $x->[1][1] is the value in row 1 column 1 and $x->[1][2] is the value of row 1 column 2 # once you build the entire array, pass it to the correlation subroutine my $i=1; while ( my ($chromosome, $lenght) = each (%chromosomes)) { $x->[$i][1] = $lenght; # First column contains the chromosome size $x->[$i][2] = $refH_file->{$filename}{'SBSPerChr'}{$k_mutation}{'CHR'}{$chromosome}{'chr'}; # Second column contains the count of SBS if($refH_file->{$filename}{'SBSPerChr'}{$k_mutation}{'CHR'}{$chromosome}{'chr'}==0) { $testZero++; } $i++; } if( $testZero == keys %{$refH_file->{$filename}{'SBSPerChr'}{$k_mutation}{'CHR'}} ) { $correlation = 0; } # Pass the 2D array to the correlation subroutine else { $correlation = correlation($x); } $refH_file->{$filename}{'SBSPerChr'}{$k_mutation}{'Pearson'} = $correlation; # Pearson per mutation type } } #generate an anonymous 2D array for all mutation type my $testZero = 0; my $x = []; my $correlation = 0; my $i=1; while ( my ($chromosome, $lenght) = each (%chromosomes)) { $x->[$i][1] = $lenght; $x->[$i][2] = $refH_file->{$filename}{'SBSPerChr'}{'TotalPerChr'}{$chromosome}{'chr'}; $i++; } if($testZero == keys %{$refH_file->{$filename}{'SBSPerChr'}{'TotalPerChr'}} ) { $correlation = 0; } else { $correlation = correlation($x); } # Pass the 2D array to the correlation subroutine $refH_file->{$filename}{'SBSPerChr'}{'AllMutType'} = $correlation; sub correlation { my ($x) = @_; my ($mean_x,$mean_y) = mean($x); my $ssxx=ss($x,$mean_x,$mean_y,1,1); my $ssyy=ss($x,$mean_x,$mean_y,2,2); my $ssxy=ss($x,$mean_x,$mean_y,1,2); my $correl=correl($ssxx,$ssyy,$ssxy);; my $xcorrel=sprintf("%.2f",$correl); return($xcorrel); sub mean { my ($x)=@_; my $num = scalar(@{$x}) - 2; my $sum_x = '0'; my $sum_y = '0'; for (my $i = 2; $i < scalar(@{$x}); ++$i) { $sum_x += $x->[$i][1]; $sum_y += $x->[$i][2]; } my $mu_x = $sum_x / $num; my $mu_y = $sum_y / $num; return($mu_x,$mu_y); } ### ss = sum of squared (deviations to the mean) sub ss { my ($x,$mean_x,$mean_y,$one,$two)=@_; my $sum = '0'; for (my $i=2;$i<scalar(@{$x});++$i) { $sum += ($x->[$i][$one]-$mean_x)*($x->[$i][$two]-$mean_y); } return $sum; } sub correl { my($ssxx,$ssyy,$ssxy)=@_; my ($sign, $correl) = (0,0); if(abs($ssxy) == 0) { $sign = 0 } else { $sign=$ssxy/abs($ssxy); } if( ($ssxx==0) || ($ssyy==0) ) { $correl = 0 } else { $correl=$sign*sqrt($ssxy*$ssxy/($ssxx*$ssyy)); } return $correl; } } } # Save the output of the chi2 into a hash table for writing the results in the Excel file sub chi2hash { my ($outputChi2, $k_file) = @_; open(F1, $outputChi2) or die "$!: $outputChi2\n"; my $header = <F1>; # Strand_Bias($tab[0]) NonTr-Tr($tab[1]) Proportion($tab[2]) P-val-Chi2($tab[3]) FDR($tab[4]) Confidence Interval($tab[5]) Mutation_Type($tab[6]) SampleName($tab[7]) while(<F1>) { $_ =~ s/[\r\n]+$//; my @tab = split("\t", $_); if($tab[7] eq $k_file) { if($tab[1] eq "NA") { $h_chi2{$tab[7]}{$tab[6]}{'NonTr'} = "NA"; $h_chi2{$tab[7]}{$tab[6]}{'Tr'} = "NA"; } else { my ($nonTr, $tr) = split("-", $tab[1]); $h_chi2{$tab[7]}{$tab[6]}{'NonTr'} = $nonTr; $h_chi2{$tab[7]}{$tab[6]}{'Tr'} = $tr; } $h_chi2{$tab[7]}{$tab[6]}{'p-value'} = $tab[3]; $h_chi2{$tab[7]}{$tab[6]}{'ConfInt'} = $tab[5]; # For the pool data the FDR isn't calculated so replace the NA (=Missing values) by "-" if($tab[7] eq "Pool_Data") { $h_chi2{$tab[7]}{$tab[6]}{'FDR'} = "-"; } else { $h_chi2{$tab[7]}{$tab[6]}{'FDR'} = $tab[4]; } } } close F1; } ### Complement bases (for the sequence context) sub complement { if($_[0] eq "A") { return "T"; } if($_[0] eq "C") { return "G"; } if($_[0] eq "G") { return "C"; } if($_[0] eq "T") { return "A"; } } ### Recover the functional region for all the samples. Allows to thave the same annotations for the pie chart "Impact on protein sequence" sub recoverAnnovarAnnotation { my ($func_name) = @_; my %hash = (); # The input is a folder foreach my $file (`ls $folderCheckedForStat/*`) { chomp($file); my $AV_annotation_value = recoverNumCol($file, $func_name); open(F1, $file) or die "$!: $file\n"; my $header = <F1>; while(<F1>) { $_ =~ s/[\r\n]+$//; my @tab = split("\t", $_); # Some files can have an empty line at the end and WE DON'T WANT to consider it if(! defined $tab[0]) { next; } # Some func value are repeated and separated by ";" my $funcSegment = ""; if($tab[$AV_annotation_value] =~ /;/) { my @temp = split(";", $tab[$AV_annotation_value]); $funcSegment = $temp[0]; } else { $funcSegment = $tab[$AV_annotation_value]; } $hash{$funcSegment} = ""; } close F1; } my @tab_AVAnnotation = (); foreach my $k (sort keys %hash) { push(@tab_AVAnnotation, $k); } return @tab_AVAnnotation; } sub recoverNumCol { my ($input, $name_of_column) = @_; open(F1,$input) or die "recoverNumCol: $!: $input\n"; # For having the name of the columns my $search_header = <F1>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header); close F1; # The number of the column my $name_of_column_NB = "toto"; for(my $i=0; $i<=$#tab_search_header; $i++) { if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; last; } } if($name_of_column_NB eq "toto") { print STDERR "Error recoverNumCol(): the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit 3; } else { return $name_of_column_NB; } } ###################################################################################################################################################### # Functions for writing in the Excel report # ###################################################################################################################################################### # Write the header for the six mutation types sub WriteHeaderSection { our ($wb, $ws, $rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, $nb_func, $colStart_matrixSeqContext) = @_; our ($format_CA, $format_CG, $format_CT, $format_TA, $format_TC, $format_TG, $format_TG2, $format_LeftHeader, $format_RightHeader, $format_LeftHeader2); Format_Header($wb, \$format_CA, \$format_CG, \$format_CT, \$format_TA, \$format_TC, \$format_TG, \$format_TG2, \$format_LeftHeader, \$format_RightHeader, \$format_LeftHeader2); our ($format_LeftCA, $format_LeftCG, $format_LeftCT, $format_LeftTA, $format_LeftTC, $format_LeftTG, $format_RightCA, $format_RightCG, $format_RightCT, $format_RightTA, $format_RightTC, $format_RightTG); Format_HeaderSBSDistrBySegAndFunc($wb, \$format_LeftCA, \$format_LeftCG, \$format_LeftCT, \$format_LeftTA, \$format_LeftTC, \$format_LeftTG, \$format_RightCA, \$format_RightCG, \$format_RightCT, \$format_RightTA, \$format_RightTC, \$format_RightTG); our $format_A11Bold = ""; Format_A11Bold($wb, \$format_A11Bold); # Arial 11 bold and center our $format_A11BoldLeft = ""; Format_A11BoldLeft($wb, \$format_A11BoldLeft); # Arial 11 bold and left our ($format_header12CA, $format_header12CG, $format_header12CT, $format_header12TA, $format_header12TC, $format_header12TG); Format_Header12MutType($wb, \$format_header12CA, \$format_header12CG, \$format_header12CT, \$format_header12TA, \$format_header12TC, \$format_header12TG); ## Header for SBS distribution by segment HeaderMutTypeSBSDistrBySeg(); ## Header for strand bias by function $ws->set_column($colStart_SBSdistrBySeg+5, $colStart_SBSdistrBySeg+5, 11); my $row = $rowStart_SBSdistrBySeg+$nb_func+10; my $col = $colStart_SBSdistrBySeg; $ws->write($row, $col+1, ' ', $format_CA); $ws->write($row, $col+2, "C>A", $format_CA); $ws->write($row, $col+3, ' ', $format_CA); $ws->write($row, $col+5, ' ', $format_CG); $ws->write($row, $col+6, "C>G", $format_CG); $ws->write($row, $col+7, ' ', $format_CG); $ws->write($row, $col+9, ' ', $format_CT); $ws->write($row, $col+10, "C>T", $format_CT); $ws->write($row, $col+11, ' ', $format_RightCT); $row = $rowStart_SBSdistrBySeg+($nb_func*2)+14; $ws->write($row, $col+1, ' ', $format_TA); $ws->write($row, $col+2, "T>A", $format_TA); $ws->write($row, $col+3, ' ', $format_TA); $ws->write($row, $col+5, ' ', $format_TC); $ws->write($row, $col+6, "T>C", $format_TC); $ws->write($row, $col+7, ' ', $format_TC); $ws->write($row, $col+9, ' ', $format_TG2); $ws->write($row, $col+10, "T>G", $format_TG2); $ws->write($row, $col+11, ' ', $format_RightTG); $ws->set_row($rowStart_SBSdistrBySeg+$nb_func+11, 18); $ws->set_row($rowStart_SBSdistrBySeg+($nb_func*2)+15, 18); $ws->set_column($colStart_SBSdistrBySeg+5, $colStart_SBSdistrBySeg+5, 13); $ws->set_column($colStart_SBSdistrBySeg+9, $colStart_SBSdistrBySeg+9, 13); for(my $i=$rowStart_SBSdistrBySeg+$nb_func+10; $i<=$rowStart_SBSdistrBySeg+($nb_func*2)+14; $i+=$nb_func+4) { $ws->write($i+1, $colStart_SBSdistrBySeg, 'Segment', $format_LeftHeader); $ws -> write($i+1, $colStart_SBSdistrBySeg+1, 'Non-Tr/Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+2, 'Non-Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+3, 'Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+5, 'Non-Tr/Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+6, 'Non-Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+7, 'Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+9, 'Non-Tr/Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+10, 'Non-Tr', $format_A11Bold); $ws -> write($i+1, $colStart_SBSdistrBySeg+11, 'Tr', $format_RightHeader); } ## Header for Counts of SBS per chromosome and mutation type HeaderCountSBSPerChr(); ## Header for the short sequence context HeaderShortTriNtContext(); ## Header for the 12 mutation types with the sequence context (coding strand) HeaderLongTriNtContext(); sub HeaderMutTypeSBSDistrBySeg { $ws->set_row($rowStart_SBSdistrBySeg+2, 18); $ws->write($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+2, "C:G>A:T", $format_CA); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+3, $format_CA); $ws->write($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+4, "C:G>G:C", $format_CG); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+5, $format_CG); $ws->write($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+6, "C:G>T:A", $format_CT); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+7, $format_CT); $ws->write($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+8, "T:A>A:T", $format_TA); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+9, $format_TA); $ws->write($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+10, "T:A>C:G", $format_TC); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+11, $format_TC); $ws->write($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+12, "T:A>G:C", $format_TG); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg+13, $format_TG); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg, "Segment", $format_LeftHeader); $ws->set_column($colStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, 13); $ws->set_row($rowStart_SBSdistrBySeg+3, 18); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+1, "Total SBS", $format_A11Bold); $ws->set_column($colStart_SBSdistrBySeg+1, $colStart_SBSdistrBySeg+1, 11); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+2, "%", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+3, "#", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+4, "%", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+5, "#", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+6, "%", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+7, "#", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+8, "%", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+9, "#", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+10, "%", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+11, "#", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_SBSdistrBySeg+12, "%", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+3, 13, "#", $format_RightHeader); } sub HeaderCountSBSPerChr { $ws->set_column(3,3, 10); $ws->set_column(4,4, 10); $ws->set_row($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, 18); $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+7, $colStart_SBSdistrBySeg+1, "Pearson", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg, "Chr", $format_LeftHeader); $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+1, "Size", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+2, "All SBS", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+3, "C:G>A:T", $format_CA); $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+4, "C:G>G:C", $format_CG); $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+5, "C:G>T:A", $format_CT); $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+6, "T:A>A:T", $format_TA); $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+7, "T:A>C:G", $format_TC); $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+8, $colStart_SBSdistrBySeg+8, "T:A>G:C", $format_TG); } sub HeaderShortTriNtContext { ### GENOMIC STRAND $ws->write(2, $colStart_matrixSeqContext, 'Count matrix', $format_LeftHeader2); $ws->write(3, $colStart_matrixSeqContext+4, 'C>A', $format_CA); $ws->write(3, $colStart_matrixSeqContext+5, 'C>G', $format_CG); $ws->write(3, $colStart_matrixSeqContext+6, 'C>T', $format_CT); $ws->write(3, $colStart_matrixSeqContext+7, 'T>A', $format_TA); $ws->write(3, $colStart_matrixSeqContext+8, 'T>C', $format_TC); $ws->write(3, $colStart_matrixSeqContext+9, 'T>G', $format_TG2); $ws->write(2, $colStart_matrixSeqContext+11, 'Frequency matrix', $format_A11BoldLeft); $ws->write(3, $colStart_matrixSeqContext+14, 'C>A', $format_CA); $ws->write(3, $colStart_matrixSeqContext+15, 'C>G', $format_CG); $ws->write(3, $colStart_matrixSeqContext+16, 'C>T', $format_CT); $ws->write(3, $colStart_matrixSeqContext+17, 'T>A', $format_TA); $ws->write(3, $colStart_matrixSeqContext+18, 'T>C', $format_TC); $ws->write(3, $colStart_matrixSeqContext+19, 'T>G', $format_TG2); ### sequence context with a bar graph $ws->write(25, $colStart_matrixSeqContext+10, "Mutation spectra frequency", $format_A11Bold); } sub HeaderLongTriNtContext { $ws->set_row($rowStart_SBSdistrBySeg+3, 15); $ws->set_row($rowStart_SBSdistrBySeg+4, 15); $ws->set_row($rowStart_SBSdistrBySeg+5, 15); $ws->write($rowStart_SBSdistrBySeg+3, $colStart_matrixSeqContext, "Count matrix", $format_LeftHeader2); $ws->write($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+1, "C>A", $format_CA); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+2, $format_CA); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+1, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+2, "Tr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+3, "C>G", $format_CG); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+4, $format_CG); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+3, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+4, "Tr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+5, "C>T", $format_CT); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+6, $format_CT); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+5, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+6, "Tr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+7, "T>A", $format_TA); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+8, $format_TA); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+7, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+8, "Tr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+9, "T>C", $format_TC); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+10, $format_TC); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+9, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+10, "Tr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+11, "T>G", $format_TG2); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext+12, $format_TG2); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+11, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext+12, "Tr", $format_A11Bold); $ws->set_row($rowStart_SBSdistrBySeg+24, 15); $ws->set_row($rowStart_SBSdistrBySeg+25, 15); $ws->set_row($rowStart_SBSdistrBySeg+26, 15); $ws->write($rowStart_SBSdistrBySeg+24, $colStart_matrixSeqContext, "Frequency matrix", $format_LeftHeader2); $ws->write($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+1, "C>A", $format_CA); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+2, $format_CA); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+1, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+2, "Tr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+3, "C>G", $format_CG); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+4, $format_CG); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+3, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+4, "Tr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+5, "C>T", $format_CT); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+6, $format_CT); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+5, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+6, "Tr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+7, "T>A", $format_TA); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+8, $format_TA); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+7, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+8, "Tr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+9, "T>C", $format_TC); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+10, $format_TC); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+9, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+10, "Tr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+11, "T>G", $format_TG2); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext+12, $format_TG2); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+11, "NonTr", $format_A11Bold); $ws->write($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext+12, "Tr", $format_A11Bold); } } # Write the titles of the different sections of the report sub WriteBorderSection { our ($wb, $ws, $rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, $nb_func, $colStart_matrixSeqContext) = @_; our ($format_topLeft, $format_topRight, $format_bottomLeft, $format_bottomRight, $format_top, $format_right, $format_bottom, $format_left); Format_section($wb, \$format_topLeft, \$format_topRight, \$format_bottomLeft, \$format_bottomRight, \$format_top, \$format_right, \$format_bottom, \$format_left); TableSBSDistrBySeg(); TableStrandBiasBySegment(); CountSBSPerChr(); ShortTriNtContext(); # 6 mut type LongTriNtContext(); # 12 mut type sub TableSBSDistrBySeg { # Top-Left $ws->write($rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, "Table 4. SBS distribution by functional region", $format_topLeft); $ws->set_row($rowStart_SBSdistrBySeg, 18); # Set the height of the row to 0.25" # Top for(my $i=1; $i<=13; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg+$i, $format_top); } # Top-Right $ws->write_blank($rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg+13, $format_topRight); # Right $ws->write_blank($rowStart_SBSdistrBySeg+1, $colStart_SBSdistrBySeg+13, $format_right); # Bottom-left $ws->write_blank($rowStart_SBSdistrBySeg+$nb_func+5, $colStart_SBSdistrBySeg, $format_bottomLeft); # Left $ws->write_blank($rowStart_SBSdistrBySeg+1, $colStart_SBSdistrBySeg, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_SBSdistrBySeg, $format_left); } sub TableStrandBiasBySegment { # Top-Left $ws->write($rowStart_SBSdistrBySeg+$nb_func+8, $colStart_SBSdistrBySeg, "Table 5. Strand bias by functional region", $format_topLeft); $ws->set_row($rowStart_SBSdistrBySeg+$nb_func+8, 18); # Set the height of the row to 0.25" # Top for(my $i=1; $i<=10; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg+$nb_func+8, $colStart_SBSdistrBySeg+$i, $format_top); } # Top-Right $ws->write_blank($rowStart_SBSdistrBySeg+$nb_func+8, $colStart_SBSdistrBySeg+11, $format_topRight); # Right $ws->write_blank($rowStart_SBSdistrBySeg+$nb_func+9, $colStart_SBSdistrBySeg+11, $format_right); $ws->write_blank($rowStart_SBSdistrBySeg+($nb_func*2)+13, $colStart_SBSdistrBySeg+11, $format_right); # Left $ws->write_blank($rowStart_SBSdistrBySeg+$nb_func+9, $colStart_SBSdistrBySeg, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+$nb_func+10, $colStart_SBSdistrBySeg, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+($nb_func*2)+13, $colStart_SBSdistrBySeg, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+($nb_func*2)+14, $colStart_SBSdistrBySeg, $format_left); # Bottom $ws->write_blank($rowStart_SBSdistrBySeg+($nb_func*3)+16, $colStart_SBSdistrBySeg+4, $format_bottom); $ws->write_blank($rowStart_SBSdistrBySeg+($nb_func*3)+16, $colStart_SBSdistrBySeg+8, $format_bottom); } sub CountSBSPerChr { #### Top-Left $ws->write($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+4, $colStart_SBSdistrBySeg, "Table 6. SBS distribution per chromosome", $format_topLeft); $ws->set_row($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+4, 18); # Set the height of the row to 0.25" #### Top for(my $i=1; $i<8; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+4, $colStart_SBSdistrBySeg+$i, $format_top); } #### Top-Right $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+4, $colStart_SBSdistrBySeg+8, $format_topRight); #### Right $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+5, $colStart_SBSdistrBySeg+8, $format_right); $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+6, $colStart_SBSdistrBySeg+8, $format_right); #### Bottom-Right # Human genome = 24 chromosomes if($refGenome =~ /hg/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+33, $colStart_SBSdistrBySeg+8, $format_bottomRight); } # Mouse genome = 21 chromosomes if($refGenome =~ /mm/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+30, $colStart_SBSdistrBySeg+8, $format_bottomRight); } # Rat genome = 22 chromosomes if($refGenome =~ /rn/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+31, $colStart_SBSdistrBySeg+8, $format_bottomRight); } #### Bottom if($refGenome =~ /hg/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+33, $colStart_SBSdistrBySeg+1, $format_bottom); for(my $i=3; $i<=7; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+33, $colStart_SBSdistrBySeg+$i, $format_bottom); } } if($refGenome =~ /mm/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+30, $colStart_SBSdistrBySeg+1, $format_bottom); for(my $i=3; $i<=7; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+30, $colStart_SBSdistrBySeg+$i, $format_bottom); } } if($refGenome =~ /rn/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+31, $colStart_SBSdistrBySeg+1, $format_bottom); for(my $i=3; $i<=7; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+31, $colStart_SBSdistrBySeg+$i, $format_bottom); } } #### Left $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+5, $colStart_SBSdistrBySeg, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+6, $colStart_SBSdistrBySeg, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+7, $colStart_SBSdistrBySeg, $format_left); #### Bottom-left if($refGenome =~ /hg/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+33, $colStart_SBSdistrBySeg, $format_bottomLeft); } if($refGenome =~ /mm/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+30, $colStart_SBSdistrBySeg, $format_bottomLeft); } if($refGenome =~ /rn/) { $ws->write_blank($rowStart_SBSdistrBySeg+8+$nb_func+(($nb_func+4)*2)+31, $colStart_SBSdistrBySeg, $format_bottomLeft); } } sub ShortTriNtContext { my $format_headerSection = $wb->add_format(valign => 'left', bold => 1, font => 'Arial', size => 12); $format_headerSection->set_left(2); $format_headerSection->set_left_color('blue'); # Top-left $ws->write(0, $colStart_matrixSeqContext, "Panel 1. Trinucleotide sequence context of SBS on the genomic sequence", $format_topLeft); # Top for(my $i=1; $i<=19; $i++) { $ws->write_blank(0, $colStart_matrixSeqContext+$i, $format_top); } # Top-right $ws->write_blank(0, $colStart_matrixSeqContext+20, $format_topRight); # Right for(my $i=1; $i<=37; $i++) { $ws->write_blank($i, $colStart_matrixSeqContext+20, $format_right); } # Bottom-right $ws->write_blank(37, $colStart_matrixSeqContext+20, $format_bottomRight); # Bottom for(my $i=1; $i<=19; $i++) { $ws->write_blank(38, $colStart_matrixSeqContext+$i, $format_top); } # Bottom-left $ws->write_blank(37, $colStart_matrixSeqContext, $format_bottomLeft); # Left $ws->write(1, $colStart_matrixSeqContext, "", $format_left); for(my $i=3; $i<=36; $i++) { $ws->write_blank($i, $colStart_matrixSeqContext, $format_left); } } sub LongTriNtContext { # Top-left $ws->write($rowStart_SBSdistrBySeg, $colStart_matrixSeqContext, "Panel 2. Stranded analysis of trinucleotide sequence context of SBS", $format_topLeft); # Top for(my $i=1; $i<=28; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg, $colStart_matrixSeqContext+$i, $format_top); } # Top-right $ws->write_blank($rowStart_SBSdistrBySeg, $colStart_matrixSeqContext+29, $format_topRight); # Right for(my $i=1; $i<=42; $i++) { $ws->write_blank($rowStart_SBSdistrBySeg+$i, $colStart_matrixSeqContext+29, $format_right); } # Bottom-right $ws->write_blank(91, $colStart_matrixSeqContext+29, $format_bottomRight); # Bottom for(my $i=13; $i<=28; $i++) { $ws->write_blank(92, $colStart_matrixSeqContext+$i, $format_top); } # Bottom-left $ws->write_blank(91, $colStart_matrixSeqContext, $format_bottomLeft); # Left $ws->write_blank($rowStart_SBSdistrBySeg+1, $colStart_matrixSeqContext, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+2, $colStart_matrixSeqContext, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+4, $colStart_matrixSeqContext, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+5, $colStart_matrixSeqContext, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+22, $colStart_matrixSeqContext, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+23, $colStart_matrixSeqContext, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+25, $colStart_matrixSeqContext, $format_left); $ws->write_blank($rowStart_SBSdistrBySeg+26, $colStart_matrixSeqContext, $format_left); } } ############################################################################################################ # Write count of SBS by functional impact on the protein (Table 2) + Create the input for ggplot2 (pie chart with functional impact) + Create the input for ggplot2 (pie chart of SBS vs. Indels) sub writeDistrFuncImpact { my ($ws, $refH_file, $sample, $funcImpactggplot2, $overallDistrggplot2) = @_; # Set the row for the table 2 my $lImpactSBS = 31; my ($deletion, $insertion) = (0, 0); $ws->write(29, 6, "Table 2. Frequency and counts of functional impact", $format_A10Boldleft); $ws->set_column(6, 6, 13); $ws->set_column(10, 10, 15); $ws->write(30, 6, "RefSeq gene", $table_topleft); $ws->write(30, 7, "", $table_top); $ws->write(30, 8, "Percent", $table_top); $ws->write(30, 9, "Count", $table_topRight); open(IMPACTSBS, ">", $funcImpactggplot2) or die "$!: $funcImpactggplot2\n"; print IMPACTSBS "AA_Change\tCount\tPercent\n"; # Pie chart with the distribution of SBS vs Indel open(SBSINDEL, ">", $overallDistrggplot2) or die "$!: $overallDistrggplot2\n"; print SBSINDEL "Variant_type\tCount\tPercent\n"; foreach my $k_exoFunc(sort keys %{$refH_file->{$sample}{'ImpactSBS'}}) { my $percent = ($refH_file->{$sample}{'ImpactSBS'}{$k_exoFunc} / $refH_file->{$sample}{'TotalMutGenomic'})*100; $percent = sprintf("%.2f", $percent); if($k_exoFunc eq "NA") { print IMPACTSBS "Not_Applicable\t$percent\t$refH_file->{$sample}{'ImpactSBS'}{$k_exoFunc}\n"; } else { my $temp = $k_exoFunc; $temp =~ s/ /_/g; print IMPACTSBS "$temp\t$percent\t$refH_file->{$sample}{'ImpactSBS'}{$k_exoFunc}\n"; } $ws->write($lImpactSBS, 6, $k_exoFunc, $table_left2); $ws->write($lImpactSBS, 8, $percent, $format_A10); $ws->write($lImpactSBS, 9, $refH_file->{$sample}{'ImpactSBS'}{$k_exoFunc}, $table_right); $lImpactSBS++; # Pie chart with the distribution of SBS vs Indel if($k_exoFunc =~ /deletion/i) { $deletion += $refH_file->{$sample}{'ImpactSBS'}{$k_exoFunc}; } elsif($k_exoFunc =~ /insertion/i) { $insertion += $refH_file->{$sample}{'ImpactSBS'}{$k_exoFunc}; } } close IMPACTSBS; $ws->write($lImpactSBS, 9, $refH_file->{$sample}{'TotalMutGenomic'}, $table_bottomrightHeader); $ws->write($lImpactSBS, 6, "", $table_bottomleft); $ws->write($lImpactSBS, 7, "", $table_bottom); $ws->write($lImpactSBS, 8, "", $table_bottom); # Pie chart with the distribution of SBS vs Indel my $percentSBSIndel = ($deletion/$refH_file->{$sample}{'TotalMutGenomic'})*100; $percentSBSIndel = sprintf("%.2f", $percentSBSIndel); print SBSINDEL "Deletion\t$deletion\t$percentSBSIndel\n"; $percentSBSIndel = ($insertion/$refH_file->{$sample}{'TotalMutGenomic'})*100; $percentSBSIndel = sprintf("%.2f", $percentSBSIndel); print SBSINDEL "Insertion\t$insertion\t$percentSBSIndel\n"; $percentSBSIndel = ($refH_file->{$sample}{TotalSBSGenomic}/$refH_file->{$sample}{'TotalMutGenomic'})*100; $percentSBSIndel = sprintf("%.2f", $percentSBSIndel); print SBSINDEL "SBS\t$refH_file->{$sample}{TotalSBSGenomic}\t$percentSBSIndel\n"; close SBSINDEL; } ############################################################################################################ # Write the result of the chi2 for the strand bias (Table 3) sub writeChi2result { my ($wb, $ws, $strandBiasggplot2, $refH_file, $k_file) = @_; # Define the header of the table 3 $ws->write(28, 11, "Table 3. Significance of the strand biases", $format_A10Boldleft); $ws->set_column(11, 11, 13); $ws->set_column(16, 16, 15); $ws->set_column(17, 17, 10); $ws->write(29, 11, "Mutation Type", $table_topleft); $ws->write(29, 12, "Non-Tr/Tr", $table_top); $ws->write(29, 13, "Non-Tr", $table_top); $ws->write(29, 14, "Tr", $table_top); $ws->write(29, 15, "P-value", $table_top); $ws->write(29, 16, "FDR q value", $table_top); $ws->write(29, 17, "95% CI", $table_topRight); $ws->write(39, 11, "Table 3. Significance of the strand biases", $format_A10Boldleft); $ws->write(40, 11, "Mutation Type", $table_topleft); $ws->write(40, 12, "Non-Tr/Tr", $table_top); $ws->write(40, 13, "Non-Tr", $table_top); $ws->write(40, 14, "Tr", $table_top); $ws->write(40, 15, "P-value", $table_top); $ws->write(40, 16, "FDR q value", $table_top); $ws->write(40, 17, "95% CI", $table_topRight); # Define the count on non-transcribed and transcribed strand # C>A my ($ca_NonTr, $ca_Tr) = (0,0); if( ($h_chi2{$k_file}{"C>A"}{'NonTr'} eq "NA") || ($h_chi2{$k_file}{"C>A"}{'Tr'} eq "NA") ) { $ca_NonTr = 0; $ca_Tr = 0; } else { $ca_NonTr = $h_chi2{$k_file}{"C>A"}{'NonTr'}; $ca_Tr = $h_chi2{$k_file}{"C>A"}{'Tr'}; } # C>G my ($cg_NonTr, $cg_Tr) = (0,0); if( ($h_chi2{$k_file}{"C>G"}{'NonTr'} eq "NA") || ($h_chi2{$k_file}{"C>G"}{'Tr'} eq "NA") ) { $cg_NonTr = 0; $cg_Tr = 0; } else { $cg_NonTr = $h_chi2{$k_file}{"C>G"}{'NonTr'}; $cg_Tr = $h_chi2{$k_file}{"C>G"}{'Tr'}; } # C>T my ($ct_NonTr, $ct_Tr) = (0,0); if( ($h_chi2{$k_file}{"C>T"}{'NonTr'} eq "NA") || ($h_chi2{$k_file}{"C>T"}{'Tr'} eq "NA") ) { $ct_NonTr = 0; $ct_Tr = 0; } else { $ct_NonTr = $h_chi2{$k_file}{"C>T"}{'NonTr'}; $ct_Tr = $h_chi2{$k_file}{"C>T"}{'Tr'}; } # T>A my ($ta_NonTr, $ta_Tr) = (0,0); if( ($h_chi2{$k_file}{"T>A"}{'NonTr'} eq "NA") || ($h_chi2{$k_file}{"T>A"}{'Tr'} eq "NA") ) { $ta_NonTr = 0; $ta_Tr = 0; } else { $ta_NonTr = $h_chi2{$k_file}{"T>A"}{'NonTr'}; $ta_Tr = $h_chi2{$k_file}{"T>A"}{'Tr'}; } # T>C my ($tc_NonTr, $tc_Tr) = (0,0); if( ($h_chi2{$k_file}{"T>C"}{'NonTr'} eq "NA") || ($h_chi2{$k_file}{"T>C"}{'Tr'} eq "NA") ) { $tc_NonTr = 0; $tc_Tr = 0; } else { $tc_NonTr = $h_chi2{$k_file}{"T>C"}{'NonTr'}; $tc_Tr = $h_chi2{$k_file}{"T>C"}{'Tr'}; } # T>G my ($tg_NonTr, $tg_Tr) = (0,0); if( ($h_chi2{$k_file}{"T>G"}{'NonTr'} eq "NA") || ($h_chi2{$k_file}{"T>G"}{'Tr'} eq "NA") ) { $tg_NonTr = 0; $tg_Tr = 0; } else { $tg_NonTr = $h_chi2{$k_file}{"T>G"}{'NonTr'}; $tg_Tr = $h_chi2{$k_file}{"T>G"}{'Tr'}; } # Create an input for representing the strand bias with ggplot2 open(SB, ">", $strandBiasggplot2) or die "$!: $strandBiasggplot2\n"; print SB "Alteration\tStrand\tCount\n"; print SB "C>A\tNonTranscribed\t".$ca_NonTr."\n"."C>A\tTranscribed\t".$ca_Tr."\n"; print SB "C>G\tNonTranscribed\t".$cg_NonTr."\n"."C>G\tTranscribed\t".$cg_Tr."\n"; print SB "C>T\tNonTranscribed\t".$ct_NonTr."\n"."C>T\tTranscribed\t".$ct_Tr."\n"; print SB "T>A\tNonTranscribed\t".$ta_NonTr."\n"."T>A\tTranscribed\t".$ta_Tr."\n"; print SB "T>C\tNonTranscribed\t".$tc_NonTr."\n"."T>C\tTranscribed\t".$tc_Tr."\n"; print SB "T>G\tNonTranscribed\t".$tg_NonTr."\n"."T>G\tTranscribed\t".$tg_Tr."\n"; close SB; ### Calcul the ratio for the strand bias and write it on the Excel file (Table 3) writeRatioSB($ca_NonTr, $ca_Tr, $ws, 30, $refH_file, $k_file, "C>A", "G>T", $format_A10, $table_left, $table_middleHeader, $table_right); writeRatioSB($cg_NonTr, $cg_Tr, $ws, 31, $refH_file, $k_file, "C>G", "G>C", $format_A10, $table_left, $table_middleHeader, $table_right); writeRatioSB($ct_NonTr, $ct_Tr, $ws, 32, $refH_file, $k_file, "C>T", "G>A", $format_A10, $table_left, $table_middleHeader, $table_right); writeRatioSB($ta_NonTr, $ta_Tr, $ws, 33, $refH_file, $k_file, "T>A", "A>T", $format_A10, $table_left, $table_middleHeader, $table_right); writeRatioSB($tc_NonTr, $tc_Tr, $ws, 34, $refH_file, $k_file, "T>C", "A>G", $format_A10, $table_left, $table_middleHeader, $table_right); writeRatioSB($tg_NonTr, $tg_Tr, $ws, 35, $refH_file, $k_file, "T>G", "A>C", $table_bottom, $table_bottomleft, $table_middleHeader2, $table_bottomRight); ### Write a warning message when NonTr+Tr < 10 my $format_italic_red = $wb->add_format(font=>'Arial', size=>10, italic=>1, color => 'red'); if( (($ca_NonTr+$ca_Tr)< 10) || (($cg_NonTr+$cg_Tr)< 10) || (($ct_NonTr+$ct_Tr)< 10) || (($ta_NonTr+$ta_Tr)< 10) || (($tc_NonTr+$tc_Tr)< 10) || (($tg_NonTr+$tg_Tr)< 10) ) { $ws->write(36, 11, "Warning message: chi-squared approximation may be incorrect because the number of SBS", $format_italic_red); $ws->write(37, 11, "on Non-transcribed and transcribed strand is lower than 10", $format_italic_red); } } # Write values in Table 3 (Sub function of writeChi2result) sub writeRatioSB { my ($count_NonTr, $count_Tr, $ws, $row, $refH_file, $k_file, $mut1, $mut2, $formatText, $formatTextMut1, $formatTextRatio, $formatTable) = @_; my ($ratio_mut1, $ratio_mut2, $percent_NonTr, $percent_Tr) = (0, 0, 0, 0, 0); if( ($count_NonTr==0) || ($count_Tr==0) ) { $ratio_mut1 = 0; $ratio_mut2 = 0; $percent_NonTr = 0; $percent_Tr = 0; } else { $ratio_mut1 = $count_NonTr/$count_Tr; $ratio_mut1 = sprintf("%.2f", $ratio_mut1); $ratio_mut2 = $count_Tr/$count_NonTr; $ratio_mut2 = sprintf("%.2f", $ratio_mut2); $percent_NonTr = ($count_NonTr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; $percent_Tr = ($count_Tr/$refH_file->{$k_file}{'TotalSBSGenomic'})*100; } # C>N and T>N $ws->write($row, 11, $mut1, $formatTextMut1); $ws->write($row, 12, $ratio_mut1, $formatTextRatio); $ws->write($row, 13, $count_NonTr, $formatText); $ws->write($row, 14, $count_Tr, $formatText); # Write in italic and red (= warning message) when the count of NonTr + Tr is lower than 10 if( ($count_NonTr+$count_Tr) < 10 ) { if($mut1 eq "T>G") { if(! exists $h_chi2{$k_file}{$mut1}{'p-value'}) { $ws->write_string($row, 15, ""); } elsif($h_chi2{$k_file}{$mut1}{'p-value'} eq "NA") { $ws->write_string($row, 15, $h_chi2{$k_file}{$mut1}{'p-value'}, $table_bottom); } else { $ws->write_string($row, 15, $h_chi2{$k_file}{$mut1}{'p-value'}, $table_bottomItalicRed); } } else { if(! exists $h_chi2{$k_file}{$mut1}{'p-value'}) { $ws->write_string($row, 15, ""); } elsif($h_chi2{$k_file}{$mut1}{'p-value'} eq "NA") { $ws->write_string($row, 15, $h_chi2{$k_file}{$mut1}{'p-value'}, $formatText); } else { $ws->write_string($row, 15, $h_chi2{$k_file}{$mut1}{'p-value'}, $format_A10ItalicRed); } } } else { $ws->write_string($row, 15, $h_chi2{$k_file}{$mut1}{'p-value'}, $formatText); } $ws->write($row, 16, $h_chi2{$k_file}{$mut1}{'FDR'}, $formatText); $ws->write($row, 17, $h_chi2{$k_file}{$mut1}{'ConfInt'}, $formatTable); # G>N and A>N $ws->write($row+11, 11, $mut2, $formatTextMut1); $ws->write($row+11, 12, $ratio_mut2, $formatTextRatio); $ws->write($row+11, 13, $count_Tr, $formatText); $ws->write($row+11, 14, $count_NonTr, $formatText); if( ($count_NonTr+$count_Tr) < 10 ) { if($mut1 eq "T>G") { if(! exists $h_chi2{$k_file}{$mut1}{'p-value'}) { $ws->write_string($row+11, 15, ""); } elsif($h_chi2{$k_file}{$mut1}{'p-value'} eq "NA") { $ws->write_string($row+11, 15, $h_chi2{$k_file}{$mut1}{'p-value'}, $table_bottom); } else { $ws->write_string($row+11, 15, $h_chi2{$k_file}{$mut1}{'p-value'}, $table_bottomItalicRed); } } else { if(! exists $h_chi2{$k_file}{$mut1}{'p-value'}) { $ws->write_string($row+11, 15, ""); } elsif($h_chi2{$k_file}{$mut1}{'p-value'} eq "NA") { $ws->write_string($row+11, 15, $h_chi2{$k_file}{$mut1}{'p-value'}, $formatText); } else { $ws->write_string($row+11, 15, $h_chi2{$k_file}{$mut1}{'p-value'}, $format_A10ItalicRed); } } } else { $ws->write_string($row+11, 15, $h_chi2{$k_file}{$mut1}{'p-value'}, $formatText); } $ws->write($row+11, 16, $h_chi2{$k_file}{$mut1}{'FDR'}, $formatText); $ws->write($row+11, 17, $h_chi2{$k_file}{$mut1}{'ConfInt'}, $formatTable); } ############################################################################################################ # SBS distribution by functional region (Table 4) & Strand bias by functional region (Table 5) & Overall count and percent of SBS (Table 1) sub writeStatbyFuncRegion { my ($refH_file, $sample, $ws, $rowStart_SBSdistrBySeg, $colStart_SBSdistrBySeg, $nb_func, $ref_RowSBSDistrBySegAndFuncCG, $mutDistrggplot2) = @_; my $row_SBSdistrBySeg = $rowStart_SBSdistrBySeg+4; my $row_SBSDistrBySegAndFunc_CA = $rowStart_SBSdistrBySeg + $nb_func + 12; my $rowEndCG_SBSDistrBySegAndFunc_CG = $$ref_RowSBSDistrBySegAndFuncCG + $nb_func; my $row_SBSDistrBySegAndFunc_CT = $rowStart_SBSdistrBySeg + ($nb_func*3) + 20; my $colTable4 = 0; my ($count_ca, $count_cg, $count_ct, $count_ta, $count_tc, $count_tg) = (0,0,0,0,0,0); ## 6 mutation types by segment foreach my $k_func (sort keys %{$refH_file->{$sample}{'6mutType'}}) { my $totalSBS_bySegment = 0; # Write the functional region for the section SBS distribution by segment (Table 4) $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg, $k_func, $formatT_left); # Write the exonic func for the section strand bias by segment (Table 5) $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg, $k_func, $formatT_left); # Write the last functional element in the table if($$ref_RowSBSDistrBySegAndFuncCG == $rowEndCG_SBSDistrBySegAndFunc_CG) { $ws->write($$ref_RowSBSDistrBySegAndFuncCG, $colStart_SBSdistrBySeg, $k_func, $formatT_bottomLeft); } else { $ws->write($$ref_RowSBSDistrBySegAndFuncCG, $colStart_SBSdistrBySeg, $k_func, $formatT_left); } foreach my $k_mutation (sort keys %{$refH_file->{$sample}{'6mutType'}{$k_func}}) { ### Write the count of SBS per mutation on genomic (Table 4) and coding strand (Table 5) if($k_mutation eq "C:G>A:T") { writeCountSBS($refH_file, $sample, $k_func, $k_mutation, $ws, $row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg, $colStart_SBSdistrBySeg+3, $row_SBSdistrBySeg, $rowEndCG_SBSDistrBySegAndFunc_CG, \$count_ca); } if($k_mutation eq "C:G>G:C") { writeCountSBS($refH_file, $sample, $k_func, $k_mutation, $ws, $row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+4, $colStart_SBSdistrBySeg+5, $row_SBSdistrBySeg, $rowEndCG_SBSDistrBySegAndFunc_CG, \$count_cg); } if($k_mutation eq "C:G>T:A") { writeCountSBS($refH_file, $sample, $k_func, $k_mutation, $ws, $row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+8, $colStart_SBSdistrBySeg+7, $row_SBSdistrBySeg, $rowEndCG_SBSDistrBySegAndFunc_CG, \$count_ct); } if($k_mutation eq "T:A>A:T") { writeCountSBS($refH_file, $sample, $k_func, $k_mutation, $ws, $$ref_RowSBSDistrBySegAndFuncCG, $colStart_SBSdistrBySeg, $colStart_SBSdistrBySeg+9, $row_SBSdistrBySeg, $rowEndCG_SBSDistrBySegAndFunc_CG, \$count_ta); } if($k_mutation eq "T:A>C:G") { writeCountSBS($refH_file, $sample, $k_func, $k_mutation, $ws, $$ref_RowSBSDistrBySegAndFuncCG, $colStart_SBSdistrBySeg+4, $colStart_SBSdistrBySeg+11, $row_SBSdistrBySeg, $rowEndCG_SBSDistrBySegAndFunc_CG, \$count_tc); } if($k_mutation eq "T:A>G:C") { writeCountSBS($refH_file, $sample, $k_func, $k_mutation, $ws, $$ref_RowSBSDistrBySegAndFuncCG, $colStart_SBSdistrBySeg+8, $colStart_SBSdistrBySeg+13, $row_SBSdistrBySeg, $rowEndCG_SBSDistrBySegAndFunc_CG, \$count_tg); } # Calculate the total number of SBS on the genomic strand for each mutation types by exonic region $totalSBS_bySegment += $refH_file->{$sample}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}; } # End $k_mutation $row_SBSDistrBySegAndFunc_CA++; $$ref_RowSBSDistrBySegAndFuncCG++; #$row_SBSDistrBySegAndFunc_CT++; # Write the percent by exonic region writePercentSBS($refH_file, $sample, $k_func, "C:G>A:T", $row_SBSdistrBySeg, $colStart_SBSdistrBySeg+2, $ws, $totalSBS_bySegment); writePercentSBS($refH_file, $sample, $k_func, "C:G>G:C", $row_SBSdistrBySeg, $colStart_SBSdistrBySeg+4, $ws, $totalSBS_bySegment); writePercentSBS($refH_file, $sample, $k_func, "C:G>T:A", $row_SBSdistrBySeg, $colStart_SBSdistrBySeg+6, $ws, $totalSBS_bySegment); writePercentSBS($refH_file, $sample, $k_func, "T:A>A:T", $row_SBSdistrBySeg, $colStart_SBSdistrBySeg+8, $ws, $totalSBS_bySegment); writePercentSBS($refH_file, $sample, $k_func, "T:A>C:G", $row_SBSdistrBySeg, $colStart_SBSdistrBySeg+10, $ws, $totalSBS_bySegment); writePercentSBS($refH_file, $sample, $k_func, "T:A>G:C", $row_SBSdistrBySeg, $colStart_SBSdistrBySeg+12, $ws, $totalSBS_bySegment); # Write the count of SBS by segment $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+1, $totalSBS_bySegment, $format_A10); $row_SBSdistrBySeg++; } # End $k_func # Write the total number of SBS on the genomic strand (Table 4) $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+1, $refH_file->{$sample}{'TotalSBSGenomic'}, $formatT_bottomHeader); ##### Calculate the total percentages by mutation type my $percent_ca = ($count_ca / $refH_file->{$sample}{'TotalSBSGenomic'}) * 100; $percent_ca = sprintf("%.2f", $percent_ca); my $percent_cg = ($count_cg / $refH_file->{$sample}{'TotalSBSGenomic'}) * 100; $percent_cg = sprintf("%.2f", $percent_cg); my $percent_ct = ($count_ct / $refH_file->{$sample}{'TotalSBSGenomic'}) * 100; $percent_ct = sprintf("%.2f", $percent_ct); my $percent_ta = ($count_ta / $refH_file->{$sample}{'TotalSBSGenomic'}) * 100; $percent_ta = sprintf("%.2f", $percent_ta); my $percent_tc = ($count_tc / $refH_file->{$sample}{'TotalSBSGenomic'}) * 100; $percent_tc = sprintf("%.2f", $percent_tc); my $percent_tg = ($count_tg / $refH_file->{$sample}{'TotalSBSGenomic'}) * 100; $percent_tg = sprintf("%.2f", $percent_tg); # Write the total percentage (Table 4) $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+2, $percent_ca, $formatT_bottom); $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+3, $count_ca, $formatT_bottomHeader); $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+4, $percent_cg, $formatT_bottom); $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+5, $count_cg, $formatT_bottomHeader); $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+6, $percent_ct, $formatT_bottom); $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+7, $count_ct, $formatT_bottomHeader); $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+8, $percent_ta, $formatT_bottom); $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+9, $count_ta, $formatT_bottomHeader); $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+10, $percent_tc, $formatT_bottom); $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+11, $count_tc, $formatT_bottomHeader); $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+12, $percent_tg, $formatT_bottom); $ws->write($row_SBSdistrBySeg, $colStart_SBSdistrBySeg+13, $count_tg, $formatT_bottomRightHeader); # Overall distribution of SBS (Table 1) $ws->write(0, 0, "Graph 1. SBS distribution", $formatT_graphTitle); $ws->set_row(0, 18); $ws->write(29, 0, "Table 1. Frequency and counts of all SBS", $format_A10Boldleft); $ws->write(30, 0, "Mutation type", $table_topleft); $ws->write(30, 1, "Percentage", $table_top); $ws->write(30, 2, "Count", $table_topRight); $ws->write(31, 0, "C:G>A:T", $table_left); $ws->write(31, 1, $percent_ca, $format_A10); $ws->write(31, 2, $count_ca, $table_right); $ws->write(32, 0, "C:G>G:C", $table_left); $ws->write(32, 1, $percent_cg, $format_A10); $ws->write(32, 2, $count_cg, $table_right); $ws->write(33, 0, "C:G>T:A", $table_left); $ws->write(33, 1, $percent_ct, $format_A10); $ws->write(33, 2, $count_ct, $table_right); $ws->write(34, 0, "T:A>A:T", $table_left); $ws->write(34, 1, $percent_ta, $format_A10); $ws->write(34, 2, $count_ta, $table_right); $ws->write(35, 0, "T:A>C:G", $table_left); $ws->write(35, 1, $percent_tc, $format_A10); $ws->write(35, 2, $count_tc, $table_right); $ws->write(36, 0, "T:A>G:C", $table_left); $ws->write(36, 1, $percent_tg, $format_A10); $ws->write(36, 2, $count_tg, $table_right); $ws->write(37, 0, "", $table_bottomleft); $ws->write(37, 1, "", $table_bottom); $ws->write(37, 2, $refH_file->{$sample}{'TotalSBSGenomic'}, $table_bottomrightHeader); # Create an input for ggplot2 for representing the distribution of SBS for each mutation types (Figure 1) open(DISTRSBS, ">", $mutDistrggplot2) or die "$!: $mutDistrggplot2\n"; print DISTRSBS "Mutation_Type\tCount\tPercentage\tSample\n"; print DISTRSBS "C:G>A:T\t$count_ca\t$percent_ca\t$sample\n"; print DISTRSBS "C:G>G:C\t$count_cg\t$percent_cg\t$sample\n"; print DISTRSBS "C:G>T:A\t$count_ct\t$percent_ct\t$sample\n"; print DISTRSBS "T:A>A:T\t$count_ta\t$percent_ta\t$sample\n"; print DISTRSBS "T:A>C:G\t$count_tc\t$percent_tc\t$sample\n"; print DISTRSBS "T:A>G:C\t$count_tg\t$percent_tg\t$sample\n"; close DISTRSBS; } # Write the percentage in table 4 of the Excel report (Sub function of writeStatbyFuncRegion) sub writePercentSBS { my ($refH_file, $k_file, $k_func, $mutation, $row, $col, $ws, $totalSBS) = @_; my $percent = 0; if($refH_file->{$k_file}{'6mutType'}{$k_func}{$mutation}{'TotalMutG'} == 0) { $percent = 0; } else { $percent = ($refH_file->{$k_file}{'6mutType'}{$k_func}{$mutation}{'TotalMutG'} / $totalSBS ) * 100; $percent = sprintf("%.2f", $percent); } $ws->write($row, $col, $percent, $format_A10); } # Write the count in table 4 and table 5 of the Excel report (Sub function of writeStatbyFuncRegion) sub writeCountSBS { my ($refH_file, $k_file, $k_func, $k_mutation, $ws, $row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg, $colTable4, $row_SBSdistrBySeg, $rowEndCG_SBSDistrBySegAndFunc_CG, $refCount) = @_; my $ratioSB = 0; if( ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} == 0) || ($refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'} == 0) ) { $ratioSB = 0; } else { $ratioSB = $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'} / $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}; } $ratioSB = sprintf("%.2f", $ratioSB); if($row_SBSDistrBySegAndFunc_CA == $rowEndCG_SBSDistrBySegAndFunc_CG) { # Write the ratio of NonTr / Tr (Table 5) $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+1, $ratioSB, $formatT_bottom); # Write the count of SBS in the NonTr and Tr strand $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+2, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}, $formatT_bottom); $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+3, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $formatT_bottom); if( ($k_mutation eq "C:G>T:A") || ($k_mutation eq "T:A>G:C") ) { $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+3, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $formatT_bottomRight); } } else { # Write the ratio of NonTr / Tr (Table 5) $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+1, $ratioSB, $format_A10); # Write the count of SBS in the NonTr and Tr strand $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+2, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'NonTr'}, $format_A10); $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+3, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $format_A10); if( ($k_mutation eq "C:G>T:A") || ($k_mutation eq "T:A>G:C") ) { $ws->write($row_SBSDistrBySegAndFunc_CA, $colStart_SBSdistrBySeg+3, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'Tr'}, $formatT_right); } } if($k_mutation eq "C:G>A:T") { # Calculate the total number of SBS per mut type (genomic strand) $$refCount += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}; } elsif($k_mutation eq "C:G>G:C") { $$refCount += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}; } elsif($k_mutation eq "C:G>T:A") { $$refCount += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}; } elsif($k_mutation eq "T:A>A:T") { $$refCount += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}; } elsif($k_mutation eq "T:A>C:G") { $$refCount += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}; } elsif($k_mutation eq "T:A>G:C") { $$refCount += $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}; } # Write the count by exonic region (Table 4) $ws->write($row_SBSdistrBySeg, $colTable4, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}, $format_A10); if($k_mutation eq "T:A>G:C") { $ws->write($row_SBSdistrBySeg, $colTable4, $refH_file->{$k_file}{'6mutType'}{$k_func}{$k_mutation}{'TotalMutG'}, $formatT_right); } } ############################################################################################################ # SBS distribution by chromosomes (Table 6) sub writeDistrByChr { my ($ws, $refH_file, $sample, $row, $col, $distrByChrggplot2) = @_; # For the HTML report open(SBSPerChr, ">", $distrByChrggplot2) or die "$!: $distrByChrggplot2\n"; print SBSPerChr "\tPearson\t$refH_file->{$sample}{'SBSPerChr'}{'AllMutType'}\t", $refH_file->{$sample}{'SBSPerChr'}{"C:G>A:T"}{'Pearson'},"\t", $refH_file->{$sample}{'SBSPerChr'}{"C:G>G:C"}{'Pearson'},"\t", $refH_file->{$sample}{'SBSPerChr'}{"C:G>T:A"}{'Pearson'},"\t", $refH_file->{$sample}{'SBSPerChr'}{"T:A>A:T"}{'Pearson'},"\t", $refH_file->{$sample}{'SBSPerChr'}{"T:A>C:G"}{'Pearson'},"\t", $refH_file->{$sample}{'SBSPerChr'}{"T:A>G:C"}{'Pearson'},"\n"; print SBSPerChr "Chr\tSize\tAll SBS\tC:G>A:T\tC:G>G:C\tC:G>T:A\tT:A>A:T\tT:A>C:G\tT:A>G:C\n"; my $row_SBSPerChr = $row + 8; # Write the Pearson coefficient $ws->write($row+6, $col+3, $refH_file->{$sample}{'SBSPerChr'}{"C:G>A:T"}{'Pearson'}, $format_A10); $ws->write($row+6, $col+4, $refH_file->{$sample}{'SBSPerChr'}{"C:G>G:C"}{'Pearson'}, $format_A10); $ws->write($row+6, $col+5, $refH_file->{$sample}{'SBSPerChr'}{"C:G>T:A"}{'Pearson'}, $format_A10); $ws->write($row+6, $col+6, $refH_file->{$sample}{'SBSPerChr'}{"T:A>A:T"}{'Pearson'}, $format_A10); $ws->write($row+6, $col+7, $refH_file->{$sample}{'SBSPerChr'}{"T:A>C:G"}{'Pearson'}, $format_A10); $ws->write($row+6, $col+8, $refH_file->{$sample}{'SBSPerChr'}{"T:A>G:C"}{'Pearson'}, $formatT_right); # Write the chromosome number and their sizes / Write count SBS per chromosomes my $line=0; foreach my $chromosome (sort keys %chromosomes) { $ws->write($row_SBSPerChr+($line), $col, $chromosome, $formatT_left); $ws->write($row_SBSPerChr+($line), $col+1, $chromosomes{$chromosome}, $format_A10); $ws->write($row_SBSPerChr+($line), $col+2, $refH_file->{$sample}{'SBSPerChr'}{'TotalPerChr'}{$chromosome}{'chr'}, $format_A10); # Write the count per mutation $ws->write($row_SBSPerChr+($line), $col+3, $refH_file->{$sample}{'SBSPerChr'}{"C:G>A:T"}{'CHR'}{$chromosome}{'chr'}, $format_A10); $ws->write($row_SBSPerChr+($line), $col+4, $refH_file->{$sample}{'SBSPerChr'}{"C:G>G:C"}{'CHR'}{$chromosome}{'chr'}, $format_A10); $ws->write($row_SBSPerChr+($line), $col+5, $refH_file->{$sample}{'SBSPerChr'}{"C:G>T:A"}{'CHR'}{$chromosome}{'chr'}, $format_A10); $ws->write($row_SBSPerChr+($line), $col+6, $refH_file->{$sample}{'SBSPerChr'}{"T:A>A:T"}{'CHR'}{$chromosome}{'chr'}, $format_A10); $ws->write($row_SBSPerChr+($line), $col+7, $refH_file->{$sample}{'SBSPerChr'}{"T:A>C:G"}{'CHR'}{$chromosome}{'chr'}, $format_A10); $ws->write($row_SBSPerChr+($line), $col+8, $refH_file->{$sample}{'SBSPerChr'}{"T:A>G:C"}{'CHR'}{$chromosome}{'chr'}, $formatT_right); # For the HTML report print SBSPerChr "$chromosome\t", $chromosomes{$chromosome},"\t", $refH_file->{$sample}{'SBSPerChr'}{'TotalPerChr'}{$chromosome}{'chr'},"\t", $refH_file->{$sample}{'SBSPerChr'}{"C:G>A:T"}{'CHR'}{$chromosome}{'chr'},"\t", $refH_file->{$sample}{'SBSPerChr'}{"C:G>G:C"}{'CHR'}{$chromosome}{'chr'},"\t", $refH_file->{$sample}{'SBSPerChr'}{"C:G>T:A"}{'CHR'}{$chromosome}{'chr'},"\t", $refH_file->{$sample}{'SBSPerChr'}{"T:A>A:T"}{'CHR'}{$chromosome}{'chr'},"\t", $refH_file->{$sample}{'SBSPerChr'}{"T:A>C:G"}{'CHR'}{$chromosome}{'chr'},"\t", $refH_file->{$sample}{'SBSPerChr'}{"T:A>G:C"}{'CHR'}{$chromosome}{'chr'},"\n"; $line++; } # Write the Pearson coefficient for the total number of SBS $ws->write($row+6, $col+2, $refH_file->{$sample}{'SBSPerChr'}{'AllMutType'}, $format_A10); $ws->write($row_SBSPerChr+(keys %chromosomes), $col+2, $refH_file->{$sample}{'TotalSBSGenomic'}, $formatT_bottomHeader); print SBSPerChr "\t\t$refH_file->{$sample}{'TotalSBSGenomic'}\n"; close SBSPerChr; } ############################################################################################################ # Trinucleotide sequence context on genomic strand (Panel 1) sub writeTriNtGenomic { my ($ws, $refH_file, $sample, $col, $heatmapCountggplot2, $heatmapPercentggplot2, $triNtBarChartggplot2, $ref_c_ca6_g, $ref_c_cg6_g, $ref_c_ct6_g, $ref_c_ta6_g, $ref_c_tc6_g, $ref_c_tg6_g) = @_; # Initialise the row of the panel 1 my $row_SeqContext6 = 4; # Percent total of mutations for 6 mutation types on genomic strand my ($p_ca6_g, $p_cg6_g, $p_ct6_g, $p_ta6_g, $p_tc6_g, $p_tg6_g) = (0,0,0, 0,0,0); my $maxValue = 0; # For the heatmap # For checking if the total number of SBS is correct my $total_SBS_genomic = 0; open(HEATMAPCGENOMIC, ">", $heatmapCountggplot2) or die "$!: $heatmapCountggplot2\n"; print HEATMAPCGENOMIC "\tC>A\tC>G\tC>T\tT>A\tT>C\tT>G\n"; open(HEATMAPPGENOMIC, ">", $heatmapPercentggplot2) or die "$!: $heatmapPercentggplot2\n"; print HEATMAPPGENOMIC "\tC>A\tC>G\tC>T\tT>A\tT>C\tT>G\n"; ## Bar plot NMF like open(BARPLOTNMFLIKE, ">", $triNtBarChartggplot2) or die "$!: $triNtBarChartggplot2\n"; print BARPLOTNMFLIKE "alteration\tcontext\tvalue\n"; foreach my $k_context (sort keys %{$refH_file->{$sample}{'SeqContextG'}}) { if( ($k_context =~ /N/) || (length($k_context) != 3) ) { next; } # Write the context: 6 mut type on genomic strand $ws->write($row_SeqContext6 , $col+3, $k_context, $format_A10); $ws->write($row_SeqContext6 , $col+13, $k_context, $format_A10); # Count for the heatmap print HEATMAPCGENOMIC $k_context."\t"; print HEATMAPPGENOMIC $k_context."\t"; foreach my $k_mutation (sort keys %{$refH_file->{$sample}{'SeqContextG'}{$k_context}}) { # For checking the total number of SBS $total_SBS_genomic += $refH_file->{$sample}{'SeqContextG'}{$k_context}{$k_mutation}; # Calculate the percentages my $percent = 0; if($refH_file->{$sample}{'SeqContextG'}{$k_context}{$k_mutation} == 0) { $percent = 0; } else { $percent = ($refH_file->{$sample}{'SeqContextG'}{$k_context}{$k_mutation} / $refH_file->{$sample}{'TotalSBSGenomic'}) * 100; $percent = sprintf("%.2f", $percent); } # For representing the sequence context with a bar plot (NMF like style) print BARPLOTNMFLIKE $k_mutation,"\t", $k_context,"\t", $percent,"\n"; # Write the count for the heatmap print HEATMAPCGENOMIC $refH_file->{$sample}{'SeqContextG'}{$k_context}{$k_mutation}."\t"; print HEATMAPPGENOMIC "$percent\t"; # For NMF input my $count = $refH_file->{$sample}{'SeqContextG'}{$k_context}{$k_mutation}; if($sample ne "Pool_Data") { push(@{$h_inputNMF{'Count'}{$k_context}{$k_mutation}}, $count); } if($sample ne "Pool_Data") { push(@{$h_inputNMF{'Percent'}{$k_context}{$k_mutation}}, $percent); } if($k_mutation eq "C>A") { triNtByMut($ws, $row_SeqContext6, $col+4, $col+14, $refH_file, $sample, $k_context, $k_mutation, $percent, $maxValue, $ref_c_ca6_g, \$p_ca6_g); } elsif($k_mutation eq "C>G") { triNtByMut($ws, $row_SeqContext6, $col+5, $col+15, $refH_file, $sample, $k_context, $k_mutation, $percent, $maxValue, $ref_c_cg6_g, \$p_cg6_g); } elsif($k_mutation eq "C>T") { triNtByMut($ws, $row_SeqContext6, $col+6, $col+16, $refH_file, $sample, $k_context, $k_mutation, $percent, $maxValue, $ref_c_ct6_g, \$p_ct6_g); } elsif($k_mutation eq "T>A") { triNtByMut($ws, $row_SeqContext6, $col+7, $col+17, $refH_file, $sample, $k_context, $k_mutation, $percent, $maxValue, $ref_c_ta6_g, \$p_ta6_g); } elsif($k_mutation eq "T>C") { triNtByMut($ws, $row_SeqContext6, $col+8, $col+18, $refH_file, $sample, $k_context, $k_mutation, $percent, $maxValue, $ref_c_tc6_g, \$p_tc6_g); } elsif($k_mutation eq "T>G") { triNtByMut($ws, $row_SeqContext6, $col+9, $col+19, $refH_file, $sample, $k_context, $k_mutation, $percent, $maxValue, $ref_c_tg6_g, \$p_tg6_g); } else { print STDERR "Error: Mutation type not considered for: $k_mutation\n"; exit; } } $row_SeqContext6++; print HEATMAPCGENOMIC "\n"; print HEATMAPPGENOMIC "\n"; } close HEATMAPCGENOMIC; close HEATMAPPGENOMIC; close BARPLOTNMFLIKE; # Write the total number of SBS per mutation type: COUNT $ws->write($row_SeqContext6, $col+4, $$ref_c_ca6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $col+5, $$ref_c_cg6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $col+6, $$ref_c_ct6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $col+7, $$ref_c_ta6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $col+8, $$ref_c_tc6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $col+9, $$ref_c_tg6_g, $formatT_bottomHeader2); if($total_SBS_genomic != $refH_file->{$sample}{'TotalSBSGenomic'}) { print STDERR "Error in the calculation of the total number of SBS on the genomic strand!!!!\n"; print STDERR "From hash table $refH_file->{$sample}{'TotalSBSGenomic'}\tVS\t$total_SBS_genomic\n"; exit; } # Write the total number of SBS per mutation type: PERCENT $ws->write($row_SeqContext6, $col+14, $p_ca6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $col+15, $p_cg6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $col+16, $p_ct6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $col+17, $p_ta6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $col+18, $p_tc6_g, $formatT_bottomHeader2); $ws->write($row_SeqContext6, $col+19, $p_tg6_g, $formatT_bottomHeader2); my $totalPercent_genomic = $p_ca6_g + $p_cg6_g + $p_ct6_g + $p_ta6_g + $p_tc6_g + $p_tg6_g; $totalPercent_genomic = sprintf("%.0f", $totalPercent_genomic); if($totalPercent_genomic != 100) { print STDERR "Error in the calculation of the total percentages on the genomic strand!!!\n"; print STDERR "The total is equal to=\t$totalPercent_genomic\n"; exit; } } # Trinucleotide count and percentage by mutation type (Sub function of writeTriNtGenomic) sub triNtByMut { my ($ws, $row, $colC, $colP, $refH_file, $sample, $context, $mutation, $percent, $maxValue, $refCountG, $refPercentG) = @_; ### COUNT $ws->write($row, $colC, $refH_file->{$sample}{'SeqContextG'}{$context}{$mutation}, $format_A10); ### PERCENTAGE $ws->write($row, $colP, $percent, $format_A10); # For the heatmap if($percent >= $maxValue) { $maxValue = $percent; } # For the total amount per mutation types $$refCountG += $refH_file->{$sample}{'SeqContextG'}{$context}{$mutation}; $$refPercentG += $percent; } ############################################################################################################ # Trinucleotide sequence context on coding strand (Panel 2) sub writeTriNtCoding { my ($ws, $row, $col, $refH_file, $sample, $triNtBarChartCodingCountggplot2, $triNtBarChartCodingPercentggplot2) = @_; # Initialise the row my $row_SeqContext12 = $row+6; my $row_SeqContext12Percent = $row+27; # Total count and percent calculated for the strand bias my ($ca_NonTr, $ca_Tr, $cg_NonTr, $cg_Tr, $ct_NonTr, $ct_Tr, $ta_NonTr, $ta_Tr, $tc_NonTr, $tc_Tr, $tg_NonTr, $tg_Tr) = (0,0,0, 0,0,0, 0,0,0, 0,0,0); my ($percent_ca_NonTr, $percent_ca_Tr, $percent_cg_NonTr, $percent_cg_Tr, $percent_ct_NonTr, $percent_ct_Tr, $percent_ta_NonTr, $percent_ta_Tr, $percent_tc_NonTr, $percent_tc_Tr, $percent_tg_NonTr, $percent_tg_Tr) = (0,0,0, 0,0,0, 0,0,0, 0,0,0); # For checking if the total number of SBS is correct my $total_SBS_coding = 0; open(COUNT, ">", $triNtBarChartCodingCountggplot2) or die "$!: $triNtBarChartCodingCountggplot2\n"; print COUNT "MutationTypeContext\tStrand\tValue\tSample\n"; open(PERCENT, ">", $triNtBarChartCodingPercentggplot2) or die "$!: $triNtBarChartCodingPercentggplot2\n"; print PERCENT "MutationTypeContext\tStrand\tValue\tSample\n"; foreach my $k_context (sort keys %{$refH_file->{$sample}{'SeqContextC'}}) { if( ($k_context =~ /N/) || (length($k_context) != 3) ) { next; } # Write the context: 12 mut type on coding strand $ws->write($row_SeqContext12 , $col, $k_context, $formatT_left); $ws->write($row_SeqContext12Percent , $col, $k_context, $formatT_left); foreach my $k_mutation (sort keys %{$refH_file->{$sample}{'SeqContextC'}{$k_context}}) { # Percent: 12 mut type on coding strand my ($percent_NonTr, $percent_Tr) = (0, 0); if($refH_file->{$sample}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'} != 0) { $percent_NonTr = ( $refH_file->{$sample}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'} / $refH_file->{$sample}{'TotalSBSCoding'} ) * 100; } if($refH_file->{$sample}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'} != 0) { $percent_Tr = ( $refH_file->{$sample}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'} / $refH_file->{$sample}{'TotalSBSCoding'} ) * 100; } # Counts print COUNT "$k_mutation:$k_context\tNonTranscribed\t$refH_file->{$sample}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'}\t$sample\n"; print COUNT "$k_mutation:$k_context\tTranscribed\t$refH_file->{$sample}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}\t$sample\n"; # Percentages $percent_NonTr = sprintf("%.2f", $percent_NonTr); $percent_Tr = sprintf("%.2f", $percent_Tr); print PERCENT "$k_mutation:$k_context\tNonTranscribed\t$percent_NonTr\t$sample\n"; print PERCENT "$k_mutation:$k_context\tTranscribed\t$percent_Tr\t$sample\n"; # Calculate the total number for each mutation types if($k_mutation eq "C>A") { triNtByMutCoding($refH_file, $sample, $k_context, $k_mutation, $ws, $row_SeqContext12, $col+1, $row_SeqContext12Percent, \$ca_NonTr, \$ca_Tr, $percent_NonTr, $percent_Tr); $percent_ca_NonTr += $percent_NonTr; $percent_ca_Tr += $percent_Tr; } if($k_mutation eq "C>G") { triNtByMutCoding($refH_file, $sample, $k_context, $k_mutation, $ws, $row_SeqContext12, $col+3, $row_SeqContext12Percent, \$cg_NonTr, \$cg_Tr, $percent_NonTr, $percent_Tr); $percent_cg_NonTr += $percent_NonTr; $percent_cg_Tr += $percent_Tr; } if($k_mutation eq "C>T") { triNtByMutCoding($refH_file, $sample, $k_context, $k_mutation, $ws, $row_SeqContext12, $col+5, $row_SeqContext12Percent, \$ct_NonTr, \$ct_Tr, $percent_NonTr, $percent_Tr); $percent_ct_NonTr += $percent_NonTr; $percent_ct_Tr += $percent_Tr; } if($k_mutation eq "T>A") { triNtByMutCoding($refH_file, $sample, $k_context, $k_mutation, $ws, $row_SeqContext12, $col+7, $row_SeqContext12Percent, \$ta_NonTr, \$ta_Tr, $percent_NonTr, $percent_Tr); $percent_ta_NonTr += $percent_NonTr; $percent_ta_Tr += $percent_Tr; } if($k_mutation eq "T>C") { triNtByMutCoding($refH_file, $sample, $k_context, $k_mutation, $ws, $row_SeqContext12, $col+9, $row_SeqContext12Percent, \$tc_NonTr, \$tc_Tr, $percent_NonTr, $percent_Tr); $percent_tc_NonTr += $percent_NonTr; $percent_tc_Tr += $percent_Tr; } if($k_mutation eq "T>G") { triNtByMutCoding($refH_file, $sample, $k_context, $k_mutation, $ws, $row_SeqContext12, $col+11, $row_SeqContext12Percent, \$tg_NonTr, \$tg_Tr, $percent_NonTr, $percent_Tr); $percent_tg_NonTr += $percent_NonTr; $percent_tg_Tr += $percent_Tr; } # For checking if the total number of SBS is correct $total_SBS_coding += $refH_file->{$sample}{'SeqContextC'}{$k_context}{$k_mutation}{'NonTr'} + $refH_file->{$sample}{'SeqContextC'}{$k_context}{$k_mutation}{'Tr'}; } $row_SeqContext12++; $row_SeqContext12Percent++; } close COUNT; close PERCENT; ## Write the total of each mutation types: 12 mut type on coding strand $ws->write($row_SeqContext12, $col+1, $ca_NonTr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $col+2, $ca_Tr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $col+3, $cg_NonTr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $col+4, $cg_Tr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $col+5, $ct_NonTr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $col+6, $ct_Tr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $col+7, $ta_NonTr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $col+8, $ta_Tr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $col+9, $tc_NonTr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $col+10, $tc_Tr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $col+11, $tg_NonTr, $formatT_bottomHeader2); $ws->write($row_SeqContext12, $col+12, $tg_Tr, $formatT_bottomHeader2); # Write the total percentages of each mutation types: 12 mut type on coding strand $ws->write($row_SeqContext12Percent, $col+1, $percent_ca_NonTr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $col+2, $percent_ca_Tr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $col+3, $percent_cg_NonTr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $col+4, $percent_cg_Tr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $col+5, $percent_ct_NonTr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $col+6, $percent_ct_Tr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $col+7, $percent_ta_NonTr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $col+8, $percent_ta_Tr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $col+9, $percent_tc_NonTr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $col+10, $percent_tc_Tr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $col+11, $percent_tg_NonTr, $formatT_bottomHeader); $ws->write($row_SeqContext12Percent, $col+12, $percent_tg_Tr, $formatT_bottomHeader); if($total_SBS_coding == $refH_file->{$sample}{'TotalSBSCoding'}) { $ws->write($row_SeqContext12, $col+13, $refH_file->{$sample}{'TotalSBSCoding'}, $formatT_bottomHeader2) } else { print STDERR "Error: in the calculation of the total number of SBS on the coding strand!!!!\n"; print STDERR "From hash table $refH_file->{$sample}{'TotalSBSCoding'}\tVS\t$total_SBS_coding\n"; exit; } my $totalP_SBS_coding = $percent_ca_NonTr + $percent_ca_Tr + $percent_cg_NonTr + $percent_cg_Tr + $percent_ct_NonTr + $percent_ct_Tr + $percent_ta_NonTr + $percent_ta_Tr + $percent_tc_NonTr + $percent_tc_Tr + $percent_tg_NonTr + $percent_tg_Tr; $totalP_SBS_coding = sprintf("%.0f", $totalP_SBS_coding); if($totalP_SBS_coding != 100) { print STDERR "Error: The percentages for the trinucleotide sequence context on the coding strand for 12 mutation types is not equal to 100!!!\n$totalP_SBS_coding\n"; exit; } } # Trinucleotide count and percentage by mutation type on Coding strand (Sub function of writeTriNtCoding) sub triNtByMutCoding { my ($refH_file, $sample, $context, $mutation, $ws, $row, $col, $rowP, $refNonTr, $refTr, $percent_NonTr, $percent_Tr) = @_; $$refNonTr += $refH_file->{$sample}{'SeqContextC'}{$context}{$mutation}{'NonTr'}; $$refTr += $refH_file->{$sample}{'SeqContextC'}{$context}{$mutation}{'Tr'}; # COUNT : 12 mutation type (stranded bar graph) $ws->write($row, $col, $refH_file->{$sample}{'SeqContextC'}{$context}{$mutation}{'NonTr'}, $format_A10); $ws->write($row, $col+1, $refH_file->{$sample}{'SeqContextC'}{$context}{$mutation}{'Tr'}, $format_A10); ## PERCENT : 12 mutation type (stranded bar graph) $ws->write($rowP, $col, $percent_NonTr, $format_A10); $ws->write($rowP, $col+1, $percent_Tr, $format_A10); } ############################################################################################################ # Create and write the figures on the Excel report sub createWriteFigs { my ($ws, $row, $col, $folderFigure, $sample, $c_ca6_g, $c_cg6_g, $c_ct6_g, $c_ta6_g, $c_tc6_g, $c_tg6_g) = @_; ######## Create figures # Bar char for SBS distribution (Figure 1) # Pie char for Impact on protein sequence (Figure 2) # Stranded distribution of SBS (Figure 3) # Heatmaps for trinucleotide context `Rscript $pathRScriptFigs --folderFigure $folderFigure --folderTemp $folder_temp --filename $sample`; # Bar chart for trinucleotide context on coding strand `Rscript $pathRScriptTxnSB $folderFigure/Stranded_Analysis/$sample/$sample-StrandedSignatureCount.txt $folderFigure/Stranded_Analysis/$sample/$sample-StrandedSignatureCount $folder_temp/$sample-StrandedSignatureCount Count`; `Rscript $pathRScriptTxnSB $folderFigure/Stranded_Analysis/$sample/$sample-StrandedSignaturePercent.txt $folderFigure/Stranded_Analysis/$sample/$sample-StrandedSignaturePercent $folder_temp/$sample-StrandedSignaturePercent Percent`; # Bar plot for representing the sequence context (NMF like style) `Rscript $pathRScriptMutSpectrum $folderFigure/Trinucleotide_Sequence_Context/$sample/$sample-MutationSpectraPercent-Genomic.txt $sample $folderFigure/Trinucleotide_Sequence_Context/$sample $folder_temp $c_ca6_g $c_cg6_g $c_ct6_g $c_ta6_g $c_tc6_g $c_tg6_g`; ######## Write the figures in the Excel report # Bar char for SBS distribution (Figure 1) $ws->insert_image(1, 0, "$folder_temp/$sample-SBS_distribution-Report.png", 0, 0, .2, .2); # Impact of the SBS on the protein (Figure 2) $ws->write(0, 6, "Graph 2. Impact on protein sequence", $formatT_graphTitle); $ws->insert_image(1, 6, "$folder_temp/$sample-DistributionExoFunc-Report.png", 0, 0, .2, .2); # Stranded distribution of SBS (Figure 3) $ws->write(0, 11, "Graph 3. Stranded distribution of SBS", $formatT_graphTitle); $ws->insert_image(1, 11, "$folder_temp/$sample-StrandBias-Report.png", 0, 0, .2, .2); ## Trinucleotide context on coding strand (Scale the inserted image: width x 0.7, height x 0.8) $ws->insert_image($row+3, $col+15, "$folder_temp/$sample-StrandedSignatureCount-Report.png", 0, 0, .16, .16); $ws->insert_image($row+24, $col+15, "$folder_temp/$sample-StrandedSignaturePercent-Report.png", 0, 0, .16, .16); # Heatamp for the sequence context on the genomic strand (6 mutation types) $ws->insert_image(4, $col, "$folder_temp/$sample-HeatmapCount-Genomic-Report.png"); $ws->insert_image(4, $col+10, "$folder_temp/$sample-HeatmapPercent-Genomic-Report.png"); # Bar plot for the sequence context on the genomic strand (6 mutation types) $ws->insert_image(27, $col+3, "$folder_temp/$sample-MutationSpectraPercent-Genomic-Report.png"); } ############################################################################################################ # Write NMF input for count and percentages in the Excel report sub writeInputNMF { my ($ws_inputNMF_count, $ws_inputNMF_percent, $outCount, $outPercent) = @_; open(OUTINPUTNMFC, ">", $outCount) or die "$!: $outCount\n"; # with the count open(OUTINPUTNMFP, ">", $outPercent) or die "$!: $outPercent\n"; # With the frequency un-normalized foreach my $k_sample (@{$h_inputNMF{'Sample'}}) { print OUTINPUTNMFC "\t$k_sample"; print OUTINPUTNMFP "\t$k_sample"; } print OUTINPUTNMFC "\n"; print OUTINPUTNMFP "\n"; my $row_inputNMF = 1; foreach my $k_context (sort keys %{$h_inputNMF{'Count'}}) { $k_context =~ /(\w)_(\w)/; my ($base5, $base3) = ($1, $2); foreach my $k_mutation (sort keys %{$h_inputNMF{'Count'}{$k_context}}) { my ($col_inputNMF_Count, $col_inputNMF_Percent) = (1, 1); my $contextNMF = $base5."[$k_mutation]".$base3; # Write the input in the Excel report, only when all the samples are in the same workbook if($oneReportPerSample == 2) { $ws_inputNMF_count->write($row_inputNMF, 0, $contextNMF); $ws_inputNMF_percent->write($row_inputNMF, 0, $contextNMF); } print OUTINPUTNMFC $contextNMF,"\t"; print OUTINPUTNMFP $contextNMF,"\t"; foreach (@{$h_inputNMF{'Count'}{$k_context}{$k_mutation}}) { print OUTINPUTNMFC "$_\t"; } print OUTINPUTNMFC "\n"; foreach (@{$h_inputNMF{'Percent'}{$k_context}{$k_mutation}}) { print OUTINPUTNMFP "$_\t"; } print OUTINPUTNMFP "\n"; foreach (@{$h_inputNMF{'Count'}{$k_context}{$k_mutation}}) { if($oneReportPerSample == 2) { $ws_inputNMF_count->write($row_inputNMF, $col_inputNMF_Count, $_); } $col_inputNMF_Count++; } foreach (@{$h_inputNMF{'Percent'}{$k_context}{$k_mutation}}) { if($oneReportPerSample == 2) { $ws_inputNMF_percent->write($row_inputNMF, $col_inputNMF_Percent, $_); } $col_inputNMF_Percent++; } $row_inputNMF++; } } close OUTINPUTNMFP; close OUTINPUTNMFC; } ###################################################################################################################################################### # Define format and background colors for the Excel report # ###################################################################################################################################################### # Font: Arial size 10 sub Format_A10 { my ($wb, $format) = @_; $$format = $wb->add_format(font=>'Arial', size=>10); $$format->set_align('center'); } # Font: Arial size 11 bold and center sub Format_A11Bold { my ($wb, $format) = @_; $$format = $wb->add_format(font=>'Arial', size=>11, bold=>1); $$format->set_align('center'); } # Font: Arial size 10 italic red and center sub Format_A10ItalicRed { my ($wb, $format) = @_; $$format = $wb->add_format(font=>'Arial', size=>10, italic=>1, color => 'red'); $$format->set_align('center'); } # Format: Arialt size 11 bold and left sub Format_A11BoldLeft { my ($wb, $format) = @_; $$format = $wb->add_format(valign =>'left', font=>'Arial', size=>11, bold=>1); } # Font: Arialt size 10 bold and left sub Format_A10BoldLeft { my ($wb, $format) = @_; $$format = $wb->add_format(valign =>'left', font=>'Arial', size=>10, bold=>1); } # Define the format of the border of the section (for delimiting the different section of the report) sub Format_section { my ($wb, $format_topLeft, $format_topRight, $format_bottomLeft, $format_bottomRight, $format_top, $format_right, $format_bottom, $format_left) = @_; $$format_topLeft = $wb->add_format(valign => 'left', bold => 1, font => 'Arial', size => 12); $$format_topLeft->set_top(2); $$format_topLeft->set_top_color('blue'); $$format_topLeft->set_left(2); $$format_topLeft->set_left_color('blue'); $$format_topRight = $wb->add_format(valign => 'left', bold => 1, font => 'Arial', size => 12); $$format_topRight->set_top(2); $$format_topRight->set_top_color('blue'); $$format_topRight->set_right(2); $$format_topRight->set_right_color('blue'); $$format_bottomLeft = $wb->add_format(valign => 'left', bold => 1, font => 'Arial', size => 12); $$format_bottomLeft->set_bottom(2); $$format_bottomLeft->set_bottom_color('blue'); $$format_bottomLeft->set_left(2); $$format_bottomLeft->set_left_color('blue'); $$format_bottomRight = $wb->add_format(valign => 'left', bold => 1, font => 'Arial', size => 12); $$format_bottomRight->set_bottom(2); $$format_bottomRight->set_bottom_color('blue'); $$format_bottomRight->set_right(2); $$format_bottomRight->set_right_color('blue'); $$format_top = $wb->add_format(); $$format_top->set_top(2); $$format_top->set_top_color('blue'); $$format_right = $wb->add_format(); $$format_right->set_right(2); $$format_right->set_right_color('blue'); $$format_bottom = $wb->add_format(); $$format_bottom->set_bottom(2); $$format_bottom->set_bottom_color('blue'); $$format_left = $wb->add_format(); $$format_left->set_left(2); $$format_left->set_left_color('blue'); } # Define the header sub Format_Header { my ($wb, $format_CA, $format_CG, $format_CT, $format_TA, $format_TC, $format_TG, $format_TG2, $format_LeftHeader, $format_RightHeader, $format_LeftHeader2) = @_; my ($blue, $black, $red, $gray, $green, $pink); Color($wb, \$blue, \$black, \$red, \$gray, \$green, \$pink); my ($bgColor_blue, $bgColor_black, $bgColor_red, $bgColor_gray, $bgColor_green, $bgColor_pink); BackgroundColor($wb, \$bgColor_blue, \$bgColor_black, \$bgColor_red, \$bgColor_gray, \$bgColor_green, \$bgColor_pink); $$format_CA = $wb->add_format(bg_color => $blue, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_CA->set_align('center'); $$format_CA->set_center_across(); $$format_CG = $wb->add_format(bg_color => $black, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_CG->set_align('center'); $$format_CG->set_center_across(); $$format_CT = $wb->add_format(bg_color => $red, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_CT->set_align('center'); $$format_CT->set_center_across(); $$format_TA = $wb->add_format(bg_color => $gray, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TA->set_align('center'); $$format_TA->set_center_across(); $$format_TC = $wb->add_format(bg_color => $green, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TC->set_align('center'); $$format_TC->set_center_across(); $$format_TG = $wb->add_format(bg_color=>$bgColor_pink, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TG->set_align('center'); $$format_TG->set_center_across(); $$format_TG->set_right(2); $$format_TG->set_right_color('blue'); $$format_TG2 = $wb->add_format(bg_color => $pink, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TG2->set_align('center'); $$format_TG2->set_center_across(); $$format_LeftHeader = $wb->add_format(bold=>1, font=>'Arial', size=>11); $$format_LeftHeader->set_align('center'); $$format_LeftHeader->set_left(2); $$format_LeftHeader->set_left_color('blue'); $$format_LeftHeader2 = $wb->add_format(bold=>1, font=>'Arial', size=>11); $$format_LeftHeader2->set_left(2); $$format_LeftHeader2->set_left_color('blue'); $$format_RightHeader = $wb->add_format(bold=>1, font=>'Arial', size=>11); $$format_RightHeader->set_align('center'); $$format_RightHeader->set_right(2); $$format_RightHeader->set_right_color('blue'); } # Define the header for the part "Strand bias by segment" sub Format_HeaderSBSDistrBySegAndFunc { my ($wb, $format_LeftCA, $format_LeftCG, $format_LeftCT, $format_LeftTA, $format_LeftTC, $format_LeftTG, $format_RightCA, $format_RightCG, $format_RightCT, $format_RightTA, $format_RightTC, $format_RightTG) = @_; my ($bgColor_blue, $bgColor_black, $bgColor_red, $bgColor_gray, $bgColor_green, $bgColor_pink); BackgroundColor($wb, \$bgColor_blue, \$bgColor_black, \$bgColor_red, \$bgColor_gray, \$bgColor_green, \$bgColor_pink); $$format_LeftCA = $wb->add_format(bg_color=>$bgColor_blue, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_LeftCA->set_align('center'); $$format_LeftCA->set_left(2); $$format_LeftCA->set_left_color('blue'); $$format_LeftCG = $wb->add_format(bg_color=>$bgColor_black, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_LeftCG->set_align('center'); $$format_LeftCG->set_left(2); $$format_LeftCG->set_left_color('blue'); $$format_LeftCT = $wb->add_format(bg_color=>$bgColor_red, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_LeftCT->set_align('center'); $$format_LeftCT->set_left(2); $$format_LeftCT->set_left_color('blue'); $$format_LeftTA = $wb->add_format(bg_color=>$bgColor_gray, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_LeftTA->set_align('center'); $$format_LeftTA->set_left(2); $$format_LeftTA->set_left_color('blue'); $$format_LeftTC = $wb->add_format(bg_color=>$bgColor_green, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_LeftTC->set_align('center'); $$format_LeftTC->set_left(2); $$format_LeftTC->set_left_color('blue'); $$format_LeftTG = $wb->add_format(bg_color=>$bgColor_pink, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_LeftTG->set_align('center'); $$format_LeftTG->set_left(2); $$format_LeftTG->set_left_color('blue'); $$format_RightCA = $wb->add_format(bg_color=>$bgColor_blue, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_RightCA->set_align('center'); $$format_RightCA->set_right(2); $$format_RightCA->set_right_color('blue'); $$format_RightCG = $wb->add_format(bg_color=>$bgColor_black, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_RightCG->set_align('center'); $$format_RightCG->set_right(2); $$format_RightCG->set_right_color('blue'); $$format_RightCT = $wb->add_format(bg_color=>$bgColor_red, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_RightCT->set_align('center'); $$format_RightCT->set_right(2); $$format_RightCT->set_right_color('blue'); $$format_RightTA = $wb->add_format(bg_color=>$bgColor_gray, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_RightTA->set_align('center'); $$format_RightTA->set_right(2); $$format_RightTA->set_right_color('blue'); $$format_RightTC = $wb->add_format(bg_color=>$bgColor_green, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_RightTC->set_align('center'); $$format_RightTC->set_right(2); $$format_RightTC->set_right_color('blue'); $$format_RightTG = $wb->add_format(bg_color=>$bgColor_pink, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_RightTG->set_align('center'); $$format_RightTG->set_right(2); $$format_RightTG->set_right_color('blue'); } # Define the header for the part "Trinucleotide sequence context on the coding strand" sub Format_Header12MutType { my ($wb, $format_CA, $format_CG, $format_CT, $format_TA, $format_TC, $format_TG) = @_; my ($bgColor_blue, $bgColor_black, $bgColor_red, $bgColor_gray, $bgColor_green, $bgColor_pink); BackgroundColor($wb, \$bgColor_blue, \$bgColor_black, \$bgColor_red, \$bgColor_gray, \$bgColor_green, \$bgColor_pink); $$format_CA = $wb->add_format(bg_color=>$bgColor_blue, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_CA->set_align('center'); $$format_CG = $wb->add_format(bg_color=>$bgColor_black, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_CG->set_align('center'); $$format_CT = $wb->add_format(bg_color=>$bgColor_red, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_CT->set_align('center'); $$format_TA = $wb->add_format(bg_color=>$bgColor_gray, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TA->set_align('center'); $$format_TC = $wb->add_format(bg_color=>$bgColor_green, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TC->set_align('center'); $$format_TG = $wb->add_format(bg_color=>$bgColor_pink, font=>'Arial', bold=>1, size=>11, color=>'white'); $$format_TG->set_align('center'); } # Define the format for the text that needs a section border sub Format_TextSection { my ($wb, $formatT_left, $formatT_right, $formatT_bottomRight, $formatT_bottomLeft, $formatT_bottom, $formatT_bottomHeader, $formatT_bottomRightHeader, $formatT_bottomHeader2, $formatT_rightHeader) = @_; $$formatT_left = $wb->add_format(valign=>'center', font=>'Arial', size=>10); $$formatT_left->set_left(2); $$formatT_left->set_left_color('blue'); $$formatT_right = $wb->add_format(valign=>'center', font=>'Arial', size=>10); $$formatT_right->set_right(2); $$formatT_right->set_right_color('blue'); $$formatT_bottomRight = $wb->add_format(valign=>'center', font=>'Arial', size=>10); $$formatT_bottomRight->set_bottom(2); $$formatT_bottomRight->set_bottom_color('blue'); $$formatT_bottomRight->set_right(2); $$formatT_bottomRight->set_right_color('blue'); $$formatT_bottomLeft = $wb->add_format(valign=>'center', font=>'Arial', size=>10); $$formatT_bottomLeft->set_bottom(2); $$formatT_bottomLeft->set_bottom_color('blue'); $$formatT_bottomLeft->set_left(2); $$formatT_bottomLeft->set_left_color('blue'); $$formatT_bottom = $wb->add_format(valign=>'center', font=>'Arial', size=>10); $$formatT_bottom->set_bottom(2); $$formatT_bottom->set_bottom_color('blue'); my $bgColor_totallighGray = $wb->set_custom_color(54, 230, 230, 230); $$formatT_bottomHeader = $wb->add_format(bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>11); $$formatT_bottomHeader->set_align('center'); $$formatT_bottomHeader->set_bottom(2); $$formatT_bottomHeader->set_bottom_color('blue'); $$formatT_bottomRightHeader = $wb->add_format(bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>11); $$formatT_bottomRightHeader->set_align('center'); $$formatT_bottomRightHeader->set_bottom(2); $$formatT_bottomRightHeader->set_bottom_color('blue'); $$formatT_bottomRightHeader->set_right(2); $$formatT_bottomRightHeader->set_right_color('blue'); $$formatT_bottomHeader2 = $wb->add_format(bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>11); $$formatT_bottomHeader2->set_align('center'); $$formatT_rightHeader = $wb->add_format(bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>11); $$formatT_rightHeader->set_align('center'); $$formatT_rightHeader->set_right(2); $$formatT_rightHeader->set_right_color('blue'); } # Define the format for the graphs titles sub Format_GraphTitle { my ($wb, $formatT_graphTitle) = @_; $$formatT_graphTitle = $wb->add_format(font=>'Arial', size=>12, bold=>1); } # Define the format of the border of the tables sub Format_Table { my ($wb, $table_topleft, $table_topRight, $table_bottomleft, $table_bottomRight, $table_top, $table_right, $table_bottom, $table_bottomItalicRed, $table_left, $table_bottomrightHeader, $table_left2, $table_middleHeader, $table_middleHeader2) = @_; $$table_topleft = $wb->add_format(valign=>'center', bold=>1, font=>'Arial', size=>10); $$table_topleft->set_top(1); $$table_topleft->set_left(1); $$table_topRight = $wb->add_format(valign=>'center', bold=>1, font=>'Arial', size=>10); $$table_topRight->set_top(1); $$table_topRight->set_right(1); $$table_bottomleft = $wb->add_format(valign=>'center', bold=>1, font=>'Arial', size=>10); $$table_bottomleft->set_bottom(1); $$table_bottomleft->set_left(1); $$table_bottomRight = $wb->add_format(valign=>'center', font=>'Arial', size=>10); $$table_bottomRight->set_bottom(1); $$table_bottomRight->set_right(1); $$table_top = $wb->add_format(valign=>'center', bold=>1, font=>'Arial', size=>10); $$table_top->set_top(1); $$table_right = $wb->add_format(valign=>'center', font=>'Arial', size=>10); $$table_right->set_right(1); $$table_bottom = $wb->add_format(valign=>'center', font=>'Arial', size=>10); $$table_bottom->set_bottom(1); $$table_bottomItalicRed = $wb->add_format(valign=>'center', font=>'Arial', size=>10, italic=>1, color => 'red'); $$table_bottomItalicRed->set_bottom(1); $$table_left = $wb->add_format(valign=>'center', bold=>1, font=>'Arial', size=>10); $$table_left->set_left(1); my $bgColor_totallighGray = $wb->set_custom_color(54, 230, 230, 230); $$table_bottomrightHeader = $wb->add_format(bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>10); $$table_bottomrightHeader->set_bottom(1); $$table_bottomrightHeader->set_right(1); $$table_left2 = $wb->add_format(valign=>'left', font=>'Arial', size=>10); $$table_left2->set_left(1); $$table_middleHeader = $wb->add_format(valign=>'center', bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>10); $$table_middleHeader2 = $wb->add_format(valign=>'center', bg_color=>$bgColor_totallighGray, font=>'Arial', bold=>1, size=>10); $$table_middleHeader2->set_bottom(1); } # Define the color sub Color { my ($wb, $blue, $black, $red, $gray, $green, $pink) = @_; $$blue = $wb->set_custom_color(40, 0, 0, 204);# C:G>A:T in blue $$black = $wb->set_custom_color(41, 0, 0, 0);# C:G>G:C in black $$red = $wb->set_custom_color(42, 255, 0, 0);# C:G>T:A in red $$gray = $wb->set_custom_color(43, 205, 205, 205); # T:A>A:T in light gray $$green = $wb->set_custom_color(44, 0, 204, 51);# T:A>C:G in green $$pink = $wb->set_custom_color(45, 255, 192, 203);# T:A>G:C in pink } sub BackgroundColor { my ($wb, $bgColor_blue, $bgColor_black, $bgColor_red, $bgColor_gray, $bgColor_green, $bgColor_pink) = @_; $$bgColor_blue = $wb->set_custom_color(48, 0, 0, 204); $$bgColor_black = $wb->set_custom_color(49, 0, 0, 0); $$bgColor_red = $wb->set_custom_color(50, 255, 0, 0); $$bgColor_gray = $wb->set_custom_color(51, 205, 205, 205); $$bgColor_green = $wb->set_custom_color(52, 0, 204, 51); $$bgColor_pink = $wb->set_custom_color(53, 255, 192, 203); } =head1 NAME mutSpec-Stat =head1 SYNOPSIS mutSpecstat.pl [arguments] <query-file> <query-file> a folder with one or multiple VCFs Arguments: -h, --help print help message -m, --man print complete documentation -v, --verbose use verbose output --refGenome the reference genome to use (human, mouse or rat genomes) -o, --outfile <string> output directory for the result. If none is specify the result will be write in the same directory as the input file --temp <string> the path for saving the temporary files --pathSeqRefGenome the path to the fasta reference sequences --poolData generate the pool of all the samples (optional) --reportSample generate a report for each sample (optional) Function: automatically run a pipeline and calculate various statistics on mutations Example: mutSpecstat.pl --refGenome hg19 --outfile output_directory --temp path_to_temporary_directory --pathRscript path_to_R_scripts --pathSeqRefGenome path_fasta_ref_seq --poolData --reportSample inputFolder Version: 02-2017 (February 2016) =head1 OPTIONS =over 8 =item B<--help> print a brief usage message and detailed explanation of options. =item B<--man> print the complete manual of the program. =item B<--verbose> use verbose output. =item B<--refGenome> the reference genome to use, could be human, mouse or rat genomes. =item B<--outfile> the directory of output file names. If it is nor specify the same directory as the input file is used. =item B<--temp> the path for saving temporary files generated by the script. If any is specify a temporary folder is created in the same directory where the script is running. Deleted when the script is finish =item B<--pathSeqRefGenome> The path to the fasta reference sequences =item B<--poolData only for the report> calculate the statistics on the pool of all the data pass in input =item B<--reportSample only for the report> generate a report for each samples =head1 DESCRIPTION mutSpecstat is a perl script for calculated various statistics on mutations An Excel report containing the mutation type distribution per functional region, the strand bias and the sequence context on genomic and coding sequence is created. The different statistics are illustrated using ggplot2. =cut