Mercurial > repos > dereeper > pangenome_explorer
diff Perl/GenerateHeatmapFromPAV.pl @ 3:e42d30da7a74 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:52:25 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Perl/GenerateHeatmapFromPAV.pl Thu May 30 11:52:25 2024 +0000 @@ -0,0 +1,264 @@ +#!/usr/bin/perl + +use strict; +use File::Basename; +my $dirname = dirname(__FILE__); + +my $infile = $ARGV[0]; +my $heatmap = $ARGV[1]; + +my %core_cluster; +my %samples; +my %sequences; +my %matrix; +my %hash; +my $cl_num = 0; +my $nb_strains = 1; +open(M,">$heatmap.accessory_01matrix.txt"); +open(PAN,">$heatmap.pangenes_01matrix.txt"); +open(F,"$infile"); +my $firstline = <F>; +$firstline =~s/\n//g;$firstline =~s/\r//g; +#my @infos = split(/","/,$firstline); +my @infos = split(/\t/,$firstline); +print M "Gene"; +print PAN "Gene"; +#for (my $j=14; $j <= $#infos; $j++){ +for (my $j=1; $j <= $#infos; $j++){ + my $gbfile = $infos[$j]; + $gbfile =~s/\"//g; + $gbfile =~s/\.gb\.filt//g; + + my $strain = $gbfile; + my @words = split(/_/,$strain); + my $genus = $words[0]; + my $species = $words[1]; + my $shortname = substr($genus,0,3) . "_". substr($species,0,2); + for (my $j = 2; $j <= $#words; $j++){ + $shortname.="_".$words[$j]; + } + #$shortname = substr($shortname,0,25); + + print M "\t".$strain; + print PAN "\t".$strain; + $samples{$j} = $strain; + $nb_strains++; +} +print M "\n"; +print PAN "\n"; +while(<F>){ + $cl_num++; + my $line = $_; + $line =~s/\n//g;$line =~s/\r//g; + #my @infos = split(/","/,$line); + my @infos = split(/\t/,$line); + my $concat_accessory = ""; + #for (my $i = 14; $i <= $#infos; $i++){ + for (my $i = 1; $i <= $#infos; $i++){ + my $val = $infos[$i]; + my $sample = $samples{$i}; + $matrix{$sample}{$cl_num} = $val; + $val =~s/\"//g; + if ($val =~/\w+/){ + $concat_accessory .= "\t1"; + $hash{$sample}{$cl_num} = 1; + $sequences{$sample}.= "T"; + } + else{ + $concat_accessory .= "\t0"; + $hash{$sample}{$cl_num} = 0; + $sequences{$sample}.= "A"; + } + } + if ($concat_accessory =~/0/){ + print M $cl_num.$concat_accessory."\n"; + } + else{ + $core_cluster{$cl_num}=1; + } + print PAN $cl_num.$concat_accessory."\n"; + #if ($cl_num > 1000){last;} +} +close(F); +close(M); +close(PAN); + + +system("ulimit -s 163840;Rscript $dirname/../R/heatmap.R -f $heatmap.accessory_01matrix.txt -o $heatmap.complete.pdf"); +#system("ulimit -s 163840;Rscript $dirname/../R/heatmaply.R -f $heatmap.accessory_01matrix.txt -o $heatmap.complete2.svg"); +system("pdf2svg $heatmap.complete.pdf $heatmap.complete.svg"); + + +open(A,">$heatmap.alignment.fa"); +foreach my $sample(keys(%sequences)){ + my $seq = $sequences{$sample}; + print A ">$sample\n"; + print A "$seq\n"; +} +close(A); + + + +my $min_y = 10000000; +my $min_x = 10000000; +my $max_y = 0; +my $max_x = 0; +my $step_x; +my $step_y; +open(SVG,"$heatmap.complete.svg"); +open(SVGNEW,">$heatmap.complete.new.svg"); +while(<SVG>){ + if (/rgb\(100%,0%,0%\)/){ + $_ =~s/rgb\(100%,0%,0%\)/rgb\(100%,100%,100%\)/g; + } + if (/rgb\(100%,63.529968%,0%\)/){ + $_ =~s/rgb\(100%,63.529968%,0%\)/rgb\(41%,72%,64%\)/g; + } + print SVGNEW $_; +} +close(SVG); +close(SVGNEW); + + + +my @clusters; +open(F,"$heatmap.complete.pdf.cols.csv"); +<F>; +while(<F>){ + my $line = $_; + $line =~s/\n//g;$line =~s/\r//g; + push(@clusters,$line); + print $line; +} +close(F); + +my @strains; +open(F,"$heatmap.complete.pdf.rows.csv"); +<F>; +while(<F>){ + my $line = $_; + $line =~s/\n//g;$line =~s/\r//g; + push(@strains,$line); + print $line; +} +close(F); + + +open(O,">$infile.sorted"); +open(HP,">$infile.sorted.for_heatmap_plotly.txt"); +print O "ClutserID"."\t".join("\t",@strains)."\n"; +print HP "ClutserID"."\t".join("\t",@strains)."\n"; +foreach my $cl(reverse(@clusters)){ + print O $cl; + my $clnb = $cl; + if (length($clnb) == 1){$clnb = "000".$clnb;} + elsif (length($clnb) == 2){$clnb = "00".$clnb;} + elsif (length($clnb) == 3){$clnb = "0".$clnb;} + my $name = "CLUSTER".$clnb; + + print HP $name; + foreach my $strain(@strains){ + my $val = $matrix{$strain}{$cl}; + print O "\t$val"; + if ($val =~/\w/){print HP "\t1";} + else{print HP "\t0";} + } + print O "\n"; + print HP "\n"; +} +foreach my $cl(keys(%core_cluster)){ + print O $cl; + my $clnb = $cl; + if (length($clnb) == 1){$clnb = "000".$clnb;} + elsif (length($clnb) == 2){$clnb = "00".$clnb;} + elsif (length($clnb) == 3){$clnb = "0".$clnb;} + my $name = "CLUSTER".$clnb; + print HP $name; + foreach my $strain(@strains){ + my $val = $matrix{$strain}{$cl}; + print O "\t$val"; + if ($val =~/\w/){print HP "\t1";} + else{print HP "\t0";} + + } + print O "\n"; + print HP "\n"; +} +close(O); +close(HP); + +my $svg_section = ""; + +if (scalar @clusters == 0){ + system("touch $heatmap.upsetr.svg"); + if (!-e "$heatmap"){ + system("touch $heatmap"); + } + exit; +} +system("cp -rf $infile $infile.bkp"); +system("cp -rf $infile.sorted $infile"); + +my %combinaisons; +my $x = $min_x; +my $length_matrix = ($max_x-$min_x); +my $height_matrix = ($max_y-$min_y); +my $step_x = $length_matrix / scalar @clusters; +#my $step_y = $height_matrix / scalar @strains; + +print "$height_matrix $length_matrix $step_x $step_y\n"; +my $previous_combinaison = ""; +my $combinaison_id = 0; +my %combinaison_ids; +open(M,">$heatmap.upsetr.txt"); +print M "ClutserID"; +foreach my $strain(@strains){ + my @words = split(/_/,$strain); + my $genus = $words[0]; + my $species = $words[1]; + my $shortname = substr($genus,0,3) . "_". substr($species,0,2); + for (my $j = 2; $j <= $#words; $j++){ + $shortname.="_".$words[$j]; + } + $shortname = substr($shortname,0,25); + print M "\t".$shortname; +} +print M "\n"; +foreach my $cluster(@clusters){ + $x = $x+$step_x; + my $combinaison = ""; + my $numstrain; + print M "$cluster"; + foreach my $strain(@strains){ + $numstrain++; + my $val = $hash{$strain}{$cluster}; + print M "\t".$val; + if ($val == 1){ + $combinaison .= ",$numstrain"; + } + } + if ($combinaison ne $previous_combinaison){ + $combinaison_id++; + } + print M "\n"; + $previous_combinaison = $combinaison; + $combinaison_ids{$combinaison_id}=$combinaison; + #print "Cluster:$cluster Combinaison: $combinaison X:$x $combinaison_id\n"; + $combinaisons{$combinaison_id} .= "|".$x; + +} +close(M); + + +my $nb_strains = scalar @strains; +system("Rscript $dirname/../R/upsetr.R $heatmap.upsetr.txt $heatmap.upsetr.pdf $nb_strains >>$heatmap.upsetr.log 2>&1"); +system("pdf2svg $heatmap.upsetr.pdf $heatmap.upsetr.svg 2"); + +system("python3 $dirname/../Python/Heatmap.py -i $infile.sorted.for_heatmap_plotly.txt -o $heatmap.heatmap_plotly.html >>$heatmap.heatmap_plotly.log 2>&1"); + +if (!-e "$heatmap.upsetr.svg"){ + system("touch $heatmap.upsetr.svg"); +} +system("perl $dirname/../Perl/reformatHeatmapSVG.pl $heatmap.complete.svg $heatmap.complete.new.svg $infile.sorted.for_heatmap_plotly.txt"); +system("cp -rf $heatmap.complete.new.svg $heatmap"); +system("gzip $heatmap");