comparison Perl/GenerateHeatmapFromPAV.pl @ 3:e42d30da7a74 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:52:25 +0000
parents
children
comparison
equal deleted inserted replaced
2:97e4e3e818b6 3:e42d30da7a74
1 #!/usr/bin/perl
2
3 use strict;
4 use File::Basename;
5 my $dirname = dirname(__FILE__);
6
7 my $infile = $ARGV[0];
8 my $heatmap = $ARGV[1];
9
10 my %core_cluster;
11 my %samples;
12 my %sequences;
13 my %matrix;
14 my %hash;
15 my $cl_num = 0;
16 my $nb_strains = 1;
17 open(M,">$heatmap.accessory_01matrix.txt");
18 open(PAN,">$heatmap.pangenes_01matrix.txt");
19 open(F,"$infile");
20 my $firstline = <F>;
21 $firstline =~s/\n//g;$firstline =~s/\r//g;
22 #my @infos = split(/","/,$firstline);
23 my @infos = split(/\t/,$firstline);
24 print M "Gene";
25 print PAN "Gene";
26 #for (my $j=14; $j <= $#infos; $j++){
27 for (my $j=1; $j <= $#infos; $j++){
28 my $gbfile = $infos[$j];
29 $gbfile =~s/\"//g;
30 $gbfile =~s/\.gb\.filt//g;
31
32 my $strain = $gbfile;
33 my @words = split(/_/,$strain);
34 my $genus = $words[0];
35 my $species = $words[1];
36 my $shortname = substr($genus,0,3) . "_". substr($species,0,2);
37 for (my $j = 2; $j <= $#words; $j++){
38 $shortname.="_".$words[$j];
39 }
40 #$shortname = substr($shortname,0,25);
41
42 print M "\t".$strain;
43 print PAN "\t".$strain;
44 $samples{$j} = $strain;
45 $nb_strains++;
46 }
47 print M "\n";
48 print PAN "\n";
49 while(<F>){
50 $cl_num++;
51 my $line = $_;
52 $line =~s/\n//g;$line =~s/\r//g;
53 #my @infos = split(/","/,$line);
54 my @infos = split(/\t/,$line);
55 my $concat_accessory = "";
56 #for (my $i = 14; $i <= $#infos; $i++){
57 for (my $i = 1; $i <= $#infos; $i++){
58 my $val = $infos[$i];
59 my $sample = $samples{$i};
60 $matrix{$sample}{$cl_num} = $val;
61 $val =~s/\"//g;
62 if ($val =~/\w+/){
63 $concat_accessory .= "\t1";
64 $hash{$sample}{$cl_num} = 1;
65 $sequences{$sample}.= "T";
66 }
67 else{
68 $concat_accessory .= "\t0";
69 $hash{$sample}{$cl_num} = 0;
70 $sequences{$sample}.= "A";
71 }
72 }
73 if ($concat_accessory =~/0/){
74 print M $cl_num.$concat_accessory."\n";
75 }
76 else{
77 $core_cluster{$cl_num}=1;
78 }
79 print PAN $cl_num.$concat_accessory."\n";
80 #if ($cl_num > 1000){last;}
81 }
82 close(F);
83 close(M);
84 close(PAN);
85
86
87 system("ulimit -s 163840;Rscript $dirname/../R/heatmap.R -f $heatmap.accessory_01matrix.txt -o $heatmap.complete.pdf");
88 #system("ulimit -s 163840;Rscript $dirname/../R/heatmaply.R -f $heatmap.accessory_01matrix.txt -o $heatmap.complete2.svg");
89 system("pdf2svg $heatmap.complete.pdf $heatmap.complete.svg");
90
91
92 open(A,">$heatmap.alignment.fa");
93 foreach my $sample(keys(%sequences)){
94 my $seq = $sequences{$sample};
95 print A ">$sample\n";
96 print A "$seq\n";
97 }
98 close(A);
99
100
101
102 my $min_y = 10000000;
103 my $min_x = 10000000;
104 my $max_y = 0;
105 my $max_x = 0;
106 my $step_x;
107 my $step_y;
108 open(SVG,"$heatmap.complete.svg");
109 open(SVGNEW,">$heatmap.complete.new.svg");
110 while(<SVG>){
111 if (/rgb\(100%,0%,0%\)/){
112 $_ =~s/rgb\(100%,0%,0%\)/rgb\(100%,100%,100%\)/g;
113 }
114 if (/rgb\(100%,63.529968%,0%\)/){
115 $_ =~s/rgb\(100%,63.529968%,0%\)/rgb\(41%,72%,64%\)/g;
116 }
117 print SVGNEW $_;
118 }
119 close(SVG);
120 close(SVGNEW);
121
122
123
124 my @clusters;
125 open(F,"$heatmap.complete.pdf.cols.csv");
126 <F>;
127 while(<F>){
128 my $line = $_;
129 $line =~s/\n//g;$line =~s/\r//g;
130 push(@clusters,$line);
131 print $line;
132 }
133 close(F);
134
135 my @strains;
136 open(F,"$heatmap.complete.pdf.rows.csv");
137 <F>;
138 while(<F>){
139 my $line = $_;
140 $line =~s/\n//g;$line =~s/\r//g;
141 push(@strains,$line);
142 print $line;
143 }
144 close(F);
145
146
147 open(O,">$infile.sorted");
148 open(HP,">$infile.sorted.for_heatmap_plotly.txt");
149 print O "ClutserID"."\t".join("\t",@strains)."\n";
150 print HP "ClutserID"."\t".join("\t",@strains)."\n";
151 foreach my $cl(reverse(@clusters)){
152 print O $cl;
153 my $clnb = $cl;
154 if (length($clnb) == 1){$clnb = "000".$clnb;}
155 elsif (length($clnb) == 2){$clnb = "00".$clnb;}
156 elsif (length($clnb) == 3){$clnb = "0".$clnb;}
157 my $name = "CLUSTER".$clnb;
158
159 print HP $name;
160 foreach my $strain(@strains){
161 my $val = $matrix{$strain}{$cl};
162 print O "\t$val";
163 if ($val =~/\w/){print HP "\t1";}
164 else{print HP "\t0";}
165 }
166 print O "\n";
167 print HP "\n";
168 }
169 foreach my $cl(keys(%core_cluster)){
170 print O $cl;
171 my $clnb = $cl;
172 if (length($clnb) == 1){$clnb = "000".$clnb;}
173 elsif (length($clnb) == 2){$clnb = "00".$clnb;}
174 elsif (length($clnb) == 3){$clnb = "0".$clnb;}
175 my $name = "CLUSTER".$clnb;
176 print HP $name;
177 foreach my $strain(@strains){
178 my $val = $matrix{$strain}{$cl};
179 print O "\t$val";
180 if ($val =~/\w/){print HP "\t1";}
181 else{print HP "\t0";}
182
183 }
184 print O "\n";
185 print HP "\n";
186 }
187 close(O);
188 close(HP);
189
190 my $svg_section = "";
191
192 if (scalar @clusters == 0){
193 system("touch $heatmap.upsetr.svg");
194 if (!-e "$heatmap"){
195 system("touch $heatmap");
196 }
197 exit;
198 }
199 system("cp -rf $infile $infile.bkp");
200 system("cp -rf $infile.sorted $infile");
201
202 my %combinaisons;
203 my $x = $min_x;
204 my $length_matrix = ($max_x-$min_x);
205 my $height_matrix = ($max_y-$min_y);
206 my $step_x = $length_matrix / scalar @clusters;
207 #my $step_y = $height_matrix / scalar @strains;
208
209 print "$height_matrix $length_matrix $step_x $step_y\n";
210 my $previous_combinaison = "";
211 my $combinaison_id = 0;
212 my %combinaison_ids;
213 open(M,">$heatmap.upsetr.txt");
214 print M "ClutserID";
215 foreach my $strain(@strains){
216 my @words = split(/_/,$strain);
217 my $genus = $words[0];
218 my $species = $words[1];
219 my $shortname = substr($genus,0,3) . "_". substr($species,0,2);
220 for (my $j = 2; $j <= $#words; $j++){
221 $shortname.="_".$words[$j];
222 }
223 $shortname = substr($shortname,0,25);
224 print M "\t".$shortname;
225 }
226 print M "\n";
227 foreach my $cluster(@clusters){
228 $x = $x+$step_x;
229 my $combinaison = "";
230 my $numstrain;
231 print M "$cluster";
232 foreach my $strain(@strains){
233 $numstrain++;
234 my $val = $hash{$strain}{$cluster};
235 print M "\t".$val;
236 if ($val == 1){
237 $combinaison .= ",$numstrain";
238 }
239 }
240 if ($combinaison ne $previous_combinaison){
241 $combinaison_id++;
242 }
243 print M "\n";
244 $previous_combinaison = $combinaison;
245 $combinaison_ids{$combinaison_id}=$combinaison;
246 #print "Cluster:$cluster Combinaison: $combinaison X:$x $combinaison_id\n";
247 $combinaisons{$combinaison_id} .= "|".$x;
248
249 }
250 close(M);
251
252
253 my $nb_strains = scalar @strains;
254 system("Rscript $dirname/../R/upsetr.R $heatmap.upsetr.txt $heatmap.upsetr.pdf $nb_strains >>$heatmap.upsetr.log 2>&1");
255 system("pdf2svg $heatmap.upsetr.pdf $heatmap.upsetr.svg 2");
256
257 system("python3 $dirname/../Python/Heatmap.py -i $infile.sorted.for_heatmap_plotly.txt -o $heatmap.heatmap_plotly.html >>$heatmap.heatmap_plotly.log 2>&1");
258
259 if (!-e "$heatmap.upsetr.svg"){
260 system("touch $heatmap.upsetr.svg");
261 }
262 system("perl $dirname/../Perl/reformatHeatmapSVG.pl $heatmap.complete.svg $heatmap.complete.new.svg $infile.sorted.for_heatmap_plotly.txt");
263 system("cp -rf $heatmap.complete.new.svg $heatmap");
264 system("gzip $heatmap");