Mercurial > repos > cpt > cpt_psm_plotter
comparison cpt_psm_1_plot.pl @ 1:8691c1c61a8e draft default tip
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:48:47 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:54c7a3ea81e2 | 1:8691c1c61a8e |
---|---|
1 #!/usr/bin/env perl | |
2 use strict; | |
3 use warnings; | |
4 use Storable; | |
5 use CPT::GalaxyGetOpt; | |
6 use CPT::Bio::NW_MSA; | |
7 use Data::Dumper; | |
8 use CPT::Circos::Conf; | |
9 use POSIX; | |
10 | |
11 | |
12 my $ggo = CPT::GalaxyGetOpt->new(); | |
13 my $options = $ggo->getOptions( | |
14 'options' => [ | |
15 [ 'file', 'PSM2 Data File', { validate => 'File/Input', required => 1 } ], | |
16 [ 'user_ordering', 'List of genome IDs used in the analysis, can be comma/space/newline separated.', { validate => 'String', required => 1 }], | |
17 [], | |
18 ['Plot Options'], | |
19 ['percent_filled' , 'Percentage of a whole block that an individual gene is' , { validate => 'Float', default=>'0.8', min => '0.1', max => '1.0' }], | |
20 ['ig_dist' , 'Maximum length of links between genome comparisons' , { validate => 'Int', default => 100 }], | |
21 ['stroke_thickness' , 'Thickness of inter-genome links' , { validate => 'Int', default => '2', min => 1, max => 10 } ], | |
22 ['every_nth' , 'Plot every Nth gene a modified version of the main color for that genome', { validate => 'Int', default => '20'}], | |
23 [], | |
24 ['Cutoffs'], | |
25 ['evalue' , 'Evalue cutoff' , { validate => 'Float' , default => 1e-4 } ] , | |
26 ['dice' , 'Dice cutoff' , { validate => 'Float' , default => 50 } ] , | |
27 [], | |
28 ['Alignment Options'], | |
29 ['mismatch' , 'Mismatch Score' , { validate => 'Float' , default => -0.1 } ] , | |
30 ['gap_penalty' , 'Gap Penalty' , { validate => 'Float' , default => '0.0' } ] , | |
31 ['match' , 'Match Score' , { validate => 'Float' , default => 5 } ] , | |
32 ], | |
33 'outputs' => [ | |
34 [ | |
35 'output_circos_confs', | |
36 'Output Circos Conf Object', | |
37 { | |
38 validate => 'File/Output', | |
39 required => 1, | |
40 default => 'psm3', | |
41 data_format => 'archive', | |
42 default_format => 'tar.gz', | |
43 } | |
44 ], | |
45 ], | |
46 'defaults' => [ | |
47 'appid' => 'PSM.Plot', | |
48 'appname' => 'PSM Plotter', | |
49 'appdesc' => 'plots data from PSM Prep', | |
50 'appvers' => '1.94.2', | |
51 ], | |
52 'tests' => [ | |
53 ], | |
54 ); | |
55 | |
56 my $percent_filled = $options->{percent_filled}; | |
57 my $width = 1000*$percent_filled; | |
58 my $spacing = 1000-$width; | |
59 | |
60 | |
61 | |
62 my $offset = ($width+$spacing)/2; | |
63 my $full_increment = $width+$spacing; | |
64 my $halfwidth = $width/2; | |
65 | |
66 #my %option_map = ( | |
67 #'offset' => ($width+$spacing)/2, | |
68 #'full_increment' => $width+$spacing, | |
69 #'halfwidth' => $width/2, | |
70 #'gap_penalty' => $options->{gap_penalty}, | |
71 #'match' => $options->{match}, | |
72 #'heatmap' => 1, | |
73 #'heatmap_low' => hex("0xCCCCCC"), | |
74 #'heatmap_high' => hex("0x333333"), | |
75 #'heatmap_bucket' => 8, | |
76 #'every_nth' => $options->{every_nth}, | |
77 #'user_ordering' => $options->{user_ordering}, | |
78 #'dice' => $options->{dice}, | |
79 #'evalue' => $options->{evalue}, | |
80 #); | |
81 #Color/name correspondance, to be used in writing the circos-0.63-pre1 files | |
82 | |
83 my @user_ordering; | |
84 push(@user_ordering, split(/[,\n\r\s]+/, $options->{user_ordering})); | |
85 | |
86 my @aligned_results; | |
87 my %fh_relationship; | |
88 my %precomputed_colour_hash; | |
89 my %protein_position_quicklookup; | |
90 my %response = (); | |
91 my %data_file = %{retrieve($options->{file})}; | |
92 | |
93 my %uo_idx; | |
94 for(my $i=0;$i<scalar @user_ordering;$i++){ | |
95 $uo_idx{$user_ordering[$i]} = $i; | |
96 } | |
97 | |
98 align(); | |
99 my %compmap_proteins = %{compmap_proteins()}; | |
100 # a => "link text" | |
101 my %linkages = %{linkages()}; | |
102 # a => b => "link text" | |
103 | |
104 sub align{ | |
105 print STDERR "Aliging genomes\n"; | |
106 my $msa = CPT::Bio::NW_MSA->new( | |
107 gap_penalty => $options->{'gap_penalty'}, | |
108 match_score => $options->{'match'}, | |
109 mismatch_score => $options->{'mismatch'}, | |
110 bidi => 1, | |
111 ); | |
112 | |
113 my @hits = @{$data_file{hit_table}}; | |
114 | |
115 foreach my $hit(@hits){ | |
116 my ($from, $to, $evalue, $dice) = @{$hit}; | |
117 if($evalue < $options->{evalue} && $dice > $options->{dice}){ | |
118 if($options->{verbose}){ | |
119 print "$from $to\n"; | |
120 } | |
121 $msa->add_relationship($from, $to); | |
122 } | |
123 } | |
124 | |
125 foreach my $genome(@user_ordering){ | |
126 print STDERR "\tAligning $genome\n"; | |
127 my $gi_list_ref = $data_file{gbk}{$genome}{gi};#"GI" list | |
128 if(! defined $gi_list_ref){ | |
129 warn "Could not find $genome genome in the data file. Please be sure you have correctly specified the name of a genome from a genbank file. (See the LOCUS line for the name)."; | |
130 }else{ | |
131 $msa->align_list($gi_list_ref); | |
132 } | |
133 } | |
134 @aligned_results = $msa->merged_array(); | |
135 } | |
136 sub compmap_proteins{ | |
137 my @Narr = ();#Keep count of how many items we've had in a single | |
138 #column, so the modulus whe we're colouring them in will work properly, | |
139 #rather than being on every Nth radial colum in the plot | |
140 # | |
141 my $_max = scalar @aligned_results; | |
142 my %protein_files; | |
143 | |
144 #for(my $i = scalar @aligned_results - 1; $i >= 0; $i--){ | |
145 for(my $i = 0; $i < scalar @aligned_results; $i++){ | |
146 # Get the current row from the PSM result object | |
147 my @current_row = @{$aligned_results[$i]}; | |
148 if($options->{verbose}){ | |
149 print join("\t", @current_row) , "\n"; | |
150 } | |
151 | |
152 for(my $j = 0; $j < scalar @current_row; $j++){ | |
153 if($current_row[$j] ne "-"){ | |
154 $protein_position_quicklookup{$current_row[$j]} = [$i,$j]; | |
155 $Narr[$j]++; | |
156 my $color_str = ''; | |
157 if($Narr[$j] % $options->{every_nth} == 0){ | |
158 $color_str = 'fill_color=accent-8-qual-inv-' . ($j+1) . ','; | |
159 } | |
160 my $str = join(' ', | |
161 'compmap ', | |
162 (($i+1) * $full_increment - $halfwidth ), | |
163 (($i+1) * $full_increment + $halfwidth ), | |
164 "${color_str}f=". $current_row[$j] | |
165 ); | |
166 $protein_files{$user_ordering[$j]} .= $str . "\n"; | |
167 } | |
168 } | |
169 } | |
170 return \%protein_files; | |
171 } | |
172 sub linkages{ | |
173 my @hits = @{$data_file{hit_table}}; | |
174 my %links; | |
175 foreach my $hit (@hits){ | |
176 my ($from, $to, $evalue, $dice) = @{$hit}; | |
177 if($evalue < $options->{evalue} && $dice > $options->{dice}){ | |
178 if(defined $protein_position_quicklookup{$from} && defined $protein_position_quicklookup{$to}){ | |
179 my ($theta0,$radius0) = @{$protein_position_quicklookup{$from}}; | |
180 my ($theta1,$radius1) = @{$protein_position_quicklookup{$to}}; | |
181 # If this is a self-self link, disable plotting because we don't care. | |
182 # If ig_dist is disabled or distance is between them is less than our minimum | |
183 if($radius0 != $radius1 | |
184 && ($options->{'ig_dist'} == "-1" || abs($theta0-$theta1) <= $options->{'ig_dist'}) | |
185 ){ | |
186 # Create the dataset | |
187 my @row_data = ('compmap', | |
188 ); | |
189 # We work under the assumption that all hits | |
190 # are bi-directional, so we swap them to be | |
191 # smallest first no matter what. | |
192 if($radius1 < $radius0){ | |
193 my $tmp = $radius1; | |
194 $radius1 = $radius0; | |
195 $radius0 = $tmp; | |
196 # We also want to add in reverse order | |
197 push(@row_data, | |
198 (($theta0+1)*$full_increment), | |
199 (($theta1+1)*$full_increment), | |
200 ); | |
201 }else{ | |
202 push(@row_data, | |
203 (($theta1+1)*$full_increment), | |
204 (($theta0+1)*$full_increment), | |
205 ); | |
206 } | |
207 | |
208 # If it's a link with the same theta | |
209 # value, then we'll go ahead and hide | |
210 # behind the track to make it a little | |
211 # prettier. | |
212 my $zstr; | |
213 if($theta0 == $theta1){ | |
214 $zstr="z=0"; | |
215 }else{ | |
216 $zstr="z=100"; | |
217 } | |
218 | |
219 # Create the additional row data | |
220 push(@row_data, | |
221 join(',', "dice=$dice", "color=" . colorstr($dice)) | |
222 ); | |
223 $links{$radius0}{$radius1} .= join(' ', @row_data) . "\n"; | |
224 } | |
225 } | |
226 } | |
227 } | |
228 return \%links; | |
229 } | |
230 sub colorstr { | |
231 my ($dice) = @_; | |
232 if($dice > 90) { | |
233 return 'black'; | |
234 }else{ | |
235 return 'greys-9-seq-' . floor($dice / 10); | |
236 } | |
237 } | |
238 sub circosconf { | |
239 my $cc = CPT::Circos::Conf->new(); | |
240 $cc->include('etc/colors_fonts_patterns.conf'); | |
241 $cc->start_block('colors'); | |
242 $cc->set('accent-8-qual-inv-1', '42, 135, 42'); | |
243 $cc->set('accent-8-qual-inv-2', '111, 83, 150'); | |
244 $cc->set('accent-8-qual-inv-3', '182, 112, 46'); | |
245 $cc->set('accent-8-qual-inv-4', '178, 178, 53'); | |
246 $cc->set('accent-8-qual-inv-5', '13, 63, 128'); | |
247 $cc->set('accent-8-qual-inv-6', '183, 0, 96'); | |
248 $cc->set('accent-8-qual-inv-7', '116, 47, 0'); | |
249 $cc->set('accent-8-qual-inv-8', '45, 45, 45'); | |
250 $cc->end_block(); | |
251 # markings indicating position along genome | |
252 $cc->include('example/etc/ideogram.conf'); | |
253 #$cc->include('rules.conf'); | |
254 # Genome data | |
255 $cc->set('karyotype', 'karyotype.conf'); | |
256 # Default image params are fine | |
257 $cc->start_block('image'); | |
258 $cc->include('etc/image.conf'); | |
259 $cc->end_block(); | |
260 #$cc->include('highlights.conf'); | |
261 $cc->include('plots.conf'); | |
262 #$cc->include('rules.conf'); | |
263 | |
264 $cc->include('etc/housekeeping.conf'); | |
265 my $result = $cc->finalize(); | |
266 $cc = CPT::Circos::Conf->new(); | |
267 return $result; | |
268 } | |
269 sub karyotype { | |
270 my @karyotype_data = ( | |
271 "chr - compmap compmap 0 ".((scalar @aligned_results)*1000+500)." white" | |
272 ); | |
273 return join("\n", @karyotype_data); | |
274 } | |
275 | |
276 my @files = (); | |
277 | |
278 my $number_of_tracks = 0; | |
279 sub register_track { | |
280 my ($r0,$r1) = calculate_individual_track($number_of_tracks); | |
281 $number_of_tracks++; | |
282 return ($r0, $r1); | |
283 } | |
284 sub calculate_individual_track { | |
285 my ($idx) = @_; | |
286 my $r0 = ( 90 - (10 * $idx - 1)/1) / 100; | |
287 my $r1 = ( 90 - (10 * $idx - 9)/1) / 100; | |
288 return ($r0, $r1); | |
289 } | |
290 sub genome_data { | |
291 my $cc = CPT::Circos::Conf->new(); | |
292 # Map string back to position in array. | |
293 #$cc->set('z',10); | |
294 # Loop across all our protein data sets | |
295 $cc->start_block('plots'); | |
296 foreach my $genome(@user_ordering){ | |
297 # Add protein file | |
298 my $filename = sprintf('org.features.%s.txt', $genome); | |
299 push(@files, [ 'data/'.$filename, $compmap_proteins{$genome}]); | |
300 # Create associated tracks | |
301 | |
302 my ($r0,$r1) = register_track(); | |
303 $cc->start_block('plot'); | |
304 $cc->set('type','highlight'); | |
305 $cc->set('file', $filename); | |
306 $cc->set('r0', $r0 .'r'); | |
307 $cc->set('r1', $r1 .'r'); | |
308 $cc->set('z', '50'); | |
309 $cc->set('fill_color','accent-8-qual-' . ($uo_idx{$genome} + 1)); | |
310 $cc->set('stroke_thickness', '1'); | |
311 $cc->set('stroke_color', 'black'); | |
312 $cc->end_block(); | |
313 } | |
314 | |
315 | |
316 foreach my $from(@user_ordering){ | |
317 foreach my $to(@user_ordering){ | |
318 next if($from eq $to || $uo_idx{$from} > $uo_idx{$to}); | |
319 | |
320 if($linkages{$uo_idx{$from}}{$uo_idx{$to}}){ | |
321 my $filename = sprintf('links.%s.%s.txt', $from, $to); | |
322 push(@files, [ 'data/'.$filename, $linkages{$uo_idx{$from}}{$uo_idx{$to}}]); | |
323 #push(@files, [ 'data/'.$filename, 'blaaaaaaah']); | |
324 | |
325 my ($r0a, $r0b) = calculate_individual_track($uo_idx{$to}); | |
326 my ($r1a, $r1b) = calculate_individual_track($uo_idx{$from}); | |
327 | |
328 # If they're in this ordering, they will be pointing at | |
329 # the "outsides" of each genome/protein track, so we | |
330 # swap with the internal ones. | |
331 if($r1b > $r0a){ | |
332 $r0a = $r1a; | |
333 $r1b = $r0b; | |
334 } | |
335 | |
336 $cc->start_block('plot'); | |
337 $cc->set('type','connector'); | |
338 $cc->set('thickness', $options->{stroke_thickness}); | |
339 $cc->set('file', $filename); | |
340 if($r1b<$r0a){ | |
341 $cc->set('r0', $r1b .'r'); | |
342 $cc->set('r1', $r0a .'r'); | |
343 }else{ | |
344 $cc->set('r0', $r0a .'r'); | |
345 $cc->set('r1', $r1b .'r'); | |
346 } | |
347 $cc->set('connector_dims', '0,0.3,0.4,0.3,0'); | |
348 $cc->set('color','black'); | |
349 $cc->end_block(); | |
350 } | |
351 } | |
352 } | |
353 | |
354 | |
355 $cc->end_block(); | |
356 return $cc->finalize(); | |
357 } | |
358 sub rulesconf { | |
359 my ($self) = @_; | |
360 my $cc = CPT::Circos::Conf->new(); | |
361 $cc->start_block('rules'); | |
362 for(my $i = 0; $i < 10; $i++){ | |
363 $cc->start_block('rule'); | |
364 $cc->set('importance', 10 - $i); | |
365 $cc->set('condition', 'var(dice) > ' . (10*$i)); | |
366 if($i == 9){ | |
367 $cc->set('color', 'black'); | |
368 }else{ | |
369 $cc->set('color', 'gray' . (10 * ($i+1))); | |
370 } | |
371 #$cc->set('z',10-$i); | |
372 $cc->end_block(); | |
373 } | |
374 $cc->end_block(); | |
375 } | |
376 | |
377 push(@files, [ 'etc/karyotype.conf', karyotype() ]); | |
378 push(@files, [ 'etc/circos.conf', circosconf() ]); | |
379 #push(@files, [ 'etc/rules.conf', rulesconf() ]); | |
380 push(@files, [ 'etc/plots.conf', genome_data() ]); | |
381 | |
382 | |
383 | |
384 use Archive::Any::Create; | |
385 my $archive = Archive::Any::Create->new(); | |
386 $archive->container('conf'); | |
387 foreach(@files){ | |
388 my ($location, $content) = @{$_}; | |
389 $archive->add_file($location, $content); | |
390 } | |
391 | |
392 use CPT::OutputFiles; | |
393 my $crr_output = CPT::OutputFiles->new( | |
394 name => 'output_circos_confs', | |
395 GGO => $ggo, | |
396 ); | |
397 $crr_output->CRR(data => $archive); | |
398 | |
399 | |
400 =head1 NAME | |
401 | |
402 PSM Plotter | |
403 | |
404 =head1 DESCRIPTION | |
405 | |
406 Following the execution of the PSM Prep tool, this tool plots a subset of those genomes as ciruclar tracks with protein-protein relationships plotted as links between the boxes representing proteins. | |
407 | |
408 =head2 IMPORTANT PARAMETERS | |
409 | |
410 =over 4 | |
411 | |
412 =item C<user_ordering> | |
413 | |
414 This parameter controls the order in which genomes are aligned and then plotted. This MUST contain (comma/space) separated values listing the order (outside to in) in which you want your genomes to appear. In this field, type the name of each genome. The name can be found on the first line of the file under "LOCUS NC_00000001", where NC_00000001 would be the genome's name. | |
415 | |
416 =item C<evalue>, C<dice> | |
417 | |
418 Adjusting these parameters will affect which links are plotted. Links are heatmapped into bins based on dice score as that is the easiest measure to work with, and scales nicely from 0 to 100. For example, a link with a dice score of 20-29 would be plotted as 20% black (grey20), whereas a dice score of 90+ would be plotted as solid black | |
419 | |
420 =item C<mismatch>, C<gap_penalty>, C<match> | |
421 | |
422 These parameters control the Needleman-Wunsch Multiple Sequence Alignment library's scoring scheme. Mismatch scores are generally negative and discourage unrelated proteins from being plotted in a line together. Match scores encourage related proteins to line up. Gap penalty is set at zero as we generally prefer gaps to mismatches in this tool; phage genomes are small and gaps are "cheap" to use, whereas mismatches can sometimes give an incorrect impression of relatedness. That said, how your plots look is completely up to you and we encourage experimentation! | |
423 | |
424 =item C<every_nth> | |
425 | |
426 Every Nth gene in a genome will be plotted a slightly different color. | |
427 | |
428 =back | |
429 | |
430 =head2 Why Can't I Control Colors? | |
431 | |
432 Brewer colors compose Brewer palettes which have been manually defined by | |
433 Cynthia Brewer for their perceptual properties. | |
434 | |
435 http://circos.ca/tutorials/lessons/configuration/colors/ | |
436 | |
437 Color palette choice is very important to creating an attractive and easy to read graphic. In the words of Dr. Krzywinski, L<Color palettes matter|http://mkweb.bcgsc.ca/jclub/biovis/brewer/colorpalettes.pdf>. Humans selecting from an RGB/HSV color palette tend to make poor choices, so we've removed the option in lieu of using the very attractive L<Brewer Palettes|http://colorbrewer2.org/>. Specifically, I've selected the 8 class qualtitative "Accent" color set, which has produced some very nice maps. If you would like the option of selecting amongst the other 8-class qualitative color sets, please file a bug report and let me know. | |
438 | |
439 | |
440 =cut |