comparison cpt_psm_1_plot.pl @ 1:8691c1c61a8e draft default tip

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:48:47 +0000
parents
children
comparison
equal deleted inserted replaced
0:54c7a3ea81e2 1:8691c1c61a8e
1 #!/usr/bin/env perl
2 use strict;
3 use warnings;
4 use Storable;
5 use CPT::GalaxyGetOpt;
6 use CPT::Bio::NW_MSA;
7 use Data::Dumper;
8 use CPT::Circos::Conf;
9 use POSIX;
10
11
12 my $ggo = CPT::GalaxyGetOpt->new();
13 my $options = $ggo->getOptions(
14 'options' => [
15 [ 'file', 'PSM2 Data File', { validate => 'File/Input', required => 1 } ],
16 [ 'user_ordering', 'List of genome IDs used in the analysis, can be comma/space/newline separated.', { validate => 'String', required => 1 }],
17 [],
18 ['Plot Options'],
19 ['percent_filled' , 'Percentage of a whole block that an individual gene is' , { validate => 'Float', default=>'0.8', min => '0.1', max => '1.0' }],
20 ['ig_dist' , 'Maximum length of links between genome comparisons' , { validate => 'Int', default => 100 }],
21 ['stroke_thickness' , 'Thickness of inter-genome links' , { validate => 'Int', default => '2', min => 1, max => 10 } ],
22 ['every_nth' , 'Plot every Nth gene a modified version of the main color for that genome', { validate => 'Int', default => '20'}],
23 [],
24 ['Cutoffs'],
25 ['evalue' , 'Evalue cutoff' , { validate => 'Float' , default => 1e-4 } ] ,
26 ['dice' , 'Dice cutoff' , { validate => 'Float' , default => 50 } ] ,
27 [],
28 ['Alignment Options'],
29 ['mismatch' , 'Mismatch Score' , { validate => 'Float' , default => -0.1 } ] ,
30 ['gap_penalty' , 'Gap Penalty' , { validate => 'Float' , default => '0.0' } ] ,
31 ['match' , 'Match Score' , { validate => 'Float' , default => 5 } ] ,
32 ],
33 'outputs' => [
34 [
35 'output_circos_confs',
36 'Output Circos Conf Object',
37 {
38 validate => 'File/Output',
39 required => 1,
40 default => 'psm3',
41 data_format => 'archive',
42 default_format => 'tar.gz',
43 }
44 ],
45 ],
46 'defaults' => [
47 'appid' => 'PSM.Plot',
48 'appname' => 'PSM Plotter',
49 'appdesc' => 'plots data from PSM Prep',
50 'appvers' => '1.94.2',
51 ],
52 'tests' => [
53 ],
54 );
55
56 my $percent_filled = $options->{percent_filled};
57 my $width = 1000*$percent_filled;
58 my $spacing = 1000-$width;
59
60
61
62 my $offset = ($width+$spacing)/2;
63 my $full_increment = $width+$spacing;
64 my $halfwidth = $width/2;
65
66 #my %option_map = (
67 #'offset' => ($width+$spacing)/2,
68 #'full_increment' => $width+$spacing,
69 #'halfwidth' => $width/2,
70 #'gap_penalty' => $options->{gap_penalty},
71 #'match' => $options->{match},
72 #'heatmap' => 1,
73 #'heatmap_low' => hex("0xCCCCCC"),
74 #'heatmap_high' => hex("0x333333"),
75 #'heatmap_bucket' => 8,
76 #'every_nth' => $options->{every_nth},
77 #'user_ordering' => $options->{user_ordering},
78 #'dice' => $options->{dice},
79 #'evalue' => $options->{evalue},
80 #);
81 #Color/name correspondance, to be used in writing the circos-0.63-pre1 files
82
83 my @user_ordering;
84 push(@user_ordering, split(/[,\n\r\s]+/, $options->{user_ordering}));
85
86 my @aligned_results;
87 my %fh_relationship;
88 my %precomputed_colour_hash;
89 my %protein_position_quicklookup;
90 my %response = ();
91 my %data_file = %{retrieve($options->{file})};
92
93 my %uo_idx;
94 for(my $i=0;$i<scalar @user_ordering;$i++){
95 $uo_idx{$user_ordering[$i]} = $i;
96 }
97
98 align();
99 my %compmap_proteins = %{compmap_proteins()};
100 # a => "link text"
101 my %linkages = %{linkages()};
102 # a => b => "link text"
103
104 sub align{
105 print STDERR "Aliging genomes\n";
106 my $msa = CPT::Bio::NW_MSA->new(
107 gap_penalty => $options->{'gap_penalty'},
108 match_score => $options->{'match'},
109 mismatch_score => $options->{'mismatch'},
110 bidi => 1,
111 );
112
113 my @hits = @{$data_file{hit_table}};
114
115 foreach my $hit(@hits){
116 my ($from, $to, $evalue, $dice) = @{$hit};
117 if($evalue < $options->{evalue} && $dice > $options->{dice}){
118 if($options->{verbose}){
119 print "$from $to\n";
120 }
121 $msa->add_relationship($from, $to);
122 }
123 }
124
125 foreach my $genome(@user_ordering){
126 print STDERR "\tAligning $genome\n";
127 my $gi_list_ref = $data_file{gbk}{$genome}{gi};#"GI" list
128 if(! defined $gi_list_ref){
129 warn "Could not find $genome genome in the data file. Please be sure you have correctly specified the name of a genome from a genbank file. (See the LOCUS line for the name).";
130 }else{
131 $msa->align_list($gi_list_ref);
132 }
133 }
134 @aligned_results = $msa->merged_array();
135 }
136 sub compmap_proteins{
137 my @Narr = ();#Keep count of how many items we've had in a single
138 #column, so the modulus whe we're colouring them in will work properly,
139 #rather than being on every Nth radial colum in the plot
140 #
141 my $_max = scalar @aligned_results;
142 my %protein_files;
143
144 #for(my $i = scalar @aligned_results - 1; $i >= 0; $i--){
145 for(my $i = 0; $i < scalar @aligned_results; $i++){
146 # Get the current row from the PSM result object
147 my @current_row = @{$aligned_results[$i]};
148 if($options->{verbose}){
149 print join("\t", @current_row) , "\n";
150 }
151
152 for(my $j = 0; $j < scalar @current_row; $j++){
153 if($current_row[$j] ne "-"){
154 $protein_position_quicklookup{$current_row[$j]} = [$i,$j];
155 $Narr[$j]++;
156 my $color_str = '';
157 if($Narr[$j] % $options->{every_nth} == 0){
158 $color_str = 'fill_color=accent-8-qual-inv-' . ($j+1) . ',';
159 }
160 my $str = join(' ',
161 'compmap ',
162 (($i+1) * $full_increment - $halfwidth ),
163 (($i+1) * $full_increment + $halfwidth ),
164 "${color_str}f=". $current_row[$j]
165 );
166 $protein_files{$user_ordering[$j]} .= $str . "\n";
167 }
168 }
169 }
170 return \%protein_files;
171 }
172 sub linkages{
173 my @hits = @{$data_file{hit_table}};
174 my %links;
175 foreach my $hit (@hits){
176 my ($from, $to, $evalue, $dice) = @{$hit};
177 if($evalue < $options->{evalue} && $dice > $options->{dice}){
178 if(defined $protein_position_quicklookup{$from} && defined $protein_position_quicklookup{$to}){
179 my ($theta0,$radius0) = @{$protein_position_quicklookup{$from}};
180 my ($theta1,$radius1) = @{$protein_position_quicklookup{$to}};
181 # If this is a self-self link, disable plotting because we don't care.
182 # If ig_dist is disabled or distance is between them is less than our minimum
183 if($radius0 != $radius1
184 && ($options->{'ig_dist'} == "-1" || abs($theta0-$theta1) <= $options->{'ig_dist'})
185 ){
186 # Create the dataset
187 my @row_data = ('compmap',
188 );
189 # We work under the assumption that all hits
190 # are bi-directional, so we swap them to be
191 # smallest first no matter what.
192 if($radius1 < $radius0){
193 my $tmp = $radius1;
194 $radius1 = $radius0;
195 $radius0 = $tmp;
196 # We also want to add in reverse order
197 push(@row_data,
198 (($theta0+1)*$full_increment),
199 (($theta1+1)*$full_increment),
200 );
201 }else{
202 push(@row_data,
203 (($theta1+1)*$full_increment),
204 (($theta0+1)*$full_increment),
205 );
206 }
207
208 # If it's a link with the same theta
209 # value, then we'll go ahead and hide
210 # behind the track to make it a little
211 # prettier.
212 my $zstr;
213 if($theta0 == $theta1){
214 $zstr="z=0";
215 }else{
216 $zstr="z=100";
217 }
218
219 # Create the additional row data
220 push(@row_data,
221 join(',', "dice=$dice", "color=" . colorstr($dice))
222 );
223 $links{$radius0}{$radius1} .= join(' ', @row_data) . "\n";
224 }
225 }
226 }
227 }
228 return \%links;
229 }
230 sub colorstr {
231 my ($dice) = @_;
232 if($dice > 90) {
233 return 'black';
234 }else{
235 return 'greys-9-seq-' . floor($dice / 10);
236 }
237 }
238 sub circosconf {
239 my $cc = CPT::Circos::Conf->new();
240 $cc->include('etc/colors_fonts_patterns.conf');
241 $cc->start_block('colors');
242 $cc->set('accent-8-qual-inv-1', '42, 135, 42');
243 $cc->set('accent-8-qual-inv-2', '111, 83, 150');
244 $cc->set('accent-8-qual-inv-3', '182, 112, 46');
245 $cc->set('accent-8-qual-inv-4', '178, 178, 53');
246 $cc->set('accent-8-qual-inv-5', '13, 63, 128');
247 $cc->set('accent-8-qual-inv-6', '183, 0, 96');
248 $cc->set('accent-8-qual-inv-7', '116, 47, 0');
249 $cc->set('accent-8-qual-inv-8', '45, 45, 45');
250 $cc->end_block();
251 # markings indicating position along genome
252 $cc->include('example/etc/ideogram.conf');
253 #$cc->include('rules.conf');
254 # Genome data
255 $cc->set('karyotype', 'karyotype.conf');
256 # Default image params are fine
257 $cc->start_block('image');
258 $cc->include('etc/image.conf');
259 $cc->end_block();
260 #$cc->include('highlights.conf');
261 $cc->include('plots.conf');
262 #$cc->include('rules.conf');
263
264 $cc->include('etc/housekeeping.conf');
265 my $result = $cc->finalize();
266 $cc = CPT::Circos::Conf->new();
267 return $result;
268 }
269 sub karyotype {
270 my @karyotype_data = (
271 "chr - compmap compmap 0 ".((scalar @aligned_results)*1000+500)." white"
272 );
273 return join("\n", @karyotype_data);
274 }
275
276 my @files = ();
277
278 my $number_of_tracks = 0;
279 sub register_track {
280 my ($r0,$r1) = calculate_individual_track($number_of_tracks);
281 $number_of_tracks++;
282 return ($r0, $r1);
283 }
284 sub calculate_individual_track {
285 my ($idx) = @_;
286 my $r0 = ( 90 - (10 * $idx - 1)/1) / 100;
287 my $r1 = ( 90 - (10 * $idx - 9)/1) / 100;
288 return ($r0, $r1);
289 }
290 sub genome_data {
291 my $cc = CPT::Circos::Conf->new();
292 # Map string back to position in array.
293 #$cc->set('z',10);
294 # Loop across all our protein data sets
295 $cc->start_block('plots');
296 foreach my $genome(@user_ordering){
297 # Add protein file
298 my $filename = sprintf('org.features.%s.txt', $genome);
299 push(@files, [ 'data/'.$filename, $compmap_proteins{$genome}]);
300 # Create associated tracks
301
302 my ($r0,$r1) = register_track();
303 $cc->start_block('plot');
304 $cc->set('type','highlight');
305 $cc->set('file', $filename);
306 $cc->set('r0', $r0 .'r');
307 $cc->set('r1', $r1 .'r');
308 $cc->set('z', '50');
309 $cc->set('fill_color','accent-8-qual-' . ($uo_idx{$genome} + 1));
310 $cc->set('stroke_thickness', '1');
311 $cc->set('stroke_color', 'black');
312 $cc->end_block();
313 }
314
315
316 foreach my $from(@user_ordering){
317 foreach my $to(@user_ordering){
318 next if($from eq $to || $uo_idx{$from} > $uo_idx{$to});
319
320 if($linkages{$uo_idx{$from}}{$uo_idx{$to}}){
321 my $filename = sprintf('links.%s.%s.txt', $from, $to);
322 push(@files, [ 'data/'.$filename, $linkages{$uo_idx{$from}}{$uo_idx{$to}}]);
323 #push(@files, [ 'data/'.$filename, 'blaaaaaaah']);
324
325 my ($r0a, $r0b) = calculate_individual_track($uo_idx{$to});
326 my ($r1a, $r1b) = calculate_individual_track($uo_idx{$from});
327
328 # If they're in this ordering, they will be pointing at
329 # the "outsides" of each genome/protein track, so we
330 # swap with the internal ones.
331 if($r1b > $r0a){
332 $r0a = $r1a;
333 $r1b = $r0b;
334 }
335
336 $cc->start_block('plot');
337 $cc->set('type','connector');
338 $cc->set('thickness', $options->{stroke_thickness});
339 $cc->set('file', $filename);
340 if($r1b<$r0a){
341 $cc->set('r0', $r1b .'r');
342 $cc->set('r1', $r0a .'r');
343 }else{
344 $cc->set('r0', $r0a .'r');
345 $cc->set('r1', $r1b .'r');
346 }
347 $cc->set('connector_dims', '0,0.3,0.4,0.3,0');
348 $cc->set('color','black');
349 $cc->end_block();
350 }
351 }
352 }
353
354
355 $cc->end_block();
356 return $cc->finalize();
357 }
358 sub rulesconf {
359 my ($self) = @_;
360 my $cc = CPT::Circos::Conf->new();
361 $cc->start_block('rules');
362 for(my $i = 0; $i < 10; $i++){
363 $cc->start_block('rule');
364 $cc->set('importance', 10 - $i);
365 $cc->set('condition', 'var(dice) > ' . (10*$i));
366 if($i == 9){
367 $cc->set('color', 'black');
368 }else{
369 $cc->set('color', 'gray' . (10 * ($i+1)));
370 }
371 #$cc->set('z',10-$i);
372 $cc->end_block();
373 }
374 $cc->end_block();
375 }
376
377 push(@files, [ 'etc/karyotype.conf', karyotype() ]);
378 push(@files, [ 'etc/circos.conf', circosconf() ]);
379 #push(@files, [ 'etc/rules.conf', rulesconf() ]);
380 push(@files, [ 'etc/plots.conf', genome_data() ]);
381
382
383
384 use Archive::Any::Create;
385 my $archive = Archive::Any::Create->new();
386 $archive->container('conf');
387 foreach(@files){
388 my ($location, $content) = @{$_};
389 $archive->add_file($location, $content);
390 }
391
392 use CPT::OutputFiles;
393 my $crr_output = CPT::OutputFiles->new(
394 name => 'output_circos_confs',
395 GGO => $ggo,
396 );
397 $crr_output->CRR(data => $archive);
398
399
400 =head1 NAME
401
402 PSM Plotter
403
404 =head1 DESCRIPTION
405
406 Following the execution of the PSM Prep tool, this tool plots a subset of those genomes as ciruclar tracks with protein-protein relationships plotted as links between the boxes representing proteins.
407
408 =head2 IMPORTANT PARAMETERS
409
410 =over 4
411
412 =item C<user_ordering>
413
414 This parameter controls the order in which genomes are aligned and then plotted. This MUST contain (comma/space) separated values listing the order (outside to in) in which you want your genomes to appear. In this field, type the name of each genome. The name can be found on the first line of the file under "LOCUS NC_00000001", where NC_00000001 would be the genome's name.
415
416 =item C<evalue>, C<dice>
417
418 Adjusting these parameters will affect which links are plotted. Links are heatmapped into bins based on dice score as that is the easiest measure to work with, and scales nicely from 0 to 100. For example, a link with a dice score of 20-29 would be plotted as 20% black (grey20), whereas a dice score of 90+ would be plotted as solid black
419
420 =item C<mismatch>, C<gap_penalty>, C<match>
421
422 These parameters control the Needleman-Wunsch Multiple Sequence Alignment library's scoring scheme. Mismatch scores are generally negative and discourage unrelated proteins from being plotted in a line together. Match scores encourage related proteins to line up. Gap penalty is set at zero as we generally prefer gaps to mismatches in this tool; phage genomes are small and gaps are "cheap" to use, whereas mismatches can sometimes give an incorrect impression of relatedness. That said, how your plots look is completely up to you and we encourage experimentation!
423
424 =item C<every_nth>
425
426 Every Nth gene in a genome will be plotted a slightly different color.
427
428 =back
429
430 =head2 Why Can't I Control Colors?
431
432 Brewer colors compose Brewer palettes which have been manually defined by
433 Cynthia Brewer for their perceptual properties.
434
435 http://circos.ca/tutorials/lessons/configuration/colors/
436
437 Color palette choice is very important to creating an attractive and easy to read graphic. In the words of Dr. Krzywinski, L<Color palettes matter|http://mkweb.bcgsc.ca/jclub/biovis/brewer/colorpalettes.pdf>. Humans selecting from an RGB/HSV color palette tend to make poor choices, so we've removed the option in lieu of using the very attractive L<Brewer Palettes|http://colorbrewer2.org/>. Specifically, I've selected the 8 class qualtitative "Accent" color set, which has produced some very nice maps. If you would like the option of selecting amongst the other 8-class qualitative color sets, please file a bug report and let me know.
438
439
440 =cut