Mercurial > repos > cpt > cpt_psm_prep
comparison cpt_psm_2_gentable.pl @ 1:d724f34e671d draft default tip
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:50:07 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:e4de0a0e90c8 | 1:d724f34e671d |
---|---|
1 #!/usr/bin/env perl | |
2 use strict; | |
3 use warnings; | |
4 use Storable; | |
5 use CPT::GalaxyGetOpt; | |
6 use CPT::Bio::NW_MSA; | |
7 use Data::Dumper; | |
8 use CPT::Circos::Conf; | |
9 use POSIX; | |
10 | |
11 | |
12 my $ggo = CPT::GalaxyGetOpt->new(); | |
13 my $options = $ggo->getOptions( | |
14 'options' => [ | |
15 [ 'file', 'PSM2 Data File', { validate => 'File/Input', required => 1 } ], | |
16 [], | |
17 ['Cutoffs'], | |
18 ['evalue' , 'Evalue cutoff' , { validate => 'Float' , default => 1e-4 } ] , | |
19 ['dice' , 'Dice cutoff' , { validate => 'Float' , default => 50 } ] , | |
20 [], | |
21 ['Alignment Options'], | |
22 ['mismatch' , 'Mismatch Score' , { validate => 'Float' , default => -1} ] , | |
23 ['gap_penalty' , 'Gap Penalty' , { validate => 'Float' , default => '0.0' } ] , | |
24 ['match' , 'Match Score' , { validate => 'Float' , default => 5 } ] , | |
25 ], | |
26 'outputs' => [ | |
27 [ | |
28 'diff_table', | |
29 'Output Comparison Table', | |
30 { | |
31 validate => 'File/Output', | |
32 required => 1, | |
33 default => 'genome_comp', | |
34 data_format => 'text/tabular', | |
35 default_format => 'TSV_U', | |
36 }, | |
37 ], | |
38 [ | |
39 'blastclust', | |
40 'Output Blastclust Table', | |
41 { | |
42 validate => 'File/Output', | |
43 required => 1, | |
44 default => 'blastclust', | |
45 data_format => 'text/tabular', | |
46 default_format => 'TSV_U', | |
47 } | |
48 ], | |
49 ], | |
50 'defaults' => [ | |
51 'appid' => 'PSM.Comp', | |
52 'appname' => 'PSM Comparison Table', | |
53 'appdesc' => 'aligns and lists data from PSM Prep', | |
54 'appvers' => '1.94', | |
55 ], | |
56 'tests' => [ | |
57 ], | |
58 ); | |
59 | |
60 | |
61 my %data_file = %{retrieve($options->{file})}; | |
62 | |
63 print STDERR "Aliging genomes\n"; | |
64 my $msa = CPT::Bio::NW_MSA->new( | |
65 gap_penalty => $options->{'gap_penalty'}, | |
66 match_score => $options->{'match'}, | |
67 mismatch_score => $options->{'mismatch'}, | |
68 bidi => 1, | |
69 ); | |
70 | |
71 my @hits = @{$data_file{hit_table}}; | |
72 my @clusters; | |
73 | |
74 foreach my $hit(@hits){ | |
75 my ($from, $to, $evalue, $dice) = @{$hit}; | |
76 if($evalue < $options->{evalue} && $dice > $options->{dice}){ | |
77 if($options->{verbose}){ | |
78 print "$from $to\n"; | |
79 } | |
80 | |
81 my $foundmatch = 0; | |
82 foreach my $cluster(@clusters){ | |
83 if($from ~~ @{$cluster} || $to ~~ @{$cluster}){ | |
84 $foundmatch = 1; | |
85 if(!($from ~~ @{$cluster})){ | |
86 push(@{$cluster}, $from); | |
87 } | |
88 if(!($to ~~ @{$cluster})){ | |
89 push(@{$cluster}, $to); | |
90 } | |
91 } | |
92 } | |
93 if($foundmatch == 0){ | |
94 push(@clusters, ["".($#clusters+2), $from, $to]); | |
95 } | |
96 $msa->add_relationship($from, $to); | |
97 } | |
98 } | |
99 my @fixed_clusters; | |
100 foreach my $cluster (@clusters) { | |
101 my ($idx, @values) = @{$cluster}; | |
102 push(@fixed_clusters, [$idx, join(',', @values)]); | |
103 } | |
104 | |
105 my @user_ordering = keys($data_file{gbk}); | |
106 | |
107 foreach my $genome(@user_ordering){ | |
108 print STDERR "\tAligning $genome\n"; | |
109 my $gi_list_ref = $data_file{gbk}{$genome}{gi};#"GI" list | |
110 if(! defined $gi_list_ref){ | |
111 warn "Could not find $genome genome in the data file. Please be sure you have correctly specified the name of a genome from a genbank file. (See the LOCUS line for the name)."; | |
112 }else{ | |
113 $msa->align_list($gi_list_ref); | |
114 } | |
115 } | |
116 | |
117 my @aligned_results = $msa->merged_array(); | |
118 # Remove CRC64 hashes from sequences | |
119 foreach my $row(@aligned_results){ | |
120 $row = [map { s/_[A-F0-9]{16}$//; $_ } @{$row}]; | |
121 #my $key = ${$row}[0]; | |
122 #foreach my $cluster(@clusters){ | |
123 | |
124 #} | |
125 } | |
126 | |
127 my %table = ( | |
128 'Sheet1' => { | |
129 header => \@user_ordering, | |
130 data => \@aligned_results, | |
131 } | |
132 ); | |
133 | |
134 | |
135 use CPT::OutputFiles; | |
136 my $crr_output = CPT::OutputFiles->new( | |
137 name => 'diff_table', | |
138 GGO => $ggo, | |
139 ); | |
140 $crr_output->CRR(data => \%table); | |
141 | |
142 my %table2 = ( | |
143 'Sheet1' => { | |
144 header => ['Cluster ID', 'Contents'], | |
145 data => \@fixed_clusters, | |
146 } | |
147 ); | |
148 my $crr_output2 = CPT::OutputFiles->new( | |
149 name => 'blastclust', | |
150 GGO => $ggo, | |
151 ); | |
152 $crr_output2->CRR(data => \%table2); | |
153 | |
154 =head1 DESCRIPTION | |
155 | |
156 Following the execution of the PSM Prep tool, this tool simply aligns the genomes and generates a table comparison the positions of all proteins. It can be very useful to figure out which genes are missing in which genomes. | |
157 | |
158 =head2 IMPORTANT PARAMETERS | |
159 | |
160 =over 4 | |
161 | |
162 =item C<mismatch>, C<gap_penalty>, C<match> | |
163 | |
164 These parameters control the Needleman-Wunsch Multiple Sequence Alignment library's scoring scheme. Mismatch scores are generally negative and discourage unrelated proteins from being plotted in a line together. Match scores encourage related proteins to line up. Gap penalty is set at zero as we generally prefer gaps to mismatches in this tool; phage genomes are small and gaps are "cheap" to use, whereas mismatches can sometimes give an incorrect impression of relatedness. That said, how your plots look is completely up to you and we encourage experimentation! | |
165 | |
166 =back | |
167 | |
168 =cut |