3
|
1 #!/usr/bin/perl
|
|
2
|
|
3 #######
|
|
4 # POD #
|
|
5 #######
|
|
6
|
|
7 =pod
|
|
8
|
|
9 =head1 NAME
|
|
10
|
|
11 C<po2group_stats.pl> - categorize orthologs from Proteinortho5
|
|
12 output according to genome groups
|
|
13
|
|
14 =head1 SYNOPSIS
|
|
15
|
|
16 C<perl po2group_stats.pl -i matrix.proteinortho -d genome_fasta_dir/
|
|
17 -g group_file.tsv -p E<gt> overall_stats.tsv>
|
|
18
|
|
19 =head1 DESCRIPTION
|
|
20
|
|
21 Categorize the genomes in an ortholog/paralog output matrix (option
|
|
22 B<-i>) from a
|
|
23 L<B<Proteinortho5>|http://www.bioinf.uni-leipzig.de/Software/proteinortho/>
|
|
24 calculation according to group affiliations. The group
|
|
25 affiliations of the genomes are intended to get overall
|
|
26 presence/absence statistics for groups of genomes and not simply
|
|
27 single genomes (e.g. comparing 'marine', 'earth', 'commensal',
|
|
28 'pathogenic' etc. genome groups). Percentage inclusion (option
|
|
29 B<-cut_i>) and exclusion (option B<-cut_e>) cutoffs can be set to
|
|
30 define how strict the presence/absence of genome groups within an
|
|
31 orthologous group (OG) are defined. Of course groups can also hold
|
|
32 only single genomes to get single genome statistics. Group
|
|
33 affiliations are defined in a mandatory B<tab-delimited> group input
|
|
34 file (option B<-g>) with B<minimal two> and B<maximal four> groups.
|
|
35
|
|
36 Only alphanumeric (a-z, A-Z, 0-9), underscore (_), dash (-), and
|
|
37 period (.) characters are allowed for the B<group names> in the
|
|
38 group file to avoid downstream problems with the operating/file
|
|
39 system. As a consequence, also no whitespaces are allowed in these!
|
|
40 Additionally, B<group names>, B<genome filenames> (should be
|
|
41 enforced by the file system), and B<FASTA IDs> considering B<all>
|
|
42 genome files (mostly locus tags; should be enforced by
|
|
43 Proteinortho5) need to be B<unique>.
|
|
44
|
|
45 B<Proteinortho5> (PO) has to be run with option B<-singles> to
|
|
46 include also genes without orthologs, so-called singletons/ORFans,
|
|
47 for each genome in the PO matrix (see the
|
|
48 L<PO manual|http://www.bioinf.uni-leipzig.de/Software/proteinortho/manual.html>).
|
|
49 Additionally, option B<-selfblast> is recommended to enhance
|
|
50 paralog detection by PO.
|
|
51
|
|
52 To explain the logic behind the categorization, the following
|
|
53 annotation for example groups will be used. A '1' exemplifies a
|
|
54 group genome count in a respective OG E<gt>= the rounded inclusion
|
|
55 cutoff, a '0' a group genome count E<lt>= the rounded exclusion
|
|
56 cutoff. The presence and absence of OGs for the group affiliations
|
|
57 are structured in different categories depending on the number of
|
|
58 groups. For B<two groups> (e.g. A and B) there are five categories:
|
|
59 'A specific' (A:B = 1:0), 'B specific' (0:1), 'cutoff core' (1:1),
|
|
60 'underrepresented' (0:0), and 'unspecific'. Unspecific OGs have a
|
|
61 genome count for at least B<one> group outside the cutoffs
|
|
62 (exclusion cutoff E<lt> genome count E<lt> inclusion cutoff) and
|
|
63 thus cannot be categorized. These 'unspecific' OGs will only be
|
|
64 printed to a final annotation result file with option B<-u>. Overall
|
|
65 stats for all categories are printed to C<STDOUT> in a final
|
|
66 tab-delimited output matrix.
|
|
67
|
|
68 B<Three groups> (A, B, and C) have the following nine categories: 'A
|
|
69 specific' (A:B:C = 1:0:0), 'B specific' (0:1:0), 'C specific'
|
|
70 (0:0:1), 'A absent' (0:1:1), 'B absent' (1:0:1), 'C absent' (1:1:0),
|
|
71 'cutoff core' (1:1:1), 'underrepresented' (0:0:0), and 'unspecific'.
|
|
72
|
|
73 B<Four groups> (A, B, C, and D) are classified in 17 categories: 'A
|
|
74 specific' (A:B:C:D = 1:0:0:0), 'B specific' (0:1:0:0), 'C specific'
|
|
75 (0:0:1:0), 'D specific' (0:0:0:1), 'A-B specific' (1:1:0:0), 'A-C
|
|
76 specific' (1:0:1:0), 'A-D specific' (1:0:0:1), 'B-C specific'
|
|
77 (0:1:1:0), 'B-D specific' (0:1:0:1), 'C-D specific' (0:0:1:1), 'A
|
|
78 absent' (0:1:1:1), 'B absent' (1:0:1:1), 'C absent' (1:1:0:1), 'D
|
|
79 absent' (1:1:1:0), 'cutoff core' (1:1:1:1), 'underrepresented'
|
|
80 (0:0:0:0), and 'unspecific'.
|
|
81
|
|
82 The resulting group presence/absence (according to the cutoffs) can
|
|
83 also be printed to a binary matrix (option B<-b>) in the result
|
|
84 directory (option B<-r>), excluding the 'unspecific' category. Since
|
|
85 the categories are the logics underlying venn diagrams, you can also
|
|
86 plot the results in a venn diagram using the binary matrix (option
|
|
87 B<-p>). The 'underrepresented' category is exempt from the venn
|
|
88 diagram, because it is outside of venn diagram logics.
|
|
89
|
|
90 For an illustration of the logics have a look at the example venn
|
|
91 diagrams F<./pics/venn_diagram_logics.[svg|png]>.
|
|
92
|
|
93 There are two optional categories (which are only considered for the
|
|
94 final print outs and in the final stats matrix, not for the binary
|
|
95 matrix and the venn diagram): 'strict core' (option B<-co>) for OGs
|
|
96 where B<all> genomes have an ortholog. Of course all the 'strict
|
|
97 core' OGs are also included in the 'cutoff core' category ('strict
|
|
98 core' is identical to 'cutoff core' with B<-cut_i> 1 and B<-cut_e>
|
|
99 0). Option B<-s> activates the detection of 'singleton/ORFan' OGs
|
|
100 present in only B<one> genome. Depending on the cutoffs and number
|
|
101 of genomes in the groups, category 'underrepresented' includes most
|
|
102 of these singletons.
|
|
103
|
|
104 Additionally, annotation is retrieved from multi-FASTA files created
|
|
105 with L<C<cds_extractor.pl>|/cds_extractor>. See
|
|
106 L<C<cds_extractor.pl>|/cds_extractor> for a description of the
|
|
107 format. These files are used as input for the PO analysis and with
|
|
108 option B<-d> for C<po2group_stats.pl>. The annotations are printed
|
|
109 in category output files in the result directory.
|
|
110
|
|
111 Annotations are only pulled from one representative genome for each
|
|
112 category present in the current OG. With option B<-co> you can set a
|
|
113 specific genome for the representative annotation for category
|
|
114 'strict core'. For all other categories the representative genome is
|
|
115 chosen according to the order of the genomes in the group files,
|
|
116 depending on the presence in each OG. Thus, the best annotated
|
|
117 genome should be in the first group at the topmost position
|
|
118 (especially for 'cutoff core'), as well as the best annotated ones
|
|
119 at the top in all other groups.
|
|
120
|
|
121 In the result files, each orthologous group (OG) is listed in a row
|
|
122 of the resulting category files, the first column holds the OG
|
|
123 numbers from the PO input matrix (i.e. line number minus one). The
|
|
124 following columns specify the ID for each CDS, gene, EC number(s),
|
|
125 product, and organism are shown depending on their presence in the
|
|
126 CDS's annotation. The ID is in most cases the locus tag (see
|
|
127 L<C<cds_extractor.pl>|/cds_extractor>). If several EC numbers exist
|
|
128 for a single CDS they are separated by a ';'. If the representative
|
|
129 genome within an OG includes paralogs (co-orthologs) these will be
|
|
130 printed in the following row(s) B<without> a new OG number in the
|
|
131 first column.
|
|
132
|
|
133 The number of OGs in the category annotation result files are the
|
|
134 same as listed in the venn diagram and the final stats matrix.
|
|
135 However, since only annotation from one representative annotation is
|
|
136 used the CDS number will be different to the final stats. The final
|
|
137 stats include B<all> the CDS in this category in B<all> genomes
|
|
138 present in the OG in groups E<gt>= the inclusion cutoff (i.e. for
|
|
139 'strict core' the CDS for all genomes in this OG are counted). Two
|
|
140 categories are different, for 'unspecific' all unspecific groups are
|
|
141 included, for 'underrepresented' all groups E<lt>= the exclusion
|
|
142 cutoffs. This is also the reason, the 'pangenome' CDS count is
|
|
143 greater than the 'included in categories' CDS count in the final
|
|
144 stats matrix, as genomes in excluded groups are exempt from the CDS
|
|
145 counts for most categories. 'Included in categories' is the OG/CDS
|
|
146 sum of all non-optional categories ('*specific', '*absent', 'cutoff
|
|
147 core', 'underrepresented', and 'unspecific'), since the optional
|
|
148 categories are included in non-optionals. An exception to the
|
|
149 difference in CDS counts are the 'singletons' category where OG and
|
|
150 CDS counts are identical in the result files and in the overall
|
|
151 final output matrix (as there is only one genome), as well as in
|
|
152 group-'specific' categories for groups including only one genome.
|
|
153
|
|
154 At last, if you want the respective representative sequences for a
|
|
155 category you can first filter the locus tags from the result file
|
|
156 with Unix command-line tools:
|
|
157
|
|
158 C<grep -v "^#" result_file.tsv | cut -f 2 E<gt> locus_tags.txt>
|
|
159
|
|
160 And then feed the locus tag list to
|
|
161 L<C<cds_extractor.pl>|/cds_extractor> with option B<-l>.
|
|
162
|
|
163 As a final note, in the L<F<prot_finder>|/prot_finder> workflow is a
|
|
164 script, C<binary_group_stats.pl>, based upon C<po2group_stats.pl>,
|
|
165 which can calculate overall presence/absence statistics for column
|
|
166 groups in a delimited TEXT binary matrix (as with genomes here).
|
|
167
|
|
168 =head1 OPTIONS
|
|
169
|
|
170 =head2 Mandatory options
|
|
171
|
|
172 =over 20
|
|
173
|
|
174 =item B<-i>=I<str>, B<-input>=I<str>
|
|
175
|
|
176 Proteinortho (PO) result matrix (*.proteinortho or *.poff)
|
|
177
|
|
178 =item B<-d>=I<str>, B<-dir_genome>=I<str>
|
|
179
|
|
180 Path to the directory including the genome multi-FASTA PO input
|
|
181 files (*.faa or *.ffn), created with
|
|
182 L<C<cds_extractor.pl>|/cds_extractor>
|
|
183
|
|
184 =item B<-g>=I<str>, B<-groups_file>=I<str>
|
|
185
|
|
186 Tab-delimited file with group affiliation for the genomes with
|
|
187 B<minimal two> and B<maximal four> groups (easiest to create in a
|
|
188 spreadsheet software and save in tab-separated format). B<All>
|
|
189 genomes from the PO matrix need to be included. Group names can only
|
|
190 include alphanumeric (a-z, A-Z, 0-9), underscore (_), dash (-), and
|
|
191 period (.) characters (no whitespaces allowed either). Example
|
|
192 format with two genomes in group A, three in group B and D, and one
|
|
193 in group C:
|
|
194
|
|
195 group_A\tgroup_B\tgroup_C\tgroup_D
|
|
196 genome1.faa\tgenome2.faa\tgenome3.faa\tgenome4.faa
|
|
197 genome5.faa\tgenome6.faa\t\tgenome7.faa
|
|
198 \tgenome8.faa\t\tgenome9.faa
|
|
199
|
|
200 =back
|
|
201
|
|
202 =head2 Optional options
|
|
203
|
|
204 =over 20
|
|
205
|
|
206 =item B<-h>, B<-help>
|
|
207
|
|
208 Help (perldoc POD)
|
|
209
|
|
210 =item B<-r>=I<str>, B<-result_dir>=I<str>
|
|
211
|
|
212 Path to result folder [default = inclusion and exclusion percentage
|
|
213 cutoff, './results_i#_e#']
|
|
214
|
|
215 =item B<-cut_i>=I<float>, B<-cut_inclusion>=I<float>
|
|
216
|
|
217 Percentage inclusion cutoff for genomes in a group per OG, has to be
|
|
218 E<gt> 0 and E<lt>= 1. Cutoff will be rounded according to the genome
|
|
219 number in each group and has to be E<gt> the rounded exclusion
|
|
220 cutoff in this group. [default = 0.9]
|
|
221
|
|
222 =item B<-cut_e>=I<float>, B<-cut_exclusion>=I<float>
|
|
223
|
|
224 Percentage exclusion cutoff, has to be E<gt>= 0 and E<lt> 1. Rounded
|
|
225 cutoff has to be E<lt> rounded inclusion cutoff. [default = 0.1]
|
|
226
|
|
227 =item B<-b>, B<-binary_matrix>
|
|
228
|
|
229 Print a binary matrix with the presence/absence genome group results
|
|
230 according to the cutoffs (excluding 'unspecific' category OGs)
|
|
231
|
|
232 =item B<-p>, B<-plot_venn>
|
|
233
|
|
234 Plot venn diagram from the binary matrix (except 'unspecific' and
|
|
235 'underrepresented' categories) with function C<venn> from B<R>
|
|
236 package B<gplots>, requires option B<-b>
|
|
237
|
|
238 =item B<-co>=(I<str>), B<-core_strict>=(I<str>)
|
|
239
|
|
240 Include 'strict core' category in output. Optionally, give a genome
|
|
241 name from the PO matrix to use for the representative output
|
|
242 annotation. [default = topmost genome in first group]
|
|
243
|
|
244 =item B<-s>, B<-singletons>
|
|
245
|
|
246 Include singletons/ORFans for each genome in the output, activates
|
|
247 also overall genome OG/CDS stats in final stats matrix for genomes
|
|
248 with singletons
|
|
249
|
|
250 =item B<-u>, B<-unspecific>
|
|
251
|
|
252 Include 'unspecific' category representative annotation file in
|
|
253 result directory
|
|
254
|
|
255 =item B<-a>, B<-all_genomes_overall>
|
|
256
|
|
257 Report overall stats for all genomes (appended to the final stats
|
|
258 matrix), also those without singletons; will include all overall
|
|
259 genome stats without option B<-s>
|
|
260
|
|
261 =item B<-v>, B<-version>
|
|
262
|
|
263 Print version number to C<STDERR>
|
|
264
|
|
265 =back
|
|
266
|
|
267 =head1 OUTPUT
|
|
268
|
|
269 =over 20
|
|
270
|
|
271 =item C<STDOUT>
|
|
272
|
|
273 The tab-delimited final stats matrix is printed to C<STDOUT>.
|
|
274 Redirect or pipe into another tool as needed.
|
|
275
|
|
276 =item F<./results_i#_e#>
|
|
277
|
|
278 All output files are stored in a results folder
|
|
279
|
|
280 =item F<./results_i#_e#/[*_specific|*_absent|cutoff_core|underrepresented]_OGs.tsv>
|
|
281
|
|
282 Tab-delimited files with OG annotation from a representative genome
|
|
283 for non-optional categories
|
|
284
|
|
285 =item (F<./results_i#_e#/[*_singletons|strict_core|unspecific]_OGs.tsv>)
|
|
286
|
|
287 Optional category tab-delimited output files with representative
|
|
288 annotation
|
|
289
|
|
290 =item (F<./results_i#_e#/binary_matrix.tsv>)
|
|
291
|
|
292 Tab-delimited binary matrix of group presence/absence results
|
|
293 according to cutoffs (excluding 'unspecific' category)
|
|
294
|
|
295 =item (F<./results_i#_e#/venn_diagram.pdf>)
|
|
296
|
|
297 Venn diagram for non-optional categories (except 'unspecific' and
|
|
298 'underrepresented' categories)
|
|
299
|
|
300 =back
|
|
301
|
|
302 =head1 EXAMPLES
|
|
303
|
|
304 =head2 L<C<cds_extractor.pl>|/cds_extractor>
|
|
305
|
|
306 =over
|
|
307
|
|
308 =item C<for i in *.[gbk|embl]; do perl cds_extractor.pl -i $i [-p|-n]; done>
|
|
309
|
|
310 =back
|
|
311
|
|
312 =head2 L<B<Proteinortho5>|http://www.bioinf.uni-leipzig.de/Software/proteinortho/>
|
|
313
|
|
314 =over
|
|
315
|
|
316 =item C<proteinortho5.pl -graph [-synteny] -cpus=# -selfblast -singles -identity=50 -cov=50 -blastParameters='-use_sw_tback [-seg no|-dust no]' *.[faa|ffn]>
|
|
317
|
|
318 =back
|
|
319
|
|
320 =head2 C<po2group_stats.pl>
|
|
321
|
|
322 =over
|
|
323
|
|
324 =item C<perl po2group_stats.pl -i matrix.[proteinortho|poff] -d genome_fasta_dir/ -g group_file.tsv -r result_dir -cut_i 0.7 -cut_e 0.2 -b -p -co genome4.[faa|ffn] -s -u -a E<gt> overall_stats.tsv>
|
|
325
|
|
326 =back
|
|
327
|
|
328 =head1 DEPENDENCIES
|
|
329
|
|
330 =over
|
|
331
|
|
332 =item B<Statistical computing language L<R|http://www.r-project.org/>>
|
|
333
|
|
334 C<Rscript> is needed to plot the venn diagram with option B<-p>,
|
|
335 tested with version 3.2.2
|
|
336
|
|
337 =item B<L<gplots|https://cran.r-project.org/web/packages/gplots/index.html>>
|
|
338
|
|
339 Package needed for B<R> to plot the venn diagram with function
|
|
340 C<venn>. Tested with B<gplots> version 2.17.0.
|
|
341
|
|
342 =back
|
|
343
|
|
344 =head1 VERSION
|
|
345
|
|
346 0.1.3 update: 06-06-2016
|
|
347 0.1 23-10-2015
|
|
348
|
|
349 =head1 AUTHOR
|
|
350
|
|
351 Andreas Leimbach aleimba[at]gmx[dot]de
|
|
352
|
|
353 =head1 LICENSE
|
|
354
|
|
355 This program is free software: you can redistribute it and/or modify
|
|
356 it under the terms of the GNU General Public License as published by
|
|
357 the Free Software Foundation; either version 3 (GPLv3) of the
|
|
358 License, or (at your option) any later version.
|
|
359
|
|
360 This program is distributed in the hope that it will be useful, but
|
|
361 WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
362 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
363 General Public License for more details.
|
|
364
|
|
365 You should have received a copy of the GNU General Public License
|
|
366 along with this program. If not, see L<http://www.gnu.org/licenses/>.
|
|
367
|
|
368 =cut
|
|
369
|
|
370
|
|
371 ########
|
|
372 # MAIN #
|
|
373 ########
|
|
374
|
|
375 use strict;
|
|
376 use warnings;
|
|
377 use autodie;
|
|
378 use Getopt::Long;
|
|
379 use Pod::Usage;
|
|
380
|
|
381 my $Cmdline = "$0 @ARGV"; # used call command
|
|
382
|
|
383 ### Get the options with Getopt::Long
|
|
384 my $PO_Matrix_File; # PO result matrix (*.proteinortho or *.poff)
|
|
385 my $Genome_Dir; # directory with genome multi-FASTA files (PO input)
|
|
386 my $Groups_File; # tab-separated file with group classification for the genomes
|
|
387 my $Result_Dir; # path to results folder; default is set below to '"./results_i".$Inclusion_Cutoff."c".$Exclusion_Cutoff'
|
|
388 my $Inclusion_Cutoff = 0.9; # inclusion percentage cutoff for genome count per OG (floating point number >0 && <=1)
|
|
389 my $Exclusion_Cutoff = 0.1; # exclusion percentage cutoff (floating point number >=0 && <1)
|
|
390 my $Opt_Binary; # print resulting binary matrix according to the group inclusion/exclusion results
|
|
391 my $Opt_Venn; # create venn diagram with R package 'gplots' function 'venn'; requires $Opt_Binary
|
|
392 my $Strict_Core; # report also strict core genome, i.e. OGs present in ALL genomes of PO matrix; either option flag (and use first genome in first group) or give genome file for result annotation print
|
|
393 my $Opt_Singletons; # report also singletons for each genome (outside of the group classifications)
|
|
394 my $Opt_Unspecific; # report also group-unspecific OGs ('exclusion < OG_group_genome_count < inclusion' for any group)
|
|
395 my $Opt_Genomes_Overall; # report overall stats also for genomes without singletons
|
|
396 my $VERSION = '0.1.3';
|
|
397 my ($Opt_Version, $Opt_Help);
|
|
398 GetOptions ('input=s' => \$PO_Matrix_File,
|
|
399 'dir_genome=s' => \$Genome_Dir,
|
|
400 'groups_file=s' => \$Groups_File,
|
|
401 'result_dir=s' => \$Result_Dir,
|
|
402 'cut_inclusion=f' => \$Inclusion_Cutoff,
|
|
403 'cut_exclusion=f' => \$Exclusion_Cutoff,
|
|
404 'binary_matrix' => \$Opt_Binary,
|
|
405 'plot_venn' => \$Opt_Venn,
|
|
406 'core_strict:s' => \$Strict_Core,
|
|
407 'singletons' => \$Opt_Singletons,
|
|
408 'unspecific' => \$Opt_Unspecific,
|
|
409 'all_genomes_overall' => \$Opt_Genomes_Overall,
|
|
410 'version' => \$Opt_Version,
|
|
411 'help|?' => \$Opt_Help)
|
|
412 or pod2usage(-verbose => 1, -exitval => 2);
|
|
413
|
|
414
|
|
415
|
|
416 ### Run perldoc on POD and enforce mandatory options
|
|
417 pod2usage(-verbose => 2) if ($Opt_Help);
|
|
418 die "$0 $VERSION\n" if ($Opt_Version);
|
|
419
|
|
420 if (!$PO_Matrix_File || !$Genome_Dir || !$Groups_File) {
|
|
421 my $warning = "\n### Fatal error: Mandatory options '-i', '-d', or '-g' or their arguments are missing!\n";
|
|
422 pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
|
|
423 }
|
|
424 $Genome_Dir =~ s/\/$//; # get rid of a potential '/' at the end of $Genome_Dir path
|
|
425 die "\n### Fatal error: Genome directory '$Genome_Dir' does not exist: $!\n" if (!-d $Genome_Dir);
|
|
426
|
|
427 if (($Inclusion_Cutoff <= 0 || $Inclusion_Cutoff > 1) || ($Exclusion_Cutoff < 0 || $Exclusion_Cutoff >= 1)) {
|
|
428 my $warning = "\n### Fatal error:\nInclusion (0 < '-cut_i' <= 1) or exclusion (0 <= '-cut_e' < 1) cutoff(s) not chosen correctly!\n";
|
|
429 pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
|
|
430 }
|
|
431
|
|
432 print STDERR "Script call command: $Cmdline\n"; # print call command after '-h|-v'
|
|
433
|
|
434 if ($Opt_Venn && !$Opt_Binary) {
|
|
435 warn "Option '-p' to draw the venn diagram set, but not it's required option '-b' for the binary matrix. Forcing option '-b'!\n";
|
|
436 $Opt_Binary = 1;
|
|
437 }
|
|
438
|
|
439
|
|
440
|
|
441 ### Parse genomes in groups file
|
|
442 print STDERR "Parsing group file '$Groups_File' "; # run status of script
|
|
443 open (my $Groups_File_Fh, "<", "$Groups_File");
|
|
444
|
|
445 my @Group_Names; # store group name column order
|
|
446 my %Genome_Groups; # hash of hash with group name as key, internal hash with resp. genomes anonymous array (key 'genomes'), or calculated integer inclusion/exclusion cutoffs (key 'inclusion' etc.)
|
|
447 while (<$Groups_File_Fh>) {
|
|
448 chomp;
|
|
449 next if (/^\s*$/); # skip empty lines
|
|
450
|
|
451 # check groups file header and get group names
|
|
452 if ($. == 1) { # header of groups file (first line)
|
|
453 die "\n### Fatal error:\nGroups file '$Groups_File' does not have a correctly formatted header line with at least TWO and maximal FOUR tab-separated group name columns (without other whitespaces)!\n" if (!/^\S+\t\S+\t?\S*\t?\S*$/);
|
|
454 foreach my $group_name (split(/\t/)) {
|
|
455 check_file_system_conformity($group_name, 'group names'); # subroutine to check string only contains alphanumeric '0-9a-zA-Z', underscore '_', dash '-', and period '.' characters
|
|
456 die "\n### Fatal error:\nGroup name '$group_name' is not unique in the group file '$Groups_File'! Please choose unique characters/names for the groups in the group file!\n" if (grep {/$group_name/} @Group_Names);
|
|
457 push (@Group_Names, $group_name); # store groups array
|
|
458 }
|
|
459 next; # skip header line for subsequent code
|
|
460 }
|
|
461
|
|
462 # parse genomes in groups file
|
|
463 die "\n### Fatal error:\nLine '$.' in the groups file '$Groups_File' does not have the correct format with maximal four tab-separated genome file names according to the groups in the header (without other whitespaces):\n$_\n" if (!/^\S*\t?\S*\t?\S*\t?\S*$/);
|
|
464 my $column = -1; # column counter to associate genome filename (column) to correct group in header (@Group_Names array zero-based, thus start with '-1')
|
|
465 foreach my $genome (split(/\t/)) {
|
|
466 $column++;
|
|
467 next if ($genome =~ /^$/); # skip empty columns in group file, in case the groups have uneven genome numbers
|
|
468 die "\n### Fatal error:\nGenome '$genome' is not unique in the group file '$Groups_File'! However, the group file needs to contain all of the genomes from the header in the Proteinortho matrix '$PO_Matrix_File' and vice versa. Make sure the strings correspond exactly (also lower/uppercase) to the header of the matrix and the input files in directory '$Genome_Dir'!\n" if (check_genome_in_groups($genome)); # subroutine to check if a genome is present in %Genome_Groups
|
|
469 push (@{ $Genome_Groups{$Group_Names[$column]}->{'genomes'} }, $genome); # push into anonymous array in hash of hash
|
|
470 }
|
|
471 }
|
|
472 close $Groups_File_Fh;
|
|
473 print STDERR "with ", scalar map(@{ $Genome_Groups{$_}->{'genomes'} }, keys %Genome_Groups), " genomes ...\n"; # run status of script
|
|
474
|
|
475 foreach (@Group_Names) {
|
|
476 die "\n### Fatal error:\nGroup '$_' does not contain a genome, please add a suitable genome filename or remove the group from the group file '$Groups_File'!\n" if (!$Genome_Groups{$_});
|
|
477 }
|
|
478
|
|
479
|
|
480
|
|
481 ### Check $Strict_Core and set genome if not given
|
|
482 if ($Strict_Core) { # if a genome filename is given with option '-co'
|
|
483 die "\n### Fatal error:\nThe given genome '$Strict_Core' with option '-co' is not present in the genomes of the group file '$Groups_File'. The string has to be exactly the same, please check!\n" unless (check_genome_in_groups($Strict_Core)); # subroutine to check if a genome is present in %Genome_Groups
|
|
484 } elsif (defined $Strict_Core && !$Strict_Core) { # if option '-co' is set without argument
|
|
485 $Strict_Core = $Genome_Groups{$Group_Names[0]}->{'genomes'}[0];
|
|
486 print STDERR "Genome for option '-co' was set automatically to the first genome in the first group, '$Strict_Core'!\n";
|
|
487 }
|
|
488
|
|
489
|
|
490
|
|
491 ### Round inclusion and exclusion count cutoffs for each group
|
|
492 foreach (sort keys %Genome_Groups) {
|
|
493 $Genome_Groups{$_}->{'inclusion'} = int((scalar @{ $Genome_Groups{$_}->{'genomes'} } * $Inclusion_Cutoff) + 0.5); # round positive number half up to integer ((s)printf rounds half to even), see e.g. https://stackoverflow.com/questions/178539/how-do-you-round-a-floating-point-number-in-perl
|
|
494 print STDERR "Group '$_' with ", scalar @{ $Genome_Groups{$_}->{'genomes'} }, " genomes has a rounded inclusion cutoff of $Genome_Groups{$_}->{'inclusion'} genome(s) and ";
|
|
495
|
|
496 $Genome_Groups{$_}->{'exclusion'} = int((scalar @{ $Genome_Groups{$_}->{'genomes'} } * $Exclusion_Cutoff) + 0.5);
|
|
497 print STDERR "an exclusion cutoff of $Genome_Groups{$_}->{'exclusion'} genome(s)!\n";
|
|
498 die "\n### Fatal error:\nThe rounded inclusion cutoff has to be greater than the exclusion cutoff for all groups, please choose appropriate cutoffs!\n" if ($Genome_Groups{$_}->{'inclusion'} <= $Genome_Groups{$_}->{'exclusion'});
|
|
499 }
|
|
500
|
|
501
|
|
502
|
|
503 ### Create result folder
|
|
504 if (!$Result_Dir) { # can't give default before 'GetOptions' in case cutoffs are set by the user
|
|
505 $Result_Dir = "./results_i".$Inclusion_Cutoff."_e".$Exclusion_Cutoff;
|
|
506 } else {
|
|
507 $Result_Dir =~ s/\/$//; # get rid of a potential '/' at the end of $Result_Dir path
|
|
508 }
|
|
509 if (-e $Result_Dir) {
|
|
510 empty_dir($Result_Dir); # subroutine to empty a directory with user interaction
|
|
511 } else {
|
|
512 mkdir $Result_Dir;
|
|
513 }
|
|
514
|
|
515
|
|
516
|
|
517 ### Parse OGs in input PO matrix
|
|
518 print STDERR "Parsing Proteinortho input matrix '$PO_Matrix_File' ...\n"; # run status of script
|
|
519 open (my $PO_Matrix_Fh, "<", "$PO_Matrix_File");
|
|
520
|
|
521 my @Genome_Files; # store genome filename column order
|
|
522 my %Ortho_Groups; # hash of hash with OG# as key, internal hash with genome filename as key and IDs (e.g. locus tags) in anonymous array
|
|
523 while (<$PO_Matrix_Fh>) {
|
|
524 chomp;
|
|
525
|
|
526 # check PO input file header and get genome file names
|
|
527 if ($. == 1) { # header of PO matrix file (first line)
|
|
528 die "\n### Fatal error:\nProteinortho input matrix '$PO_Matrix_File' does not have the mandatory header line, which starts with the first three tab-separated mandatory columns:\n# Species\tGenes\tAlg.-Conn.\n" if (!/^# Species\tGenes\tAlg\.-Conn\./);
|
|
529
|
|
530 # get input genomes from PO matrix and check if fit to %Genome_Groups from $Groups_File
|
|
531 foreach my $genome (split(/\t/)) {
|
|
532 next if ($genome =~ /# Species|Genes|Alg\.-Conn\./); # non-genome header columns
|
|
533 die "\n### Fatal error:\nGenome '$genome' present in the header of the Proteinortho input matrix '$PO_Matrix_File' but not in the groups file '$Groups_File'. However, the group file needs to contain all of the genomes from the matrix header and vice versa. Make sure the strings correspond exactly (also lower/uppercase) to the matrix header and the input files in directory '$Genome_Dir'!\n" unless (check_genome_in_groups($genome)); # subroutine to check if a genome is present in %Genome_Groups
|
|
534 push (@Genome_Files, $genome);
|
|
535 }
|
|
536 die "\n### Fatal error:\nNot the same amount of genomes in the group file '$Groups_File' and the Proteinortho input matrix header '$PO_Matrix_File', but the group file needs to contain all of the genomes from the header in the Proteinortho matrix and vice versa!\n" if (@Genome_Files != map(@{ $Genome_Groups{$_}->{'genomes'} }, keys %Genome_Groups));
|
|
537 next; # skip header line of PO matrix for subsequent code
|
|
538 }
|
|
539
|
|
540 # parse PO ortholog matrix
|
|
541 my ($organisms, $genes, $alg_conn, @og) = split(/\t/); # $alg_conn not used
|
|
542 my $organism_count = 0; # count number of organisms in OG to compare to '$organisms' (should be ok from PO)
|
|
543 my $gene_count = 0; # count number of genes in OG to compare to '$genes' (should be ok from PO)
|
|
544 my $column = -1; # column counter to associate column to correct genome file in OG (@Genome_Files array zero-based, thus start with '-1')
|
|
545 foreach (@og) {
|
|
546 $column++;
|
|
547 die "\n###Fatal error:\nThere's an empty field in the Proteinortho input matrix '$PO_Matrix_File' in row '$.', column '$column+4'. This shouldn't happen, as all the fields of the matrix should either have a FASTA ID from the input genome files or a '*' in case the respective genome doesn't have an ortholog in this orthologous group!\n" if (/^$/);
|
|
548 next if (/^\*$/); # skip columns without a FASTA ID (ortholog) in PO matrix (indicated by '*'), have to skip after '$column++' otherwise column won't fit
|
|
549 $organism_count++;
|
|
550 my @paralogs = split(',');
|
|
551 foreach (sort {$a cmp $b} @paralogs) {
|
|
552 $gene_count++;
|
|
553 push(@{ $Ortho_Groups{$.-1}->{$Genome_Files[$column]} }, $_); # push into anonymous array in hash of hash ($.-1 = OG number, because of header)
|
|
554 }
|
|
555 }
|
|
556 die "\n### Fatal error:\nThe indicated number of species ($organisms) does not fit to the counted number of organisms ($organism_count) in the orthologous group of line '$.' of the Proteinortho input matrix '$PO_Matrix_File'!\n" if ($organism_count != $organisms); # to check PO matrix (should be ok from PO)
|
|
557 die "\n### Fatal error:\nThe indicated number of genes ($genes) does not fit to the counted number of orthologs/paralogs ($gene_count) in the orthologous group of line '$.' of the Proteinortho input matrix '$PO_Matrix_File'!\n" if ($gene_count != $genes); # to check PO matrix (should be ok from PO)
|
|
558 }
|
|
559 close $PO_Matrix_Fh;
|
|
560
|
|
561
|
|
562
|
|
563 ### Parse annotations in genome multi-FASTAs
|
|
564 print STDERR "Parsing annotation from multi-FASTA CDS genome files in directory '$Genome_Dir' ...\n";
|
|
565 my %Annotation; # hash of hash to store the annotation of the genome files
|
|
566 foreach my $genome (@Genome_Files) { # @Genome_Files filled from PO matrix
|
|
567 my $genome_file_path = "$Genome_Dir/$genome";
|
|
568 check_file_exist($genome_file_path); # subroutine
|
|
569 open (my $genome_fh, "<", "$genome_file_path");
|
|
570
|
|
571 # parse anno in current genome file
|
|
572 while (my $anno_line = <$genome_fh>) {
|
|
573 chomp $anno_line;
|
|
574 die "\n### Fatal error:\n'$genome_file_path' is not a FASTA input file. First line of the file should be a FASTA ID/header line and start with a '>':\n$anno_line\n" if ($anno_line !~ /^>/ && $. == 1);
|
|
575 next if ($anno_line !~ /^>/); # skip non-ID FASTA lines
|
|
576
|
|
577 my ($id, $gene, $product, $length, $organism, $ec); # $length not used
|
|
578 foreach (split(/\s/, $anno_line)) {
|
|
579 if (/^>(.+)$/) {
|
|
580 $id = $1;
|
|
581 } elsif (/g=(.*)$/) {
|
|
582 $gene = $1;
|
|
583 } elsif (/p=(.*)$/) {
|
|
584 warn "\n### Warning:\nNo product annotation (p=) in file '$genome_file_path' on line:\n$anno_line\nProceeding ...\n" if (!$1);
|
|
585 $product = $1;
|
|
586 $product =~ s/\_/ /g;
|
|
587 } elsif (/l=(.*)$/) { # $length not used
|
|
588 my ($start, $stop) = split(/\.\./, $1);
|
|
589 $length = abs($stop - $start) + 1;
|
|
590 } elsif (/o=(.*)$/) {
|
|
591 $organism = $1;
|
|
592 $organism =~ s/\_/ /g;
|
|
593 } elsif (/ec=(.*)$/) {
|
|
594 $ec = $1;
|
|
595 } else {
|
|
596 die "\n### Fatal error:\nAnnotation in file '$genome_file_path' is not recognized on the following line. Please use 'cds_extractor.pl' to create your multi-FASTA protein genome files.\n$anno_line\n";
|
|
597 }
|
|
598 }
|
|
599 die "\n### Fatal error:\nThe following FASTA ID of file '$genome_file_path' is not unique but has to be considering ALL input genome files (actually Proteinortho should have complained already). Please modify all repetitive occurences.\n$id\n" if ($Annotation{$id});
|
|
600 #die "\n### Fatal error:\nThe FASTA ID '$id' is present in the multi-FASTA genome file '$genome_file_path' but not for this genome in the Proteinortho matrix '$PO_Matrix_File'. However, all IDs from the genome files in directory '$Genome_Dir' need to be present in the matrix and vice versa. Please run Proteinortho5 with option '-singles' to include also genes without orthologs, so-called singletons/ORFans (recommended is also option '-selfblast' to enhance paralog detection).\n" unless (check_po_id($id, $genome)); # subroutine to check if ID is present in %Ortho_Groups (from PO matrix); the reversed check (an ID that's present in PO matrix but not in the corresponding genome file) is in sub 'print_og_results' # this code was 1.) way too slow and 2.) somehow broke '$Query_Specific_OGs' and '$Query_Singletons' during implementation in 'po2anno.pl', no idea why ... (thus handle with care here)
|
|
601
|
|
602 # fill annotation hash of hash
|
|
603 $Annotation{$id} = {'genome' => $genome,
|
|
604 'gene' => $gene,
|
|
605 'product' => $product,
|
|
606 'organism' => $organism,
|
|
607 'ec' => $ec};
|
|
608 }
|
|
609 close $genome_fh;
|
|
610 }
|
|
611
|
|
612
|
|
613
|
|
614 ### Check if CDS counts in multi-FASTA genome files and PO matrix are equal
|
|
615 print STDERR "Comparing annotation CDS counts to orthologous group CDS counts (test if everything is present) ...\n";
|
|
616 foreach my $genome (@Genome_Files) {
|
|
617 my $ortho_cds_count = map ($Ortho_Groups{$_}->{$genome} ? @{ $Ortho_Groups{$_}->{$genome} } : (), keys %Ortho_Groups); # de-reference anonymous array in hash of hash
|
|
618 # map evaluates BLOCK or EXPR in list context and returns the LIST value composed of the results of each such evaluation (each element of LIST may produce zero, one, or more elements in the returned value <=> grep returns only one value). In scalar context, returns the total number of elements so generated.
|
|
619 # 'grep($Ortho_Groups{$_}->{$category} , keys %Ortho_Groups)' should also work (see result matrix)?
|
|
620
|
|
621 # the map line above is a fancy way of writing
|
|
622 #foreach (keys %Ortho_Groups) {
|
|
623 #$ortho_cds_count += @{ $Ortho_Groups{$_}->{$genome} } if ($Ortho_Groups{$_}->{$genome});
|
|
624 #}
|
|
625
|
|
626 my $anno_cds_count = grep ($Annotation{$_}->{'genome'} eq $genome, keys %Annotation);
|
|
627 die "\n### Fatal error:\nThere are more CDSs in file '$genome' than for this genome in the Proteinortho matrix '$PO_Matrix_File', but counts have to be equal. Please run Proteinortho5 with option '-singles' to include also genes without orthologs, so-called singletons/ORFans (recommended is also option '-selfblast' to enhance paralog detection).\n" if ($ortho_cds_count < $anno_cds_count); # overlaps with discarded (commented) sub 'check_po_id', but much more efficient (albeit not ID-specific)
|
|
628 die "\n### Fatal error:\nThere are less CDSs in file '$genome' than for this genome in the Proteinortho matrix '$PO_Matrix_File', but counts have to be equal. Please check if the files are correct.\n" if ($ortho_cds_count > $anno_cds_count); # overlaps with ID-specific check in sub 'print_og_results', but here checks all IDs, the sub checks only those used for result file annotations
|
|
629 }
|
|
630
|
|
631
|
|
632
|
|
633 ### Calculate stats from PO matrix according to the group cutoffs, categorize the results and print to result files
|
|
634 print STDERR "Calculating stats from the Proteinortho matrix according to the group cutoffs ...\n"; # run status of script
|
|
635
|
|
636 # open result file for binary matrix and print header
|
|
637 my $Binary_Matrix_File; # filename needed also for venn diagram R script below
|
|
638 my $Binary_Matrix_Fh;
|
|
639 if ($Opt_Binary) {
|
|
640 $Binary_Matrix_File = "$Result_Dir/binary_matrix.tsv";
|
|
641 open ($Binary_Matrix_Fh, ">", "$Binary_Matrix_File");
|
|
642 print $Binary_Matrix_Fh join("\t", @Group_Names), "\n"; # print header for binary matrix; print array-elements tab-separated and after the last element a newline
|
|
643 }
|
|
644
|
|
645 my %Count_Group_OG; # hash of hash to store OG/CDS count and output filehandle for each category ('core', 'group-specific' etc.)
|
|
646 foreach my $og (sort { $a <=> $b } keys %Ortho_Groups) {
|
|
647 my %present_og_genomes; # hash of array to store genomes of a group PRESENT in the current OG for cutoff tests and print outs
|
|
648
|
|
649 # 'strict_core' category OG if present in ALL genomes (with option '-co')
|
|
650 if ($Strict_Core && keys %{$Ortho_Groups{$og}} == @Genome_Files) {
|
|
651 # always count OGs and CDSs for each category for final stats and plausability checks
|
|
652 $Count_Group_OG{'strict_core'}->{'OG_count'}++;
|
|
653 count_category_CDS(\@Genome_Files, 'strict_core', $og); # subroutine to count all CDS (also paralogs) in the current OG for ALL genomes in the indicated category (strict core category includes all genomes, thus @Genome_Files)
|
|
654 open_result_file("$Result_Dir/strict_core_OGs.tsv", 'strict_core') if (!$Count_Group_OG{'strict_core'}->{'fh'}); # subroutine to open result file and store fh (if not opened yet)
|
|
655 print_og_results($og, $Strict_Core, $Count_Group_OG{'strict_core'}->{'fh'}); # subroutine to print representative annotation for specified genome to result file (here genome is '$Strict_Core')
|
|
656 }
|
|
657
|
|
658 # count and save number of genomes in the current $og with their corresponding group affiliation
|
|
659 foreach my $group (@Group_Names) { # keep initial group order in groups file (for representative annotation)
|
|
660 foreach my $genome (@{ $Genome_Groups{$group}->{'genomes'} }) { # keep initial genome order for each group in groups file (for representative annotation)
|
|
661 if ($Ortho_Groups{$og}->{$genome}) { # $genome present in the current $og
|
|
662 push(@{ $present_og_genomes{$group} }, $genome); # push into anonymous array in hash; keeps initial genome order in groups
|
|
663
|
|
664 # 'singleton/ORFan' category OG if only present in ONE genome (with option '-s')
|
|
665 if ($Opt_Singletons && keys %{$Ortho_Groups{$og}} == 1) {
|
|
666 $Count_Group_OG{$genome}->{'OG_count'}++; # here: category = $genome
|
|
667 $Count_Group_OG{$genome}->{'CDS_count'} += @{ $Ortho_Groups{$og}->{$genome} }; # don't need sub 'count_category_CDS' because only one genome
|
|
668 if (!$Count_Group_OG{$genome}->{'fh'}) {
|
|
669 my $file_no_extension = $genome =~ s/\.\w+$//r; # remove file extensions from @Genome_Files to have nicer output files; non-destructive modifier causes the result of the substitution to be returned instead of modifying $_ (see http://perldoc.perl.org/perlrequick.html#Search-and-replace)
|
|
670 open_result_file("$Result_Dir/$file_no_extension\_singletons.tsv", $genome); # subroutine
|
|
671 }
|
|
672 print_og_results($og, $genome, $Count_Group_OG{$genome}->{'fh'}); # subroutine
|
|
673 }
|
|
674 }
|
|
675 }
|
|
676 }
|
|
677
|
|
678 # filter genome counts of the groups in this OG according to inclusion-/exclusion-cutoffs
|
|
679 my @OG_included_groups; # array for groups with number of genomes in this OG >= the inclusion cutoff
|
|
680 my @OG_excluded_groups; # array for groups with genomes <= the exclusion cutoff
|
|
681 my @OG_unspec_groups; # array for groups with 'exclusion < genome_count < inclusion'
|
|
682 foreach my $group (@Group_Names) {
|
|
683 if ($present_og_genomes{$group}) { # only run if group has a genome in this OG (otherwise undefined ARRAY reference)
|
|
684 if (@{ $present_og_genomes{$group} } >= $Genome_Groups{$group}->{'inclusion'}) {
|
|
685 push(@OG_included_groups, $group);
|
|
686 } elsif (@{ $present_og_genomes{$group} } <= $Genome_Groups{$group}->{'exclusion'}) {
|
|
687 push(@OG_excluded_groups, $group);
|
|
688 } elsif (@{ $present_og_genomes{$group} } > $Genome_Groups{$group}->{'exclusion'} && @{ $present_og_genomes{$group} } < $Genome_Groups{$group}->{'inclusion'}) {
|
|
689 push(@OG_unspec_groups, $group);
|
|
690 }
|
|
691 } else { # $present_og_genomes{$group} == 0 for group (undef anonymous array), i.e. always <= exclusion
|
|
692 push(@OG_excluded_groups, $group);
|
|
693 }
|
|
694 }
|
|
695
|
|
696 # 'unspecific' category OG if at least ONE group present in '@OG_unspec_groups'
|
|
697 # no further categorization, i.e. skip rest of logics
|
|
698 if (@OG_unspec_groups) {
|
|
699 $Count_Group_OG{'unspecific'}->{'OG_count'}++;
|
|
700 foreach my $group (@OG_unspec_groups) {
|
|
701 count_category_CDS(\@{ $present_og_genomes{$group} }, 'unspecific', $og); # subroutine to count all CDS in the current OG for ALL genomes in the included groups (here unspec groups); %present_og_genomes contains only genomes present in this OG (see above 'push')
|
|
702 }
|
|
703 if ($Opt_Unspecific) { # print only to output file with option '-u'
|
|
704 open_result_file("$Result_Dir/unspecific_OGs.tsv", 'unspecific') if (!$Count_Group_OG{'unspecific'}->{'fh'}); # subroutine
|
|
705 print_og_results($og, @{ $present_og_genomes{$OG_unspec_groups[0]} }[0], $Count_Group_OG{'unspecific'}->{'fh'}); # subroutine; use first genome in first group which is present in current OG
|
|
706 }
|
|
707 next; # skip to next $og, because now unspec and not suitable for any further categories
|
|
708 }
|
|
709
|
|
710 # print binary matrix (in contrast to OGs in category 'unspecific' above, 'underrepresented' OGs below not skipped because package 'venn' can handle a row with only zeros)
|
|
711 if ($Opt_Binary) {
|
|
712 my $i = 0;
|
|
713 foreach my $group (@Group_Names) {
|
|
714 $i++;
|
|
715 print $Binary_Matrix_Fh "1" if (grep {/$group/} @OG_included_groups);
|
|
716 print $Binary_Matrix_Fh "0" if (grep {/$group/} @OG_excluded_groups);
|
|
717 print $Binary_Matrix_Fh "\t" if ($i < @Group_Names);
|
|
718 }
|
|
719 print $Binary_Matrix_Fh "\n";
|
|
720 }
|
|
721
|
|
722 # 'cutoff_core' category OG if ALL groups >= inclusion (includes 'strict_core' OGs where ALL genomes are present in the OG)
|
|
723 # 2 groups A/B [2 inclus :0 exclus], 3 groups A/B/C [3:0], 4 groups A/B/C/D [4:0]
|
|
724 if (@OG_included_groups == @Group_Names && @OG_excluded_groups == 0) {
|
|
725 $Count_Group_OG{'cutoff_core'}->{'OG_count'}++;
|
|
726 foreach my $group (@OG_included_groups) {
|
|
727 count_category_CDS(\@{ $present_og_genomes{$group} }, 'cutoff_core', $og); # subroutine
|
|
728 }
|
|
729 open_result_file("$Result_Dir/cutoff_core_OGs.tsv", 'cutoff_core') if (!$Count_Group_OG{'cutoff_core'}->{'fh'}); # subroutine
|
|
730 print_og_results($og, @{ $present_og_genomes{$OG_included_groups[0]} }[0], $Count_Group_OG{'cutoff_core'}->{'fh'}); # subroutine; again use first genome in first group which is present in current OG
|
|
731
|
|
732 # 'underrepresented' category OG if ALL groups <= exclusion_cutoff (depending on the cutoffs and number of genomes in the groups will include most singletons)
|
|
733 # 2 groups [0:2], 3 [0:3], 4 [0:4]
|
|
734 } elsif (@OG_included_groups == 0 && @OG_excluded_groups == @Group_Names) {
|
|
735 $Count_Group_OG{'underrepresented'}->{'OG_count'}++;
|
|
736 open_result_file("$Result_Dir/underrepresented_OGs.tsv", 'underrepresented') if (!$Count_Group_OG{'underrepresented'}->{'fh'}); # subroutine
|
|
737
|
|
738 my $genome_found; # indicates if a genome with annotation found
|
|
739 foreach my $group (@OG_excluded_groups) { # have to go through all groups in '@OG_excluded_groups' in case groups have no genomes ('$present_og_genomes{$group} == 0' above) in this OG; at least one group will have a genome (otherwise wouldn't be an OG in the PO matrix)
|
|
740 if ($present_og_genomes{$group}) { # group with genome in this OG
|
|
741 count_category_CDS(\@{ $present_og_genomes{$group} }, 'underrepresented', $og); # subroutine
|
|
742 next if ($genome_found); # print annotation to output only for first genome found
|
|
743 $genome_found = print_og_results($og, @{ $present_og_genomes{$group} }[0], $Count_Group_OG{'underrepresented'}->{'fh'}); # subroutine returns 1
|
|
744 }
|
|
745 }
|
|
746
|
|
747 # 'group-specific' category OG if only ONE group >= inclusion_cutoff
|
|
748 # 2 groups [1:1], 3 [1:2], 4 [1:3]
|
|
749 } elsif (@OG_included_groups == 1 && @OG_excluded_groups == @Group_Names - 1) {
|
|
750 $Count_Group_OG{"$OG_included_groups[0]_specific"}->{'OG_count'}++; # only one element in array thus [0]
|
|
751 foreach my $group (@OG_included_groups) {
|
|
752 count_category_CDS(\@{ $present_og_genomes{$group} }, "$OG_included_groups[0]_specific", $og); # subroutine
|
|
753 }
|
|
754 open_result_file("$Result_Dir/$OG_included_groups[0]_specific_OGs.tsv", "$OG_included_groups[0]_specific") if (!$Count_Group_OG{"$OG_included_groups[0]_specific"}->{'fh'}); # subroutine
|
|
755 print_og_results($og, @{ $present_og_genomes{$OG_included_groups[0]} }[0], $Count_Group_OG{"$OG_included_groups[0]_specific"}->{'fh'}); # subroutine
|
|
756
|
|
757 # 'group-absent' category OG if ONE group <= exclusion_cutoff (only for 3 and 4 groups)
|
|
758 # 3 groups (A+B not C [2:1], A+C not B, and B+C not A) and 4 groups (A+B+C not D [3:1], A+B+D not C, A+C+D not B, B+C+D not A)
|
|
759 } elsif (@Group_Names >= 3 && (@OG_included_groups == @Group_Names - 1 && @OG_excluded_groups == 1)) {
|
|
760 $Count_Group_OG{"$OG_excluded_groups[0]_absent"}->{'OG_count'}++;
|
|
761 foreach my $group (@OG_included_groups) {
|
|
762 count_category_CDS(\@{ $present_og_genomes{$group} }, "$OG_excluded_groups[0]_absent", $og); # subroutine
|
|
763 }
|
|
764 open_result_file("$Result_Dir/$OG_excluded_groups[0]_absent_OGs.tsv", "$OG_excluded_groups[0]_absent") if (!$Count_Group_OG{"$OG_excluded_groups[0]_absent"}->{'fh'}); # subroutine
|
|
765 print_og_results($og, @{ $present_og_genomes{$OG_included_groups[0]} }[0], $Count_Group_OG{"$OG_excluded_groups[0]_absent"}->{'fh'}); # subroutine
|
|
766
|
|
767 # 'two group-specific' category OG if TWO groups >= inclusion_cutoff (only for 4 groups)
|
|
768 # 4 groups (B+D not A or C [2:2], B+C not A or D, A+D not B or C, A+C not B or D, A+B not C or D, C+D not A or B)
|
|
769 } elsif (@Group_Names == 4 && (@OG_included_groups == @OG_excluded_groups)) {
|
|
770 $Count_Group_OG{"$OG_included_groups[0]-$OG_included_groups[1]_specific"}->{'OG_count'}++;
|
|
771 foreach my $group (@OG_included_groups) {
|
|
772 count_category_CDS(\@{ $present_og_genomes{$group} }, "$OG_included_groups[0]-$OG_included_groups[1]_specific", $og); # subroutine
|
|
773 }
|
|
774 open_result_file("$Result_Dir/$OG_included_groups[0]-$OG_included_groups[1]_specific_OGs.tsv", "$OG_included_groups[0]-$OG_included_groups[1]_specific") if (!$Count_Group_OG{"$OG_included_groups[0]-$OG_included_groups[1]_specific"}->{'fh'}); # subroutine
|
|
775 print_og_results($og, @{ $present_og_genomes{$OG_included_groups[0]} }[0], $Count_Group_OG{"$OG_included_groups[0]-$OG_included_groups[1]_specific"}->{'fh'}); # subroutine
|
|
776
|
|
777 } else {
|
|
778 die "\n### Fatal error:\nNo category logic test fits for OG '$og', which shouldn't happen. Have you smuggled only one or five groups into the groups file '$Groups_File' (which also shouldn't be possible)? The script can handle only two, three, or four groups please edit the input!\n"; # shouldn't happen, since regex in group file parse checks for minimally two and maximally four tab-separated group names and all category logics should be covered
|
|
779 }
|
|
780 }
|
|
781
|
|
782
|
|
783
|
|
784 ### Close all output files
|
|
785 close $Binary_Matrix_Fh if ($Opt_Binary);
|
|
786 map { close $Count_Group_OG{$_}->{'fh'} } keys %Count_Group_OG;
|
|
787
|
|
788
|
|
789
|
|
790 ### Plot the venn diagram with R package 'gplots' function 'venn'
|
|
791 if ($Opt_Venn) {
|
|
792 my $tmp_r_script = "$Result_Dir/tmp_venn.R"; # filename of the R script
|
|
793 my $venn_diagram_name = "$Result_Dir/venn_diagram.pdf"; # filename of the output PDF histogram
|
|
794 print STDERR "Drawing venn diagram (option '-p') '$venn_diagram_name' ...\n"; # run status of script
|
|
795 open (my $r_fh, ">", "$tmp_r_script");
|
|
796
|
|
797 select $r_fh; # select fh for standard print/f output
|
|
798 print "#!/usr/bin/Rscript --vanilla --slave\n"; # header of R script
|
|
799 print "suppressMessages(library(gplots))\n"; # load library 'gplots' for function 'venn' and suppress STDOUT message from loading
|
|
800 print "binary_matrix <- read.table(\"$Binary_Matrix_File\", header=TRUE, sep=\"\\t\")\n";
|
|
801 print "pdf(\"$venn_diagram_name\")\n";
|
|
802 print "venn(binary_matrix)\n"; # create the venn diagram
|
|
803 print "out <- dev.off()\n"; # write histogram to PDF and suppress STDOUT output by diverting it
|
|
804 select STDOUT; # change standard print/f output back to STDOUT
|
|
805 close $r_fh;
|
|
806
|
|
807 # execute tmp R script with 'Rscript'
|
|
808 system("Rscript $tmp_r_script") == 0 or die "### Fatal error:\nPlotting the venn diagram didn't work. Either statistical programming language 'R' or the required R package 'gplots' is not installed, not in global \$PATH, or something wrong with the binary matrix '$Binary_Matrix_File' or the temporary R script '$tmp_r_script', which runs the drawing of the venn diagram. Function 'venn' of the 'gplots' package is required and needs to be installed!\n";
|
|
809 unlink $tmp_r_script;
|
|
810 }
|
|
811
|
|
812
|
|
813
|
|
814 ### Count overall category OGs and CDS, and plausibility check for above logics
|
|
815 my ($Total_Category_CDS, $Total_Category_OGs) = (0, 0);
|
|
816 foreach my $category (keys %Count_Group_OG) { # can't use a map in case of strict_core, singletons (categories which overlap non-optional categories)
|
|
817 next if ($category =~ /strict_core/ || grep {$category eq $_} @Genome_Files); # skip optional 'strict_core' (option '-co') or singleton ('-s') categories
|
|
818
|
|
819 $Total_Category_OGs += $Count_Group_OG{$category}->{'OG_count'};
|
|
820 $Total_Category_CDS += $Count_Group_OG{$category}->{'CDS_count'};
|
|
821 }
|
|
822 die "\n### Fatal error:\nResulting categories don't include all present OGs, this shouldn't happen!\n" if (keys %Ortho_Groups != $Total_Category_OGs); # is this really always the case?
|
|
823
|
|
824
|
|
825
|
|
826 ### Print final output stats matrix
|
|
827 print STDERR "Finished printing output files to result directory '$Result_Dir', printing final stats matrix to STDOUT ...\n"; # run status of script
|
|
828 print "# category\tOG_count\tCDS_count (all included genomes)\n"; # header for matrix
|
|
829 print "pangenome\t", scalar keys %Ortho_Groups, "\t", scalar keys %Annotation, "\n";
|
|
830 print "included_in_categories\t$Total_Category_OGs\t$Total_Category_CDS\n";
|
|
831 foreach my $category (sort {lc($a) cmp lc($b)} keys %Count_Group_OG) {
|
|
832 print "$category";
|
|
833 if (grep {$category eq $_} @Genome_Files) { # print overall stats for genomes with singletons (option '-s')
|
|
834 print " genome\t", scalar grep($Ortho_Groups{$_}->{$category} , keys %Ortho_Groups), "\t", scalar grep($Annotation{$_}->{'genome'} eq $category, keys %Annotation), "\n"; # why does it work without the fancy map during CDS count check?
|
|
835 print "$category singletons"; # include category "key word" singleton for category stats below
|
|
836 }
|
|
837 print "\t$Count_Group_OG{$category}->{'OG_count'}\t$Count_Group_OG{$category}->{'CDS_count'}\n";
|
|
838 }
|
|
839
|
|
840 # print out overall stats for all genomes even without singletons (option '-a')
|
|
841 if ($Opt_Genomes_Overall) {
|
|
842 foreach my $genome (sort {lc($a) cmp lc($b)} @Genome_Files) {
|
|
843 next if (grep {$genome eq $_} keys %Count_Group_OG); # skip genomes with singletons (option '-s'), already printed above
|
|
844 print "$genome genome\t", scalar grep($Ortho_Groups{$_}->{$genome} , keys %Ortho_Groups), "\t", scalar grep($Annotation{$_}->{'genome'} eq $genome, keys %Annotation), "\n";
|
|
845 }
|
|
846 }
|
|
847
|
|
848 exit;
|
|
849
|
|
850
|
|
851 #############
|
|
852 #Subroutines#
|
|
853 #############
|
|
854
|
|
855 ### Subroutine to test for multi-FASTA genome file existence
|
|
856 sub check_file_exist {
|
|
857 my $file = shift;
|
|
858 die "\n### Fatal error:\nGenome file '$file' is listed in the Proteinortho matrix, but does not exist in directory '$Genome_Dir': $!\n" if (!-e $file);
|
|
859 return 1;
|
|
860 }
|
|
861
|
|
862
|
|
863
|
|
864 ### Subroutine to check if a string adheres to file system conformity, esp. for filenames
|
|
865 sub check_file_system_conformity {
|
|
866 my ($string, $type) = @_;
|
|
867 die "\n\n### Fatal error:\nOnly alphanumeric '0-9a-zA-Z', underscore '_', dash '-', and period '.' characters allowed for $type to avoid operating/file system problems. Thus, please adapt '$string' in the group file '$Groups_File' accordingly!\n" if ($string !~ /^[\w_\-\.]+$/);
|
|
868 return 1;
|
|
869 }
|
|
870
|
|
871
|
|
872
|
|
873 ### Subroutine to check if a genome filename is present in hash %Genome_Groups (hash filled from the $Groups_File)
|
|
874 sub check_genome_in_groups {
|
|
875 my $genome = shift;
|
|
876 foreach my $group (keys %Genome_Groups) {
|
|
877 return 1 if (grep {$_ eq $genome} @{ $Genome_Groups{$group}->{'genomes'} });
|
|
878 }
|
|
879 return; # return 'undef'
|
|
880 }
|
|
881
|
|
882
|
|
883
|
|
884 ### Subroutine to check if an ID is present in hash %Ortho_Group (hash filled from the PO matrix)
|
|
885 # this code was 1.) way too slow and 2.) somehow broke '$Query_Specific_OGs' and '$Query_Singletons' during implementation in 'po2anno.pl', no idea why ... (thus handle with care here)
|
|
886 #sub check_po_id {
|
|
887 #my ($id, $genome) = @_;
|
|
888 #foreach my $og (keys %Ortho_Groups) {
|
|
889 #return 1 if (grep {$_ eq $id} @{ $Ortho_Groups{$og}->{$genome} });
|
|
890 #}
|
|
891 #return;
|
|
892 #}
|
|
893
|
|
894
|
|
895
|
|
896 ### Subroutine to count and sum all CDS for ALL genomes in a certain category for a specific OG
|
|
897 sub count_category_CDS {
|
|
898 my ($genomes_array_ref, $category, $og) = @_;
|
|
899 foreach my $genome (@{ $genomes_array_ref }) { # de-reference array
|
|
900 $Count_Group_OG{$category}->{'CDS_count'} += @{ $Ortho_Groups{$og}->{$genome} };
|
|
901 }
|
|
902 return 1;
|
|
903 }
|
|
904
|
|
905
|
|
906
|
|
907 ### Subroutine to empty a directory with user interaction
|
|
908 sub empty_dir {
|
|
909 my $dir = shift;
|
|
910 print STDERR "\nDirectory '$dir' already exists! You can use either option '-r' to set a different output result directory name, or do you want to replace the directory and all its contents [y|n]? ";
|
|
911 my $user_ask = <STDIN>;
|
|
912 if ($user_ask =~ /y/i) {
|
|
913 unlink glob "$dir/*"; # remove all files in results directory
|
|
914 } else {
|
|
915 die "\nScript abborted!\n";
|
|
916 }
|
|
917 return 1;
|
|
918 }
|
|
919
|
|
920
|
|
921
|
|
922 ### Open a result file and print the header
|
|
923 sub open_result_file {
|
|
924 my ($filename, $category) = @_;
|
|
925 open ($Count_Group_OG{$category}->{'fh'}, ">", "$filename"); # store filehandle in hash of hash
|
|
926 print { $Count_Group_OG{$category}->{'fh'} } "# OG#\tID\tgene\tEC_number\tproduct\torganism\n"; # block around fh needed if not a simple string variable (or glob)
|
|
927 return 1;
|
|
928 }
|
|
929
|
|
930
|
|
931
|
|
932 ### Print annotation of the OGs to the result files
|
|
933 sub print_og_results {
|
|
934 my ($og, $genome, $fh) = @_;
|
|
935
|
|
936 print $fh "$og"; # print OG number only for the first paralog
|
|
937 foreach my $id (@{ $Ortho_Groups{$og}->{$genome} }) { # de-reference anonymous array for putative paralogs
|
|
938 die "\n### Fatal error:\nID '$id' present in the Proteinortho matrix '$PO_Matrix_File' in OG '$og' but not present in the corresponding genome '$genome' multi-FASTA file. However, all IDs in the matrix and the genome files in directory '$Genome_Dir' need to be correspondent to each other. Make sure you chose the correct directory with the input genome files for the current Proteinortho matrix!\n" unless ($Annotation{$id}); # overlaps with CDS count check in multi-FASTA and PO matrix files, but here only IDs checked used for print out # the reverse check (an ID that's present in a genome file but not in the PO matrix) WAS during the parsing of the annotation from the multi-FASTA files and subroutine 'check_po_id($id)' (see sub why not)
|
|
939
|
|
940 print $fh "\t$id\t";
|
|
941 if ($Annotation{$id}->{'gene'}) {
|
|
942 print $fh $Annotation{$id}->{'gene'}, "\t";
|
|
943 } else {
|
|
944 print $fh "\t";
|
|
945 }
|
|
946 if ($Annotation{$id}->{'ec'}) {
|
|
947 print $fh $Annotation{$id}->{'ec'}, "\t";
|
|
948 } else {
|
|
949 print $fh "\t";
|
|
950 }
|
|
951 if ($Annotation{$id}->{'product'}) {
|
|
952 print $fh $Annotation{$id}->{'product'}, "\t";
|
|
953 } else { # very unlikely that a CDS has no product annotation
|
|
954 print $fh "\t";
|
|
955 }
|
|
956 if ($Annotation{$id}->{'organism'}) {
|
|
957 print $fh $Annotation{$id}->{'organism'}, "\n";
|
|
958 } else {
|
|
959 print $fh "\n";
|
|
960 }
|
|
961 }
|
|
962 return 1;
|
|
963 }
|