3
|
1 #!/usr/bin/perl
|
|
2
|
|
3 #######
|
|
4 # POD #
|
|
5 #######
|
|
6
|
|
7 =pod
|
|
8
|
|
9 =head1 NAME
|
|
10
|
|
11 C<binary_group_stats.pl> - categorize binary matrix rows according to
|
|
12 column groups
|
|
13
|
|
14 =head1 SYNOPSIS
|
|
15
|
|
16 C<perl binary_group_stats.pl -i binary_matrix.tsv -g group_file.tsv
|
|
17 -p E<gt> overall_stats.tsv>
|
|
18
|
|
19 =head1 DESCRIPTION
|
|
20
|
|
21 Categorize rows of a delimited TEXT input B<binary> matrix (option
|
|
22 B<-i>) according to column group affiliations. All fields of the binary
|
|
23 matrix need to be filled with either a B<0> indicating absence or a
|
|
24 B<1> indicating presence, i.e. all rows need to have the same number of
|
|
25 columns. Use option B<-d> to set the delimiter of the input matrix,
|
|
26 default is set to tab-delimited/separated matrices.
|
|
27
|
|
28 The group affiliations of the columns are intended to get overall
|
|
29 presence/absence statistics for groups of columns and not simply
|
|
30 single columns of the matrix. Percentage inclusion (option
|
|
31 B<-cut_i>) and exclusion (option B<-cut_e>) cutoffs can be set to
|
|
32 define how strict the presence/absence of column groups within a row
|
|
33 are defined. Of course groups can also hold only single column
|
|
34 headers to get single column statistics. Group affiliations are
|
|
35 defined in a mandatory B<tab-delimited> group input file (option
|
|
36 B<-g>), including the column headers from the input binary matrix,
|
|
37 with B<minimal two> and B<maximal four> groups.
|
|
38
|
|
39 The script was designed to handle a presence/absence query protein
|
|
40 binary matrix from a C<prot_finder.pl> protein homolog search in
|
|
41 bacterial genomes with a subsequent C<prot_binary_matrix.pl> run to
|
|
42 create the matrix. But, any binary matrix can be used for
|
|
43 C<binary_group_stats.pl>, optionally beforehand transposed with
|
|
44 C<transpose_matrix.pl>. However, column headers in the first row and
|
|
45 row headers in the first column are B<mandatory> for the input
|
|
46 binary matrix. Only alphanumeric (a-z, A-Z, 0-9), underscore (_),
|
|
47 dash (-), and period (.) characters are allowed for the B<column
|
|
48 headers> and B<group names> in the group file (option B<-g>) to
|
|
49 avoid downstream problems with the operating/file system. As a
|
|
50 consequence, also no whitespaces are allowed in these! Additionally,
|
|
51 B<column headers>, B<row headers>, and B<group names> need to be
|
|
52 B<unique>.
|
|
53
|
|
54 <binary_group_stats.pl> is based upon
|
|
55 L<C<po2group_stats.pl>|/po2group_stats>, which does the same thing
|
|
56 for genomes in an ortholog/paralog output matrix from a
|
|
57 L<B<Proteinortho5>|http://www.bioinf.uni-leipzig.de/Software/proteinortho/>
|
|
58 calculation.
|
|
59
|
|
60 To explain the logic behind the categorization, the following
|
|
61 annotation for example groups will be used. A following '1'
|
|
62 exemplifies a group column presence count in a respective row E<gt>=
|
|
63 the rounded inclusion cutoff, a '0' a group column presence count
|
|
64 E<lt>= the rounded exclusion cutoff. The presence and absence of
|
|
65 rows for the column group affiliations are structured in different
|
|
66 categories depending on the number of groups. For B<two groups>
|
|
67 (e.g. A and B) there are five categories: 'A specific' (A:B = 1:0),
|
|
68 'B specific' (0:1), 'cutoff core' (1:1), 'underrepresented' (0:0),
|
|
69 and 'unspecific'. Unspecific rows have a column presence count for
|
|
70 at least B<one> group outside the cutoffs (exclusion cutoff E<lt>
|
|
71 column presence count E<lt> inclusion cutoff) and thus cannot be
|
|
72 categorized. The respective row headers of these 'unspecific' rows
|
|
73 will only be printed to a final result file with option B<-u>.
|
|
74 Overall stats for all categories are printed to C<STDOUT> in a final
|
|
75 tab-delimited output matrix.
|
|
76
|
|
77 B<Three groups> (A, B, and C) have the following nine categories: 'A
|
|
78 specific' (A:B:C = 1:0:0), 'B specific' (0:1:0), 'C specific'
|
|
79 (0:0:1), 'A absent' (0:1:1), 'B absent' (1:0:1), 'C absent' (1:1:0),
|
|
80 'cutoff core' (1:1:1), 'underrepresented' (0:0:0), and 'unspecific'.
|
|
81
|
|
82 B<Four groups> (A, B, C, and D) are classified in 17 categories: 'A
|
|
83 specific' (A:B:C:D = 1:0:0:0), 'B specific' (0:1:0:0), 'C specific'
|
|
84 (0:0:1:0), 'D specific' (0:0:0:1), 'A-B specific' (1:1:0:0), 'A-C
|
|
85 specific' (1:0:1:0), 'A-D specific' (1:0:0:1), 'B-C specific'
|
|
86 (0:1:1:0), 'B-D specific' (0:1:0:1), 'C-D specific' (0:0:1:1), 'A
|
|
87 absent' (0:1:1:1), 'B absent' (1:0:1:1), 'C absent' (1:1:0:1), 'D
|
|
88 absent' (1:1:1:0), 'cutoff core' (1:1:1:1), 'underrepresented'
|
|
89 (0:0:0:0), and 'unspecific'.
|
|
90
|
|
91 The resulting B<group> presence/absence (according to the cutoffs)
|
|
92 can also be printed to a group binary matrix (option B<-b>) in the
|
|
93 result directory (option B<-r>), excluding the 'unspecific'
|
|
94 category. Since the categories are the logics underlying venn
|
|
95 diagrams, you can also plot the results in a venn diagram using the
|
|
96 group binary matrix (option B<-p>). The 'underrepresented' category
|
|
97 is exempt from the venn diagram, because it is outside of venn
|
|
98 diagram logics.
|
|
99
|
|
100 For an illustration of the logics have a look at the example venn
|
|
101 diagrams in the L<C<po2group_stats.pl>|/po2group_stats> folder
|
|
102 F</po2group_stats/pics/venn_diagram_logics.[svg|png]>.
|
|
103
|
|
104 There are two optional categories (which are only considered for the
|
|
105 final print outs and in the final stats matrix, not for the group
|
|
106 binary matrix and the venn diagram): 'strict core' (option B<-co>)
|
|
107 for rows where B<all> columns have a presence '1'. Of course all the
|
|
108 'strict core' rows are also included in the 'cutoff core' category
|
|
109 ('strict core' is identical to 'cutoff core' with B<-cut_i> 1 and
|
|
110 B<-cut_e> 0). Option B<-s> activates the detection of
|
|
111 'singleton/column-specific' rows with a '1' in only B<one> column.
|
|
112 Depending on the cutoffs and number of columns in the groups,
|
|
113 category 'underrepresented' includes most of these singletons.
|
|
114
|
|
115 Each row header is printed in the respective category output file in
|
|
116 the result directory. The number of row headers in the category
|
|
117 result files are the same as listed in the venn diagram and the
|
|
118 final stats matrix. Groups with only a single column header will
|
|
119 have the same result as the respective 'singleton' category (with
|
|
120 option B<-s>).
|
|
121
|
|
122 =head1 OPTIONS
|
|
123
|
|
124 =head2 Mandatory options
|
|
125
|
|
126 =over 20
|
|
127
|
|
128 =item B<-i>=I<str>, B<-input>=I<str>
|
|
129
|
|
130 Input delimited TEXT binary matrix (e.g. *.tsv, *.csv, or *.txt),
|
|
131 see option B<-d>
|
|
132
|
|
133 =item B<-g>=I<str>, B<-groups_file>=I<str>
|
|
134
|
|
135 Tab-delimited file with group affiliation for the columns from the
|
|
136 input binary matrix with B<minimal two> and B<maximal four> groups
|
|
137 (easiest to create in a spreadsheet software and save in
|
|
138 tab-separated format). B<All> column headers from the input binary
|
|
139 matrix need to be included. Column headers and group names can only
|
|
140 include alphanumeric (a-z, A-Z, 0-9), underscore (_), dash (-), and
|
|
141 period (.) characters (no whitespaces allowed either). Example
|
|
142 format with two column headers in group A, three in group B and D,
|
|
143 and one in group C:
|
|
144
|
|
145 group_A\tgroup_B\tgroup_C\tgroup_D
|
|
146 column_header1\tcolumn_header9\tcolumn_header3\tcolumn_header8
|
|
147 column_header7\tcolumn_header6\t\tcolumn_header5
|
|
148 \tcolumn_header4\t\tcolumn_header2
|
|
149
|
|
150 =back
|
|
151
|
|
152 =head2 Optional options
|
|
153
|
|
154 =over 20
|
|
155
|
|
156 =item B<-h>, B<-help>
|
|
157
|
|
158 Help (perldoc POD)
|
|
159
|
|
160 =item B<-d>=I<str>, B<-delimiter>=I<str>
|
|
161
|
|
162 Set delimiter of input binary matrix (e.g. comma ',', single
|
|
163 space ' ' etc.) [default = tab-delimited/separated]
|
|
164
|
|
165 =item B<-r>=I<str>, B<-result_dir>=I<str>
|
|
166
|
|
167 Path to result folder [default = inclusion and exclusion percentage
|
|
168 cutoff, './results_i#_e#']
|
|
169
|
|
170 =item B<-cut_i>=I<float>, B<-cut_inclusion>=I<float>
|
|
171
|
|
172 Percentage inclusion cutoff for column presence counts in a group
|
|
173 per row, has to be E<gt> 0 and E<lt>= 1. Cutoff will be rounded
|
|
174 according to the column header number in each group and has to be
|
|
175 E<gt> the rounded exclusion cutoff in this group. [default = 0.9]
|
|
176
|
|
177 =item B<-cut_e>=I<float>, B<-cut_exclusion>=I<float>
|
|
178
|
|
179 Percentage exclusion cutoff, has to be E<gt>= 0 and E<lt> 1. Rounded
|
|
180 cutoff has to be E<lt> rounded inclusion cutoff. [default = 0.1]
|
|
181
|
|
182 =item B<-b>, B<-binary_group_matrix>
|
|
183
|
|
184 Print a group binary matrix with the presence/absence column group
|
|
185 results according to the cutoffs (excluding 'unspecific' category
|
|
186 rows)
|
|
187
|
|
188 =item B<-p>, B<-plot_venn>
|
|
189
|
|
190 Plot venn diagram from the group binary matrix (except 'unspecific'
|
|
191 and 'underrepresented' categories) with function C<venn> from B<R>
|
|
192 package B<gplots>, requires option B<-b>
|
|
193
|
|
194 =item B<-co>, B<-core_strict>
|
|
195
|
|
196 Include 'strict core' category in output for rows where B<all>
|
|
197 columns have a '1'
|
|
198
|
|
199 =item B<-s>, B<-singletons>
|
|
200
|
|
201 Include singleton/column-specific rows for each column header in the
|
|
202 output, activates also overall column header presence ('1') counts
|
|
203 in final stats matrix for columns with singletons
|
|
204
|
|
205 =item B<-u>, B<-unspecific>
|
|
206
|
|
207 Include 'unspecific' category in output
|
|
208
|
|
209 =item B<-a>, B<-all_column_presence_overall>
|
|
210
|
|
211 Report overall presence counts for all column headers (appended to
|
|
212 the final stats matrix), also those without singletons; will include
|
|
213 all overall column header presence counts without option B<-s>
|
|
214
|
|
215 =item B<-v>, B<-version>
|
|
216
|
|
217 Print version number to C<STDERR>
|
|
218
|
|
219 =back
|
|
220
|
|
221 =head1 OUTPUT
|
|
222
|
|
223 =over 20
|
|
224
|
|
225 =item C<STDOUT>
|
|
226
|
|
227 The tab-delimited final stats matrix is printed to C<STDOUT>.
|
|
228 Redirect or pipe into another tool as needed.
|
|
229
|
|
230 =item F<./results_i#_e#>
|
|
231
|
|
232 All output files are stored in a results folder
|
|
233
|
|
234 =item F<./results_i#_e#/[*_specific|*_absent|cutoff_core|underrepresented]_rows.txt>
|
|
235
|
|
236 Files including the row headers for rows in non-optional categories
|
|
237
|
|
238 =item (F<./results_i#_e#/[*_singletons|strict_core|unspecific]_rows.txt>)
|
|
239
|
|
240 Optional category output files with the respective row headers
|
|
241
|
|
242 =item (F<./results_i#_e#/binary_matrix.tsv>)
|
|
243
|
|
244 Tab-delimited binary matrix of group presence/absence results
|
|
245 according to cutoffs (excluding 'unspecific' rows)
|
|
246
|
|
247 =item (F<./results_i#_e#/venn_diagram.pdf>)
|
|
248
|
|
249 Venn diagram for non-optional categories (except 'unspecific' and
|
|
250 'underrepresented' categories)
|
|
251
|
|
252 =back
|
|
253
|
|
254 =head1 EXAMPLES
|
|
255
|
|
256 =over
|
|
257
|
|
258 =item C<perl binary_group_stats.pl -i binary_matrix_transposed.csv -g group_file.tsv -d , -r result_dir -cut_i 0.7 -cut_e 0.2 -b -p -co -s -u -a E<gt> overall_stats.tsv>
|
|
259
|
|
260 =back
|
|
261
|
|
262 =head1 DEPENDENCIES
|
|
263
|
|
264 =over
|
|
265
|
|
266 =item B<Statistical computing language L<R|http://www.r-project.org/>>
|
|
267
|
|
268 C<Rscript> is needed to plot the venn diagram with option B<-p>,
|
|
269 tested with version 3.2.2
|
|
270
|
|
271 =item B<L<gplots|https://cran.r-project.org/web/packages/gplots/index.html>>
|
|
272
|
|
273 Package needed for B<R> to plot the venn diagram with function
|
|
274 C<venn>. Tested with B<gplots> version 2.17.0.
|
|
275
|
|
276 =back
|
|
277
|
|
278 =head1 VERSION
|
|
279
|
|
280 0.1 06-06-2016
|
|
281
|
|
282 =head1 AUTHOR
|
|
283
|
|
284 Andreas Leimbach aleimba[at]gmx[dot]de
|
|
285
|
|
286 =head1 LICENSE
|
|
287
|
|
288 This program is free software: you can redistribute it and/or modify
|
|
289 it under the terms of the GNU General Public License as published by
|
|
290 the Free Software Foundation; either version 3 (GPLv3) of the
|
|
291 License, or (at your option) any later version.
|
|
292
|
|
293 This program is distributed in the hope that it will be useful, but
|
|
294 WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
295 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
296 General Public License for more details.
|
|
297
|
|
298 You should have received a copy of the GNU General Public License
|
|
299 along with this program. If not, see L<http://www.gnu.org/licenses/>.
|
|
300
|
|
301 =cut
|
|
302
|
|
303
|
|
304 ########
|
|
305 # MAIN #
|
|
306 ########
|
|
307
|
|
308 use strict;
|
|
309 use warnings;
|
|
310 use autodie;
|
|
311 use Getopt::Long;
|
|
312 use Pod::Usage;
|
|
313
|
|
314 my $Cmdline = "$0 @ARGV"; # used call command
|
|
315
|
|
316 ### Get the options with Getopt::Long
|
|
317 my $Binary_Matrix_File; # binary input matrix
|
|
318 my $Groups_File; # tab-separated file with group affiliation for the input binary matrix column headers
|
|
319 my $Delimiter = "\t"; # set delimiter/separator of input matrix
|
|
320 my $Result_Dir; # path to results folder; default is set below to '"./results_i".$Inclusion_Cutoff."c".$Exclusion_Cutoff'
|
|
321 my $Inclusion_Cutoff = 0.9; # inclusion percentage cutoff for column presence ('1') count per row (floating point number >0 && <=1)
|
|
322 my $Exclusion_Cutoff = 0.1; # exclusion percentage cutoff (floating point number >=0 && <1)
|
|
323 my $Opt_Group_Binary; # print resulting group binary matrix according to the group inclusion/exclusion results
|
|
324 my $Opt_Venn; # create venn diagram with R package 'gplots' function 'venn'; requires $Opt_Group_Binary
|
|
325 my $Opt_Strict_Core; # report also strict core, i.e. rows with presence in ALL columns of input binary matrix (identical to 'cutoff_core' with inclusion and exclusion cutoffs of 1 and 0 resp.)
|
|
326 my $Opt_Singletons; # report also singletons for each column header (outside of the group classifications)
|
|
327 my $Opt_Unspecific; # report also group-unspecific rows ('exclusion < row_group_column-presence_count < inclusion' for any group)
|
|
328 my $Opt_Columns_Overall; # report overall row presence counts also for column headers without singletons
|
|
329 my $VERSION = 0.1;
|
|
330 my ($Opt_Version, $Opt_Help);
|
|
331 GetOptions ('input=s' => \$Binary_Matrix_File,
|
|
332 'groups_file=s' => \$Groups_File,
|
|
333 'delimiter=s' => \$Delimiter,
|
|
334 'result_dir=s' => \$Result_Dir,
|
|
335 'cut_inclusion=f' => \$Inclusion_Cutoff,
|
|
336 'cut_exclusion=f' => \$Exclusion_Cutoff,
|
|
337 'binary_group_matrix' => \$Opt_Group_Binary,
|
|
338 'plot_venn' => \$Opt_Venn,
|
|
339 'core_strict' => \$Opt_Strict_Core,
|
|
340 'singletons' => \$Opt_Singletons,
|
|
341 'unspecific' => \$Opt_Unspecific,
|
|
342 'all_column_presence_overall' => \$Opt_Columns_Overall,
|
|
343 'version' => \$Opt_Version,
|
|
344 'help|?' => \$Opt_Help)
|
|
345 or pod2usage(-verbose => 1, -exitval => 2);
|
|
346
|
|
347
|
|
348
|
|
349 ### Run perldoc on POD and enforce mandatory options
|
|
350 pod2usage(-verbose => 2) if ($Opt_Help);
|
|
351 die "$0 $VERSION\n" if ($Opt_Version);
|
|
352
|
|
353 if (!$Binary_Matrix_File || !$Groups_File) {
|
|
354 my $warning = "\n### Fatal error: Mandatory options '-i' or '-g' or their arguments are missing!\n";
|
|
355 pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
|
|
356 }
|
|
357
|
|
358 if (($Inclusion_Cutoff <= 0 || $Inclusion_Cutoff > 1) || ($Exclusion_Cutoff < 0 || $Exclusion_Cutoff >= 1)) {
|
|
359 my $warning = "\n### Fatal error:\nInclusion (0 < '-cut_i' <= 1) or exclusion (0 <= '-cut_e' < 1) cutoff(s) not chosen correctly!\n";
|
|
360 pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
|
|
361 }
|
|
362
|
|
363 print STDERR "Script call command: $Cmdline\n"; # print call command after '-h|-v'
|
|
364
|
|
365 if ($Opt_Venn && !$Opt_Group_Binary) {
|
|
366 warn "Option '-p' to draw the venn diagram set, but not it's required option '-b' for the output group binary matrix. Forcing option '-b'!\n";
|
|
367 $Opt_Group_Binary = 1;
|
|
368 }
|
|
369
|
|
370
|
|
371
|
|
372 ### Parse column headers (from the input binary matrix) in groups file
|
|
373 print STDERR "Parsing group file '$Groups_File' "; # run status of script
|
|
374 open (my $Groups_File_Fh, "<", "$Groups_File");
|
|
375
|
|
376 my @Group_Names; # store group name column order
|
|
377 my %Column_Header_Groups; # hash of hash with group name as key, internal hash with resp. input binary matrix column header anonymous array (key 'column_headers'), or calculated integer inclusion/exclusion cutoffs (key 'inclusion' etc.)
|
|
378 while (<$Groups_File_Fh>) {
|
|
379 chomp;
|
|
380 next if (/^\s*$/); # skip empty lines
|
|
381
|
|
382 # check groups file header and get group names
|
|
383 if ($. == 1) { # header of groups file (first line)
|
|
384 die "\n### Fatal error:\nGroups file '$Groups_File' does not have a correctly formatted header line with at least TWO and maximal FOUR tab-separated group name columns (without other whitespaces)!\n" if (!/^\S+\t\S+\t?\S*\t?\S*$/);
|
|
385 foreach my $group_name (split(/\t/)) {
|
|
386 check_file_system_conformity($group_name, 'group names'); # subroutine to check string only contains alphanumeric '0-9a-zA-Z', underscore '_', dash '-', and period '.' characters
|
|
387 die "\n### Fatal error:\nGroup name '$group_name' is not unique in the group file '$Groups_File'! Please choose unique names for the groups in the group file!\n" if (grep {/$group_name/} @Group_Names);
|
|
388 push (@Group_Names, $group_name); # store groups array
|
|
389 }
|
|
390 next; # skip header line for subsequent code
|
|
391 }
|
|
392
|
|
393 # parse input binary matrix column headers in groups file
|
|
394 die "\n### Fatal error:\nLine '$.' in the groups file '$Groups_File' does not have the correct format with maximal four tab-separated column header names from the input binary matrix '$Binary_Matrix_File' according to the groups (without other whitespaces):\n$_\n" if (!/^\S*\t?\S*\t?\S*\t?\S*$/);
|
|
395 my $column = -1; # group file column counter to associate input binary matrix column header to correct group (@Group_Names array zero-based, thus start with '-1')
|
|
396 foreach my $column_header (split(/\t/)) {
|
|
397 $column++;
|
|
398 next if ($column_header =~ /^$/); # skip empty columns in group file, in case the groups have uneven column header numbers
|
|
399 warn "\n### Warning:\nInput binary matrix column header '$column_header' in the group file has a binary ('0' or '1') instead of an alphanumeric string, sure this is a column header? The matrix needs to have mandatory row and column headers, see help with '-h'!\n" if ($column_header =~ /^(0|1)$/);
|
|
400 check_file_system_conformity($column_header, 'column headers'); # subroutine
|
|
401 die "\n### Fatal error:\nInput binary matrix column header '$column_header' is not unique in the group file '$Groups_File', but needs to be in the group file and the input binary matrix '$Binary_Matrix_File'! The group file needs to contain all of the column headers from the input binary matrix and vice versa. Make sure the strings correspond exactly (also lower/uppercase) to the header of the matrix!\n\n" if (check_column_header_in_groups($column_header)); # subroutine to check if a column header is present in %Column_Header_Groups
|
|
402 push (@{ $Column_Header_Groups{$Group_Names[$column]}->{'column_headers'} }, $column_header); # push into anonymous array in hash of hash
|
|
403 }
|
|
404 }
|
|
405 close $Groups_File_Fh;
|
|
406 print STDERR "with ", scalar map(@{ $Column_Header_Groups{$_}->{'column_headers'} }, keys %Column_Header_Groups), " input binary matrix column headers ...\n"; # run status of script
|
|
407
|
|
408 foreach (@Group_Names) {
|
|
409 die "\n### Fatal error:\nGroup '$_' does not contain an element, please add a suitable input binary matrix column header or remove the group from the group file '$Groups_File'!\n" if (!$Column_Header_Groups{$_});
|
|
410 }
|
|
411
|
|
412
|
|
413
|
|
414 ### Round inclusion and exclusion count cutoffs for each group
|
|
415 foreach (sort keys %Column_Header_Groups) {
|
|
416 $Column_Header_Groups{$_}->{'inclusion'} = int((scalar @{ $Column_Header_Groups{$_}->{'column_headers'} } * $Inclusion_Cutoff) + 0.5); # round positive number half up to integer ((s)printf rounds half to even), see e.g. https://stackoverflow.com/questions/178539/how-do-you-round-a-floating-point-number-in-perl
|
|
417 print STDERR "Group '$_' with ", scalar @{ $Column_Header_Groups{$_}->{'column_headers'} }, " input binary matrix column headers has a rounded inclusion cutoff of $Column_Header_Groups{$_}->{'inclusion'} column header(s) and ";
|
|
418
|
|
419 $Column_Header_Groups{$_}->{'exclusion'} = int((scalar @{ $Column_Header_Groups{$_}->{'column_headers'} } * $Exclusion_Cutoff) + 0.5);
|
|
420 print STDERR "an exclusion cutoff of $Column_Header_Groups{$_}->{'exclusion'} column header(s)!\n";
|
|
421 die "\n### Fatal error:\nThe rounded inclusion cutoff has to be greater than the exclusion cutoff for all groups, please choose appropriate cutoffs!\n" if ($Column_Header_Groups{$_}->{'inclusion'} <= $Column_Header_Groups{$_}->{'exclusion'});
|
|
422 }
|
|
423
|
|
424
|
|
425
|
|
426 ### Create result folder
|
|
427 if (!$Result_Dir) { # can't give default before 'GetOptions' in case cutoffs are set by the user
|
|
428 $Result_Dir = "./results_i".$Inclusion_Cutoff."_e".$Exclusion_Cutoff;
|
|
429 } else {
|
|
430 $Result_Dir =~ s/\/$//; # get rid of a potential '/' at the end of $Result_Dir path
|
|
431 }
|
|
432 if (-e $Result_Dir) {
|
|
433 empty_dir($Result_Dir); # subroutine to empty a directory with user interaction
|
|
434 } else {
|
|
435 mkdir $Result_Dir;
|
|
436 }
|
|
437
|
|
438
|
|
439
|
|
440 ### Parse rows in input binary matrix
|
|
441 print STDERR "Parsing input binary matrix '$Binary_Matrix_File' ...\n"; # run status of script
|
|
442 open (my $Binary_Matrix_Fh, "<", "$Binary_Matrix_File");
|
|
443
|
|
444 my @Column_Headers; # store input binary matrix column header order
|
|
445 my %Row_Header_Groups; # hash of hash with row header as key (first column of each row) and internal hash with column header as key and '1' for presence ('0' absence is not stored)
|
|
446 while (<$Binary_Matrix_Fh>) {
|
|
447 chomp;
|
|
448 next if (/^\s*$/); # skip empty lines
|
|
449 die "\n### Fatal error:\nSet delimiter/separator '$Delimiter' (option '-d') not found in line '$.' of the input binary matrix '$Binary_Matrix_File', sure the correct one is set?\n" if ($_ !~ /$Delimiter/);
|
|
450
|
|
451 # check input binary matrix file header and get column header names
|
|
452 if ($. == 1) { # header of input binary matrix file (first line)
|
|
453 my (undef, @headers) = split(/$Delimiter/); # get rid of first line row header
|
|
454 foreach my $column_header (@headers) {
|
|
455 die "\n### Fatal error:\nColumn header '$column_header' is present in the input binary matrix '$Binary_Matrix_File' but not in the groups file '$Groups_File'. However, the group file needs to contain all of the column headers from the matrix and vice versa. Make sure the strings correspond exactly (also lower/uppercase) to the matrix header!\n" unless (check_column_header_in_groups($column_header)); # subroutine to check if a column header is present in %Column_Header_Groups
|
|
456 push (@Column_Headers, $column_header);
|
|
457 }
|
|
458 die "\n### Fatal error:\nNot the same amount of input binary matrix column headers in the group file '$Groups_File' and matrix '$Binary_Matrix_File', but the group file needs to contain all of the column headers in the matrix and vice versa!\n" if (@Column_Headers != map(@{ $Column_Header_Groups{$_}->{'column_headers'} }, keys %Column_Header_Groups));
|
|
459 next; # skip header line of input binary matrix for subsequent code
|
|
460 }
|
|
461
|
|
462 # parse input binary matrix
|
|
463 my ($row_header, @presence_absence) = split(/$Delimiter/);
|
|
464 die "\n### Fatal error:\nInput binary matrix '$Binary_Matrix_File' row '$row_header' doesn't have the same amount of columns as overall column headers, however the matrix needs to be filled/consistent in every row!\n" unless (@Column_Headers == @presence_absence);
|
|
465 die "\n### Fatal error:\nInput binary matrix row header '$row_header' is not unique in the input binary matrix '$Binary_Matrix_File', however all row headers need to be unique!\n" if ($Row_Header_Groups{$row_header});
|
|
466 warn "\n### Warning:\nLine number '$.' of the binary input matrix '$Binary_Matrix_File' has a binary ('0' or '1') row header instead of an alphanumeric string, sure this is a row header? The matrix needs to have mandatory row and column headers, see help with '-h'!\n\n" if ($row_header =~ /^(0|1)$/);
|
|
467 my $column = -1; # column counter to associate presence/absence field to correct column header (@Column_Headers array zero-based, thus start with '-1')
|
|
468 foreach (@presence_absence) {
|
|
469 $column++;
|
|
470 die "\n###Fatal error:\nThere's an empty field in the input binary matrix '$Binary_Matrix_File' in row '$row_header', column '$column+2'. This shouldn't happen, as all the fields of the BINARY matrix should either have a '0' indicating absence or a '1' indicating presence!\n" if (/^$/);
|
|
471 die "\n### Fatal error:\nThe input binary matrix '$Binary_Matrix_File' contains a field with '$_' in row/line '$.', column '$column+2', instead of the mandatory binary '0' indicating absence or '1' indicating presence!\n" if (!/^(0|1)$/);
|
|
472 next if (!$_); # skip absence columns in binary input matrix (indicated by '0'), have to skip after '$column++' otherwise column won't fit
|
|
473 $Row_Header_Groups{$row_header}{$Column_Headers[$column]} = $_; # store presence ('1') in hash of hash
|
|
474 }
|
|
475 }
|
|
476 close $Binary_Matrix_Fh;
|
|
477
|
|
478
|
|
479
|
|
480 ### Calculate column stats from input binary matrix according to the group cutoffs, categorize the results and print to result files
|
|
481 print STDERR "Calculating column stats from the input binary matrix according to the group cutoffs ...\n"; # run status of script
|
|
482
|
|
483 # open result file for group binary matrix and print header
|
|
484 my $Group_Binary_Matrix_File; # filename needed also for venn diagram R script below
|
|
485 my $Group_Binary_Matrix_Fh;
|
|
486 if ($Opt_Group_Binary) {
|
|
487 $Group_Binary_Matrix_File = "$Result_Dir/group_binary_matrix.tsv";
|
|
488 open ($Group_Binary_Matrix_Fh, ">", "$Group_Binary_Matrix_File");
|
|
489 print $Group_Binary_Matrix_Fh join("\t", @Group_Names), "\n"; # print header for group binary matrix
|
|
490 }
|
|
491
|
|
492 my %Count_Group_Pres_Abs; # hash of hash to store presence count and output filehandle for each category ('core', 'group-specific' etc.)
|
|
493 foreach my $row_header (sort {lc($a) cmp lc($b)} keys %Row_Header_Groups) {
|
|
494 my %present_row_columns; # hash of array to store input binary matrix columns of a group with PRESENCE ('1') in the current row for cutoff tests and print outs
|
|
495
|
|
496 # 'strict_core' category row, if ALL columns present in row (with option '-co')
|
|
497 if ($Opt_Strict_Core && keys %{$Row_Header_Groups{$row_header}} == @Column_Headers) {
|
|
498 $Count_Group_Pres_Abs{'strict_core'}->{'presence_count'}++;
|
|
499 open_result_file("$Result_Dir/strict_core_rows.txt", 'strict_core') if (!$Count_Group_Pres_Abs{'strict_core'}->{'fh'}); # subroutine to open result file and store fh (if not opened yet)
|
|
500 print { $Count_Group_Pres_Abs{'strict_core'}->{'fh'} } "$row_header\n"; # block around fh needed if not a simple string variable (or glob)
|
|
501 }
|
|
502
|
|
503 # count and save number of columns with presence ('1') for the current $row_header with their corresponding group affiliation
|
|
504 foreach my $group (@Group_Names) {
|
|
505 foreach my $column_header (@{ $Column_Header_Groups{$group}->{'column_headers'} }) {
|
|
506 if ($Row_Header_Groups{$row_header}->{$column_header}) { # $column_header with presence in current $row_header
|
|
507 push(@{ $present_row_columns{$group} }, $column_header); # push into anonymous array in hash
|
|
508
|
|
509 # 'singleton' category row if only ONE column with presence (with option '-s')
|
|
510 if ($Opt_Singletons && keys %{$Row_Header_Groups{$row_header}} == 1) {
|
|
511 $Count_Group_Pres_Abs{$column_header}->{'presence_count'}++; # here: category = $column_header
|
|
512 open_result_file("$Result_Dir/$column_header\_singletons.txt", $column_header) if (!$Count_Group_Pres_Abs{$column_header}->{'fh'}); # subroutine
|
|
513 print { $Count_Group_Pres_Abs{$column_header}->{'fh'} } "$row_header\n";
|
|
514 }
|
|
515 }
|
|
516 }
|
|
517 }
|
|
518
|
|
519 # filter column presence counts of the groups in this row according to inclusion-/exclusion-cutoffs
|
|
520 my @row_included_groups; # array for groups with number of present column in this row >= the inclusion cutoff
|
|
521 my @row_excluded_groups; # array for groups with column presence <= the exclusion cutoff
|
|
522 my @row_unspec_groups; # array for unspecific groups with 'exclusion < column_presence_count < inclusion'
|
|
523 foreach my $group (@Group_Names) {
|
|
524 if ($present_row_columns{$group}) { # only run if group has a column presence in this row (otherwise undefined ARRAY reference)
|
|
525 if (@{ $present_row_columns{$group} } >= $Column_Header_Groups{$group}->{'inclusion'}) {
|
|
526 push(@row_included_groups, $group);
|
|
527 } elsif (@{ $present_row_columns{$group} } <= $Column_Header_Groups{$group}->{'exclusion'}) {
|
|
528 push(@row_excluded_groups, $group);
|
|
529 } elsif (@{ $present_row_columns{$group} } > $Column_Header_Groups{$group}->{'exclusion'} && @{ $present_row_columns{$group} } < $Column_Header_Groups{$group}->{'inclusion'}) {
|
|
530 push(@row_unspec_groups, $group);
|
|
531 }
|
|
532 } else { # $present_row_columns{$group} == 0 for group (undef anonymous array), i.e. always <= exclusion
|
|
533 push(@row_excluded_groups, $group);
|
|
534 }
|
|
535 }
|
|
536
|
|
537 # 'unspecific' category row if at least ONE group present in '@row_unspec_groups'
|
|
538 # no further categorization, i.e. skip rest of logics
|
|
539 if (@row_unspec_groups) {
|
|
540 $Count_Group_Pres_Abs{'unspecific'}->{'presence_count'}++;
|
|
541 if ($Opt_Unspecific) { # print only to output file with option '-u'
|
|
542 open_result_file("$Result_Dir/unspecific_rows.txt", 'unspecific') if (!$Count_Group_Pres_Abs{'unspecific'}->{'fh'}); # subroutine
|
|
543 print { $Count_Group_Pres_Abs{'unspecific'}->{'fh'} } "$row_header\n";
|
|
544 }
|
|
545 next; # skip to next $row_header, because now unspec and not suitable for any further categories
|
|
546 }
|
|
547
|
|
548 # print group binary matrix (in contrast to rows in category 'unspecific' above, 'underrepresented' rows below not skipped because package 'venn' can handle a row with only zeros in the group binary matrix)
|
|
549 if ($Opt_Group_Binary) {
|
|
550 my $i = 0;
|
|
551 foreach my $group (@Group_Names) {
|
|
552 $i++;
|
|
553 print $Group_Binary_Matrix_Fh "1" if (grep {/$group/} @row_included_groups);
|
|
554 print $Group_Binary_Matrix_Fh "0" if (grep {/$group/} @row_excluded_groups);
|
|
555 print $Group_Binary_Matrix_Fh "\t" if ($i < @Group_Names);
|
|
556 }
|
|
557 print $Group_Binary_Matrix_Fh "\n";
|
|
558 }
|
|
559
|
|
560 # 'cutoff_core' category row if ALL groups >= inclusion (includes 'strict_core' rows where ALL columns are present in the row)
|
|
561 # 2 groups A/B [2 inclus :0 exclus], 3 groups A/B/C [3:0], 4 groups A/B/C/D [4:0]
|
|
562 if (@row_included_groups == @Group_Names && @row_excluded_groups == 0) {
|
|
563 $Count_Group_Pres_Abs{'cutoff_core'}->{'presence_count'}++;
|
|
564 open_result_file("$Result_Dir/cutoff_core_rows.txt", 'cutoff_core') if (!$Count_Group_Pres_Abs{'cutoff_core'}->{'fh'}); # subroutine
|
|
565 print { $Count_Group_Pres_Abs{'cutoff_core'}->{'fh'} } "$row_header\n";
|
|
566
|
|
567 # 'underrepresented' category row if ALL groups <= exclusion_cutoff (depending on the cutoffs and number of columns in the groups will include most singletons)
|
|
568 # 2 groups [0:2], 3 [0:3], 4 [0:4]
|
|
569 } elsif (@row_included_groups == 0 && @row_excluded_groups == @Group_Names) {
|
|
570 $Count_Group_Pres_Abs{'underrepresented'}->{'presence_count'}++;
|
|
571 open_result_file("$Result_Dir/underrepresented_rows.txt", 'underrepresented') if (!$Count_Group_Pres_Abs{'underrepresented'}->{'fh'}); # subroutine
|
|
572 print { $Count_Group_Pres_Abs{'underrepresented'}->{'fh'} } "$row_header\n";
|
|
573
|
|
574 # 'group-specific' category row if only ONE group >= inclusion_cutoff
|
|
575 # 2 groups [1:1], 3 [1:2], 4 [1:3]
|
|
576 } elsif (@row_included_groups == 1 && @row_excluded_groups == @Group_Names - 1) {
|
|
577 $Count_Group_Pres_Abs{"$row_included_groups[0]_specific"}->{'presence_count'}++; # only one element in array thus [0]
|
|
578 open_result_file("$Result_Dir/$row_included_groups[0]_specific_rows.txt", "$row_included_groups[0]_specific") if (!$Count_Group_Pres_Abs{"$row_included_groups[0]_specific"}->{'fh'}); # subroutine
|
|
579 print { $Count_Group_Pres_Abs{"$row_included_groups[0]_specific"}->{'fh'} } "$row_header\n";
|
|
580
|
|
581 # 'group-absent' category row if ONE group <= exclusion_cutoff (only for 3 and 4 groups)
|
|
582 # 3 groups (A+B not C [2:1], A+C not B, and B+C not A) and 4 groups (A+B+C not D [3:1], A+B+D not C, A+C+D not B, B+C+D not A)
|
|
583 } elsif (@Group_Names >= 3 && (@row_included_groups == @Group_Names - 1 && @row_excluded_groups == 1)) {
|
|
584 $Count_Group_Pres_Abs{"$row_excluded_groups[0]_absent"}->{'presence_count'}++;
|
|
585 open_result_file("$Result_Dir/$row_excluded_groups[0]_absent_rows.txt", "$row_excluded_groups[0]_absent") if (!$Count_Group_Pres_Abs{"$row_excluded_groups[0]_absent"}->{'fh'}); # subroutine
|
|
586 print { $Count_Group_Pres_Abs{"$row_excluded_groups[0]_absent"}->{'fh'} } "$row_header\n";
|
|
587
|
|
588 # 'two group-specific' category row if TWO groups >= inclusion_cutoff (only for 4 groups)
|
|
589 # 4 groups (B+D not A or C [2:2], B+C not A or D, A+D not B or C, A+C not B or D, A+B not C or D, C+D not A or B)
|
|
590 } elsif (@Group_Names == 4 && (@row_included_groups == @row_excluded_groups)) {
|
|
591 $Count_Group_Pres_Abs{"$row_included_groups[0]-$row_included_groups[1]_specific"}->{'presence_count'}++;
|
|
592 open_result_file("$Result_Dir/$row_included_groups[0]-$row_included_groups[1]_specific_rows.txt", "$row_included_groups[0]-$row_included_groups[1]_specific") if (!$Count_Group_Pres_Abs{"$row_included_groups[0]-$row_included_groups[1]_specific"}->{'fh'}); # subroutine
|
|
593 print { $Count_Group_Pres_Abs{"$row_included_groups[0]-$row_included_groups[1]_specific"}->{'fh'} } "$row_header\n";
|
|
594
|
|
595 } else {
|
|
596 die "\n### Fatal error:\nNo category logic test fits for row '$row_header', which shouldn't happen. Have you smuggled only one or five groups into the groups file '$Groups_File' (which also shouldn't be possible)? The script can handle only two, three, or four groups please edit the input!\n"; # shouldn't happen, since regex in group file parse checks for minimally two and maximally four tab-separated group names and all category logics should be covered
|
|
597 }
|
|
598 }
|
|
599
|
|
600
|
|
601
|
|
602 ### Close all output files
|
|
603 close $Group_Binary_Matrix_Fh if ($Opt_Group_Binary);
|
|
604 map { close $Count_Group_Pres_Abs{$_}->{'fh'} } keys %Count_Group_Pres_Abs;
|
|
605
|
|
606
|
|
607
|
|
608 ### Plot the venn diagram with R package 'gplots' function 'venn'
|
|
609 if ($Opt_Venn) {
|
|
610 my $tmp_r_script = "$Result_Dir/tmp_venn.R"; # filename of the R script
|
|
611 my $venn_diagram_name = "$Result_Dir/venn_diagram.pdf"; # filename of the output PDF histogram
|
|
612 print STDERR "Drawing venn diagram (option '-p') '$venn_diagram_name' ...\n"; # run status of script
|
|
613 open (my $r_fh, ">", "$tmp_r_script");
|
|
614
|
|
615 select $r_fh; # select fh for standard print/f output
|
|
616 print "#!/usr/bin/Rscript --vanilla --slave\n"; # header of R script
|
|
617 print "suppressMessages(library(gplots))\n"; # load library 'gplots' for function 'venn' and suppress STDOUT message from loading
|
|
618 print "group_binary_matrix <- read.table(\"$Group_Binary_Matrix_File\", header=TRUE, sep=\"\\t\")\n";
|
|
619 print "pdf(\"$venn_diagram_name\")\n";
|
|
620 print "venn(group_binary_matrix)\n"; # create the venn diagram
|
|
621 print "out <- dev.off()\n"; # write histogram to PDF and suppress STDOUT output by diverting it
|
|
622 select STDOUT; # change standard print/f output back to STDOUT
|
|
623 close $r_fh;
|
|
624
|
|
625 # execute tmp R script with 'Rscript'
|
|
626 system("Rscript $tmp_r_script") == 0 or die "### Fatal error:\nPlotting the venn diagram didn't work. Either statistical programming language 'R' or the required R package 'gplots' is not installed, not in global \$PATH, or something wrong with the group binary matrix '$Group_Binary_Matrix_File' or the temporary R script '$tmp_r_script', which runs the drawing of the venn diagram. Function 'venn' of the 'gplots' package is required and needs to be installed!\n";
|
|
627 unlink $tmp_r_script;
|
|
628 }
|
|
629
|
|
630
|
|
631
|
|
632 ### Count overall category rows, and plausibility check for above logics
|
|
633 my $Total_Category_Rows = 0;
|
|
634 foreach my $category (keys %Count_Group_Pres_Abs) { # can't use a map in case of optionally strict_core, singletons categories (which overlap non-optional categories)
|
|
635 next if ($category =~ /strict_core/ || grep {$category eq $_} @Column_Headers); # skip optional 'strict_core' (option '-co') or singleton ('-s') categories
|
|
636
|
|
637 $Total_Category_Rows += $Count_Group_Pres_Abs{$category}->{'presence_count'};
|
|
638 }
|
|
639 die "\n### Fatal error:\nResulting categories don't include all present rows, this shouldn't happen!\n" if (keys %Row_Header_Groups != $Total_Category_Rows); # is this really always the case?
|
|
640
|
|
641
|
|
642
|
|
643 ### Print final output stats matrix
|
|
644 print STDERR "Finished printing output files to result directory '$Result_Dir', printing final stats matrix to STDOUT ...\n"; # run status of script
|
|
645 print "# category\tpresence_count\n"; # header for matrix
|
|
646 print "total_rows\t", scalar keys %Row_Header_Groups, "\n";
|
|
647 foreach my $category (sort {lc($a) cmp lc($b)} keys %Count_Group_Pres_Abs) {
|
|
648 print "$category";
|
|
649 if (grep {$category eq $_} @Column_Headers) { # print overall presence counts for column headers with singletons (option '-s')
|
|
650 print " overall_presence\t", scalar grep($Row_Header_Groups{$_}->{$category} , keys %Row_Header_Groups), "\n";
|
|
651 print "$category singletons"; # include category "key word" singleton for category stats below
|
|
652 }
|
|
653 print "\t$Count_Group_Pres_Abs{$category}->{'presence_count'}\n";
|
|
654 }
|
|
655
|
|
656 # print out overall presence counts for all columns even without singletons (option '-a')
|
|
657 if ($Opt_Columns_Overall) {
|
|
658 foreach my $column_header (sort {lc($a) cmp lc($b)} @Column_Headers) {
|
|
659 next if (grep {$column_header eq $_} keys %Count_Group_Pres_Abs); # skip column headers with singletons (option '-s'), already printed above
|
|
660 print "$column_header overall_presence\t", scalar grep($Row_Header_Groups{$_}->{$column_header} , keys %Row_Header_Groups), "\n";
|
|
661 }
|
|
662 }
|
|
663
|
|
664 exit;
|
|
665
|
|
666
|
|
667 #############
|
|
668 #Subroutines#
|
|
669 #############
|
|
670
|
|
671 ### Subroutine to check if a column header is present in hash %Column_Header_Groups (hash filled from the $Groups_File)
|
|
672 sub check_column_header_in_groups {
|
|
673 my $column_header = shift;
|
|
674 foreach my $group (keys %Column_Header_Groups) {
|
|
675 return 1 if (grep {$_ eq $column_header} @{ $Column_Header_Groups{$group}->{'column_headers'} });
|
|
676 }
|
|
677 return; # return 'undef'
|
|
678 }
|
|
679
|
|
680
|
|
681
|
|
682 ### Subroutine to check if a string adheres to file system conformity, esp. for filenames
|
|
683 sub check_file_system_conformity {
|
|
684 my ($string, $type) = @_;
|
|
685 die "\n\n### Fatal error:\nOnly alphanumeric '0-9a-zA-Z', underscore '_', dash '-', and period '.' characters allowed for $type to avoid operating/file system problems. Thus, please adapt '$string' in the group file '$Groups_File' and if necessary also in the input binary matrix '$Binary_Matrix_File' accordingly!\n" if ($string !~ /^[\w_\-\.]+$/);
|
|
686 return 1;
|
|
687 }
|
|
688
|
|
689
|
|
690
|
|
691 ### Subroutine to empty a directory with user interaction
|
|
692 sub empty_dir {
|
|
693 my $dir = shift;
|
|
694 print STDERR "\nDirectory '$dir' already exists! You can use either option '-r' to set a different output result directory name, or do you want to replace the directory and all its contents [y|n]? ";
|
|
695 my $user_ask = <STDIN>;
|
|
696 if ($user_ask =~ /y/i) {
|
|
697 unlink glob "$dir/*"; # remove all files in results directory
|
|
698 } else {
|
|
699 die "\nScript abborted!\n";
|
|
700 }
|
|
701 return 1;
|
|
702 }
|
|
703
|
|
704
|
|
705
|
|
706 ### Open a result file and print the header
|
|
707 sub open_result_file {
|
|
708 my ($filename, $category) = @_;
|
|
709 open ($Count_Group_Pres_Abs{$category}->{'fh'}, ">", "$filename"); # store filehandle in hash of hash
|
|
710 print { $Count_Group_Pres_Abs{$category}->{'fh'} } "# row_header\n"; # block around fh needed if not a simple string variable (or glob)
|
|
711 return 1;
|
|
712 }
|