annotate COG/bac-genomics-scripts/prot_finder/transpose_matrix.pl @ 13:152d7c43478b draft default tip

Uploaded
author dereeper
date Thu, 30 May 2024 20:07:55 +0000
parents e42d30da7a74
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/perl
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
3 #######
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
4 # POD #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
5 #######
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
6
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
7 =pod
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
8
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
9 =head1 NAME
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
10
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
11 C<transpose_matrix.pl> - transpose a delimited TEXT matrix
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
12
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
13 =head1 SYNOPSIS
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
14
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
15 C<perl transpose_matrix.pl input_matrix.tsv E<gt>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
16 input_matrix_transposed.tsv>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
17
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
18 B<or>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
19
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
20 C<perl prot_binary_matrix.pl blast_hits.tsv | perl
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
21 transpose_matrix.pl E<gt> binary_matrix_transposed.tsv>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
22
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
23 =head1 DESCRIPTION
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
24
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
25 This script transposes a delimited TEXT input matrix, i.e. rows will
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
26 become columns and columns rows. Use option B<-d> to set the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
27 delimiter of the input and output matrix, default is set to
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
28 tab-delimited/separated matrices. Input matrices can be given
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
29 directly via C<STDIN> or as a file. The script is intended for the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
30 resulting presence/absence binary matrices of
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
31 C<prot_binary_matrix.pl>, but can be used for any TEXT matrix.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
32
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
33 The binary matrix of C<prot_binary_matrix.pl> has the query protein
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
34 IDs as column headers and the subject genomes as row headers. Thus,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
35 C<transpose_matrix.pl> is very useful to transpose the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
36 C<prot_binary_matrix.pl> matrix for the usage with
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
37 C<binary_group_stats.pl> to calculate presence/absence statistics
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
38 for groups of columns/genomes (and not simply single columns of the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
39 matrix). C<binary_group_stats.pl> also has a comprehensive manual
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
40 with its option B<-h>.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
41
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
42 Additionally, option B<-e> can be used to fill empty cells of the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
43 input matrix with a value in the transposed matrix (e.g. 'NA', '0'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
44 etc.).
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
45
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
46 =head1 OPTIONS
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
47
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
48 =over 20
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
49
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
50 =item B<-h>, B<-help>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
51
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
52 Help (perldoc POD)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
53
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
54 =item B<-d>=I<str>, B<-delimiter>=I<str>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
55
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
56 Set delimiter of input and output matrix (e.g. comma ',', single
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
57 space ' ' etc.) [default = tab-delimited/separated]
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
58
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
59 =item B<-e>=I<str>, B<-empty>=I<str>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
60
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
61 Fill empty cells of the input matrix with a value in the transposed
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
62 matrix (e.g. 'NA', '0' etc.)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
63
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
64 =item B<-v>, B<-version>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
65
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
66 Print version number to C<STDERR>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
67
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
68 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
69
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
70 =head1 OUTPUT
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
71
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
72 =over 20
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
73
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
74 =item C<STDOUT>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
75
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
76 The transposed matrix is printed to C<STDOUT>. Redirect or pipe into
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
77 another tool as needed.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
78
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
79 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
80
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
81 =head1 EXAMPLES
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
82
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
83 =over
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
84
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
85 =item C<perl transpose_matrix.pl -d ' ' -e NA input_matrix_space-delimit.txt E<gt> input_matrix_space-delimit_transposed.txt>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
86
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
87 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
88
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
89 B<or>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
90
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
91 =over
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
92
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
93 =item C<for matrix in *.tsv; do perl transpose_matrix.pl "$matrix" E<gt> "${matrix%.*}_transposed.tsv"; done>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
94
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
95 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
96
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
97 B<or>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
98
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
99 =over
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
100
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
101 =item C<perl prot_finder.pl -r report.blastp -s subject.faa | perl prot_binary_matrix.pl -l -c | perl transpose_matrix.pl -d , E<gt> binary_matrix_transposed.csv>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
102
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
103 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
104
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
105 B<or>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
106
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
107 =over
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
108
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
109 =item C<mkdir result_dir && ./prot_finder_pipe.sh -q query.faa -s subject.faa -d result_dir -m | tee result_dir/blast_hits.tsv | perl prot_binary_matrix.pl | tee result_dir/binary_matrix.tsv | perl transpose_matrix.pl E<gt> result_dir/binary_matrix_transposed.tsv>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
110
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
111 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
112
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
113 =head1 VERSION
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
114
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
115 0.1 12-04-2016
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
116
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
117 =head1 AUTHOR
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
118
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
119 Andreas Leimbach aleimba[at]gmx[dot]de
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
120
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
121 =head1 ACKNOWLEDGEMENT
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
122
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
123 The Perl implementation for transposing a matrix on Stack Overflow
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
124 was very useful:
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
125 L<https://stackoverflow.com/questions/1729824/transpose-a-file-in-bash>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
126
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
127 =head1 LICENSE
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
128
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
129 This program is free software: you can redistribute it and/or modify
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
130 it under the terms of the GNU General Public License as published by
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
131 the Free Software Foundation; either version 3 (GPLv3) of the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
132 License, or (at your option) any later version.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
133
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
134 This program is distributed in the hope that it will be useful, but
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
135 WITHOUT ANY WARRANTY; without even the implied warranty of
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
136 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
137 General Public License for more details.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
138
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
139 You should have received a copy of the GNU General Public License
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
140 along with this program. If not, see L<http://www.gnu.org/licenses/>.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
141
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
142 =cut
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
143
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
144
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
145 ########
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
146 # MAIN #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
147 ########
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
148
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
149 use strict;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
150 use warnings;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
151 use autodie;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
152 use Getopt::Long;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
153 use Pod::Usage;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
154
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
155 ### Get the options with Getopt::Long
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
156 my $Delimiter = "\t"; # set separator/delimiter of input/output matrix
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
157 my $Empty; # optionally, fill empty cells with a value
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
158 my $VERSION = 0.1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
159 my ($Opt_Version, $Opt_Help);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
160 GetOptions ('delimiter=s' => \$Delimiter,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
161 'empty=s' => \$Empty,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
162 'version' => \$Opt_Version,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
163 'help|?' => \$Opt_Help)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
164 or pod2usage(-verbose => 1, -exitval => 2);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
165
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
166
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
167 ### Run perldoc on POD and set option defaults
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
168 pod2usage(-verbose => 2) if ($Opt_Help);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
169 die "$0 $VERSION\n" if ($Opt_Version);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
170
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
171
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
172 ### Check input
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
173 if (-t STDIN && ! @ARGV) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
174 my $warning = "\n### Fatal error: No STDIN and no input file given as argument, please supply one of them and/or see help with '-h'!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
175 pod2usage(-verbose => 0, -message => $warning, -exitval => 2);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
176 } elsif (!-t STDIN && @ARGV) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
177 my $warning = "\n### Fatal error: Both STDIN and an input file given as argument, please supply only either one and/or see help with '-h'!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
178 pod2usage(-verbose => 0, -message => $warning, -exitval => 2);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
179 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
180 die "\n### Fatal error: Too many arguments given, only STDIN or one input file allowed as argument! Please see the usage with option '-h' if unclear!\n\n" if (@ARGV > 1);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
181 die "\n### Fatal error: File '$ARGV[0]' does not exist!\n\n" if (@ARGV && $ARGV[0] ne '-' && !-e $ARGV[0]);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
182
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
183
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
184 ### Parse input matrix
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
185 my %Input_Matrix; # hash of hash to store the input matrix
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
186 my $Max_Columns = 0; # maximum number of columns, needed in case not every row of input matrix has the same number of columns
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
187 my $Row_Num = 0; # count input matrix number of rows
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
188 while (<>) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
189 chomp;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
190 warn "### Warning: Set separator/delimiter '$Delimiter' (option '-d') not found in the following first line/header of input matrix, sure the correct one is set?\n$_\n\n" if ($_ !~ /$Delimiter/ && $. == 1);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
191
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
192 my $col_num = 0; # count number of columns for each row
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
193 foreach my $cell (split(/$Delimiter/)) { # split each row for the cells
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
194 $cell = $Empty if ($cell =~ /^$/); # needed for empty cells in between cells with values, because for these $cell is defined in print out below
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
195 $Input_Matrix{$Row_Num}{$col_num++} = $cell;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
196 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
197
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
198 $Max_Columns = $col_num if ($col_num > $Max_Columns);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
199 $Row_Num++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
200 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
201
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
202
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
203 ### Print out transposed matrix
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
204 my $Max_Rows = $Row_Num;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
205 for (my $col_num = 0; $col_num < $Max_Columns; $col_num++) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
206 for ($Row_Num = 0; $Row_Num < $Max_Rows; $Row_Num++) { # repurposing $Row_Num
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
207 print "$Delimiter" if ($Row_Num > 0); # separator only after the first transposed column
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
208 if (defined $Input_Matrix{$Row_Num}{$col_num}) { # 'defined' needed, in case $cell has '0' as value
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
209 print $Input_Matrix{$Row_Num}{$col_num};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
210 } elsif (defined $Empty) { # for rows of the input matrix with columns < $Max_Columns; 'defined' needed, in case $Empty is set to '0'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
211 print $Empty;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
212 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
213 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
214 print "\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
215 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
216
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
217 exit;