Mercurial > repos > dereeper > pangenome_explorer
comparison COG/bac-genomics-scripts/prot_finder/transpose_matrix.pl @ 3:e42d30da7a74 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:52:25 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:97e4e3e818b6 | 3:e42d30da7a74 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 ####### | |
4 # POD # | |
5 ####### | |
6 | |
7 =pod | |
8 | |
9 =head1 NAME | |
10 | |
11 C<transpose_matrix.pl> - transpose a delimited TEXT matrix | |
12 | |
13 =head1 SYNOPSIS | |
14 | |
15 C<perl transpose_matrix.pl input_matrix.tsv E<gt> | |
16 input_matrix_transposed.tsv> | |
17 | |
18 B<or> | |
19 | |
20 C<perl prot_binary_matrix.pl blast_hits.tsv | perl | |
21 transpose_matrix.pl E<gt> binary_matrix_transposed.tsv> | |
22 | |
23 =head1 DESCRIPTION | |
24 | |
25 This script transposes a delimited TEXT input matrix, i.e. rows will | |
26 become columns and columns rows. Use option B<-d> to set the | |
27 delimiter of the input and output matrix, default is set to | |
28 tab-delimited/separated matrices. Input matrices can be given | |
29 directly via C<STDIN> or as a file. The script is intended for the | |
30 resulting presence/absence binary matrices of | |
31 C<prot_binary_matrix.pl>, but can be used for any TEXT matrix. | |
32 | |
33 The binary matrix of C<prot_binary_matrix.pl> has the query protein | |
34 IDs as column headers and the subject genomes as row headers. Thus, | |
35 C<transpose_matrix.pl> is very useful to transpose the | |
36 C<prot_binary_matrix.pl> matrix for the usage with | |
37 C<binary_group_stats.pl> to calculate presence/absence statistics | |
38 for groups of columns/genomes (and not simply single columns of the | |
39 matrix). C<binary_group_stats.pl> also has a comprehensive manual | |
40 with its option B<-h>. | |
41 | |
42 Additionally, option B<-e> can be used to fill empty cells of the | |
43 input matrix with a value in the transposed matrix (e.g. 'NA', '0' | |
44 etc.). | |
45 | |
46 =head1 OPTIONS | |
47 | |
48 =over 20 | |
49 | |
50 =item B<-h>, B<-help> | |
51 | |
52 Help (perldoc POD) | |
53 | |
54 =item B<-d>=I<str>, B<-delimiter>=I<str> | |
55 | |
56 Set delimiter of input and output matrix (e.g. comma ',', single | |
57 space ' ' etc.) [default = tab-delimited/separated] | |
58 | |
59 =item B<-e>=I<str>, B<-empty>=I<str> | |
60 | |
61 Fill empty cells of the input matrix with a value in the transposed | |
62 matrix (e.g. 'NA', '0' etc.) | |
63 | |
64 =item B<-v>, B<-version> | |
65 | |
66 Print version number to C<STDERR> | |
67 | |
68 =back | |
69 | |
70 =head1 OUTPUT | |
71 | |
72 =over 20 | |
73 | |
74 =item C<STDOUT> | |
75 | |
76 The transposed matrix is printed to C<STDOUT>. Redirect or pipe into | |
77 another tool as needed. | |
78 | |
79 =back | |
80 | |
81 =head1 EXAMPLES | |
82 | |
83 =over | |
84 | |
85 =item C<perl transpose_matrix.pl -d ' ' -e NA input_matrix_space-delimit.txt E<gt> input_matrix_space-delimit_transposed.txt> | |
86 | |
87 =back | |
88 | |
89 B<or> | |
90 | |
91 =over | |
92 | |
93 =item C<for matrix in *.tsv; do perl transpose_matrix.pl "$matrix" E<gt> "${matrix%.*}_transposed.tsv"; done> | |
94 | |
95 =back | |
96 | |
97 B<or> | |
98 | |
99 =over | |
100 | |
101 =item C<perl prot_finder.pl -r report.blastp -s subject.faa | perl prot_binary_matrix.pl -l -c | perl transpose_matrix.pl -d , E<gt> binary_matrix_transposed.csv> | |
102 | |
103 =back | |
104 | |
105 B<or> | |
106 | |
107 =over | |
108 | |
109 =item C<mkdir result_dir && ./prot_finder_pipe.sh -q query.faa -s subject.faa -d result_dir -m | tee result_dir/blast_hits.tsv | perl prot_binary_matrix.pl | tee result_dir/binary_matrix.tsv | perl transpose_matrix.pl E<gt> result_dir/binary_matrix_transposed.tsv> | |
110 | |
111 =back | |
112 | |
113 =head1 VERSION | |
114 | |
115 0.1 12-04-2016 | |
116 | |
117 =head1 AUTHOR | |
118 | |
119 Andreas Leimbach aleimba[at]gmx[dot]de | |
120 | |
121 =head1 ACKNOWLEDGEMENT | |
122 | |
123 The Perl implementation for transposing a matrix on Stack Overflow | |
124 was very useful: | |
125 L<https://stackoverflow.com/questions/1729824/transpose-a-file-in-bash> | |
126 | |
127 =head1 LICENSE | |
128 | |
129 This program is free software: you can redistribute it and/or modify | |
130 it under the terms of the GNU General Public License as published by | |
131 the Free Software Foundation; either version 3 (GPLv3) of the | |
132 License, or (at your option) any later version. | |
133 | |
134 This program is distributed in the hope that it will be useful, but | |
135 WITHOUT ANY WARRANTY; without even the implied warranty of | |
136 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
137 General Public License for more details. | |
138 | |
139 You should have received a copy of the GNU General Public License | |
140 along with this program. If not, see L<http://www.gnu.org/licenses/>. | |
141 | |
142 =cut | |
143 | |
144 | |
145 ######## | |
146 # MAIN # | |
147 ######## | |
148 | |
149 use strict; | |
150 use warnings; | |
151 use autodie; | |
152 use Getopt::Long; | |
153 use Pod::Usage; | |
154 | |
155 ### Get the options with Getopt::Long | |
156 my $Delimiter = "\t"; # set separator/delimiter of input/output matrix | |
157 my $Empty; # optionally, fill empty cells with a value | |
158 my $VERSION = 0.1; | |
159 my ($Opt_Version, $Opt_Help); | |
160 GetOptions ('delimiter=s' => \$Delimiter, | |
161 'empty=s' => \$Empty, | |
162 'version' => \$Opt_Version, | |
163 'help|?' => \$Opt_Help) | |
164 or pod2usage(-verbose => 1, -exitval => 2); | |
165 | |
166 | |
167 ### Run perldoc on POD and set option defaults | |
168 pod2usage(-verbose => 2) if ($Opt_Help); | |
169 die "$0 $VERSION\n" if ($Opt_Version); | |
170 | |
171 | |
172 ### Check input | |
173 if (-t STDIN && ! @ARGV) { | |
174 my $warning = "\n### Fatal error: No STDIN and no input file given as argument, please supply one of them and/or see help with '-h'!\n"; | |
175 pod2usage(-verbose => 0, -message => $warning, -exitval => 2); | |
176 } elsif (!-t STDIN && @ARGV) { | |
177 my $warning = "\n### Fatal error: Both STDIN and an input file given as argument, please supply only either one and/or see help with '-h'!\n"; | |
178 pod2usage(-verbose => 0, -message => $warning, -exitval => 2); | |
179 } | |
180 die "\n### Fatal error: Too many arguments given, only STDIN or one input file allowed as argument! Please see the usage with option '-h' if unclear!\n\n" if (@ARGV > 1); | |
181 die "\n### Fatal error: File '$ARGV[0]' does not exist!\n\n" if (@ARGV && $ARGV[0] ne '-' && !-e $ARGV[0]); | |
182 | |
183 | |
184 ### Parse input matrix | |
185 my %Input_Matrix; # hash of hash to store the input matrix | |
186 my $Max_Columns = 0; # maximum number of columns, needed in case not every row of input matrix has the same number of columns | |
187 my $Row_Num = 0; # count input matrix number of rows | |
188 while (<>) { | |
189 chomp; | |
190 warn "### Warning: Set separator/delimiter '$Delimiter' (option '-d') not found in the following first line/header of input matrix, sure the correct one is set?\n$_\n\n" if ($_ !~ /$Delimiter/ && $. == 1); | |
191 | |
192 my $col_num = 0; # count number of columns for each row | |
193 foreach my $cell (split(/$Delimiter/)) { # split each row for the cells | |
194 $cell = $Empty if ($cell =~ /^$/); # needed for empty cells in between cells with values, because for these $cell is defined in print out below | |
195 $Input_Matrix{$Row_Num}{$col_num++} = $cell; | |
196 } | |
197 | |
198 $Max_Columns = $col_num if ($col_num > $Max_Columns); | |
199 $Row_Num++; | |
200 } | |
201 | |
202 | |
203 ### Print out transposed matrix | |
204 my $Max_Rows = $Row_Num; | |
205 for (my $col_num = 0; $col_num < $Max_Columns; $col_num++) { | |
206 for ($Row_Num = 0; $Row_Num < $Max_Rows; $Row_Num++) { # repurposing $Row_Num | |
207 print "$Delimiter" if ($Row_Num > 0); # separator only after the first transposed column | |
208 if (defined $Input_Matrix{$Row_Num}{$col_num}) { # 'defined' needed, in case $cell has '0' as value | |
209 print $Input_Matrix{$Row_Num}{$col_num}; | |
210 } elsif (defined $Empty) { # for rows of the input matrix with columns < $Max_Columns; 'defined' needed, in case $Empty is set to '0' | |
211 print $Empty; | |
212 } | |
213 } | |
214 print "\n"; | |
215 } | |
216 | |
217 exit; |