annotate COG/bac-genomics-scripts/prot_finder/prot_binary_matrix.pl @ 15:dbde253606c5 draft

Uploaded
author dereeper
date Wed, 11 Dec 2024 08:25:06 +0000
parents e42d30da7a74
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/perl
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
3 #######
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
4 # POD #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
5 #######
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
6
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
7 =pod
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
8
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
9 =head1 NAME
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
10
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
11 C<prot_binary_matrix.pl> - create a presence/absence matrix from
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
12 C<prot_finder.pl> output
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
13
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
14 =head1 SYNOPSIS
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
15
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
16 C<perl prot_binary_matrix.pl blast_hits.tsv E<gt> binary_matrix.tsv>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
17
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
18 B<or>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
19
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
20 C<perl prot_finder.pl -r report.blastp -s subject.faa | perl prot_binary_matrix.pl E<gt> binary_matrix.tsv>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
21
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
22 =head1 DESCRIPTION
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
23
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
24 This script is intended to create a presence/absence matrix from the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
25 significant C<prot_finder.pl>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
26 L<B<BLASTP>|http://blast.ncbi.nlm.nih.gov/Blast.cgi>) hits (or the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
27 companion bash pipe C<prot_finder_pipe.sh>). The tab-separated
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
28 C<prot_finder.pl> output can be given directly via C<STDIN> or as a
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
29 file. By default a tab-delimited binary presence/absence matrix for
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
30 query hits per subject organism will be printed to C<STDOUT>. Use
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
31 option B<-t> to count all query hits per subject organism, not just
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
32 the binary presence/absence. You can transpose the presence/absence
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
33 binary matrix with the script C<transpose_matrix.pl> (see its help
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
34 with B<-h>).
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
35
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
36 The resulting matrix can be used to associate the presence/absence
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
37 data with a phylogenetic tree, e.g. use the Interactive Tree Of Life
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
38 website (L<B<iTOL>|http://itol.embl.de/>). B<iTOL> likes individual
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
39 comma-separated input files, thus use options B<-s -c> for this
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
40 purpose.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
41
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
42 For B<iTOL> the organism names have to have identical names to the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
43 leaves of the phylogenetic tree, thus manual adaptation, e.g. in a
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
44 spreadsheet software, might be needed. B<Careful>, subject organisms
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
45 without a significant B<BLASTP> hit won't be included in the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
46 tab-separated C<prot_finder.pl> result table and hence can't be
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
47 included by C<prot_binary_matrix.pl>. If needed add them manually to
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
48 the result matri(x|ces).
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
49
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
50 Additionally, you can give the presence/absence binary matrix to
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
51 C<binary_group_stats.pl> to calculate presence/absence statistics
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
52 for groups of columns and not simply single columns of the matrix.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
53 C<binary_group_stats.pl> also has a comprehensive manual with its
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
54 option B<-h>.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
55
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
56 =head1 OPTIONS
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
57
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
58 =over 20
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
59
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
60 =item B<-h>, B<-help>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
61
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
62 Help (perldoc POD)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
63
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
64 =item B<-s>, B<-separate>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
65
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
66 Separate presence/absence files for each query protein printed to
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
67 the result directory [default without B<-s> = C<STDOUT> matrix for
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
68 all query proteins combined]
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
69
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
70 =item B<-d>=I<str>, B<-dir_result>=I<str>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
71
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
72 Path to result folder, requires option B<-s> [default =
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
73 './binary_matrix_results']
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
74
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
75 =item B<-t>, B<-total>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
76
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
77 Count total occurrences of query proteins, not just binary
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
78 presence/absence
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
79
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
80 =item B<-c>, B<-csv>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
81
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
82 Output matri(x|ces) in comma-separated format (*.csv) instead of
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
83 tab-delimited format (*.tsv)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
84
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
85 =item B<-l>, B<-locus_tag>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
86
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
87 Use the locus_tag B<prefixes> in the subject_ID column of the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
88 C<prot_finder.pl> output (instead of the subject_organism columns) as
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
89 organism IDs to associate query hits to organisms. The subject_ID
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
90 column will include locus_tags if they're annotated for a genome
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
91 (see the L<C<cds_extractor.pl>|/cds_extractor> format description).
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
92 Useful if the L<C<cds_extractor.pl>|/cds_extractor> output doesn't
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
93 include strain names for 'o=' in the FASTA IDs, because the prefix
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
94 of a locus_tag should be unique for a genome (see
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
95 L<http://www.ncbi.nlm.nih.gov/genbank/genomesubmit_annotation>).
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
96
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
97 =item B<-v>, B<-version>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
98
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
99 Print version number to C<STDERR>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
100
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
101 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
102
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
103 =head1 OUTPUT
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
104
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
105 =over 17
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
106
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
107 =item C<STDOUT>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
108
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
109 The resulting presence/absence matrix is printed to C<STDOUT>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
110 without option B<-s>. Redirect or pipe into another tool as needed.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
111
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
112 =item (F<./binary_matrix_results>)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
113
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
114 Separate query presence/absence files are stored in a result folder
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
115 with option B<-s>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
116
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
117 =item (F<./binary_matrix_results/query-ID_binary_matrix.(tsv|csv)>)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
118
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
119 Separate query presence/absence files with option B<-s>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
120
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
121 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
122
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
123 =head1 EXAMPLES
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
124
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
125 =over
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
126
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
127 =item C<perl prot_binary_matrix.pl -s -d result_dir -t blast_hits.tsv>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
128
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
129 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
130
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
131 B<or>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
132
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
133 =over
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
134
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
135 =item C<perl prot_finder.pl -r report.blastp -s subject.faa | perl prot_binary_matrix.pl -l -c E<gt> binary_matrix.csv>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
136
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
137 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
138
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
139 B<or>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
140
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
141 =over
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
142
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
143 =item C<mkdir result_dir && ./prot_finder_pipe.sh -q query.faa -s subject.faa -d result_dir -m | tee result_dir/blast_hits.tsv | perl prot_binary_matrix.pl E<gt> binary_matrix.tsv>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
144
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
145 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
146
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
147 =head1 VERSION
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
148
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
149 0.6 update: 23-11-2015
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
150 0.1 25-10-2012
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
151
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
152 =head1 AUTHOR
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
153
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
154 Andreas Leimbach aleimba[at]gmx[dot]de
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
155
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
156 =head1 LICENSE
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
157
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
158 This program is free software: you can redistribute it and/or modify
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
159 it under the terms of the GNU General Public License as published by
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
160 the Free Software Foundation; either version 3 (GPLv3) of the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
161 License, or (at your option) any later version.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
162
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
163 This program is distributed in the hope that it will be useful, but
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
164 WITHOUT ANY WARRANTY; without even the implied warranty of
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
165 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
166 General Public License for more details.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
167
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
168 You should have received a copy of the GNU General Public License
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
169 along with this program. If not, see L<http://www.gnu.org/licenses/>.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
170
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
171 =cut
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
172
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
173
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
174 ########
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
175 # MAIN #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
176 ########
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
177
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
178 use strict;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
179 use warnings;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
180 use autodie;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
181 use Getopt::Long;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
182 use Pod::Usage;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
183
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
184 ### Get options with Getopt::Long
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
185 my $Opt_Separate; # separate presence/absence files for each query printed to result_dir (default: single presence/absence file for all queries printed to STDOUT)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
186 my $Result_Dir; # path to result folder, requires option '-s'; default set below to 'binary_matrix_results'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
187 my $Opt_Total; # count total occurrences of query proteins not just presence/absence binary
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
188 my $Opt_Csv; # output in csv format (default: tsv)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
189 my $Opt_Locus_Tag; # use locus_tag prefixes (from subject_ID column, see cds_exractor) instead of subject_organism as ID to count query hits
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
190 my $VERSION = 0.6;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
191 my ($Opt_Version, $Opt_Help);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
192 GetOptions ('separate' => \$Opt_Separate,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
193 'dir_result=s' => \$Result_Dir,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
194 'total' => \$Opt_Total,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
195 'csv' => \$Opt_Csv,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
196 'locus_tag' => \$Opt_Locus_Tag,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
197 'version' => \$Opt_Version,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
198 'help|?' => \$Opt_Help)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
199 or pod2usage(-verbose => 1, -exitval => 2);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
200
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
201
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
202
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
203 ### Run perldoc on POD and set option defaults
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
204 pod2usage(-verbose => 2) if ($Opt_Help);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
205 die "$0 $VERSION\n" if ($Opt_Version);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
206 if ($Result_Dir && !$Opt_Separate) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
207 warn "### Warning: Option '-d' given but not its required option '-s', forcing option '-s'!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
208 $Opt_Separate = 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
209 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
210
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
211 my $Separator = "\t";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
212 $Separator = "," if ($Opt_Csv); # optional csv output format
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
213
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
214
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
215
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
216 ### Check input
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
217 if (-t STDIN && ! @ARGV) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
218 my $warning = "\n### Fatal error: No STDIN and no input file given as argument, please supply one of them and/or see help with '-h'!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
219 pod2usage(-verbose => 0, -message => $warning, -exitval => 2);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
220 } elsif (!-t STDIN && @ARGV) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
221 my $warning = "\n### Fatal error: Both STDIN and an input file given as argument, please supply only either one and/or see help with '-h'!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
222 pod2usage(-verbose => 0, -message => $warning, -exitval => 2);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
223 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
224 die "\n### Fatal error: Too many arguments given, only STDIN or one input file allowed as argument! Please see the usage with option '-h' if unclear!\n" if (@ARGV > 1);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
225 die "\n### Fatal error: File '$ARGV[0]' does not exist!\n" if (@ARGV && $ARGV[0] ne '-' && !-e $ARGV[0]);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
226
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
227
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
228
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
229 ### Create result folder, only for option '-s'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
230 if ($Opt_Separate) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
231 $Result_Dir = 'binary_matrix_results' if (!$Result_Dir);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
232 $Result_Dir =~ s/\/$//; # get rid of a potential '/' at the end of $Result_Dir path
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
233 if (-e $Result_Dir) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
234 empty_dir($Result_Dir); # subroutine to empty a directory with user interaction
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
235 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
236 mkdir $Result_Dir;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
237 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
238 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
239
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
240
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
241
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
242 ### Parse the input from 'prot_finder.pl' to associate organism with query hit
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
243 my @Queries; # store all query proteins
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
244 my %Hits; # hash of hash to associate subject_organism/subject_ID with query hit
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
245
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
246 while (<>) { # read STDIN or file input
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
247 chomp;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
248 if ($. == 1) { # $. = check only first line of input (works with STDIN and file input)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
249 die "\n### Fatal error: Input doesn't have the correct format, it has to be the output of 'prot_finder.pl' with the following header:\n# subject_organism\tsubject_ID\tsubject_gene\tsubject_protein_desc\tquery_ID\tquery_desc\tquery_coverage [%]\tquery_identities [%]\tsubject/hit_coverage [%]\te-value of best HSP\tbit-score of best HSP\n" if (!/# subject_organism\tsubject_ID\tsubject_gene\tsubject_protein_desc\tquery_ID\tquery_desc/);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
250 next; # skip header line
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
251 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
252
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
253 my @line = split (/\t/, $_); # $line[0] = subject_organism; $line[1] = subject_ID (mostly locus_tag, see cds_extractor); $line[4] = query_ID
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
254 my $query = $line[4];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
255 my $id;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
256 if ($Opt_Locus_Tag) { # use subject_ID
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
257 die "\n### Fatal error: The subject_ID of the following line doesn't look like an NCBI locus tag (see: http://www.ncbi.nlm.nih.gov/genbank/genomesubmit_annotation). Column subject_ID needs to include only locus_tags for option '-l'!\n$_\n" if ($line[1] !~ /^([a-zA-Z][0-9a-zA-Z]{2,11})_[0-9a-zA-Z]+$/); # check if subject_ID is locus_tag ('\w' not used because allows alphanumeric and '_')
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
258 # excerpt: The locus_tag prefix must be 3-12 alphanumeric characters and the first character may not be a digit. All chromosomes and plasmids of an individual genome must use the exactly same locus_tag prefix followed by an underscore and then an alphanumeric identification number that is unique within the given genome. Other than the single underscore used to separate the prefix from the identification number no other special characters can be used in the locus_tag.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
259 $id = $1; # locus_tag prefix, unique for each genome
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
260 } else { # use subject_organism as ID
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
261 $id = $line[0];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
262 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
263
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
264 if ($Opt_Total) { # count total occurrences of query proteins
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
265 $Hits{$id}{$query}++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
266
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
267 } else { # only binary output (0 or 1)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
268 $Hits{$id}{$query} = 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
269 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
270
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
271 push (@Queries, $query) if (!grep($_ eq $query, @Queries)); # push each query only once in @Queries
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
272 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
273
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
274
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
275
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
276 ### Print binary data to a joined or separate (for each query; as needed by iTOL) file(s)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
277 if (!$Opt_Separate) { # joined output
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
278 # print header
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
279 if ($Opt_Locus_Tag) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
280 print "locus_tag";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
281 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
282 print "organism";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
283 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
284 print "$Separator";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
285 print join("$Separator", sort @Queries), "\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
286
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
287 # print data to STDOUT
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
288 foreach my $id (sort keys %Hits) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
289 print "$id";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
290 foreach my $query (sort @Queries) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
291 if ($Hits{$id}->{$query}) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
292 print "$Separator", "$Hits{$id}->{$query}";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
293 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
294 print "$Separator", "0";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
295 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
296 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
297 print "\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
298 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
299
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
300 } elsif ($Opt_Separate) { # separated output for each query
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
301 foreach my $query (sort @Queries) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
302 my $file = "$Result_Dir/$query\_binary\_matrix.";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
303 if ($Opt_Csv) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
304 $file .= "csv";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
305 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
306 $file .= "tsv";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
307 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
308 open (my $binary_matrix_fh, ">", "$file");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
309 foreach my $id (sort keys %Hits) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
310 print $binary_matrix_fh "$id";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
311 if ($Hits{$id}->{$query}) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
312 print $binary_matrix_fh "$Separator", "$Hits{$id}->{$query}\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
313 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
314 print $binary_matrix_fh "$Separator", "0\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
315 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
316 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
317 close $binary_matrix_fh;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
318 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
319 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
320
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
321 exit;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
322
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
323
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
324 ###############
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
325 # Subroutines #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
326 ###############
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
327
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
328 ### Subroutine to empty a directory with user interaction
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
329 sub empty_dir {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
330 my $dir = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
331 print STDERR "\nDirectory '$dir' already exists! You can use either option '-d' to set a different output result directory name, or do you want to replace the directory and all its contents [y|n]? ";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
332 my $user_ask = <STDIN>;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
333 if ($user_ask =~ /y/i) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
334 unlink glob "$dir/*"; # remove all files in results directory
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
335 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
336 die "\nScript abborted!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
337 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
338 return 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
339 }