3
|
1 #!/usr/bin/perl
|
|
2
|
|
3 #######
|
|
4 # POD #
|
|
5 #######
|
|
6
|
|
7 =pod
|
|
8
|
|
9 =head1 NAME
|
|
10
|
|
11 C<cdd2cog.pl> - assign COG categories to protein sequences
|
|
12
|
|
13 =head1 SYNOPSIS
|
|
14
|
|
15 C<perl cdd2cog.pl -r rps-blast.out -c cddid.tbl -f fun.txt -w whog>
|
|
16
|
|
17 =head1 DESCRIPTION
|
|
18
|
|
19 The script assigns COG (L<cluster of orthologous
|
|
20 groups|http://www.ncbi.nlm.nih.gov/COG/>) categories to proteins.
|
|
21 For this purpose, the query proteins need to be blasted with
|
|
22 RPS-BLAST+ (L<Reverse Position-Specific BLAST|http://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Download>)
|
|
23 against NCBI's Conserved Domain Database
|
|
24 (L<CDD|http://www.ncbi.nlm.nih.gov/cdd>). Use
|
|
25 L<C<cds_extractor.pl>|/cds_extractor> beforehand to extract multi-fasta
|
|
26 protein files from GENBANK or EMBL files.
|
|
27
|
|
28 Both tab-delimited RPS-BLAST+ outformats, B<-outfmt 6> and B<-outfmt
|
|
29 7>, can be processed by C<cdd2cog.pl>. By default, RPS-BLAST+ hits
|
|
30 for each query protein are filtered for the best hit (lowest
|
|
31 e-value). Use option B<-a|all_hits> to assign COGs to all BLAST hits
|
|
32 and e.g. do a downstream filtering in a spreadsheet application.
|
|
33 Results are written to tab-delimited files in the F<./results>
|
|
34 folder, overall assignment statistics are printed to C<STDOUT>.
|
|
35
|
|
36 Several files are needed from NCBI's FTP server to run the RPS-BLAST+
|
|
37 and C<cdd2cog.pl>:
|
|
38
|
|
39 =over
|
|
40
|
|
41 =item 1.) L<CDD|ftp://ftp.ncbi.nlm.nih.gov/pub/mmdb/cdd/>
|
|
42
|
|
43 More information about the files in the CDD FTP archive can be found
|
|
44 in the respective F<README> file.
|
|
45
|
|
46 =item 1.1.) F<cddid.tbl.gz>
|
|
47
|
|
48 The file needs to be unpacked:
|
|
49
|
|
50 C<gunzip cddid.tbl.gz>
|
|
51
|
|
52 Contains summary information about the CD models in a tab-delimited
|
|
53 format. The columns are: PSSM-Id, CD accession (e.g. COG#), CD short
|
|
54 name, CD description, and PSSM (position-specific scoring matrices)
|
|
55 length.
|
|
56
|
|
57 =item 1.2.) F<./little_endian/Cog_LE.tar.gz>
|
|
58
|
|
59 Unpack and untar via:
|
|
60
|
|
61 C<tar xvfz Cog_LE.tar.gz>
|
|
62
|
|
63 Preformatted RPS-BLAST+ database of the CDD COG distribution for
|
|
64 Intel CPUs and Unix/Windows architectures.
|
|
65
|
|
66 =item 2.) L<COG|ftp://ftp.ncbi.nlm.nih.gov/pub/COG/COG/>
|
|
67
|
|
68 Read F<readme> for more information about the respective files in
|
|
69 the COG FTP archive.
|
|
70
|
|
71 =item 2.1.) F<fun.txt>
|
|
72
|
|
73 One-letter functional classification used in the COG database.
|
|
74
|
|
75 =item 2.2.) F<whog>
|
|
76
|
|
77 Name, description, and corresponding functional classification of
|
|
78 each COG.
|
|
79
|
|
80 =back
|
|
81
|
|
82 =head1 OPTIONS
|
|
83
|
|
84 =head2 Mandatory options
|
|
85
|
|
86 =over 20
|
|
87
|
|
88 =item B<-r>=I<str>, B<-rps_report>=I<str>
|
|
89
|
|
90 Path to RPS-BLAST+ report/output, outfmt 6 or 7
|
|
91
|
|
92 =item B<-c>=I<str>, B<-cddid>=I<str>
|
|
93
|
|
94 Path to CDD's F<cddid.tbl> file
|
|
95
|
|
96 =item B<-f>=I<str>, B<-fun>=I<str>
|
|
97
|
|
98 Path to COG's F<fun.txt> file
|
|
99
|
|
100 =item B<-w>=I<str>, B<-whog>=I<str>
|
|
101
|
|
102 Path to COG's F<whog> file
|
|
103
|
|
104 =back
|
|
105
|
|
106 =head2 Optional options
|
|
107
|
|
108 =over 20
|
|
109
|
|
110 =item B<-h>, B<-help>
|
|
111
|
|
112 Help (perldoc POD)
|
|
113
|
|
114 =item B<-a>, B<-all_hits>
|
|
115
|
|
116 Don't filter RPS-BLAST+ output for the best hit, rather assign COGs
|
|
117 to all hits
|
|
118
|
|
119 =item B<-v>, B<-version>
|
|
120
|
|
121 Print version number to C<STDERR>
|
|
122
|
|
123 =back
|
|
124
|
|
125 =head1 OUTPUT
|
|
126
|
|
127 =over 20
|
|
128
|
|
129 =item C<STDOUT>
|
|
130
|
|
131 Overall assignment statistics
|
|
132
|
|
133 =item F<./results>
|
|
134
|
|
135 All tab-delimited output files are stored in this result folder
|
|
136
|
|
137 =item F<rps-blast_cog.txt>
|
|
138
|
|
139 COG assignments concatenated to the RPS-BLAST+ results for filtering
|
|
140
|
|
141 =item F<protein-id_cog.txt>
|
|
142
|
|
143 Slimmed down F<rps-blast_cog.txt> only including query id (first
|
|
144 BLAST report column), COGs, and functional categories
|
|
145
|
|
146 =item F<cog_stats.txt>
|
|
147
|
|
148 Assignment counts for each used COG
|
|
149
|
|
150 =item F<func_stats.txt>
|
|
151
|
|
152 Assignment counts for single-letter functional categories
|
|
153
|
|
154 =back
|
|
155
|
|
156 =head1 EXAMPLES
|
|
157
|
|
158 =head2 RPS-BLAST+
|
|
159
|
|
160 =over
|
|
161
|
|
162 =item C<rpsblast -query protein.fasta -db Cog -out rps-blast.out
|
|
163 -evalue 1e-2 -outfmt 6>
|
|
164
|
|
165 =item C<rpsblast -query protein.fasta -db Cog -out rps-blast.out
|
|
166 -evalue 1e-2 -outfmt '7 qseqid sseqid pident length mismatch gapopen
|
|
167 qstart qend sstart send evalue bitscore qcovs'>
|
|
168
|
|
169 =back
|
|
170
|
|
171 =head2 C<cdd2cog.pl>
|
|
172
|
|
173 =over
|
|
174
|
|
175 =item C<perl cdd2cog.pl -r rps-blast.out -c cddid.tbl -f fun.txt
|
|
176 -w whog -a>
|
|
177
|
|
178 =back
|
|
179
|
|
180 =head1 VERSION
|
|
181
|
|
182 0.2 update: 2017-02-16
|
|
183 0.1 2013-08-01
|
|
184
|
|
185 =head1 AUTHOR
|
|
186
|
|
187 Andreas Leimbach aleimba[at]gmx[dot]de
|
|
188
|
|
189 =head1 ACKNOWLEDGEMENTS
|
|
190
|
|
191 I got the idea for using NCBI's CDD PSSMs for COG assignment from JGI's L<IMG/ER annotation
|
|
192 system|http://img.jgi.doe.gov/>, which employes the same technique.
|
|
193
|
|
194
|
|
195 =head1 LICENSE
|
|
196
|
|
197 This program is free software: you can redistribute it and/or modify
|
|
198 it under the terms of the GNU General Public License as published by
|
|
199 the Free Software Foundation; either version 3 (GPLv3) of the
|
|
200 License, or (at your option) any later version.
|
|
201
|
|
202 This program is distributed in the hope that it will be useful, but
|
|
203 WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
204 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
205 General Public License for more details.
|
|
206
|
|
207 You should have received a copy of the GNU General Public License
|
|
208 along with this program. If not, see L<http://www.gnu.org/licenses/>.
|
|
209
|
|
210 =cut
|
|
211
|
|
212
|
|
213 ########
|
|
214 # MAIN #
|
|
215 ########
|
|
216
|
|
217 use strict;
|
|
218 use warnings;
|
|
219 use autodie;
|
|
220 use Getopt::Long;
|
|
221 use Pod::Usage;
|
|
222
|
|
223
|
|
224 ### Get the options with Getopt::Long
|
|
225 my $Rps_Report; # path to the rps-blast report/output
|
|
226 my $CDDid_File; # path to the CDD 'cddid.tbl' file
|
|
227 my $Fun_File; # path to the COG 'fun' file
|
|
228 my $Whog_File; # path to the COG 'whog' file
|
|
229 my $Opt_All_Hits; # give all blast hits for a query, not just the best (lowest evalue)
|
|
230 my $VERSION = 0.1;
|
|
231 my ($Opt_Version, $Opt_Help);
|
|
232 GetOptions ('rps_report=s' => \$Rps_Report,
|
|
233 'cddid=s' => \$CDDid_File,
|
|
234 'fun=s' => \$Fun_File,
|
|
235 'whog=s' => \$Whog_File,
|
|
236 'all_hits' => \$Opt_All_Hits,
|
|
237 'version' => \$Opt_Version,
|
|
238 'help|?' => \$Opt_Help);
|
|
239
|
|
240
|
|
241
|
|
242 ### Run perldoc on POD
|
|
243 pod2usage(-verbose => 2) if ($Opt_Help);
|
|
244 die "$0 $VERSION\n" if ($Opt_Version);
|
|
245 if (!$Rps_Report || !$CDDid_File || !$Fun_File || !$Whog_File) {
|
|
246 my $warning = "\n### Fatal error: Option(s) or arguments for '-r', '-c', '-f', or '-w' are missing!\n";
|
|
247 pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
|
|
248 }
|
|
249
|
|
250
|
|
251
|
|
252 ### Parse the 'cddid.tbl', 'fun.txt' and 'whog' file contents and store info in hash structures
|
|
253 my (%CDDid, %Fun, %Whog); # global hashes
|
|
254 parse_cdd_cog(); # subroutine
|
|
255
|
|
256
|
|
257
|
|
258 ### Create results directory for output files
|
|
259 my $Out_Dir = './results/';
|
|
260 if (-e $Out_Dir) {
|
|
261 print "###Directory '$Out_Dir' already exists! Replace the directory and all its contents [y|n]? ";
|
|
262 my $user_ask = <STDIN>;
|
|
263 if ($user_ask =~ /y/i) {
|
|
264 unlink glob "$Out_Dir*"; # remove all files in results directory
|
|
265 rmdir $Out_Dir; # remove the empty directory
|
|
266 } else {
|
|
267 die "Script abborted!\n";
|
|
268 }
|
|
269 }
|
|
270 mkdir $Out_Dir or die "Can't create directory \"$Out_Dir\": $!\n";
|
|
271
|
|
272
|
|
273
|
|
274 ### Parse the rps-blast report/output file and assign COGs
|
|
275 my %Cog_Stats; # store the total number of query protein hits for each COG, written to '$Cogstats_Out' below
|
|
276
|
|
277 my $Blast_Out = 'rps-blast_cog.txt'; # output file for COG assignments appended to RPS-BLAST results
|
|
278 open (my $Blast_Out_Fh, ">", "$Out_Dir"."$Blast_Out");
|
|
279 print $Blast_Out_Fh "query id\tsubject id\t% identity\talignment length\tmismatches\tgap opens\tq. start\tq. end\ts. start\ts. end\tevalue\tbit score\tCOG#\tfunctional categories\t\t\t\t\tCOG protein description\n"; # header for $Blast_Out
|
|
280
|
|
281 my $Locus_Cog = "protein-id_cog.txt"; # slimmed down $Blast_Out only including locus_tags, COGs, and functional categories
|
|
282 open (my $Locus_Cog_Fh, ">", "$Out_Dir"."$Locus_Cog");
|
|
283
|
|
284 print "Parsing RPS-BLAST report ...\n"; # status message
|
|
285 my $Skip = ''; # only keep best blast hit per query (lowest e-value), except option 'all_hits' is given
|
|
286 open (my $Rps_Report_Fh, "<", "$Rps_Report");
|
|
287 while (<$Rps_Report_Fh>) {
|
|
288 if (/^#/) { # skip comment lines in blast report for BLAST+ "outfmt 7"
|
|
289 next;
|
|
290 }
|
|
291 chomp;
|
|
292
|
|
293 my @line = split(/\t/, $_); # split tab-separated RPS-BLAST report
|
|
294
|
|
295 if ($Skip eq $line[0] && !$Opt_All_Hits) {
|
|
296 # only keep best blast hit per query, only if option 'all_hits' is NOT set
|
|
297 # $line[0] is query id, and should be locus_tag or specific ID from multi-fasta protein query file
|
|
298 next;
|
|
299 }
|
|
300 $Skip = $line[0];
|
|
301
|
|
302 my $pssm_id = $1 if $line[1] =~ /CDD\|(\d+)/; # get PSSM-Id from the subject hit
|
|
303 my $cog = $CDDid{$pssm_id}; # get the COG# according to the PSSM-Id as listed in 'cddid.tbl'
|
|
304 print "COG: $cog \n";
|
|
305 $Cog_Stats{$cog}++; # increment hit-number for specific COG
|
|
306
|
|
307 ### Collect functional categories stats
|
|
308 my @functions = split('', $Whog{$cog}->{'function'}); # split the single-letter functional categories to count them and join them as tab-separated below
|
|
309 foreach (@functions) {
|
|
310 $Fun{$_}->{'count'}++; # increment hit-number for specific functional category
|
|
311 }
|
|
312
|
|
313 ### Print to result files
|
|
314 my $functions = join("\t", @functions); # join functional categories tab-separated
|
|
315 print $Locus_Cog_Fh "$line[0]\t$cog\t$functions\n"; # locus_tag\tCOG\tfunctional categories
|
|
316 $functions .= "\t" x (5 - @functions); # add additional tabs for COGs with fewer than five functions (which is the maximum number)
|
|
317 print $Blast_Out_Fh "$_\t$cog\t$functions\t$Whog{$cog}->{'desc'}\n"; # $_ = RPS-BLAST line
|
|
318 }
|
|
319
|
|
320 close $Rps_Report_Fh;
|
|
321 close $Blast_Out_Fh;
|
|
322 close $Locus_Cog_Fh;
|
|
323
|
|
324
|
|
325
|
|
326 ### Total COG and functional categories stats
|
|
327 print "Writing assignment statistic files in '$Out_Dir' folder ...\n"; # status message
|
|
328
|
|
329 my $Cogstats_Out = 'cog_stats.txt'; # output file for assignment numbers for each COG
|
|
330 open (my $Cog_Stats_Fh, ">", "$Out_Dir"."$Cogstats_Out");
|
|
331 my $prot_stats = 0; # store total number of query proteins, which have a COG assignment
|
|
332 foreach my $cog (sort keys %Cog_Stats) {
|
|
333 print $Cog_Stats_Fh "$cog\t$Whog{$cog}->{'desc'}\t$Cog_Stats{$cog}\n"; # COG protein descriptions stored in %Whog
|
|
334 $prot_stats += $Cog_Stats{$cog}; # sum up total COG assignments
|
|
335 }
|
|
336 close $Cog_Stats_Fh;
|
|
337
|
|
338 my $Funcstats_Out = 'func_stats.txt'; # output file for assignment numbers for each functional category
|
|
339 open (my $Func_Stats_Fh, ">", "$Out_Dir"."$Funcstats_Out");
|
|
340 my $func_cats = 0; # store total number of assigned functional categories
|
|
341 foreach my $func (sort keys %Fun) {
|
|
342 print $Func_Stats_Fh "$func\t$Fun{$func}->{'desc'}\t$Fun{$func}->{'count'}\n";
|
|
343 $func_cats += $Fun{$func}->{'count'}; # sum up total functional category assignments
|
|
344 }
|
|
345 close $Func_Stats_Fh;
|
|
346
|
|
347
|
|
348
|
|
349 ### State which files were created and print overall statistics
|
|
350 print "\n############################################################################\n";
|
|
351 print "The following tab-delimited files were created in the '$Out_Dir' directory:\n";
|
|
352 print "- $Blast_Out: COG assignments concatenated to the RPS-BLAST results for filtering\n";
|
|
353 print "- $Locus_Cog: Slimmed down '$Blast_Out' only including query id (first BLAST report column), COGs, and functional categories\n";
|
|
354 print "- $Cogstats_Out: COG assignment counts\n";
|
|
355 print "- $Funcstats_Out: Functional category assignment counts\n";
|
|
356 print "##############################################################################\n";
|
|
357 print "Overall assignment statistics:\n";
|
|
358 print "~ Total query proteins categorized into COGs: $prot_stats\n";
|
|
359 print "~ Total COGs used for the query proteins [of ", scalar keys %CDDid, " overall]: ", scalar keys %Cog_Stats, "\n";
|
|
360 print "~ Total number of assigned functional categories: $func_cats\n";
|
|
361 print "~ Total functional categories used for the query proteins [of ", scalar keys %Fun, " overall]: ", scalar grep ($Fun{$_}->{'count'} > 0, keys %Fun), "\n\n"; # grep for functional categories with a count > 0 to get the ones with assigned query proteins
|
|
362
|
|
363 exit;
|
|
364
|
|
365
|
|
366
|
|
367 ###############
|
|
368 # Subroutines #
|
|
369 ###############
|
|
370
|
|
371 ### Subroutine to parse the 'cddid.tbl', 'fun' and 'whog' file contents and store in hash structures
|
|
372 sub parse_cdd_cog {
|
|
373
|
|
374 ### 'cddid.tbl'
|
|
375 open (my $cddid_fh, "<", "$CDDid_File");
|
|
376 print "\nParsing CDDs '$CDDid_File' file ...\n"; # status message
|
|
377 while (<$cddid_fh>) {
|
|
378 chomp;
|
|
379 my @line = split(/\t/, $_); # split line at the tabs
|
|
380 if ($line[1] =~ /^COG\d{4}$/) { # search for COG CD accessions in cddid
|
|
381 $CDDid{$line[0]} = $line[1]; # hash to store info; $line[0] = PSSM-Id
|
|
382 }
|
|
383 }
|
|
384 close $cddid_fh;
|
|
385
|
|
386 ### 'fun.txt'
|
|
387 open (my $fun_fh, "<", "$Fun_File");
|
|
388 print "Parsing COGs '$Fun_File' file ...\n"; # status message
|
|
389 while (<$fun_fh>) {
|
|
390 chomp;
|
|
391 $_ =~ s/^\s*|\s+$//g; # get rid of all leading and trailing whitespaces
|
|
392 if (/^\[(\w)\]\s*(.+)$/) {
|
|
393 $Fun{$1} = {'desc' => $2, 'count' => 0}; # anonymous hash in hash
|
|
394 # $1 = single-letter functional category, $2 = description of functional category
|
|
395 # count used to find functional categories not present in the query proteins for final overall assignment statistics
|
|
396 }
|
|
397 }
|
|
398 close $fun_fh;
|
|
399
|
|
400 ### 'whog'
|
|
401 open (my $whog_fh, "<", "$Whog_File");
|
|
402 print "Parsing COGs '$Whog_File' file ...\n"; # status message
|
|
403 while (<$whog_fh>) {
|
|
404 chomp;
|
|
405 $_ =~ s/^\s*|\s+$//g; # get rid of all leading and trailing whitespaces
|
|
406 if (/^\[(\w+)\]\s*(COG\d{4})\s+(.+)$/) {
|
|
407 $Whog{$2} = {'function' => $1, 'desc' => $3}; # anonymous hash in hash
|
|
408 # $1 = single-letter functional categories, maximal five per COG (only COG5032 with five)
|
|
409 # $2 = COG#, $3 = COG protein description
|
|
410 }
|
|
411 }
|
|
412 close $whog_fh;
|
|
413
|
|
414 return 1;
|
|
415 }
|