Mercurial > repos > dereeper > pangenome_explorer
comparison COG/bac-genomics-scripts/cdd2cog/cdd2cog.pl @ 3:e42d30da7a74 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:52:25 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:97e4e3e818b6 | 3:e42d30da7a74 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 ####### | |
4 # POD # | |
5 ####### | |
6 | |
7 =pod | |
8 | |
9 =head1 NAME | |
10 | |
11 C<cdd2cog.pl> - assign COG categories to protein sequences | |
12 | |
13 =head1 SYNOPSIS | |
14 | |
15 C<perl cdd2cog.pl -r rps-blast.out -c cddid.tbl -f fun.txt -w whog> | |
16 | |
17 =head1 DESCRIPTION | |
18 | |
19 The script assigns COG (L<cluster of orthologous | |
20 groups|http://www.ncbi.nlm.nih.gov/COG/>) categories to proteins. | |
21 For this purpose, the query proteins need to be blasted with | |
22 RPS-BLAST+ (L<Reverse Position-Specific BLAST|http://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Download>) | |
23 against NCBI's Conserved Domain Database | |
24 (L<CDD|http://www.ncbi.nlm.nih.gov/cdd>). Use | |
25 L<C<cds_extractor.pl>|/cds_extractor> beforehand to extract multi-fasta | |
26 protein files from GENBANK or EMBL files. | |
27 | |
28 Both tab-delimited RPS-BLAST+ outformats, B<-outfmt 6> and B<-outfmt | |
29 7>, can be processed by C<cdd2cog.pl>. By default, RPS-BLAST+ hits | |
30 for each query protein are filtered for the best hit (lowest | |
31 e-value). Use option B<-a|all_hits> to assign COGs to all BLAST hits | |
32 and e.g. do a downstream filtering in a spreadsheet application. | |
33 Results are written to tab-delimited files in the F<./results> | |
34 folder, overall assignment statistics are printed to C<STDOUT>. | |
35 | |
36 Several files are needed from NCBI's FTP server to run the RPS-BLAST+ | |
37 and C<cdd2cog.pl>: | |
38 | |
39 =over | |
40 | |
41 =item 1.) L<CDD|ftp://ftp.ncbi.nlm.nih.gov/pub/mmdb/cdd/> | |
42 | |
43 More information about the files in the CDD FTP archive can be found | |
44 in the respective F<README> file. | |
45 | |
46 =item 1.1.) F<cddid.tbl.gz> | |
47 | |
48 The file needs to be unpacked: | |
49 | |
50 C<gunzip cddid.tbl.gz> | |
51 | |
52 Contains summary information about the CD models in a tab-delimited | |
53 format. The columns are: PSSM-Id, CD accession (e.g. COG#), CD short | |
54 name, CD description, and PSSM (position-specific scoring matrices) | |
55 length. | |
56 | |
57 =item 1.2.) F<./little_endian/Cog_LE.tar.gz> | |
58 | |
59 Unpack and untar via: | |
60 | |
61 C<tar xvfz Cog_LE.tar.gz> | |
62 | |
63 Preformatted RPS-BLAST+ database of the CDD COG distribution for | |
64 Intel CPUs and Unix/Windows architectures. | |
65 | |
66 =item 2.) L<COG|ftp://ftp.ncbi.nlm.nih.gov/pub/COG/COG/> | |
67 | |
68 Read F<readme> for more information about the respective files in | |
69 the COG FTP archive. | |
70 | |
71 =item 2.1.) F<fun.txt> | |
72 | |
73 One-letter functional classification used in the COG database. | |
74 | |
75 =item 2.2.) F<whog> | |
76 | |
77 Name, description, and corresponding functional classification of | |
78 each COG. | |
79 | |
80 =back | |
81 | |
82 =head1 OPTIONS | |
83 | |
84 =head2 Mandatory options | |
85 | |
86 =over 20 | |
87 | |
88 =item B<-r>=I<str>, B<-rps_report>=I<str> | |
89 | |
90 Path to RPS-BLAST+ report/output, outfmt 6 or 7 | |
91 | |
92 =item B<-c>=I<str>, B<-cddid>=I<str> | |
93 | |
94 Path to CDD's F<cddid.tbl> file | |
95 | |
96 =item B<-f>=I<str>, B<-fun>=I<str> | |
97 | |
98 Path to COG's F<fun.txt> file | |
99 | |
100 =item B<-w>=I<str>, B<-whog>=I<str> | |
101 | |
102 Path to COG's F<whog> file | |
103 | |
104 =back | |
105 | |
106 =head2 Optional options | |
107 | |
108 =over 20 | |
109 | |
110 =item B<-h>, B<-help> | |
111 | |
112 Help (perldoc POD) | |
113 | |
114 =item B<-a>, B<-all_hits> | |
115 | |
116 Don't filter RPS-BLAST+ output for the best hit, rather assign COGs | |
117 to all hits | |
118 | |
119 =item B<-v>, B<-version> | |
120 | |
121 Print version number to C<STDERR> | |
122 | |
123 =back | |
124 | |
125 =head1 OUTPUT | |
126 | |
127 =over 20 | |
128 | |
129 =item C<STDOUT> | |
130 | |
131 Overall assignment statistics | |
132 | |
133 =item F<./results> | |
134 | |
135 All tab-delimited output files are stored in this result folder | |
136 | |
137 =item F<rps-blast_cog.txt> | |
138 | |
139 COG assignments concatenated to the RPS-BLAST+ results for filtering | |
140 | |
141 =item F<protein-id_cog.txt> | |
142 | |
143 Slimmed down F<rps-blast_cog.txt> only including query id (first | |
144 BLAST report column), COGs, and functional categories | |
145 | |
146 =item F<cog_stats.txt> | |
147 | |
148 Assignment counts for each used COG | |
149 | |
150 =item F<func_stats.txt> | |
151 | |
152 Assignment counts for single-letter functional categories | |
153 | |
154 =back | |
155 | |
156 =head1 EXAMPLES | |
157 | |
158 =head2 RPS-BLAST+ | |
159 | |
160 =over | |
161 | |
162 =item C<rpsblast -query protein.fasta -db Cog -out rps-blast.out | |
163 -evalue 1e-2 -outfmt 6> | |
164 | |
165 =item C<rpsblast -query protein.fasta -db Cog -out rps-blast.out | |
166 -evalue 1e-2 -outfmt '7 qseqid sseqid pident length mismatch gapopen | |
167 qstart qend sstart send evalue bitscore qcovs'> | |
168 | |
169 =back | |
170 | |
171 =head2 C<cdd2cog.pl> | |
172 | |
173 =over | |
174 | |
175 =item C<perl cdd2cog.pl -r rps-blast.out -c cddid.tbl -f fun.txt | |
176 -w whog -a> | |
177 | |
178 =back | |
179 | |
180 =head1 VERSION | |
181 | |
182 0.2 update: 2017-02-16 | |
183 0.1 2013-08-01 | |
184 | |
185 =head1 AUTHOR | |
186 | |
187 Andreas Leimbach aleimba[at]gmx[dot]de | |
188 | |
189 =head1 ACKNOWLEDGEMENTS | |
190 | |
191 I got the idea for using NCBI's CDD PSSMs for COG assignment from JGI's L<IMG/ER annotation | |
192 system|http://img.jgi.doe.gov/>, which employes the same technique. | |
193 | |
194 | |
195 =head1 LICENSE | |
196 | |
197 This program is free software: you can redistribute it and/or modify | |
198 it under the terms of the GNU General Public License as published by | |
199 the Free Software Foundation; either version 3 (GPLv3) of the | |
200 License, or (at your option) any later version. | |
201 | |
202 This program is distributed in the hope that it will be useful, but | |
203 WITHOUT ANY WARRANTY; without even the implied warranty of | |
204 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
205 General Public License for more details. | |
206 | |
207 You should have received a copy of the GNU General Public License | |
208 along with this program. If not, see L<http://www.gnu.org/licenses/>. | |
209 | |
210 =cut | |
211 | |
212 | |
213 ######## | |
214 # MAIN # | |
215 ######## | |
216 | |
217 use strict; | |
218 use warnings; | |
219 use autodie; | |
220 use Getopt::Long; | |
221 use Pod::Usage; | |
222 | |
223 | |
224 ### Get the options with Getopt::Long | |
225 my $Rps_Report; # path to the rps-blast report/output | |
226 my $CDDid_File; # path to the CDD 'cddid.tbl' file | |
227 my $Fun_File; # path to the COG 'fun' file | |
228 my $Whog_File; # path to the COG 'whog' file | |
229 my $Opt_All_Hits; # give all blast hits for a query, not just the best (lowest evalue) | |
230 my $VERSION = 0.1; | |
231 my ($Opt_Version, $Opt_Help); | |
232 GetOptions ('rps_report=s' => \$Rps_Report, | |
233 'cddid=s' => \$CDDid_File, | |
234 'fun=s' => \$Fun_File, | |
235 'whog=s' => \$Whog_File, | |
236 'all_hits' => \$Opt_All_Hits, | |
237 'version' => \$Opt_Version, | |
238 'help|?' => \$Opt_Help); | |
239 | |
240 | |
241 | |
242 ### Run perldoc on POD | |
243 pod2usage(-verbose => 2) if ($Opt_Help); | |
244 die "$0 $VERSION\n" if ($Opt_Version); | |
245 if (!$Rps_Report || !$CDDid_File || !$Fun_File || !$Whog_File) { | |
246 my $warning = "\n### Fatal error: Option(s) or arguments for '-r', '-c', '-f', or '-w' are missing!\n"; | |
247 pod2usage(-verbose => 1, -message => $warning, -exitval => 2); | |
248 } | |
249 | |
250 | |
251 | |
252 ### Parse the 'cddid.tbl', 'fun.txt' and 'whog' file contents and store info in hash structures | |
253 my (%CDDid, %Fun, %Whog); # global hashes | |
254 parse_cdd_cog(); # subroutine | |
255 | |
256 | |
257 | |
258 ### Create results directory for output files | |
259 my $Out_Dir = './results/'; | |
260 if (-e $Out_Dir) { | |
261 print "###Directory '$Out_Dir' already exists! Replace the directory and all its contents [y|n]? "; | |
262 my $user_ask = <STDIN>; | |
263 if ($user_ask =~ /y/i) { | |
264 unlink glob "$Out_Dir*"; # remove all files in results directory | |
265 rmdir $Out_Dir; # remove the empty directory | |
266 } else { | |
267 die "Script abborted!\n"; | |
268 } | |
269 } | |
270 mkdir $Out_Dir or die "Can't create directory \"$Out_Dir\": $!\n"; | |
271 | |
272 | |
273 | |
274 ### Parse the rps-blast report/output file and assign COGs | |
275 my %Cog_Stats; # store the total number of query protein hits for each COG, written to '$Cogstats_Out' below | |
276 | |
277 my $Blast_Out = 'rps-blast_cog.txt'; # output file for COG assignments appended to RPS-BLAST results | |
278 open (my $Blast_Out_Fh, ">", "$Out_Dir"."$Blast_Out"); | |
279 print $Blast_Out_Fh "query id\tsubject id\t% identity\talignment length\tmismatches\tgap opens\tq. start\tq. end\ts. start\ts. end\tevalue\tbit score\tCOG#\tfunctional categories\t\t\t\t\tCOG protein description\n"; # header for $Blast_Out | |
280 | |
281 my $Locus_Cog = "protein-id_cog.txt"; # slimmed down $Blast_Out only including locus_tags, COGs, and functional categories | |
282 open (my $Locus_Cog_Fh, ">", "$Out_Dir"."$Locus_Cog"); | |
283 | |
284 print "Parsing RPS-BLAST report ...\n"; # status message | |
285 my $Skip = ''; # only keep best blast hit per query (lowest e-value), except option 'all_hits' is given | |
286 open (my $Rps_Report_Fh, "<", "$Rps_Report"); | |
287 while (<$Rps_Report_Fh>) { | |
288 if (/^#/) { # skip comment lines in blast report for BLAST+ "outfmt 7" | |
289 next; | |
290 } | |
291 chomp; | |
292 | |
293 my @line = split(/\t/, $_); # split tab-separated RPS-BLAST report | |
294 | |
295 if ($Skip eq $line[0] && !$Opt_All_Hits) { | |
296 # only keep best blast hit per query, only if option 'all_hits' is NOT set | |
297 # $line[0] is query id, and should be locus_tag or specific ID from multi-fasta protein query file | |
298 next; | |
299 } | |
300 $Skip = $line[0]; | |
301 | |
302 my $pssm_id = $1 if $line[1] =~ /CDD\|(\d+)/; # get PSSM-Id from the subject hit | |
303 my $cog = $CDDid{$pssm_id}; # get the COG# according to the PSSM-Id as listed in 'cddid.tbl' | |
304 print "COG: $cog \n"; | |
305 $Cog_Stats{$cog}++; # increment hit-number for specific COG | |
306 | |
307 ### Collect functional categories stats | |
308 my @functions = split('', $Whog{$cog}->{'function'}); # split the single-letter functional categories to count them and join them as tab-separated below | |
309 foreach (@functions) { | |
310 $Fun{$_}->{'count'}++; # increment hit-number for specific functional category | |
311 } | |
312 | |
313 ### Print to result files | |
314 my $functions = join("\t", @functions); # join functional categories tab-separated | |
315 print $Locus_Cog_Fh "$line[0]\t$cog\t$functions\n"; # locus_tag\tCOG\tfunctional categories | |
316 $functions .= "\t" x (5 - @functions); # add additional tabs for COGs with fewer than five functions (which is the maximum number) | |
317 print $Blast_Out_Fh "$_\t$cog\t$functions\t$Whog{$cog}->{'desc'}\n"; # $_ = RPS-BLAST line | |
318 } | |
319 | |
320 close $Rps_Report_Fh; | |
321 close $Blast_Out_Fh; | |
322 close $Locus_Cog_Fh; | |
323 | |
324 | |
325 | |
326 ### Total COG and functional categories stats | |
327 print "Writing assignment statistic files in '$Out_Dir' folder ...\n"; # status message | |
328 | |
329 my $Cogstats_Out = 'cog_stats.txt'; # output file for assignment numbers for each COG | |
330 open (my $Cog_Stats_Fh, ">", "$Out_Dir"."$Cogstats_Out"); | |
331 my $prot_stats = 0; # store total number of query proteins, which have a COG assignment | |
332 foreach my $cog (sort keys %Cog_Stats) { | |
333 print $Cog_Stats_Fh "$cog\t$Whog{$cog}->{'desc'}\t$Cog_Stats{$cog}\n"; # COG protein descriptions stored in %Whog | |
334 $prot_stats += $Cog_Stats{$cog}; # sum up total COG assignments | |
335 } | |
336 close $Cog_Stats_Fh; | |
337 | |
338 my $Funcstats_Out = 'func_stats.txt'; # output file for assignment numbers for each functional category | |
339 open (my $Func_Stats_Fh, ">", "$Out_Dir"."$Funcstats_Out"); | |
340 my $func_cats = 0; # store total number of assigned functional categories | |
341 foreach my $func (sort keys %Fun) { | |
342 print $Func_Stats_Fh "$func\t$Fun{$func}->{'desc'}\t$Fun{$func}->{'count'}\n"; | |
343 $func_cats += $Fun{$func}->{'count'}; # sum up total functional category assignments | |
344 } | |
345 close $Func_Stats_Fh; | |
346 | |
347 | |
348 | |
349 ### State which files were created and print overall statistics | |
350 print "\n############################################################################\n"; | |
351 print "The following tab-delimited files were created in the '$Out_Dir' directory:\n"; | |
352 print "- $Blast_Out: COG assignments concatenated to the RPS-BLAST results for filtering\n"; | |
353 print "- $Locus_Cog: Slimmed down '$Blast_Out' only including query id (first BLAST report column), COGs, and functional categories\n"; | |
354 print "- $Cogstats_Out: COG assignment counts\n"; | |
355 print "- $Funcstats_Out: Functional category assignment counts\n"; | |
356 print "##############################################################################\n"; | |
357 print "Overall assignment statistics:\n"; | |
358 print "~ Total query proteins categorized into COGs: $prot_stats\n"; | |
359 print "~ Total COGs used for the query proteins [of ", scalar keys %CDDid, " overall]: ", scalar keys %Cog_Stats, "\n"; | |
360 print "~ Total number of assigned functional categories: $func_cats\n"; | |
361 print "~ Total functional categories used for the query proteins [of ", scalar keys %Fun, " overall]: ", scalar grep ($Fun{$_}->{'count'} > 0, keys %Fun), "\n\n"; # grep for functional categories with a count > 0 to get the ones with assigned query proteins | |
362 | |
363 exit; | |
364 | |
365 | |
366 | |
367 ############### | |
368 # Subroutines # | |
369 ############### | |
370 | |
371 ### Subroutine to parse the 'cddid.tbl', 'fun' and 'whog' file contents and store in hash structures | |
372 sub parse_cdd_cog { | |
373 | |
374 ### 'cddid.tbl' | |
375 open (my $cddid_fh, "<", "$CDDid_File"); | |
376 print "\nParsing CDDs '$CDDid_File' file ...\n"; # status message | |
377 while (<$cddid_fh>) { | |
378 chomp; | |
379 my @line = split(/\t/, $_); # split line at the tabs | |
380 if ($line[1] =~ /^COG\d{4}$/) { # search for COG CD accessions in cddid | |
381 $CDDid{$line[0]} = $line[1]; # hash to store info; $line[0] = PSSM-Id | |
382 } | |
383 } | |
384 close $cddid_fh; | |
385 | |
386 ### 'fun.txt' | |
387 open (my $fun_fh, "<", "$Fun_File"); | |
388 print "Parsing COGs '$Fun_File' file ...\n"; # status message | |
389 while (<$fun_fh>) { | |
390 chomp; | |
391 $_ =~ s/^\s*|\s+$//g; # get rid of all leading and trailing whitespaces | |
392 if (/^\[(\w)\]\s*(.+)$/) { | |
393 $Fun{$1} = {'desc' => $2, 'count' => 0}; # anonymous hash in hash | |
394 # $1 = single-letter functional category, $2 = description of functional category | |
395 # count used to find functional categories not present in the query proteins for final overall assignment statistics | |
396 } | |
397 } | |
398 close $fun_fh; | |
399 | |
400 ### 'whog' | |
401 open (my $whog_fh, "<", "$Whog_File"); | |
402 print "Parsing COGs '$Whog_File' file ...\n"; # status message | |
403 while (<$whog_fh>) { | |
404 chomp; | |
405 $_ =~ s/^\s*|\s+$//g; # get rid of all leading and trailing whitespaces | |
406 if (/^\[(\w+)\]\s*(COG\d{4})\s+(.+)$/) { | |
407 $Whog{$2} = {'function' => $1, 'desc' => $3}; # anonymous hash in hash | |
408 # $1 = single-letter functional categories, maximal five per COG (only COG5032 with five) | |
409 # $2 = COG#, $3 = COG protein description | |
410 } | |
411 } | |
412 close $whog_fh; | |
413 | |
414 return 1; | |
415 } |