3
|
1 #!/usr/bin/perl
|
|
2
|
|
3 #######
|
|
4 # POD #
|
|
5 #######
|
|
6
|
|
7 =pod
|
|
8
|
|
9 =head1 NAME
|
|
10
|
|
11 C<genomes_feature_table.pl> - create a feature table for genomes in EMBL and GENBANK format
|
|
12
|
|
13 =head1 SYNOPSIS
|
|
14
|
|
15 C<perl genomes_feature_table.pl path/to/genome_dir E<gt> feature_table.tsv>
|
|
16
|
|
17 =head1 DESCRIPTION
|
|
18
|
|
19 A genome feature table lists basic stats/info (e.g. genome size, GC
|
|
20 content, coding percentage, accession number(s)) and the numbers of
|
|
21 annotated primary features (e.g. CDS, genes, RNAs) of genomes. It
|
|
22 can be used to have an overview of these features in different
|
|
23 genomes, e.g. in comparative genomics publications.
|
|
24
|
|
25 C<genomes_feature_table.pl> is designed to extract (or calculate)
|
|
26 these basic stats and B<all> annotated primary features from RichSeq
|
|
27 files (B<EMBL> or B<GENBANK> format) in a specified directory (with
|
|
28 the correct file extension, see option B<-e>). The B<default> directory
|
|
29 is the current working directory. The primary features are counted
|
|
30 and the results for each genome printed in tab-separated format. It
|
|
31 is a requirement that each file contains B<only one> genome
|
|
32 (complete or draft, with or without plasmids).
|
|
33
|
|
34 The most important features will be listed first, like genome
|
|
35 description, genome size, GC content, coding percentage (calculated
|
|
36 based on non-pseudo CDS annotation), CDS and gene numbers, accession
|
|
37 number(s) (first..last in the sequence file), RNAs (rRNA, tRNA,
|
|
38 tmRNA, ncRNA), and unresolved bases (IUPAC code 'N'). If plasmids are
|
|
39 annotated in a sequence file, the number of plasmids are counted
|
|
40 and listed as well (needs a I</plasmid="plasmid_name"> tag in the
|
|
41 I<source> primary tag, see e.g. Genbank accession number
|
|
42 L<CP009167|http://www.ncbi.nlm.nih.gov/nuccore/CP009167>). Use
|
|
43 option B<-p> to list plasmids as separate entries (lines) in the
|
|
44 feature table.
|
|
45
|
|
46 For draft genomes the number of contigs/scaffolds are counted. All
|
|
47 contigs/scaffolds of draft genomes should be marked with the I<WGS>
|
|
48 keyword (see e.g. draft NCBI Genbank entry
|
|
49 L<JSAY00000000|http://www.ncbi.nlm.nih.gov/nuccore/JSAY00000000>).
|
|
50 If this is not the case for your file(s) you can add those keywords
|
|
51 to each sequence entry with the following Perl one-liners (will edit
|
|
52 files in place). For files in B<GENBANK> format if 'KEYWORDS .' is
|
|
53 present
|
|
54
|
|
55 C<perl -i -pe 's/^KEYWORDS(\s+)\./KEYWORDS$1WGS\./' file>
|
|
56
|
|
57 or if 'KEYWORDS' isn't present at all
|
|
58
|
|
59 C<perl -i -ne 'if(/^ACCESSION/){ print; print "KEYWORDS WGS.\n";} else{ print;}' file>
|
|
60
|
|
61 For files in B<EMBL> format if 'KW .' is present
|
|
62
|
|
63 C<perl -i -pe 's/^KW(\s+)\./KW$1WGS\./' file>
|
|
64
|
|
65 or if 'KW' isn't present at all
|
|
66
|
|
67 C<perl -i -ne 'if(/^DE/){ $dw=1; print;} elsif(/^XX/ && $dw){ print; $dw=0; print "KW WGS.\n";} else{ print;}' file>
|
|
68
|
|
69 =head1 OPTIONS
|
|
70
|
|
71 =over 20
|
|
72
|
|
73 =item B<-h>, B<-help>
|
|
74
|
|
75 Help (perldoc POD)
|
|
76
|
|
77 =item B<-e>=I<str>, B<-extensions>=I<str>
|
|
78
|
|
79 File extensions to include in the analysis (EMBL or GENBANK format),
|
|
80 either comma-separated list or multiple occurences of the option
|
|
81 [default = ebl,emb,embl,gb,gbf,gbff,gbank,gbk,genbank]
|
|
82
|
|
83 =item B<-p>, B<-plasmids>
|
|
84
|
|
85 Optionally list plasmids as extra entries in the feature table, if
|
|
86 they are annotated with a I</plasmid="plasmid_name"> tag in the
|
|
87 I<source> primary tag
|
|
88
|
|
89 =item B<-v>, B<-version>
|
|
90
|
|
91 Print version number to C<STDERR>
|
|
92
|
|
93 =back
|
|
94
|
|
95 =head1 OUTPUT
|
|
96
|
|
97 =over 20
|
|
98
|
|
99 =item C<STDOUT>
|
|
100
|
|
101 The resulting feature table is printed to C<STDOUT>. Redirect or
|
|
102 pipe into another tool as needed (e.g. C<cut>, C<grep>, or C<head>).
|
|
103
|
|
104 =back
|
|
105
|
|
106 =head1 EXAMPLES
|
|
107
|
|
108 =over
|
|
109
|
|
110 =item C<perl genomes_feature_table.pl -p -e gb,gbk E<gt> feature_table_plasmids.tsv>
|
|
111
|
|
112 =item C<perl genomes_feature_table.pl path/to/genome_dir/ -e gbf -e embl E<gt> feature_table.tsv>
|
|
113
|
|
114 =back
|
|
115
|
|
116 =head1 DEPENDENCIES
|
|
117
|
|
118 =over
|
|
119
|
|
120 =item B<L<BioPerl|http://www.bioperl.org>)>
|
|
121
|
|
122 Tested with BioPerl version 1.006923
|
|
123
|
|
124 =back
|
|
125
|
|
126 =head1 VERSION
|
|
127
|
|
128 0.5 update: 14-09-2015
|
|
129 0.1 25-11-2011
|
|
130
|
|
131 =head1 AUTHOR
|
|
132
|
|
133 Andreas Leimbach aleimba[at]gmx[dot]de
|
|
134
|
|
135 =head1 LICENSE
|
|
136
|
|
137 This program is free software: you can redistribute it and/or modify
|
|
138 it under the terms of the GNU General Public License as published by
|
|
139 the Free Software Foundation; either version 3 (GPLv3) of the
|
|
140 License, or (at your option) any later version.
|
|
141
|
|
142 This program is distributed in the hope that it will be useful, but
|
|
143 WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
144 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
145 General Public License for more details.
|
|
146
|
|
147 You should have received a copy of the GNU General Public License
|
|
148 along with this program. If not, see L<http://www.gnu.org/licenses/>.
|
|
149
|
|
150 =cut
|
|
151
|
|
152
|
|
153 ########
|
|
154 # MAIN #
|
|
155 ########
|
|
156
|
|
157 use warnings;
|
|
158 use strict;
|
|
159 use autodie;
|
|
160 use Getopt::Long;
|
|
161 use Pod::Usage;
|
|
162 use Bio::SeqIO; # BioPerl module to handle sequence input/output
|
|
163 use Bio::SeqFeatureI; # BioPerl module to handle features in a sequence
|
|
164
|
|
165 ### Get the options with Getopt::Long
|
|
166 my @Extensions; # optionally give list of file-extensions to search in (otherwise defaults will be set below); comma-separated or several occurences of the option
|
|
167 my $Opt_Plasmids; # optionally list plasmids in multi-seq files as extra entries in final feature table
|
|
168 my $VERSION = 0.5;
|
|
169 my ($Opt_Version, $Opt_Help);
|
|
170 GetOptions ('extensions=s' => \@Extensions,
|
|
171 'plasmids' => \$Opt_Plasmids,
|
|
172 'version' => \$Opt_Version,
|
|
173 'help|?' => \$Opt_Help);
|
|
174
|
|
175
|
|
176
|
|
177 ### Run perldoc on POD
|
|
178 pod2usage(-verbose => 2) if ($Opt_Help);
|
|
179 die "$0 $VERSION\n" if ($Opt_Version);
|
|
180 warn "\n### Warning:\nToo many arguments given, only the path to the directory with the RichSeq files (EMBL or GENBANK format) is needed!\n" if (@ARGV > 1);
|
|
181 my $Genome_Dir = '.'; # directory with RichSeq genome files in EMBL or GENBANK format; default current directory
|
|
182 $Genome_Dir = shift if (@ARGV);
|
|
183 if (!-d $Genome_Dir) {
|
|
184 my $warning = "\n### Fatal error: Directory '$Genome_Dir' does not exist: $!\n";
|
|
185 pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
|
|
186 }
|
|
187
|
|
188
|
|
189
|
|
190 ### Get optional file-extensions or set defaults
|
|
191 if (@Extensions) {
|
|
192 @Extensions = split(/,/,join(',', @Extensions)); # split (potential comma-separated lists) and join (potential several occurences) of file-extensions
|
|
193 } else {
|
|
194 @Extensions = qw(ebl emb embl gb gbf gbff gbank gbk genbank); # default file extensions
|
|
195 }
|
|
196 my $Extensions = join('|', @Extensions); # join for regex below
|
|
197
|
|
198
|
|
199
|
|
200 ### Save all primary features from all seq-files (to include all possibilities) and count them for each genome/replicon individually
|
|
201 my $File_Count = 0; # Count number of files
|
|
202 my %Strain_Features; # hash of hash; store all primary features of each strain, counted; additional genome/plasmid description (DEFINITION line), sequence length, gc_content, coding percentage, accession numbers, contigs/scaffolds (for drafts), unresolved bases ('n/N's), and plasmids (without option '-p')
|
|
203
|
|
204 opendir (my $Dir_Fh, $Genome_Dir);
|
|
205 while (my $file = readdir($Dir_Fh)) {
|
|
206 if ($file =~ /.+\.($Extensions)$/) { # only include lines with given files extensions
|
|
207 $File_Count++;
|
|
208 my $Id; # filename used as unique ID (optionally with appended plasmid name) for hashes; need to declare here for checks after 'while next_seq' loop
|
|
209
|
|
210 my $seqio_obj = Bio::SeqIO->new(-file => "<$Genome_Dir/$file");
|
|
211 die "\n### Fatal error: File '$file' is not a RichSeq file in EMBL or GENBANK format!\n" if (ref($seqio_obj) !~ /Bio\:\:SeqIO\:\:[genbank|embl]/); # check if correct file format; Bio::SeqIO::genbank or Bio::SeqIO::embl object
|
|
212
|
|
213 my %concat_seq; # store the concatenated sequences for all replicons or contigs/scaffolds of a genome with key '$Id'; only GC-content and unresolved Ns saved in '%Strain_Features', thus extra hash declared here to remove content and save memory for each file
|
|
214 my %coding_bases; # base count to subsequently calculate coding percentage with key '$Id'; only coding percentage saved in '%Strain_Features', thus extra hash
|
|
215 my %plasmid_count; # count the number of plasmids for current genome; hash with key as plasmid name needed for draft plasmids with SEVERAL contigs for ONE plasmid (with the same plasmid name of course) in one multi seq-file
|
|
216 my $desc; # '$seq_obj->desc' in case of only-plasmid seq-file but not option '-p' use the (last) plasmid description for feature table
|
|
217
|
|
218 my $seq_count = 0;
|
|
219 while (my $seq_obj = $seqio_obj->next_seq) { # for multi-seq files with several seq entries (e.g. several replicons or contigs/scaffolds)
|
|
220 $seq_count++;
|
|
221 $Id = $file; # set unique $Id, based on filename; has to be set for every seq entry in case of plasmids and option '-p' (see below)
|
|
222 warn "\n### Warning:\nMore than 10 sequence entries in file '$file', but no 'WGS' keyword found. Sure this is not a draft genome? Otherwise please see the help with option '-h'!\n\n" if ($seq_count == 10 && !$Strain_Features{$Id}{'contigs_scaffolds'});
|
|
223
|
|
224 my ($plasmid) = grep { $_->primary_tag eq 'source' && $_->has_tag('plasmid') } $seq_obj->get_SeqFeatures; # check if current seq entry is a plasmid, needs '/plasmid="plasmid_name"' tag in primary tag 'source'; $plasmid contains now a primary tag 'source' feat_obj
|
|
225 ($plasmid) = $plasmid->get_tag_values('plasmid') if ($plasmid); # get plasmid name; replace the 'feat_obj' in $plasmid with the tag value of '/plasmid'
|
|
226 $plasmid_count{$plasmid} = 1 if ($plasmid); # count number of plasmids
|
|
227
|
|
228 $Id .= $plasmid if ($Opt_Plasmids && $plasmid); # for plasmids with option '-p' concatenate $Id with plasmid name
|
|
229
|
|
230 $desc = $seq_obj->desc;
|
|
231 $desc =~ s/(\s*(DNA|chromosome)*, complete genome\.*|\s*(chromosome)*, complete sequence\.*|\s*complete genome, strain.*|\s*, whole genome shotgun sequence\.*|\.)$// if ($desc); # shorten strain description
|
|
232 $Strain_Features{$Id}{'desc'} = $desc if ((!$Opt_Plasmids && !$plasmid) || $Opt_Plasmids); # don't use a plasmid description without '-p'; with '-p' take all descs; overwrite until last seq entry
|
|
233
|
|
234 $Strain_Features{$Id}{'contigs_scaffolds'}++ if ($seq_obj->keywords =~ /WGS/); # count contigs/scaffolds and indicate that draft
|
|
235
|
|
236 if (!$concat_seq{$Id}) { # first seq entry for $Id in potential multi-seq file ('!$Strain_Features{$Id}{'length'}' or '!$Strain_Features{$Id}{'first_acc'}' can also be used)
|
|
237 $Strain_Features{$Id}{'first_acc'} = $seq_obj->accession_number;
|
|
238 print STDERR "\nProcessing ";
|
|
239 if ($Strain_Features{$Id}{'contigs_scaffolds'}) {
|
|
240 print STDERR "draft contigs/scaffolds ";
|
|
241 } else {
|
|
242 print STDERR "complete replicon(s) ";
|
|
243 }
|
|
244 print STDERR "in '$file': $desc; $Strain_Features{$Id}{'first_acc'}\n";
|
|
245 $Strain_Features{$Id}{'length'} = $seq_obj->length;
|
|
246 $concat_seq{$Id} = $seq_obj->seq; # save the sequence to concatenate to potential following seq entries of same $Id in multi-seq files
|
|
247 $coding_bases{$Id} = 0; # need to declare for addition below
|
|
248 } else { # further contigs/scaffolds of drafts or multi-seq complete genome without option '-p'
|
|
249 $Strain_Features{$Id}{'last_acc'} = $seq_obj->accession_number; # give the range of acc#s for multi-seq files; overwrite until last sequence
|
|
250 $Strain_Features{$Id}{'length'} += $seq_obj->length; # add all previous lengths to the current for '$Id'
|
|
251 $concat_seq{$Id} .= $seq_obj->seq; # concatenate all sequences for '$Id'
|
|
252 }
|
|
253
|
|
254
|
|
255 ($Strain_Features{$Id}{'gc'}, $Strain_Features{$Id}{'unresolved_n'}) = gc_content($concat_seq{$Id}); # subroutine to calculate the GC-content and count 'N's in seq; unfortunately has to be calculated every time (don't know what is last seq entry for each '$Id' or plasmid/chromosome etc. in file)
|
|
256
|
|
257
|
|
258 foreach my $feat_obj ($seq_obj->get_SeqFeatures) {
|
|
259 if ($feat_obj->primary_tag eq 'source') { # exclude source primary tag, has no feature annotation
|
|
260 next;
|
|
261 } elsif ($feat_obj->primary_tag eq 'gene') { # count 'gene' primary tags
|
|
262 $Strain_Features{$Id}{'gene'}++;
|
|
263 $Strain_Features{$Id}{'pseudo_gene'}++ if ($feat_obj->has_tag('pseudo'));
|
|
264 } elsif ($feat_obj->primary_tag eq 'CDS') {
|
|
265 $Strain_Features{$Id}{'CDS'}++;
|
|
266 if ($feat_obj->has_tag('pseudo')) {
|
|
267 $Strain_Features{$Id}{'pseudo_CDS'}++;
|
|
268 } elsif (!$feat_obj->has_tag('pseudo')) {
|
|
269 $coding_bases{$Id} = ($feat_obj->location->end - $feat_obj->location->start) + 1 + $coding_bases{$Id}; # coding_perc only calculated based on non-pseudo CDS annotation
|
|
270 }
|
|
271 } else { # get ALL the other primary tags, shouldn't have 'pseudo' tags
|
|
272 $Strain_Features{$Id}{$feat_obj->primary_tag}++;
|
|
273 }
|
|
274 }
|
|
275
|
|
276 $Strain_Features{$Id}{'coding_perc'} = ($coding_bases{$Id}/$Strain_Features{$Id}{'length'})*100;
|
|
277 $Strain_Features{$Id}{'coding_perc'} = sprintf("%.2f", $Strain_Features{$Id}{'coding_perc'}); # round percentage to two decimals
|
|
278 $Strain_Features{$Id}{'coding_perc'} = 100 if ($Strain_Features{$Id}{'coding_perc'} > 100); # > 100 can happen with very small replicons/contigs having overlapping CDSs
|
|
279 }
|
|
280
|
|
281 $Strain_Features{$Id}{'plasmids'} = keys %plasmid_count if (!$Opt_Plasmids && keys %plasmid_count > 0); # number of plasmids in the current genome, only needed without option '-p'
|
|
282 $Strain_Features{$Id}{'desc'} = $desc if (!$Strain_Features{$Id}{'desc'}); # in case of plasmid-only seq-files and not option '-p'
|
|
283
|
|
284 #die "\n### Fatal error:\nFile '$file' contains a complete genome with several replicons. However, more than one replicon (which should only be the chromosome) doesn't have a '/plasmid=\"plasmid_name\"' tag in primary tag 'source'.\nIf it is a complete genome include these tags, or if it is a draft genome include the 'WGS' keyword in ALL contigs/scaffolds as described in the help (option '-h')!\n" if ($seq_count - keys %plasmid_count > 1 && !$Strain_Features{$Id}{'contigs_scaffolds'}); # too strict? Also, overlaps with warn 'Warning:\nMore than 10 sequence' above!
|
|
285 #die "\n### Fatal error:\nFile '$file' contains a draft genome with only one contig.\nIf it is a complete genome remove the 'WGS' keyword from the only present replicon (see '-h')!\n" if ($seq_count == 1 && $Strain_Features{$Id}{'contigs_scaffolds'}); # too strict? e.g. uncircularized single-contig draft seq files
|
|
286 }
|
|
287 }
|
|
288 closedir $Dir_Fh;
|
|
289 die "\n### Fatal error: No EMBL or GENBANK format files found!\n" if (!$File_Count);
|
|
290
|
|
291
|
|
292
|
|
293 ### Get all annotated primary features present in all files
|
|
294 my %Primary_Features;
|
|
295 foreach my $id (keys %Strain_Features) {
|
|
296 foreach (keys %{$Strain_Features{$id}}) { # de-references hash of hash
|
|
297 $Primary_Features{$_} = 1;
|
|
298 }
|
|
299 }
|
|
300
|
|
301
|
|
302
|
|
303 ### Print header of output with all existent primary features
|
|
304 print "# name (last contig/scaffold for drafts)\tsize [bp]\tGC-content [%]\tcoding percentage [%]\tCDS (pseudo)\tgenes (pseudo)\trRNA\ttRNA\ttmRNA\tncRNA\taccession (first..last)\tcontigs/scaffolds\tunresolved bases (Ns)\t"; # specific print order for the most common primary features, '#' indicates comment line
|
|
305 my $Common_Primary_Feats = 'desc|length|gc|coding_perc|CDS|pseudo_CDS|gene|pseudo_gene|rRNA|tRNA|tmRNA|ncRNA|first_acc|last_acc|contigs_scaffolds|unresolved_n'; # the common primaries for the regexs and 'split' for print out below
|
|
306
|
|
307 my $Common_Primary_Count = grep(/^($Common_Primary_Feats)$/, keys %Primary_Features); # count occurence of common primary features (not all present in all seq entries) for tab or newline print below
|
|
308 my $i = 0;
|
|
309 foreach (sort keys %Primary_Features) { # print the residual primary features
|
|
310 if ($_ =~ /^($Common_Primary_Feats)$/) { # exclude the print order ones
|
|
311 next;
|
|
312 } else {
|
|
313 $i++;
|
|
314 print "$_";
|
|
315 print "\t" if ($i < (keys %Primary_Features) - $Common_Primary_Count); # don't print tab for last element (but newline below)
|
|
316 }
|
|
317 }
|
|
318 print "\n";
|
|
319
|
|
320 print "# NA = tag, key, or qualifier not existent\n"; # comment line
|
|
321
|
|
322
|
|
323
|
|
324 ### Print the primary features for each strain
|
|
325 foreach my $id (sort {$Strain_Features{$a}{'desc'} cmp $Strain_Features{$b}{'desc'}} keys %Strain_Features) {
|
|
326 foreach (split(/\|/, $Common_Primary_Feats)) { # split '$Common_Primary_Feats' string to print according to print order above
|
|
327 print " (" if (/^pseudo/); # 'pseudo_CDS' and 'pseudo_gene' are printed in parantheses
|
|
328 if ($Strain_Features{$id}{$_}) { # print value only if defined
|
|
329 print ".." if (/last_acc/); # for 'first_acc..last_acc'
|
|
330 if ($Strain_Features{$id}{$_} =~ /unknown/) { # BioPerl returns 'unknown' for non-existent acc-nr
|
|
331 print "NA";
|
|
332 } else {
|
|
333 print "$Strain_Features{$id}{$_}";
|
|
334 }
|
|
335 } else { # feature not defined for this '$Id'
|
|
336 next if ($_ =~ /last_acc/); # skip 'last_acc' if not defined (otherwise will print 'NA' for it)
|
|
337 print "NA";
|
|
338 }
|
|
339 print ")" if (/^pseudo/); # 'pseudo_CDS' and 'pseudo_gene' are printed in parantheses
|
|
340 print "\t" unless (($_ =~ /^(CDS|gene)$/) || ($_ =~ /first_acc/ && $Strain_Features{$id}{'last_acc'})); # CDS and gene are followed by parantheses for possible pseudos, 'first_acc' might be followed by '..last_acc'; thus no tab for these
|
|
341 }
|
|
342
|
|
343 $i = 0;
|
|
344 foreach (sort keys %Primary_Features) { # print the residual primary features
|
|
345 if ($_ =~ /^($Common_Primary_Feats)$/) { # exclude the above ones
|
|
346 next;
|
|
347 } elsif ($Strain_Features{$id}{$_}) {
|
|
348 $i++;
|
|
349 print "$Strain_Features{$id}{$_}";
|
|
350 } else {
|
|
351 $i++;
|
|
352 print "NA";
|
|
353 }
|
|
354 print "\t" if ($i < (keys %Primary_Features) - $Common_Primary_Count); # don't print tab for last element
|
|
355 }
|
|
356
|
|
357 print "\n"; # newline for each line in the feature table
|
|
358 }
|
|
359
|
|
360 print STDERR "\n$File_Count RichSeq file(s) was/were processed!\n";
|
|
361
|
|
362 exit;
|
|
363
|
|
364
|
|
365
|
|
366 ###############
|
|
367 # Subroutines #
|
|
368 ###############
|
|
369
|
|
370 ### Subroutine to calculate GC-content
|
|
371 sub gc_content {
|
|
372 my $seq = shift;
|
|
373 my $a = ($seq =~ tr/[aA]//); # transliterations don't accept modifiers like case-insensitive 'i'
|
|
374 my $c = ($seq =~ tr/[cC]//);
|
|
375 my $g = ($seq =~ tr/[gG]//);
|
|
376 my $t = ($seq =~ tr/[tT]//);
|
|
377 my $n = ($seq =~ tr/[nN]//);
|
|
378 my $gc_content = (($c + $g)/($a + $c + $g + $t))*100;
|
|
379 $gc_content = sprintf("%.2f", $gc_content); # round percentage to two decimals
|
|
380 return ($gc_content, $n);
|
|
381 }
|