3
|
1 #!/usr/bin/perl
|
|
2
|
|
3 #######
|
|
4 # POD #
|
|
5 #######
|
|
6
|
|
7 =pod
|
|
8
|
|
9 =head1 NAME
|
|
10
|
|
11 C<tbl2tab.pl> - convert tbl to tab-separated format and back
|
|
12
|
|
13 =head1 SYNOPSIS
|
|
14
|
|
15 C<perl tbl2tab.pl -m tbl2tab -i feature_table.tbl -s -l locus_prefix>
|
|
16
|
|
17 B<or>
|
|
18
|
|
19 C<perl tbl2tab.pl -m tab2tbl -i feature_table.tab -g -l locus_prefix
|
|
20 -p "gnl|dbname|">
|
|
21
|
|
22 =head1 DESCRIPTION
|
|
23
|
|
24 NCBI's feature table (B<tbl>) format is needed for the submission of
|
|
25 genomic data to GenBank with the NCBI tools
|
|
26 L<Sequin|http://www.ncbi.nlm.nih.gov/Sequin/> or
|
|
27 L<tbl2asn|http://www.ncbi.nlm.nih.gov/genbank/tbl2asn2>. tbl files
|
|
28 can be created with automatic annotation systems like
|
|
29 L<Prokka|http://www.vicbioinformatics.com/software.prokka.shtml>.
|
|
30 C<tbl2tab.pl> can convert a tbl file to a tab-separated format (tab)
|
|
31 and back to the tbl format. The tab-delimited format is useful to
|
|
32 manipulate the data more comfortably in a spreadsheet software (e.g.
|
|
33 LibreOffice or MS Excel). For a conversion back to tbl format save
|
|
34 the file in the spreadsheet software as a tab-delimited text file.
|
|
35 The script is intended for microbial genomes, but might also be
|
|
36 useful for eukaryotes.
|
|
37
|
|
38 Regular expressions are applied in mode B<tbl2tab> to correct gene
|
|
39 names and words in '/product' values to lowercase initials (with
|
|
40 the exception of 'Rossman' and 'Willebrand'). The resulting tab file
|
|
41 can then be used to check for possible errors.
|
|
42
|
|
43 The first four header columns of the B<tab> format are mandatory,
|
|
44 'seq_id' for the SeqID, and for each primary tag/feature (e.g. CDS,
|
|
45 RNAs, repeat_region etc.), 'start', 'stop', and 'primary_tag'. These
|
|
46 mandatory columns have to be filled in every row in the tab file.
|
|
47 All the following columns will be included as tags/qualifiers (e.g.
|
|
48 '/locus_tag', '/product', '/EC_number', '/note' etc.) in the
|
|
49 conversion to the tbl file if a value is present.
|
|
50
|
|
51 There are three special cases:
|
|
52
|
|
53 B<First>, '/pseudo' will be included as a tag if I<any> value (the
|
|
54 script uses 'T' for true) is present in the B<tab> format. If a
|
|
55 primary tag is indicated as pseudo both the primary tag and the
|
|
56 accessory 'gene' primary tag (for CDS/RNA features with option
|
|
57 B<-g>) will include a '/pseudo' qualifier in the resulting B<tbl>
|
|
58 file. B<Pseudo-genes> are indicated by 'pseudo' in the 'primary_tag'
|
|
59 column, thus the 'pseudo' column is ignored in these cases.
|
|
60
|
|
61 B<Second>, tag '/gene_desc' is reserved for the 'product' values of
|
|
62 pseudo-genes, thus a 'gene_desc' column in a tab file will be
|
|
63 ignored in the conversion to tbl.
|
|
64
|
|
65 B<Third>, column 'protein_id' in a tab file will also be ignored in
|
|
66 the conversion. '/protein_id' values are created from option B<-p>
|
|
67 and the locus_tag for each CDS primary feature.
|
|
68
|
|
69 Furthermore, with option B<-s> G2L-style spreadsheet formulas
|
|
70 (L<Goettingen Genomics
|
|
71 Laboratory|http://appmibio.uni-goettingen.de/>) can be included with
|
|
72 additional columns, 'spreadsheet_locus_tag', 'position', 'distance',
|
|
73 'gene_number', and 'contig_order'. These columns will not be
|
|
74 included in a conversion to the tbl format. Thus, if you want to
|
|
75 include e.g. the locus_tags from the formula in column
|
|
76 'spreadsheet_locus_tag' in the resulting tbl file copy the B<values>
|
|
77 to the column 'locus_tag'!
|
|
78
|
|
79 To illustrate the process two example files are included in the
|
|
80 repository, F<example.tbl> and F<example2.tab>, which are
|
|
81 interconvertible (see L</"EXAMPLES"> below).
|
|
82
|
|
83 B<Warning>, be aware of possible errors introduced by automatic
|
|
84 format conversions using a spreadsheet software like MS Excel, see
|
|
85 e.g. Zeeberg et al. 2004
|
|
86 (L<http://www.ncbi.nlm.nih.gov/pubmed/15214961>).
|
|
87
|
|
88 For more information regarding the feature table and the submission
|
|
89 process see NCBI's L<prokaryotic annotation
|
|
90 guide|http://www.ncbi.nlm.nih.gov/genbank/genomesubmit> and the
|
|
91 L<bacterial genome submission
|
|
92 guide|http://www.ncbi.nlm.nih.gov/genbank/genomesubmit_annotation>.
|
|
93
|
|
94 =head1 OPTIONS
|
|
95
|
|
96 =head2 Mandatory options
|
|
97
|
|
98 =over 20
|
|
99
|
|
100 =item B<-m>=I<tbl2tab|tab2tbl>, B<-mode>=I<tbl2tab|tab2tbl>
|
|
101
|
|
102 Conversion mode, either 'tbl2tab' or 'tab2tbl' [default = 'tbl2tab']
|
|
103
|
|
104 =item B<-i>=I<str>, B<-input>=I<str>
|
|
105
|
|
106 Input tbl or tab file to be converted to the other format
|
|
107
|
|
108 =back
|
|
109
|
|
110 =head2 Optional options
|
|
111
|
|
112 =over 20
|
|
113
|
|
114 =item B<-h>, B<-help>
|
|
115
|
|
116 Help (perldoc POD)
|
|
117
|
|
118 =item B<-v>, B<-version>
|
|
119
|
|
120 Print version number to C<STDERR>
|
|
121
|
|
122 =back
|
|
123
|
|
124 =head3 Mode B<tbl2tab>
|
|
125
|
|
126 =over 20
|
|
127
|
|
128 =item B<-l>=I<str>, B<-locus_prefix>=I<str>
|
|
129
|
|
130 Only in combination with option B<-s> and there mandatory to include
|
|
131 the locus_tag prefix in the formula for column 'spreadsheet_locus_tag'
|
|
132
|
|
133 =item B<-c>, B<-concat>
|
|
134
|
|
135 Concatenate values of identical tags within one primary tag with '~'
|
|
136 (e.g. several '/EC_number' or '/inference' tags)
|
|
137
|
|
138 =item B<-e>=I<str>, B<-empty>=I<str>
|
|
139
|
|
140 String used for primary features without value for a tag [default = '']
|
|
141
|
|
142 =item B<-s>, B<-spreadsheet>
|
|
143
|
|
144 Include formulas for spreadsheet editing
|
|
145
|
|
146 =item B<-f>=I<e|g>, B<-formula_lang>=I<e|g>
|
|
147
|
|
148 Syntax language of the spreadsheet formulas, either 'English' or
|
|
149 'German'. If you're still encountering problems with the formulas
|
|
150 set the decimal and thousands separator manually in the options of
|
|
151 the spreadsheet software (instead of using the operating system
|
|
152 separators). [default = 'e']
|
|
153
|
|
154 =back
|
|
155
|
|
156 =head3 Mode B<tab2tbl>
|
|
157
|
|
158 =over 20
|
|
159
|
|
160 =item B<-l>=I<str>, B<-locus_prefix>=I<str>
|
|
161
|
|
162 Prefix to the SeqID if not present already in the SeqID
|
|
163
|
|
164 =item B<-g>, B<-gene>
|
|
165
|
|
166 Include accessory 'gene' primary tags (with '/gene', '/locus_tag'
|
|
167 and possibly '/pseudo' tags) for 'CDS/RNA' primary tags; NCBI standard
|
|
168
|
|
169 =item B<-t>, B<-tags_full>
|
|
170
|
|
171 Only in combination with option B<-g>, include '/gene' and
|
|
172 '/locus_tag' tags additionally in primary tag, not only in accessory
|
|
173 'gene' primary tag
|
|
174
|
|
175 =item B<-p>=I<str>, B<-protein_id_prefix>=I<str>
|
|
176
|
|
177 Prefix for '/protein_id' tags; don't forget the double quotes for
|
|
178 the string, otherwise the shell will intepret as pipe [default =
|
|
179 'gnl|goetting|']
|
|
180
|
|
181 =back
|
|
182
|
|
183 =head1 OUTPUT
|
|
184
|
|
185 =over 20
|
|
186
|
|
187 =item F<*.tab|tbl>
|
|
188
|
|
189 Result file in the opposite format
|
|
190
|
|
191 =item (F<hypo_putative_genes.txt>)
|
|
192
|
|
193 Created in mode 'tab2tbl', indicates if CDSs are annotated as
|
|
194 'hypothetical/putative/predicted protein' but still have a gene name
|
|
195
|
|
196 =back
|
|
197
|
|
198 =head1 EXAMPLES
|
|
199
|
|
200 =over
|
|
201
|
|
202 =item C<perl tbl2tab.pl -m tbl2tab -i example.tbl -s -l EPE>
|
|
203
|
|
204 =item C<perl tbl2tab.pl -m tab2tbl -i example2.tab -g -l EPE>
|
|
205
|
|
206 =back
|
|
207
|
|
208 =head1 VERSION
|
|
209
|
|
210 0.2 update: 29-10-2014
|
|
211 0.1 24-06-2014
|
|
212
|
|
213 =head1 AUTHOR
|
|
214
|
|
215 Andreas Leimbach aleimba[at]gmx[dot]de
|
|
216
|
|
217 =head1 LICENSE
|
|
218
|
|
219 This program is free software: you can redistribute it and/or modify
|
|
220 it under the terms of the GNU General Public License as published by
|
|
221 the Free Software Foundation; either version 3 (GPLv3) of the License,
|
|
222 or (at your option) any later version.
|
|
223
|
|
224 This program is distributed in the hope that it will be useful, but
|
|
225 WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
226 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
227 General Public License for more details.
|
|
228
|
|
229 You should have received a copy of the GNU General Public License
|
|
230 along with this program. If not, see L<http://www.gnu.org/licenses/>.
|
|
231
|
|
232 =cut
|
|
233
|
|
234
|
|
235 ########
|
|
236 # MAIN #
|
|
237 ########
|
|
238
|
|
239 use strict;
|
|
240 use warnings;
|
|
241 use autodie;
|
|
242 use Getopt::Long;
|
|
243 use Pod::Usage;
|
|
244
|
|
245
|
|
246
|
|
247 ### Get options with Getopt::Long, works also abbreviated and with two "--": -i, --i, -input ...
|
|
248 my $Input_File; # input file
|
|
249 my $Mode = 'tbl2tab'; # mode of script, i.e. either convert from tbl2tab or from tab2tbl; default 'tbl2tab'
|
|
250 my $Locus_Prefix = ''; # required for option 'spreadsheet' in mode 'tbl2tab', in mode 'tab2tbl' optional
|
|
251 my $Opt_Concat; # optionally, concatenate values of the same tag within one primary tag in a tbl file in one column in the resulting tab file with '~' (e.g. several 'EC_number' tags etc.)
|
|
252 my $Empty = ''; # optionally, set what should be used for tags without a value in resulting tab file; default is nothing
|
|
253 my $Opt_Spreadsheet; # optionally, include formulas for spreadsheet editing (e.g. Libre Office, MS Excel)
|
|
254 my $Formula_Lang_Spreadsheet = 'e'; # optionally, either German or English formulas in Spreadsheet option; default 'e' for English
|
|
255 my $Opt_Gene; # optionally, include accessory gene primary tags (with '/gene' and '/locus_tag' [and '/pseudo'] tags) for CDS|RNA primary tags
|
|
256 my $Opt_Tags_Full; # optionally, include '/gene' and '/locus_tag' additionally in primary tag not only accessory 'gene' primary tag
|
|
257 my $Protein_Id_Prefix = 'gnl|goetting|'; # optionally give a different string to prefix the '/protein_id' tags
|
|
258 my $VERSION = 0.2;
|
|
259 my ($Opt_Version, $Opt_Help);
|
|
260 GetOptions ('input=s' => \$Input_File,
|
|
261 'mode=s' => \$Mode,
|
|
262 'locus_prefix:s' => \$Locus_Prefix,
|
|
263 'concat' => \$Opt_Concat,
|
|
264 'empty:s' => \$Empty,
|
|
265 'spreadsheet' => \$Opt_Spreadsheet,
|
|
266 'formula_lang:s' => \$Formula_Lang_Spreadsheet,
|
|
267 'gene' => \$Opt_Gene,
|
|
268 'tags_full' => \$Opt_Tags_Full,
|
|
269 'protein_id_prefix:s' => \$Protein_Id_Prefix,
|
|
270 'version' => \$Opt_Version,
|
|
271 'help|?' => \$Opt_Help);
|
|
272
|
|
273
|
|
274
|
|
275 ### Run perldoc on POD
|
|
276 pod2usage(-verbose => 2) if ($Opt_Help);
|
|
277 die "$0 $VERSION\n" if ($Opt_Version);
|
|
278 if (!$Input_File) {
|
|
279 my $warning = "\n### Fatal error: Option '-i' or its argument is missing!\n";
|
|
280 pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
|
|
281 } elsif (!($Mode =~ /tbl2tab/i || $Mode =~ /tab2tbl/i)) { # case-insensitive
|
|
282 my $warning = "\n### Fatal error: Incorrect run mode with option '-m' given! Please choose either 'tbl2tab' or 'tab2tbl' for '-m'!\n";
|
|
283 pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
|
|
284 }
|
|
285
|
|
286
|
|
287
|
|
288 ### Enforce mandatory or optional options
|
|
289 if ($Mode =~ /tbl2tab/i && ($Opt_Gene || $Opt_Tags_Full || $Protein_Id_Prefix ne 'gnl|goetting|')) {
|
|
290 warn "\nIncompatible option(s) '-g', '-p', or '-t' set with mode 'tbl2tab'. Ignoring the option(s)!\n";
|
|
291 } elsif ($Mode =~ /tab2tbl/i && ($Opt_Concat || $Empty || $Opt_Spreadsheet || $Formula_Lang_Spreadsheet ne 'e')) {
|
|
292 warn "\nIncompatible option(s) '-c', '-e', '-f', or '-s' set with mode 'tab2tbl'. Ignoring the option(s)!\n";
|
|
293 $Formula_Lang_Spreadsheet = 'e' if ($Formula_Lang_Spreadsheet ne 'e'); # avoid die with error below if option not set correctly
|
|
294 }
|
|
295
|
|
296 if ($Mode =~ /tbl2tab/i && ($Locus_Prefix || $Formula_Lang_Spreadsheet ne 'e') && !$Opt_Spreadsheet) {
|
|
297 warn "\nOption(s) '-l' or '-f' set, but not '-s'. Forcing option '-s'!\n";
|
|
298 $Opt_Spreadsheet = 1;
|
|
299 }
|
|
300 if ($Mode =~ /tbl2tab/i && !$Locus_Prefix && $Opt_Spreadsheet) {
|
|
301 warn "\nOption '-s' set, but not '-l'. Please give a prefix for the locus tags: ";
|
|
302 chomp($Locus_Prefix = <>);
|
|
303 }
|
|
304
|
|
305 if ($Formula_Lang_Spreadsheet !~ /^(e|g)/i) {
|
|
306 die "\n### Fatal error: Incorrect language for option '-f' given! Please choose either 'e|eng' or 'g|ger' for '-f'!\n";
|
|
307 }
|
|
308
|
|
309 if ($Mode =~ /tab2tbl/i && !$Opt_Gene && $Opt_Tags_Full) {
|
|
310 warn "\nOption '-t' set, but not '-g'. Forcing option '-g'!\n";
|
|
311 $Opt_Gene = 1;
|
|
312 }
|
|
313
|
|
314
|
|
315
|
|
316 ### Read in tbl or tab-separated data and write to result file in the opposite format
|
|
317 my $Out_File = $Input_File;
|
|
318 $Out_File =~ s/^(.+)\.\w+$/$1/; # strip filename extension
|
|
319 my $Error_File = 'hypo_putative_genes.txt';
|
|
320 if ($Mode =~ /tbl2tab/i) {
|
|
321 my ($data_hash_ref, $tags_max_count_hash_ref) = read_tbl(); # subroutine
|
|
322 $Out_File .= '.tab';
|
|
323 write_tab($data_hash_ref, $tags_max_count_hash_ref); # subroutine
|
|
324
|
|
325 } elsif ($Mode =~ /tab2tbl/i) {
|
|
326 $Out_File .= '.tbl';
|
|
327 read_tab_write_tbl(); # subroutine
|
|
328 }
|
|
329
|
|
330
|
|
331
|
|
332 ### Message which file was created
|
|
333 if ($Mode =~ /tbl2tab/i) {
|
|
334 print "Input tbl file '$Input_File' was converted to tab output file '$Out_File'!\n";
|
|
335 } elsif ($Mode =~ /tab2tbl/i) {
|
|
336 print "Input tab file '$Input_File' was converted to tbl output file '$Out_File'!\n";
|
|
337 }
|
|
338
|
|
339 if (-e $Error_File) {
|
|
340 if (-s $Error_File >= 40) { # smaller than just the header, which should be 27 bytes
|
|
341 warn "\n### Warning: CDSs found that are annotated with 'hypothetical|putative|predicted protein' but still include a '/gene' tag, see file '$Error_File'!\n";
|
|
342 } elsif (-s $Error_File < 40) {
|
|
343 unlink $Error_File;
|
|
344 }
|
|
345 }
|
|
346
|
|
347
|
|
348 exit;
|
|
349
|
|
350 ###############
|
|
351 # Subroutines #
|
|
352 ###############
|
|
353
|
|
354 ### Subroutine to test for file existence and give warning to STDERR
|
|
355 sub file_exist {
|
|
356 my $file = shift;
|
|
357 if (-e $file) {
|
|
358 warn "\nThe result file \'$file\' exists already and will be overwritten!\n\n";
|
|
359 return 1;
|
|
360 }
|
|
361 return 0;
|
|
362 }
|
|
363
|
|
364
|
|
365
|
|
366 ### Print tag values to tab result file
|
|
367 sub print_tag2tab {
|
|
368 my ($tag, $data_hash_ref, $seq_id, $pos, $tag_max_count) = @_;
|
|
369
|
|
370 if ($Opt_Concat) { # values concatenated by '~' in $data_hash_ref
|
|
371 if ($data_hash_ref->{$seq_id}->{$pos}->{$tag}) {
|
|
372 print "\t$data_hash_ref->{$seq_id}->{$pos}->{$tag}";
|
|
373 return 1;
|
|
374 } else {
|
|
375 print "\t$Empty";
|
|
376 return 1;
|
|
377 }
|
|
378
|
|
379 } elsif (!$Opt_Concat) { # split concatenated values in individual values
|
|
380 my @values = split(/~/, $data_hash_ref->{$seq_id}->{$pos}->{$tag}) if ($data_hash_ref->{$seq_id}->{$pos}->{$tag});
|
|
381 if (@values) {
|
|
382 foreach (@values) {
|
|
383 print "\t$_";
|
|
384 }
|
|
385 print "\t$Empty" x ($tag_max_count - @values); # fill residual columns till maximum occurrence
|
|
386 } else {
|
|
387 print "\t$Empty" x $tag_max_count;
|
|
388 }
|
|
389 return 1;
|
|
390 }
|
|
391
|
|
392 return 0;
|
|
393 }
|
|
394
|
|
395
|
|
396
|
|
397 ### Print tag values to tbl result file
|
|
398 sub print_tag2tbl {
|
|
399 my ($tag, $value) = @_;
|
|
400 return 0 if ($value =~ /^$Empty$/);
|
|
401 if ($tag =~ /pseudo/) {
|
|
402 print "\t\t\t$tag\n";
|
|
403 return 1;
|
|
404 }
|
|
405
|
|
406 ### remove quotations from values introduced by Excel by saving as tab-separated file:
|
|
407 ### https://office.microsoft.com/en-001/excel-help/excel-formatting-and-features-that-are-not-transferred-to-other-file-formats-HP010014105.aspx
|
|
408 ### - if a cell contains a comma, the cell contents are enclosed in double quotation marks
|
|
409 ### - if the data contains a quotation mark, double quotation marks will replace the quotation mark, and the cell contents are also enclosed in double quotation marks
|
|
410 $value =~ s/""/"/g;
|
|
411 $value =~ s/^"//;
|
|
412 $value =~ s/"$//;
|
|
413
|
|
414 foreach (split(/~/, $value)) {
|
|
415 print "\t\t\t$tag\t$_\n";
|
|
416 }
|
|
417 return 1;
|
|
418 }
|
|
419
|
|
420
|
|
421 ### Read in data from tab input file and write it to tbl output file
|
|
422 sub read_tab_write_tbl {
|
|
423 file_exist($Error_File); # subroutine
|
|
424 open (my $error_file_fh, ">", $Error_File);
|
|
425 print $error_file_fh "row\tlocus_tag\tgene\tproduct\n";
|
|
426
|
|
427 open (my $input_file_fh, "<", $Input_File);
|
|
428 my $header = <$input_file_fh>;
|
|
429 $header =~ s/\R/\012/; # convert line to unix-style line endings
|
|
430 chomp $header;
|
|
431 if ($header !~ /^seq_id\tstart\tstop\tprimary_tag\t/) { # check if tbl file starts with mandatory header fields or quit
|
|
432 die "\n### Fatal error: Input tab file '$Input_File' doesn't start with the mandatory 'seq_id', 'start', 'stop', and 'primary_tag' tab-separated header fields. Sure this is a valid tab file?\nExiting program!\n\n";
|
|
433 }
|
|
434
|
|
435 my @tags;
|
|
436 foreach (split(/\t/, $header)) {
|
|
437 last if (/spreadsheet_locus_tag/); # skip all optional extra spreadsheet columns
|
|
438 push(@tags, $_); # store all header fields/columns to associate with each field in each line
|
|
439 }
|
|
440
|
|
441 file_exist($Out_File); # subroutine
|
|
442 open (my $out_file_fh, ">", $Out_File);
|
|
443 select $out_file_fh; # select fh for standard print/f output
|
|
444
|
|
445 my $row = 1; # count row numbers of tab input file for $Error_File (start with '1' as header already parsed above)
|
|
446 my $seq_id = ''; # store previous SeqID for multi-contig/replicon tab files
|
|
447 while (<$input_file_fh>) {
|
|
448 $row++;
|
|
449 $_ =~ s/\R/\012/; # convert line to unix-style line ending
|
|
450 chomp;
|
|
451 next if ($_ =~ /^\s+$/ || $_ =~ /^$/); # skip empty lines
|
|
452
|
|
453 my ($locus_tag, $gene, $hypo_putative) = ('', '', ''); # needed for $Error_File
|
|
454
|
|
455 my @cells = split(/\t/, $_);
|
|
456 for (my $i = 0; $i < 4; $i++) { # check each row for mandatory fields
|
|
457 if ($cells[$i] =~ /^$/) {
|
|
458 close $out_file_fh;
|
|
459 unlink $Out_File;
|
|
460 die "\n### Fatal error: Row $row of input tab file '$Input_File' is missing a value for one of the mandatory fields 'seq_id', 'start', 'stop', or 'primary_tag'!\nExiting program!\n\n";
|
|
461 }
|
|
462 }
|
|
463
|
|
464 ### print SeqID
|
|
465 if ($cells[0] ne $seq_id) { # print new contig for multi-contig/replicon tab files
|
|
466 $seq_id = $cells[0];
|
|
467 $cells[0] = $Locus_Prefix."_".$cells[0] if ($cells[0] !~ /$Locus_Prefix/ && $Locus_Prefix); # append locus_tag prefix only if not present already
|
|
468 print ">Feature $cells[0]\n";
|
|
469 }
|
|
470
|
|
471 ### print accessory 'gene' primary tags with '/locus_tag', '/gene', and potential '/pseudo' tags
|
|
472 if ($Opt_Gene && $cells[3] =~ /CDS|RNA/) { # accessory 'gene' primary tags only for CDS and RNA (rRNA, tRNA ...) features
|
|
473 print "$cells[1]\t$cells[2]\tgene\n";
|
|
474 my $column_count = 0;
|
|
475 foreach my $tag (@tags) {
|
|
476 if ($tag =~ /locus_tag/ && $cells[$column_count] =~ /^$/) { # CDSs|RNAs mandatory need a '/locus_tag' with option '-g'
|
|
477 close $out_file_fh;
|
|
478 unlink $Out_File;
|
|
479 die "\n### Fatal error: Row $row of input tab file '$Input_File' is missing a 'locus_tag' which is mandatory for option '-g'!\nExiting program!\n\n";
|
|
480 }
|
|
481 print_tag2tbl($tag, $cells[$column_count]) if ($tag =~ /locus_tag|^gene$|pseudo/); # subroutine; '^gene$' needed, so 'gene_desc' isn't hit (see below)
|
|
482 $column_count++;
|
|
483 }
|
|
484 }
|
|
485
|
|
486 ### print primary tag
|
|
487 print "$cells[1]\t$cells[2]"; # start\tstop
|
|
488 if ($cells[3] =~ /pseudo/) { # pseudo-gene should include '/pseudo' tag
|
|
489 print "\tgene\n";
|
|
490 print "\t\t\tpseudo\n";
|
|
491 } else {
|
|
492 print "\t$cells[3]\n";
|
|
493 }
|
|
494
|
|
495 ### print tags with values
|
|
496 for (my $i = 4; $i < @tags; $i++) { # start with field 5 of array with header fields/columns (the first 4 are mandatory, see above)
|
|
497 next if ($tags[$i] =~ /gene_desc/); # skip 'gene_desc' fields in tab file, reserved for pseudo-genes (see below)
|
|
498
|
|
499 ### enforce mandatory tags for CDS primary tags
|
|
500 if ($tags[$i] =~ /product/ && $cells[3] =~ /CDS/) {
|
|
501 if ($cells[$i] =~ /(hypothetical|putative|predicted) protein/) { # needed for $Error_File
|
|
502 $hypo_putative = $cells[$i];
|
|
503 } elsif ($cells[$i] =~ /^$/) { # CDSs mandatory need a value for '/product'
|
|
504 close $out_file_fh;
|
|
505 unlink $Out_File;
|
|
506 die "\n### Fatal error: Row $row of input tab file '$Input_File' is missing a 'product' value which is mandatory for CDS primary tags!\nExiting program!\n\n";
|
|
507 }
|
|
508 }
|
|
509
|
|
510 if ($tags[$i] =~ /locus_tag/ && $cells[3] =~ /CDS/) { # '/protein_id' mandatory for 'CDS' primary tags
|
|
511 $locus_tag = $cells[$i]; # needed for $Error_File
|
|
512 my $protein_id = "$Protein_Id_Prefix".$cells[$i];
|
|
513 print_tag2tbl('protein_id', $protein_id); # subroutine
|
|
514 }
|
|
515 next if ($tags[$i] =~ /protein_id/); # skip 'protein_id' field in tab file as they should be created from the 'locus_tag' column
|
|
516
|
|
517 $gene = $cells[$i] if ($tags[$i] =~ /^gene$/ && $cells[3] =~ /CDS/); # needed for $Error_File
|
|
518
|
|
519 ### enforce mandatory tags for CDS/RNA primary tags
|
|
520 next if ($tags[$i] =~ /locus_tag|^gene$/ && $Opt_Gene && !$Opt_Tags_Full && $cells[3] =~ /CDS|RNA/); # skip '/locus_tag' and '/gene' tags if accessory gene primary tags are present for CDS|RNA features (except option '-t' is set)
|
|
521 if ($tags[$i] =~ /product/ && $cells[3] =~ /RNA/ && $cells[$i] =~ /^$/) { # RNAs mandatory need a value for '/product'
|
|
522 close $out_file_fh;
|
|
523 unlink $Out_File;
|
|
524 die "\n### Fatal error: Row $row of input tab file '$Input_File' is missing a 'product' value which is mandatory for RNA primary tags!\nExiting program!\n\n";
|
|
525 }
|
|
526
|
|
527 ### enforce mandatory tag for pseudo-genes (have 'pseudo' as primary tag in tab file)
|
|
528 if ($tags[$i] =~ /locus_tag/ && $Opt_Gene && $cells[3] =~ /pseudo/ && $cells[$i] =~ /^$/) { # pseudo-genes mandatory need a '/locus_tag' with option '-g'
|
|
529 close $out_file_fh;
|
|
530 unlink $Out_File;
|
|
531 die "\n### Fatal error: Row $row of input tab file '$Input_File' is missing a 'locus_tag' which is mandatory for option '-g'!\nExiting program!\n\n";
|
|
532 }
|
|
533
|
|
534 ### the rest
|
|
535 if ($tags[$i] =~ /product/ && $cells[3] =~ /pseudo/) { # write 'product' values for pseudo-genes to '/gene_desc' tags
|
|
536 print_tag2tbl('gene_desc', $cells[$i]); # subroutine
|
|
537 } elsif ($tags[$i] =~ /pseudo/ && $cells[3] =~ /pseudo/) { # skip 'pseudo' tag if pseudo-gene
|
|
538 next;
|
|
539 } else {
|
|
540 print_tag2tbl($tags[$i], $cells[$i]); # subroutine
|
|
541 }
|
|
542
|
|
543 }
|
|
544 print $error_file_fh "$row\t$locus_tag\t$gene\t$hypo_putative\n" if ($hypo_putative && $gene);
|
|
545 }
|
|
546
|
|
547 select STDOUT;
|
|
548 close $input_file_fh;
|
|
549 close $error_file_fh;
|
|
550 close $out_file_fh;
|
|
551 return 1;
|
|
552 }
|
|
553
|
|
554
|
|
555
|
|
556 ### Read in data from tbl input file
|
|
557 sub read_tbl {
|
|
558 open (my $input_file_fh, "<", $Input_File);
|
|
559
|
|
560 my $seq_id = <$input_file_fh>; # SeqID
|
|
561 $seq_id =~ s/\R/\012/; # convert line to unix-style line endings
|
|
562 chomp $seq_id;
|
|
563 if ($seq_id !~ /^>Feature/) { # check if tbl file starts with mandatory '>Feature' and get first SeqID
|
|
564 die "\n### Fatal error: tbl file doesn't start with a '>Feature SeqID' line. Sure this is a valid tbl file?\nExiting program!\n\n";
|
|
565 } else {
|
|
566 $seq_id =~ s/>Feature (\S+)\s*$/$1/; # only use non-whitespace characters as SeqID
|
|
567 }
|
|
568
|
|
569 my %data; # hash-in-hash-in-hash to store tbl input data
|
|
570 my $pos_key; # store start..stop for each primary tag and use as key in %data
|
|
571 my $primary_tag; # store previous primary tag to determine if values for repeatedly occuring tags should be concatenated
|
|
572 my %tags_max_count; # hash to store all occuring tags with maximal number of presence (within a single primary tag) in the tbl file for final tab column headers
|
|
573 my @tags; # array to store all tags of each primary tag, supplement to %tags_max_count
|
|
574
|
|
575 while (<$input_file_fh>) {
|
|
576 $_ =~ s/\R/\012/; # convert line to unix-style line ending
|
|
577 chomp;
|
|
578 next if ($_ =~ /^\s+$/); # skip empty lines
|
|
579 my @fields = split(/\t/, $_);
|
|
580
|
|
581 ### get next SeqID from '>Feature' line for multi-contig/replicon tbl files
|
|
582 if ($fields[0] =~ /^>Feature (\S+)\s*$/) {
|
|
583 $seq_id = $1;
|
|
584
|
|
585 ### get primary tags/features and fill %tags_max_count
|
|
586 } elsif ($fields[0] =~ /^\d+$/) { # $fields[2] with primary tag
|
|
587 foreach my $tag (@tags) { # fill %tags_max_count for previous primary tag
|
|
588 if ($tags_max_count{$tag}) {
|
|
589 $tags_max_count{$tag} = grep(/$tag/, @tags) if ($tags_max_count{$tag} < grep(/$tag/, @tags));
|
|
590 } elsif (!$tags_max_count{$tag}) {
|
|
591 $tags_max_count{$tag} = grep(/$tag/, @tags);
|
|
592 }
|
|
593 }
|
|
594 @tags = (); # empty tags array for new current primary tag
|
|
595
|
|
596 $pos_key = "$fields[0]..$fields[1]"; # position of primary tag used as key for %data, "start..stop"
|
|
597 $primary_tag = $fields[2];
|
|
598 if (!$data{$seq_id}->{$pos_key}->{'primary_tag'} || $data{$seq_id}->{$pos_key}->{'primary_tag'} =~ /gene/) { # if primary tag not present or overwrite accessory 'gene' primary tag
|
|
599 $data{$seq_id}->{$pos_key}->{'primary_tag'} = $primary_tag; # store data in anonymous hash-in-hash-in-hash
|
|
600 $data{$seq_id}->{$pos_key}->{'start'} = $fields[0]; # to be able to sort afterwards via the start position
|
|
601 } elsif ($data{$seq_id}->{$pos_key}->{'primary_tag'} =~ /pseudo/) { # 'gene' primary tag with '/pseudo' tag will be replaced by 'pseudo' primary tag for pseudo-genes (see below), however if 'gene' primary tag is ACCESSORY to CDS|RNA primary tag replace by this primary tag and include '/pseudo' tag
|
|
602 $data{$seq_id}->{$pos_key}->{'primary_tag'} = $primary_tag;
|
|
603 $data{$seq_id}->{$pos_key}->{'pseudo'} = 'T'; # value 'T' for true
|
|
604 }
|
|
605
|
|
606 ### get tags/qualifiers
|
|
607 } elsif ($fields[3] =~ /^\w+/) {
|
|
608 push(@tags, $fields[3]) if ($fields[3] !~ /gene_desc/); # store tags for current primary tag; skip '/gene_desc' as reserved for pseudo-genes (replaced by '/product' see below)
|
|
609 if ($fields[3] =~ /pseudo/) {
|
|
610 if ($data{$seq_id}->{$pos_key}->{'primary_tag'} =~ /gene/) { # change 'gene' primary tag of pseudo-genes to 'pseudo' (if accessory 'gene' primary tag will be replaced by *actual* primary tag, see above)
|
|
611 $data{$seq_id}->{$pos_key}->{'primary_tag'} = 'pseudo';
|
|
612 } else { # else include a '/pseudo' tag with value 'T' for true
|
|
613 $data{$seq_id}->{$pos_key}->{'pseudo'} = 'T';
|
|
614 }
|
|
615 next; # next line
|
|
616 }
|
|
617
|
|
618 ### remove quotations from values introduced by Excel by saving as tab-separated file (see above)
|
|
619 $fields[4] =~ s/""/"/g;
|
|
620 $fields[4] =~ s/^"//;
|
|
621 $fields[4] =~ s/"$//;
|
|
622
|
|
623 ### adjust '/gene' and '/product' values to NCBI standard
|
|
624 if ($fields[3] =~ /gene/) {
|
|
625 $fields[4] =~ s/(\w+)/\l$1/; # first character of gene name should be lower case
|
|
626 $fields[4] =~ s/^(\w)$/\u$1/; # one letter phage genes should be upper case
|
|
627 } elsif ($fields[3] =~ /product/) {
|
|
628 $fields[4] =~ s/\b([A-Z][a-z]{3,})/\l$1/g; # lower the case for '/protein' value initials
|
|
629 $fields[4] =~ s/(rossman|willebrand)/\u$1/; # exception to the rule
|
|
630 }
|
|
631
|
|
632 if ($fields[3] =~ /gene_desc/) { # '/gene_desc' tags from pseudo-genes replaced by '/product' for resulting tab file
|
|
633 $data{$seq_id}->{$pos_key}->{'product'} = $fields[4];
|
|
634 next;
|
|
635 }
|
|
636 if ($data{$seq_id}->{$pos_key}->{$fields[3]} && $data{$seq_id}->{$pos_key}->{'primary_tag'} =~ /$primary_tag/) { # tag already exists for this position (e.g. several EC_numbers), concatenate the additional values with '~' as separator only WITHIN the same primary tag (OTHERWISE overwrite in 'else' below)
|
|
637 $data{$seq_id}->{$pos_key}->{$fields[3]} .= '~'.$fields[4];
|
|
638 } else { # tag doesn't exist yet or overwrite if current primary tag at the same position of previous (e.g. accessory 'gene' primary tag to CDS/RNA primary tag)
|
|
639 $data{$seq_id}->{$pos_key}->{$fields[3]} = $fields[4];
|
|
640 }
|
|
641 }
|
|
642 }
|
|
643 foreach my $tag (@tags) { # fill %tags_max_count for last primary tag
|
|
644 if ($tags_max_count{$tag}) {
|
|
645 $tags_max_count{$tag} = grep(/$tag/, @tags) if ($tags_max_count{$tag} < grep(/$tag/, @tags));
|
|
646 } elsif (!$tags_max_count{$tag}) {
|
|
647 $tags_max_count{$tag} = grep(/$tag/, @tags);
|
|
648 }
|
|
649 }
|
|
650 close $input_file_fh;
|
|
651 return \%data, \%tags_max_count;
|
|
652 }
|
|
653
|
|
654
|
|
655
|
|
656 ### Write data to tab output file
|
|
657 sub write_tab {
|
|
658 my ($data_hash_ref, $tags_max_count_hash_ref) = @_;
|
|
659 file_exist($Out_File); # subroutine
|
|
660 open (my $out_file_fh, ">", $Out_File);
|
|
661 select $out_file_fh; # select fh for standard print/f output
|
|
662
|
|
663 ### print header for tab result file
|
|
664 print "seq_id\tstart\tstop\tprimary_tag\tlocus_tag"; # mandatory columns/fields in tab file
|
|
665 if ($Opt_Concat) {
|
|
666 foreach (sort keys %{$tags_max_count_hash_ref}) { # print residual tags
|
|
667 print "\t$_" if (!/locus_tag/);
|
|
668 }
|
|
669 } elsif (!$Opt_Concat) {
|
|
670 foreach (sort keys %{$tags_max_count_hash_ref}) {
|
|
671 print "\t$_" x $tags_max_count_hash_ref->{$_} if (!/locus_tag/); # print max occurrence (in tbl) of each residual tag
|
|
672 }
|
|
673 }
|
|
674
|
|
675 print "\tspreadsheet_locus_tag\tposition\tdistance\tgene_number\tcontig_order" if ($Opt_Spreadsheet); # print optional spreadsheet header columns
|
|
676 print "\n";
|
|
677
|
|
678 ### variables for optional spreadsheet formulas
|
|
679 my @spread_columns = ("A".."AZ") if ($Opt_Spreadsheet); # columns in spreadsheet software for formulas
|
|
680 my ($tags_column_count, $spread_row_count, $spread_contig_order) = (0, 1, 1) if ($Opt_Spreadsheet);
|
|
681 if ($Opt_Spreadsheet) {
|
|
682 if ($Opt_Concat) {
|
|
683 $tags_column_count = (scalar keys %{$tags_max_count_hash_ref}) - scalar grep($_ =~ /locus_tag/, keys %{$tags_max_count_hash_ref}); # subtract tags for correct spreadsheet formulas
|
|
684 } elsif (!$Opt_Concat) {
|
|
685 foreach (keys %{$tags_max_count_hash_ref}) {
|
|
686 next if ($_ =~ /locus_tag/);
|
|
687 $tags_column_count += $tags_max_count_hash_ref->{$_};
|
|
688 }
|
|
689 }
|
|
690 }
|
|
691
|
|
692 ### print data from hash into tab result file, optional with G2L-style spreadsheet formulas
|
|
693 foreach my $seq_id (sort keys %{$data_hash_ref}) {
|
|
694 foreach my $pos (sort {$data_hash_ref->{$seq_id}->{$a}->{'start'} <=> $data_hash_ref->{$seq_id}->{$b}->{'start'}} keys $data_hash_ref->{$seq_id}) { # sort each position entry in %data via start position
|
|
695 print "$seq_id";
|
|
696 my ($start, $stop) = split(/\.\./, $pos);
|
|
697 print "\t$start\t$stop";
|
|
698 print "\t$data_hash_ref->{$seq_id}->{$pos}->{'primary_tag'}"; # primary_tag should always be present
|
|
699 print_tag2tab('locus_tag', $data_hash_ref, $seq_id, $pos, 1); # subroutine; locus_tag should occur always just one time per primary tag
|
|
700 foreach (sort keys %{$tags_max_count_hash_ref}) { # print residual tags
|
|
701 print_tag2tab($_, $data_hash_ref, $seq_id, $pos, $tags_max_count_hash_ref->{$_}) if (!/locus_tag/); # subroutine
|
|
702 }
|
|
703
|
|
704 ### G2L-style spreadsheet formulas
|
|
705 if ($Opt_Spreadsheet) {
|
|
706 $spread_row_count++;
|
|
707 if ($Formula_Lang_Spreadsheet =~ /^e/i) {
|
|
708 print "\t=\"$Locus_Prefix\"", '&"_"&A', "$spread_row_count&TEXT(", $spread_columns[$tags_column_count+8], $spread_row_count, ',"0000")&"0"'; # spreadsheet column 'spreadsheet_locus_tag'
|
|
709 } elsif ($Formula_Lang_Spreadsheet =~ /^g/i) {
|
|
710 print "\t=\"$Locus_Prefix\"", '&"_"&A', "$spread_row_count&TEXT(", $spread_columns[$tags_column_count+8], $spread_row_count, ';"0000")&"0"';
|
|
711 }
|
|
712 print "\t=MIN(B$spread_row_count:C$spread_row_count)"; # spreadsheet column 'position'
|
|
713 print "\t=", $spread_columns[$tags_column_count+6], $spread_row_count + 1, "-MAX(B$spread_row_count:C$spread_row_count)"; # spreadsheet column 'distance'
|
|
714 if ($Formula_Lang_Spreadsheet =~ /^e/i) {
|
|
715 print "\t=IF(", $spread_columns[$tags_column_count+9], $spread_row_count, '=', $spread_columns[$tags_column_count+9], $spread_row_count - 1, ",$spread_columns[$tags_column_count+8]", $spread_row_count - 1, '+1,1)'; # spreadsheet column 'gene_number'
|
|
716 } elsif ($Formula_Lang_Spreadsheet =~ /^g/i) {
|
|
717 print "\t=WENN(", $spread_columns[$tags_column_count+9], $spread_row_count, '=', $spread_columns[$tags_column_count+9], $spread_row_count - 1, ";$spread_columns[$tags_column_count+8]", $spread_row_count - 1, '+1;1)';
|
|
718 }
|
|
719 print "\t$spread_contig_order"; # spreadsheet column 'contig_order'
|
|
720 }
|
|
721
|
|
722 print "\n";
|
|
723 }
|
|
724 $spread_contig_order++; # next contig/replicon (SeqID) in tbl file
|
|
725 }
|
|
726
|
|
727 select STDOUT;
|
|
728 close $out_file_fh;
|
|
729 return 1;
|
|
730 }
|