Mercurial > repos > dereeper > pangenome_explorer
comparison COG/bac-genomics-scripts/tbl2tab/tbl2tab.pl @ 3:e42d30da7a74 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:52:25 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:97e4e3e818b6 | 3:e42d30da7a74 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 ####### | |
4 # POD # | |
5 ####### | |
6 | |
7 =pod | |
8 | |
9 =head1 NAME | |
10 | |
11 C<tbl2tab.pl> - convert tbl to tab-separated format and back | |
12 | |
13 =head1 SYNOPSIS | |
14 | |
15 C<perl tbl2tab.pl -m tbl2tab -i feature_table.tbl -s -l locus_prefix> | |
16 | |
17 B<or> | |
18 | |
19 C<perl tbl2tab.pl -m tab2tbl -i feature_table.tab -g -l locus_prefix | |
20 -p "gnl|dbname|"> | |
21 | |
22 =head1 DESCRIPTION | |
23 | |
24 NCBI's feature table (B<tbl>) format is needed for the submission of | |
25 genomic data to GenBank with the NCBI tools | |
26 L<Sequin|http://www.ncbi.nlm.nih.gov/Sequin/> or | |
27 L<tbl2asn|http://www.ncbi.nlm.nih.gov/genbank/tbl2asn2>. tbl files | |
28 can be created with automatic annotation systems like | |
29 L<Prokka|http://www.vicbioinformatics.com/software.prokka.shtml>. | |
30 C<tbl2tab.pl> can convert a tbl file to a tab-separated format (tab) | |
31 and back to the tbl format. The tab-delimited format is useful to | |
32 manipulate the data more comfortably in a spreadsheet software (e.g. | |
33 LibreOffice or MS Excel). For a conversion back to tbl format save | |
34 the file in the spreadsheet software as a tab-delimited text file. | |
35 The script is intended for microbial genomes, but might also be | |
36 useful for eukaryotes. | |
37 | |
38 Regular expressions are applied in mode B<tbl2tab> to correct gene | |
39 names and words in '/product' values to lowercase initials (with | |
40 the exception of 'Rossman' and 'Willebrand'). The resulting tab file | |
41 can then be used to check for possible errors. | |
42 | |
43 The first four header columns of the B<tab> format are mandatory, | |
44 'seq_id' for the SeqID, and for each primary tag/feature (e.g. CDS, | |
45 RNAs, repeat_region etc.), 'start', 'stop', and 'primary_tag'. These | |
46 mandatory columns have to be filled in every row in the tab file. | |
47 All the following columns will be included as tags/qualifiers (e.g. | |
48 '/locus_tag', '/product', '/EC_number', '/note' etc.) in the | |
49 conversion to the tbl file if a value is present. | |
50 | |
51 There are three special cases: | |
52 | |
53 B<First>, '/pseudo' will be included as a tag if I<any> value (the | |
54 script uses 'T' for true) is present in the B<tab> format. If a | |
55 primary tag is indicated as pseudo both the primary tag and the | |
56 accessory 'gene' primary tag (for CDS/RNA features with option | |
57 B<-g>) will include a '/pseudo' qualifier in the resulting B<tbl> | |
58 file. B<Pseudo-genes> are indicated by 'pseudo' in the 'primary_tag' | |
59 column, thus the 'pseudo' column is ignored in these cases. | |
60 | |
61 B<Second>, tag '/gene_desc' is reserved for the 'product' values of | |
62 pseudo-genes, thus a 'gene_desc' column in a tab file will be | |
63 ignored in the conversion to tbl. | |
64 | |
65 B<Third>, column 'protein_id' in a tab file will also be ignored in | |
66 the conversion. '/protein_id' values are created from option B<-p> | |
67 and the locus_tag for each CDS primary feature. | |
68 | |
69 Furthermore, with option B<-s> G2L-style spreadsheet formulas | |
70 (L<Goettingen Genomics | |
71 Laboratory|http://appmibio.uni-goettingen.de/>) can be included with | |
72 additional columns, 'spreadsheet_locus_tag', 'position', 'distance', | |
73 'gene_number', and 'contig_order'. These columns will not be | |
74 included in a conversion to the tbl format. Thus, if you want to | |
75 include e.g. the locus_tags from the formula in column | |
76 'spreadsheet_locus_tag' in the resulting tbl file copy the B<values> | |
77 to the column 'locus_tag'! | |
78 | |
79 To illustrate the process two example files are included in the | |
80 repository, F<example.tbl> and F<example2.tab>, which are | |
81 interconvertible (see L</"EXAMPLES"> below). | |
82 | |
83 B<Warning>, be aware of possible errors introduced by automatic | |
84 format conversions using a spreadsheet software like MS Excel, see | |
85 e.g. Zeeberg et al. 2004 | |
86 (L<http://www.ncbi.nlm.nih.gov/pubmed/15214961>). | |
87 | |
88 For more information regarding the feature table and the submission | |
89 process see NCBI's L<prokaryotic annotation | |
90 guide|http://www.ncbi.nlm.nih.gov/genbank/genomesubmit> and the | |
91 L<bacterial genome submission | |
92 guide|http://www.ncbi.nlm.nih.gov/genbank/genomesubmit_annotation>. | |
93 | |
94 =head1 OPTIONS | |
95 | |
96 =head2 Mandatory options | |
97 | |
98 =over 20 | |
99 | |
100 =item B<-m>=I<tbl2tab|tab2tbl>, B<-mode>=I<tbl2tab|tab2tbl> | |
101 | |
102 Conversion mode, either 'tbl2tab' or 'tab2tbl' [default = 'tbl2tab'] | |
103 | |
104 =item B<-i>=I<str>, B<-input>=I<str> | |
105 | |
106 Input tbl or tab file to be converted to the other format | |
107 | |
108 =back | |
109 | |
110 =head2 Optional options | |
111 | |
112 =over 20 | |
113 | |
114 =item B<-h>, B<-help> | |
115 | |
116 Help (perldoc POD) | |
117 | |
118 =item B<-v>, B<-version> | |
119 | |
120 Print version number to C<STDERR> | |
121 | |
122 =back | |
123 | |
124 =head3 Mode B<tbl2tab> | |
125 | |
126 =over 20 | |
127 | |
128 =item B<-l>=I<str>, B<-locus_prefix>=I<str> | |
129 | |
130 Only in combination with option B<-s> and there mandatory to include | |
131 the locus_tag prefix in the formula for column 'spreadsheet_locus_tag' | |
132 | |
133 =item B<-c>, B<-concat> | |
134 | |
135 Concatenate values of identical tags within one primary tag with '~' | |
136 (e.g. several '/EC_number' or '/inference' tags) | |
137 | |
138 =item B<-e>=I<str>, B<-empty>=I<str> | |
139 | |
140 String used for primary features without value for a tag [default = ''] | |
141 | |
142 =item B<-s>, B<-spreadsheet> | |
143 | |
144 Include formulas for spreadsheet editing | |
145 | |
146 =item B<-f>=I<e|g>, B<-formula_lang>=I<e|g> | |
147 | |
148 Syntax language of the spreadsheet formulas, either 'English' or | |
149 'German'. If you're still encountering problems with the formulas | |
150 set the decimal and thousands separator manually in the options of | |
151 the spreadsheet software (instead of using the operating system | |
152 separators). [default = 'e'] | |
153 | |
154 =back | |
155 | |
156 =head3 Mode B<tab2tbl> | |
157 | |
158 =over 20 | |
159 | |
160 =item B<-l>=I<str>, B<-locus_prefix>=I<str> | |
161 | |
162 Prefix to the SeqID if not present already in the SeqID | |
163 | |
164 =item B<-g>, B<-gene> | |
165 | |
166 Include accessory 'gene' primary tags (with '/gene', '/locus_tag' | |
167 and possibly '/pseudo' tags) for 'CDS/RNA' primary tags; NCBI standard | |
168 | |
169 =item B<-t>, B<-tags_full> | |
170 | |
171 Only in combination with option B<-g>, include '/gene' and | |
172 '/locus_tag' tags additionally in primary tag, not only in accessory | |
173 'gene' primary tag | |
174 | |
175 =item B<-p>=I<str>, B<-protein_id_prefix>=I<str> | |
176 | |
177 Prefix for '/protein_id' tags; don't forget the double quotes for | |
178 the string, otherwise the shell will intepret as pipe [default = | |
179 'gnl|goetting|'] | |
180 | |
181 =back | |
182 | |
183 =head1 OUTPUT | |
184 | |
185 =over 20 | |
186 | |
187 =item F<*.tab|tbl> | |
188 | |
189 Result file in the opposite format | |
190 | |
191 =item (F<hypo_putative_genes.txt>) | |
192 | |
193 Created in mode 'tab2tbl', indicates if CDSs are annotated as | |
194 'hypothetical/putative/predicted protein' but still have a gene name | |
195 | |
196 =back | |
197 | |
198 =head1 EXAMPLES | |
199 | |
200 =over | |
201 | |
202 =item C<perl tbl2tab.pl -m tbl2tab -i example.tbl -s -l EPE> | |
203 | |
204 =item C<perl tbl2tab.pl -m tab2tbl -i example2.tab -g -l EPE> | |
205 | |
206 =back | |
207 | |
208 =head1 VERSION | |
209 | |
210 0.2 update: 29-10-2014 | |
211 0.1 24-06-2014 | |
212 | |
213 =head1 AUTHOR | |
214 | |
215 Andreas Leimbach aleimba[at]gmx[dot]de | |
216 | |
217 =head1 LICENSE | |
218 | |
219 This program is free software: you can redistribute it and/or modify | |
220 it under the terms of the GNU General Public License as published by | |
221 the Free Software Foundation; either version 3 (GPLv3) of the License, | |
222 or (at your option) any later version. | |
223 | |
224 This program is distributed in the hope that it will be useful, but | |
225 WITHOUT ANY WARRANTY; without even the implied warranty of | |
226 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
227 General Public License for more details. | |
228 | |
229 You should have received a copy of the GNU General Public License | |
230 along with this program. If not, see L<http://www.gnu.org/licenses/>. | |
231 | |
232 =cut | |
233 | |
234 | |
235 ######## | |
236 # MAIN # | |
237 ######## | |
238 | |
239 use strict; | |
240 use warnings; | |
241 use autodie; | |
242 use Getopt::Long; | |
243 use Pod::Usage; | |
244 | |
245 | |
246 | |
247 ### Get options with Getopt::Long, works also abbreviated and with two "--": -i, --i, -input ... | |
248 my $Input_File; # input file | |
249 my $Mode = 'tbl2tab'; # mode of script, i.e. either convert from tbl2tab or from tab2tbl; default 'tbl2tab' | |
250 my $Locus_Prefix = ''; # required for option 'spreadsheet' in mode 'tbl2tab', in mode 'tab2tbl' optional | |
251 my $Opt_Concat; # optionally, concatenate values of the same tag within one primary tag in a tbl file in one column in the resulting tab file with '~' (e.g. several 'EC_number' tags etc.) | |
252 my $Empty = ''; # optionally, set what should be used for tags without a value in resulting tab file; default is nothing | |
253 my $Opt_Spreadsheet; # optionally, include formulas for spreadsheet editing (e.g. Libre Office, MS Excel) | |
254 my $Formula_Lang_Spreadsheet = 'e'; # optionally, either German or English formulas in Spreadsheet option; default 'e' for English | |
255 my $Opt_Gene; # optionally, include accessory gene primary tags (with '/gene' and '/locus_tag' [and '/pseudo'] tags) for CDS|RNA primary tags | |
256 my $Opt_Tags_Full; # optionally, include '/gene' and '/locus_tag' additionally in primary tag not only accessory 'gene' primary tag | |
257 my $Protein_Id_Prefix = 'gnl|goetting|'; # optionally give a different string to prefix the '/protein_id' tags | |
258 my $VERSION = 0.2; | |
259 my ($Opt_Version, $Opt_Help); | |
260 GetOptions ('input=s' => \$Input_File, | |
261 'mode=s' => \$Mode, | |
262 'locus_prefix:s' => \$Locus_Prefix, | |
263 'concat' => \$Opt_Concat, | |
264 'empty:s' => \$Empty, | |
265 'spreadsheet' => \$Opt_Spreadsheet, | |
266 'formula_lang:s' => \$Formula_Lang_Spreadsheet, | |
267 'gene' => \$Opt_Gene, | |
268 'tags_full' => \$Opt_Tags_Full, | |
269 'protein_id_prefix:s' => \$Protein_Id_Prefix, | |
270 'version' => \$Opt_Version, | |
271 'help|?' => \$Opt_Help); | |
272 | |
273 | |
274 | |
275 ### Run perldoc on POD | |
276 pod2usage(-verbose => 2) if ($Opt_Help); | |
277 die "$0 $VERSION\n" if ($Opt_Version); | |
278 if (!$Input_File) { | |
279 my $warning = "\n### Fatal error: Option '-i' or its argument is missing!\n"; | |
280 pod2usage(-verbose => 1, -message => $warning, -exitval => 2); | |
281 } elsif (!($Mode =~ /tbl2tab/i || $Mode =~ /tab2tbl/i)) { # case-insensitive | |
282 my $warning = "\n### Fatal error: Incorrect run mode with option '-m' given! Please choose either 'tbl2tab' or 'tab2tbl' for '-m'!\n"; | |
283 pod2usage(-verbose => 1, -message => $warning, -exitval => 2); | |
284 } | |
285 | |
286 | |
287 | |
288 ### Enforce mandatory or optional options | |
289 if ($Mode =~ /tbl2tab/i && ($Opt_Gene || $Opt_Tags_Full || $Protein_Id_Prefix ne 'gnl|goetting|')) { | |
290 warn "\nIncompatible option(s) '-g', '-p', or '-t' set with mode 'tbl2tab'. Ignoring the option(s)!\n"; | |
291 } elsif ($Mode =~ /tab2tbl/i && ($Opt_Concat || $Empty || $Opt_Spreadsheet || $Formula_Lang_Spreadsheet ne 'e')) { | |
292 warn "\nIncompatible option(s) '-c', '-e', '-f', or '-s' set with mode 'tab2tbl'. Ignoring the option(s)!\n"; | |
293 $Formula_Lang_Spreadsheet = 'e' if ($Formula_Lang_Spreadsheet ne 'e'); # avoid die with error below if option not set correctly | |
294 } | |
295 | |
296 if ($Mode =~ /tbl2tab/i && ($Locus_Prefix || $Formula_Lang_Spreadsheet ne 'e') && !$Opt_Spreadsheet) { | |
297 warn "\nOption(s) '-l' or '-f' set, but not '-s'. Forcing option '-s'!\n"; | |
298 $Opt_Spreadsheet = 1; | |
299 } | |
300 if ($Mode =~ /tbl2tab/i && !$Locus_Prefix && $Opt_Spreadsheet) { | |
301 warn "\nOption '-s' set, but not '-l'. Please give a prefix for the locus tags: "; | |
302 chomp($Locus_Prefix = <>); | |
303 } | |
304 | |
305 if ($Formula_Lang_Spreadsheet !~ /^(e|g)/i) { | |
306 die "\n### Fatal error: Incorrect language for option '-f' given! Please choose either 'e|eng' or 'g|ger' for '-f'!\n"; | |
307 } | |
308 | |
309 if ($Mode =~ /tab2tbl/i && !$Opt_Gene && $Opt_Tags_Full) { | |
310 warn "\nOption '-t' set, but not '-g'. Forcing option '-g'!\n"; | |
311 $Opt_Gene = 1; | |
312 } | |
313 | |
314 | |
315 | |
316 ### Read in tbl or tab-separated data and write to result file in the opposite format | |
317 my $Out_File = $Input_File; | |
318 $Out_File =~ s/^(.+)\.\w+$/$1/; # strip filename extension | |
319 my $Error_File = 'hypo_putative_genes.txt'; | |
320 if ($Mode =~ /tbl2tab/i) { | |
321 my ($data_hash_ref, $tags_max_count_hash_ref) = read_tbl(); # subroutine | |
322 $Out_File .= '.tab'; | |
323 write_tab($data_hash_ref, $tags_max_count_hash_ref); # subroutine | |
324 | |
325 } elsif ($Mode =~ /tab2tbl/i) { | |
326 $Out_File .= '.tbl'; | |
327 read_tab_write_tbl(); # subroutine | |
328 } | |
329 | |
330 | |
331 | |
332 ### Message which file was created | |
333 if ($Mode =~ /tbl2tab/i) { | |
334 print "Input tbl file '$Input_File' was converted to tab output file '$Out_File'!\n"; | |
335 } elsif ($Mode =~ /tab2tbl/i) { | |
336 print "Input tab file '$Input_File' was converted to tbl output file '$Out_File'!\n"; | |
337 } | |
338 | |
339 if (-e $Error_File) { | |
340 if (-s $Error_File >= 40) { # smaller than just the header, which should be 27 bytes | |
341 warn "\n### Warning: CDSs found that are annotated with 'hypothetical|putative|predicted protein' but still include a '/gene' tag, see file '$Error_File'!\n"; | |
342 } elsif (-s $Error_File < 40) { | |
343 unlink $Error_File; | |
344 } | |
345 } | |
346 | |
347 | |
348 exit; | |
349 | |
350 ############### | |
351 # Subroutines # | |
352 ############### | |
353 | |
354 ### Subroutine to test for file existence and give warning to STDERR | |
355 sub file_exist { | |
356 my $file = shift; | |
357 if (-e $file) { | |
358 warn "\nThe result file \'$file\' exists already and will be overwritten!\n\n"; | |
359 return 1; | |
360 } | |
361 return 0; | |
362 } | |
363 | |
364 | |
365 | |
366 ### Print tag values to tab result file | |
367 sub print_tag2tab { | |
368 my ($tag, $data_hash_ref, $seq_id, $pos, $tag_max_count) = @_; | |
369 | |
370 if ($Opt_Concat) { # values concatenated by '~' in $data_hash_ref | |
371 if ($data_hash_ref->{$seq_id}->{$pos}->{$tag}) { | |
372 print "\t$data_hash_ref->{$seq_id}->{$pos}->{$tag}"; | |
373 return 1; | |
374 } else { | |
375 print "\t$Empty"; | |
376 return 1; | |
377 } | |
378 | |
379 } elsif (!$Opt_Concat) { # split concatenated values in individual values | |
380 my @values = split(/~/, $data_hash_ref->{$seq_id}->{$pos}->{$tag}) if ($data_hash_ref->{$seq_id}->{$pos}->{$tag}); | |
381 if (@values) { | |
382 foreach (@values) { | |
383 print "\t$_"; | |
384 } | |
385 print "\t$Empty" x ($tag_max_count - @values); # fill residual columns till maximum occurrence | |
386 } else { | |
387 print "\t$Empty" x $tag_max_count; | |
388 } | |
389 return 1; | |
390 } | |
391 | |
392 return 0; | |
393 } | |
394 | |
395 | |
396 | |
397 ### Print tag values to tbl result file | |
398 sub print_tag2tbl { | |
399 my ($tag, $value) = @_; | |
400 return 0 if ($value =~ /^$Empty$/); | |
401 if ($tag =~ /pseudo/) { | |
402 print "\t\t\t$tag\n"; | |
403 return 1; | |
404 } | |
405 | |
406 ### remove quotations from values introduced by Excel by saving as tab-separated file: | |
407 ### https://office.microsoft.com/en-001/excel-help/excel-formatting-and-features-that-are-not-transferred-to-other-file-formats-HP010014105.aspx | |
408 ### - if a cell contains a comma, the cell contents are enclosed in double quotation marks | |
409 ### - if the data contains a quotation mark, double quotation marks will replace the quotation mark, and the cell contents are also enclosed in double quotation marks | |
410 $value =~ s/""/"/g; | |
411 $value =~ s/^"//; | |
412 $value =~ s/"$//; | |
413 | |
414 foreach (split(/~/, $value)) { | |
415 print "\t\t\t$tag\t$_\n"; | |
416 } | |
417 return 1; | |
418 } | |
419 | |
420 | |
421 ### Read in data from tab input file and write it to tbl output file | |
422 sub read_tab_write_tbl { | |
423 file_exist($Error_File); # subroutine | |
424 open (my $error_file_fh, ">", $Error_File); | |
425 print $error_file_fh "row\tlocus_tag\tgene\tproduct\n"; | |
426 | |
427 open (my $input_file_fh, "<", $Input_File); | |
428 my $header = <$input_file_fh>; | |
429 $header =~ s/\R/\012/; # convert line to unix-style line endings | |
430 chomp $header; | |
431 if ($header !~ /^seq_id\tstart\tstop\tprimary_tag\t/) { # check if tbl file starts with mandatory header fields or quit | |
432 die "\n### Fatal error: Input tab file '$Input_File' doesn't start with the mandatory 'seq_id', 'start', 'stop', and 'primary_tag' tab-separated header fields. Sure this is a valid tab file?\nExiting program!\n\n"; | |
433 } | |
434 | |
435 my @tags; | |
436 foreach (split(/\t/, $header)) { | |
437 last if (/spreadsheet_locus_tag/); # skip all optional extra spreadsheet columns | |
438 push(@tags, $_); # store all header fields/columns to associate with each field in each line | |
439 } | |
440 | |
441 file_exist($Out_File); # subroutine | |
442 open (my $out_file_fh, ">", $Out_File); | |
443 select $out_file_fh; # select fh for standard print/f output | |
444 | |
445 my $row = 1; # count row numbers of tab input file for $Error_File (start with '1' as header already parsed above) | |
446 my $seq_id = ''; # store previous SeqID for multi-contig/replicon tab files | |
447 while (<$input_file_fh>) { | |
448 $row++; | |
449 $_ =~ s/\R/\012/; # convert line to unix-style line ending | |
450 chomp; | |
451 next if ($_ =~ /^\s+$/ || $_ =~ /^$/); # skip empty lines | |
452 | |
453 my ($locus_tag, $gene, $hypo_putative) = ('', '', ''); # needed for $Error_File | |
454 | |
455 my @cells = split(/\t/, $_); | |
456 for (my $i = 0; $i < 4; $i++) { # check each row for mandatory fields | |
457 if ($cells[$i] =~ /^$/) { | |
458 close $out_file_fh; | |
459 unlink $Out_File; | |
460 die "\n### Fatal error: Row $row of input tab file '$Input_File' is missing a value for one of the mandatory fields 'seq_id', 'start', 'stop', or 'primary_tag'!\nExiting program!\n\n"; | |
461 } | |
462 } | |
463 | |
464 ### print SeqID | |
465 if ($cells[0] ne $seq_id) { # print new contig for multi-contig/replicon tab files | |
466 $seq_id = $cells[0]; | |
467 $cells[0] = $Locus_Prefix."_".$cells[0] if ($cells[0] !~ /$Locus_Prefix/ && $Locus_Prefix); # append locus_tag prefix only if not present already | |
468 print ">Feature $cells[0]\n"; | |
469 } | |
470 | |
471 ### print accessory 'gene' primary tags with '/locus_tag', '/gene', and potential '/pseudo' tags | |
472 if ($Opt_Gene && $cells[3] =~ /CDS|RNA/) { # accessory 'gene' primary tags only for CDS and RNA (rRNA, tRNA ...) features | |
473 print "$cells[1]\t$cells[2]\tgene\n"; | |
474 my $column_count = 0; | |
475 foreach my $tag (@tags) { | |
476 if ($tag =~ /locus_tag/ && $cells[$column_count] =~ /^$/) { # CDSs|RNAs mandatory need a '/locus_tag' with option '-g' | |
477 close $out_file_fh; | |
478 unlink $Out_File; | |
479 die "\n### Fatal error: Row $row of input tab file '$Input_File' is missing a 'locus_tag' which is mandatory for option '-g'!\nExiting program!\n\n"; | |
480 } | |
481 print_tag2tbl($tag, $cells[$column_count]) if ($tag =~ /locus_tag|^gene$|pseudo/); # subroutine; '^gene$' needed, so 'gene_desc' isn't hit (see below) | |
482 $column_count++; | |
483 } | |
484 } | |
485 | |
486 ### print primary tag | |
487 print "$cells[1]\t$cells[2]"; # start\tstop | |
488 if ($cells[3] =~ /pseudo/) { # pseudo-gene should include '/pseudo' tag | |
489 print "\tgene\n"; | |
490 print "\t\t\tpseudo\n"; | |
491 } else { | |
492 print "\t$cells[3]\n"; | |
493 } | |
494 | |
495 ### print tags with values | |
496 for (my $i = 4; $i < @tags; $i++) { # start with field 5 of array with header fields/columns (the first 4 are mandatory, see above) | |
497 next if ($tags[$i] =~ /gene_desc/); # skip 'gene_desc' fields in tab file, reserved for pseudo-genes (see below) | |
498 | |
499 ### enforce mandatory tags for CDS primary tags | |
500 if ($tags[$i] =~ /product/ && $cells[3] =~ /CDS/) { | |
501 if ($cells[$i] =~ /(hypothetical|putative|predicted) protein/) { # needed for $Error_File | |
502 $hypo_putative = $cells[$i]; | |
503 } elsif ($cells[$i] =~ /^$/) { # CDSs mandatory need a value for '/product' | |
504 close $out_file_fh; | |
505 unlink $Out_File; | |
506 die "\n### Fatal error: Row $row of input tab file '$Input_File' is missing a 'product' value which is mandatory for CDS primary tags!\nExiting program!\n\n"; | |
507 } | |
508 } | |
509 | |
510 if ($tags[$i] =~ /locus_tag/ && $cells[3] =~ /CDS/) { # '/protein_id' mandatory for 'CDS' primary tags | |
511 $locus_tag = $cells[$i]; # needed for $Error_File | |
512 my $protein_id = "$Protein_Id_Prefix".$cells[$i]; | |
513 print_tag2tbl('protein_id', $protein_id); # subroutine | |
514 } | |
515 next if ($tags[$i] =~ /protein_id/); # skip 'protein_id' field in tab file as they should be created from the 'locus_tag' column | |
516 | |
517 $gene = $cells[$i] if ($tags[$i] =~ /^gene$/ && $cells[3] =~ /CDS/); # needed for $Error_File | |
518 | |
519 ### enforce mandatory tags for CDS/RNA primary tags | |
520 next if ($tags[$i] =~ /locus_tag|^gene$/ && $Opt_Gene && !$Opt_Tags_Full && $cells[3] =~ /CDS|RNA/); # skip '/locus_tag' and '/gene' tags if accessory gene primary tags are present for CDS|RNA features (except option '-t' is set) | |
521 if ($tags[$i] =~ /product/ && $cells[3] =~ /RNA/ && $cells[$i] =~ /^$/) { # RNAs mandatory need a value for '/product' | |
522 close $out_file_fh; | |
523 unlink $Out_File; | |
524 die "\n### Fatal error: Row $row of input tab file '$Input_File' is missing a 'product' value which is mandatory for RNA primary tags!\nExiting program!\n\n"; | |
525 } | |
526 | |
527 ### enforce mandatory tag for pseudo-genes (have 'pseudo' as primary tag in tab file) | |
528 if ($tags[$i] =~ /locus_tag/ && $Opt_Gene && $cells[3] =~ /pseudo/ && $cells[$i] =~ /^$/) { # pseudo-genes mandatory need a '/locus_tag' with option '-g' | |
529 close $out_file_fh; | |
530 unlink $Out_File; | |
531 die "\n### Fatal error: Row $row of input tab file '$Input_File' is missing a 'locus_tag' which is mandatory for option '-g'!\nExiting program!\n\n"; | |
532 } | |
533 | |
534 ### the rest | |
535 if ($tags[$i] =~ /product/ && $cells[3] =~ /pseudo/) { # write 'product' values for pseudo-genes to '/gene_desc' tags | |
536 print_tag2tbl('gene_desc', $cells[$i]); # subroutine | |
537 } elsif ($tags[$i] =~ /pseudo/ && $cells[3] =~ /pseudo/) { # skip 'pseudo' tag if pseudo-gene | |
538 next; | |
539 } else { | |
540 print_tag2tbl($tags[$i], $cells[$i]); # subroutine | |
541 } | |
542 | |
543 } | |
544 print $error_file_fh "$row\t$locus_tag\t$gene\t$hypo_putative\n" if ($hypo_putative && $gene); | |
545 } | |
546 | |
547 select STDOUT; | |
548 close $input_file_fh; | |
549 close $error_file_fh; | |
550 close $out_file_fh; | |
551 return 1; | |
552 } | |
553 | |
554 | |
555 | |
556 ### Read in data from tbl input file | |
557 sub read_tbl { | |
558 open (my $input_file_fh, "<", $Input_File); | |
559 | |
560 my $seq_id = <$input_file_fh>; # SeqID | |
561 $seq_id =~ s/\R/\012/; # convert line to unix-style line endings | |
562 chomp $seq_id; | |
563 if ($seq_id !~ /^>Feature/) { # check if tbl file starts with mandatory '>Feature' and get first SeqID | |
564 die "\n### Fatal error: tbl file doesn't start with a '>Feature SeqID' line. Sure this is a valid tbl file?\nExiting program!\n\n"; | |
565 } else { | |
566 $seq_id =~ s/>Feature (\S+)\s*$/$1/; # only use non-whitespace characters as SeqID | |
567 } | |
568 | |
569 my %data; # hash-in-hash-in-hash to store tbl input data | |
570 my $pos_key; # store start..stop for each primary tag and use as key in %data | |
571 my $primary_tag; # store previous primary tag to determine if values for repeatedly occuring tags should be concatenated | |
572 my %tags_max_count; # hash to store all occuring tags with maximal number of presence (within a single primary tag) in the tbl file for final tab column headers | |
573 my @tags; # array to store all tags of each primary tag, supplement to %tags_max_count | |
574 | |
575 while (<$input_file_fh>) { | |
576 $_ =~ s/\R/\012/; # convert line to unix-style line ending | |
577 chomp; | |
578 next if ($_ =~ /^\s+$/); # skip empty lines | |
579 my @fields = split(/\t/, $_); | |
580 | |
581 ### get next SeqID from '>Feature' line for multi-contig/replicon tbl files | |
582 if ($fields[0] =~ /^>Feature (\S+)\s*$/) { | |
583 $seq_id = $1; | |
584 | |
585 ### get primary tags/features and fill %tags_max_count | |
586 } elsif ($fields[0] =~ /^\d+$/) { # $fields[2] with primary tag | |
587 foreach my $tag (@tags) { # fill %tags_max_count for previous primary tag | |
588 if ($tags_max_count{$tag}) { | |
589 $tags_max_count{$tag} = grep(/$tag/, @tags) if ($tags_max_count{$tag} < grep(/$tag/, @tags)); | |
590 } elsif (!$tags_max_count{$tag}) { | |
591 $tags_max_count{$tag} = grep(/$tag/, @tags); | |
592 } | |
593 } | |
594 @tags = (); # empty tags array for new current primary tag | |
595 | |
596 $pos_key = "$fields[0]..$fields[1]"; # position of primary tag used as key for %data, "start..stop" | |
597 $primary_tag = $fields[2]; | |
598 if (!$data{$seq_id}->{$pos_key}->{'primary_tag'} || $data{$seq_id}->{$pos_key}->{'primary_tag'} =~ /gene/) { # if primary tag not present or overwrite accessory 'gene' primary tag | |
599 $data{$seq_id}->{$pos_key}->{'primary_tag'} = $primary_tag; # store data in anonymous hash-in-hash-in-hash | |
600 $data{$seq_id}->{$pos_key}->{'start'} = $fields[0]; # to be able to sort afterwards via the start position | |
601 } elsif ($data{$seq_id}->{$pos_key}->{'primary_tag'} =~ /pseudo/) { # 'gene' primary tag with '/pseudo' tag will be replaced by 'pseudo' primary tag for pseudo-genes (see below), however if 'gene' primary tag is ACCESSORY to CDS|RNA primary tag replace by this primary tag and include '/pseudo' tag | |
602 $data{$seq_id}->{$pos_key}->{'primary_tag'} = $primary_tag; | |
603 $data{$seq_id}->{$pos_key}->{'pseudo'} = 'T'; # value 'T' for true | |
604 } | |
605 | |
606 ### get tags/qualifiers | |
607 } elsif ($fields[3] =~ /^\w+/) { | |
608 push(@tags, $fields[3]) if ($fields[3] !~ /gene_desc/); # store tags for current primary tag; skip '/gene_desc' as reserved for pseudo-genes (replaced by '/product' see below) | |
609 if ($fields[3] =~ /pseudo/) { | |
610 if ($data{$seq_id}->{$pos_key}->{'primary_tag'} =~ /gene/) { # change 'gene' primary tag of pseudo-genes to 'pseudo' (if accessory 'gene' primary tag will be replaced by *actual* primary tag, see above) | |
611 $data{$seq_id}->{$pos_key}->{'primary_tag'} = 'pseudo'; | |
612 } else { # else include a '/pseudo' tag with value 'T' for true | |
613 $data{$seq_id}->{$pos_key}->{'pseudo'} = 'T'; | |
614 } | |
615 next; # next line | |
616 } | |
617 | |
618 ### remove quotations from values introduced by Excel by saving as tab-separated file (see above) | |
619 $fields[4] =~ s/""/"/g; | |
620 $fields[4] =~ s/^"//; | |
621 $fields[4] =~ s/"$//; | |
622 | |
623 ### adjust '/gene' and '/product' values to NCBI standard | |
624 if ($fields[3] =~ /gene/) { | |
625 $fields[4] =~ s/(\w+)/\l$1/; # first character of gene name should be lower case | |
626 $fields[4] =~ s/^(\w)$/\u$1/; # one letter phage genes should be upper case | |
627 } elsif ($fields[3] =~ /product/) { | |
628 $fields[4] =~ s/\b([A-Z][a-z]{3,})/\l$1/g; # lower the case for '/protein' value initials | |
629 $fields[4] =~ s/(rossman|willebrand)/\u$1/; # exception to the rule | |
630 } | |
631 | |
632 if ($fields[3] =~ /gene_desc/) { # '/gene_desc' tags from pseudo-genes replaced by '/product' for resulting tab file | |
633 $data{$seq_id}->{$pos_key}->{'product'} = $fields[4]; | |
634 next; | |
635 } | |
636 if ($data{$seq_id}->{$pos_key}->{$fields[3]} && $data{$seq_id}->{$pos_key}->{'primary_tag'} =~ /$primary_tag/) { # tag already exists for this position (e.g. several EC_numbers), concatenate the additional values with '~' as separator only WITHIN the same primary tag (OTHERWISE overwrite in 'else' below) | |
637 $data{$seq_id}->{$pos_key}->{$fields[3]} .= '~'.$fields[4]; | |
638 } else { # tag doesn't exist yet or overwrite if current primary tag at the same position of previous (e.g. accessory 'gene' primary tag to CDS/RNA primary tag) | |
639 $data{$seq_id}->{$pos_key}->{$fields[3]} = $fields[4]; | |
640 } | |
641 } | |
642 } | |
643 foreach my $tag (@tags) { # fill %tags_max_count for last primary tag | |
644 if ($tags_max_count{$tag}) { | |
645 $tags_max_count{$tag} = grep(/$tag/, @tags) if ($tags_max_count{$tag} < grep(/$tag/, @tags)); | |
646 } elsif (!$tags_max_count{$tag}) { | |
647 $tags_max_count{$tag} = grep(/$tag/, @tags); | |
648 } | |
649 } | |
650 close $input_file_fh; | |
651 return \%data, \%tags_max_count; | |
652 } | |
653 | |
654 | |
655 | |
656 ### Write data to tab output file | |
657 sub write_tab { | |
658 my ($data_hash_ref, $tags_max_count_hash_ref) = @_; | |
659 file_exist($Out_File); # subroutine | |
660 open (my $out_file_fh, ">", $Out_File); | |
661 select $out_file_fh; # select fh for standard print/f output | |
662 | |
663 ### print header for tab result file | |
664 print "seq_id\tstart\tstop\tprimary_tag\tlocus_tag"; # mandatory columns/fields in tab file | |
665 if ($Opt_Concat) { | |
666 foreach (sort keys %{$tags_max_count_hash_ref}) { # print residual tags | |
667 print "\t$_" if (!/locus_tag/); | |
668 } | |
669 } elsif (!$Opt_Concat) { | |
670 foreach (sort keys %{$tags_max_count_hash_ref}) { | |
671 print "\t$_" x $tags_max_count_hash_ref->{$_} if (!/locus_tag/); # print max occurrence (in tbl) of each residual tag | |
672 } | |
673 } | |
674 | |
675 print "\tspreadsheet_locus_tag\tposition\tdistance\tgene_number\tcontig_order" if ($Opt_Spreadsheet); # print optional spreadsheet header columns | |
676 print "\n"; | |
677 | |
678 ### variables for optional spreadsheet formulas | |
679 my @spread_columns = ("A".."AZ") if ($Opt_Spreadsheet); # columns in spreadsheet software for formulas | |
680 my ($tags_column_count, $spread_row_count, $spread_contig_order) = (0, 1, 1) if ($Opt_Spreadsheet); | |
681 if ($Opt_Spreadsheet) { | |
682 if ($Opt_Concat) { | |
683 $tags_column_count = (scalar keys %{$tags_max_count_hash_ref}) - scalar grep($_ =~ /locus_tag/, keys %{$tags_max_count_hash_ref}); # subtract tags for correct spreadsheet formulas | |
684 } elsif (!$Opt_Concat) { | |
685 foreach (keys %{$tags_max_count_hash_ref}) { | |
686 next if ($_ =~ /locus_tag/); | |
687 $tags_column_count += $tags_max_count_hash_ref->{$_}; | |
688 } | |
689 } | |
690 } | |
691 | |
692 ### print data from hash into tab result file, optional with G2L-style spreadsheet formulas | |
693 foreach my $seq_id (sort keys %{$data_hash_ref}) { | |
694 foreach my $pos (sort {$data_hash_ref->{$seq_id}->{$a}->{'start'} <=> $data_hash_ref->{$seq_id}->{$b}->{'start'}} keys $data_hash_ref->{$seq_id}) { # sort each position entry in %data via start position | |
695 print "$seq_id"; | |
696 my ($start, $stop) = split(/\.\./, $pos); | |
697 print "\t$start\t$stop"; | |
698 print "\t$data_hash_ref->{$seq_id}->{$pos}->{'primary_tag'}"; # primary_tag should always be present | |
699 print_tag2tab('locus_tag', $data_hash_ref, $seq_id, $pos, 1); # subroutine; locus_tag should occur always just one time per primary tag | |
700 foreach (sort keys %{$tags_max_count_hash_ref}) { # print residual tags | |
701 print_tag2tab($_, $data_hash_ref, $seq_id, $pos, $tags_max_count_hash_ref->{$_}) if (!/locus_tag/); # subroutine | |
702 } | |
703 | |
704 ### G2L-style spreadsheet formulas | |
705 if ($Opt_Spreadsheet) { | |
706 $spread_row_count++; | |
707 if ($Formula_Lang_Spreadsheet =~ /^e/i) { | |
708 print "\t=\"$Locus_Prefix\"", '&"_"&A', "$spread_row_count&TEXT(", $spread_columns[$tags_column_count+8], $spread_row_count, ',"0000")&"0"'; # spreadsheet column 'spreadsheet_locus_tag' | |
709 } elsif ($Formula_Lang_Spreadsheet =~ /^g/i) { | |
710 print "\t=\"$Locus_Prefix\"", '&"_"&A', "$spread_row_count&TEXT(", $spread_columns[$tags_column_count+8], $spread_row_count, ';"0000")&"0"'; | |
711 } | |
712 print "\t=MIN(B$spread_row_count:C$spread_row_count)"; # spreadsheet column 'position' | |
713 print "\t=", $spread_columns[$tags_column_count+6], $spread_row_count + 1, "-MAX(B$spread_row_count:C$spread_row_count)"; # spreadsheet column 'distance' | |
714 if ($Formula_Lang_Spreadsheet =~ /^e/i) { | |
715 print "\t=IF(", $spread_columns[$tags_column_count+9], $spread_row_count, '=', $spread_columns[$tags_column_count+9], $spread_row_count - 1, ",$spread_columns[$tags_column_count+8]", $spread_row_count - 1, '+1,1)'; # spreadsheet column 'gene_number' | |
716 } elsif ($Formula_Lang_Spreadsheet =~ /^g/i) { | |
717 print "\t=WENN(", $spread_columns[$tags_column_count+9], $spread_row_count, '=', $spread_columns[$tags_column_count+9], $spread_row_count - 1, ";$spread_columns[$tags_column_count+8]", $spread_row_count - 1, '+1;1)'; | |
718 } | |
719 print "\t$spread_contig_order"; # spreadsheet column 'contig_order' | |
720 } | |
721 | |
722 print "\n"; | |
723 } | |
724 $spread_contig_order++; # next contig/replicon (SeqID) in tbl file | |
725 } | |
726 | |
727 select STDOUT; | |
728 close $out_file_fh; | |
729 return 1; | |
730 } |