Mercurial > repos > dereeper > pangenome_explorer

diff COG/bac-genomics-scripts/tbl2tab/tbl2tab.pl @ 3:e42d30da7a74 draft
Uploaded
author: dereeper
date: Thu, 30 May 2024 11:52:25 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/COG/bac-genomics-scripts/tbl2tab/tbl2tab.pl	Thu May 30 11:52:25 2024 +0000
@@ -0,0 +1,730 @@
+#!/usr/bin/perl
+
+#######
+# POD #
+#######
+
+=pod
+
+=head1 NAME
+
+C<tbl2tab.pl> - convert tbl to tab-separated format and back
+
+=head1 SYNOPSIS
+
+C<perl tbl2tab.pl -m tbl2tab -i feature_table.tbl -s -l locus_prefix>
+
+B<or>
+
+C<perl tbl2tab.pl -m tab2tbl -i feature_table.tab -g -l locus_prefix
+-p "gnl|dbname|">
+
+=head1 DESCRIPTION
+
+NCBI's feature table (B<tbl>) format is needed for the submission of
+genomic data to GenBank with the NCBI tools
+L<Sequin|http://www.ncbi.nlm.nih.gov/Sequin/> or
+L<tbl2asn|http://www.ncbi.nlm.nih.gov/genbank/tbl2asn2>. tbl files
+can be created with automatic annotation systems like
+L<Prokka|http://www.vicbioinformatics.com/software.prokka.shtml>.
+C<tbl2tab.pl> can convert a tbl file to a tab-separated format (tab)
+and back to the tbl format. The tab-delimited format is useful to
+manipulate the data more comfortably in a spreadsheet software (e.g.
+LibreOffice or MS Excel). For a conversion back to tbl format save
+the file in the spreadsheet software as a tab-delimited text file.
+The script is intended for microbial genomes, but might also be
+useful for eukaryotes.
+
+Regular expressions are applied in mode B<tbl2tab> to correct gene
+names and words in '/product' values  to lowercase initials (with
+the exception of 'Rossman' and 'Willebrand'). The resulting tab file
+can then be used to check for possible errors.
+
+The first four header columns of the B<tab> format are mandatory,
+'seq_id' for the SeqID, and for each primary tag/feature (e.g. CDS,
+RNAs, repeat_region etc.), 'start', 'stop', and 'primary_tag'. These
+mandatory columns have to be filled in every row in the tab file.
+All the following columns will be included as tags/qualifiers (e.g.
+'/locus_tag', '/product', '/EC_number', '/note' etc.) in the
+conversion to the tbl file if a value is present.
+
+There are three special cases:
+
+B<First>, '/pseudo' will be included as a tag if I<any> value (the
+script uses 'T' for true) is present in the B<tab> format. If a
+primary tag is indicated as pseudo both the primary tag and the
+accessory 'gene' primary tag (for CDS/RNA features with option
+B<-g>) will include a '/pseudo' qualifier in the resulting B<tbl>
+file. B<Pseudo-genes> are indicated by 'pseudo' in the 'primary_tag'
+column, thus the 'pseudo' column is ignored in these cases.
+
+B<Second>, tag '/gene_desc' is reserved for the 'product' values of
+pseudo-genes, thus a 'gene_desc' column in a tab file will be
+ignored in the conversion to tbl.
+
+B<Third>, column 'protein_id' in a tab file will also be ignored in
+the conversion. '/protein_id' values are created from option B<-p>
+and the locus_tag for each CDS primary feature.
+
+Furthermore, with option B<-s> G2L-style spreadsheet formulas
+(L<Goettingen Genomics
+Laboratory|http://appmibio.uni-goettingen.de/>) can be included with
+additional columns, 'spreadsheet_locus_tag', 'position', 'distance',
+'gene_number', and 'contig_order'. These columns will not be
+included in a conversion to the tbl format. Thus, if you want to
+include e.g. the locus_tags from the formula in column
+'spreadsheet_locus_tag' in the resulting tbl file copy the B<values>
+to the column 'locus_tag'!
+
+To illustrate the process two example files are included in the
+repository, F<example.tbl> and F<example2.tab>, which are
+interconvertible (see L</"EXAMPLES"> below).
+
+B<Warning>, be aware of possible errors introduced by automatic
+format conversions using a spreadsheet software like MS Excel, see
+e.g. Zeeberg et al. 2004
+(L<http://www.ncbi.nlm.nih.gov/pubmed/15214961>).
+
+For more information regarding the feature table and the submission
+process see NCBI's L<prokaryotic annotation
+guide|http://www.ncbi.nlm.nih.gov/genbank/genomesubmit> and the
+L<bacterial genome submission
+guide|http://www.ncbi.nlm.nih.gov/genbank/genomesubmit_annotation>.
+
+=head1 OPTIONS
+
+=head2 Mandatory options
+
+=over 20
+
+=item B<-m>=I<tbl2tab|tab2tbl>, B<-mode>=I<tbl2tab|tab2tbl>
+
+Conversion mode, either 'tbl2tab' or 'tab2tbl' [default = 'tbl2tab']
+
+=item B<-i>=I<str>, B<-input>=I<str>
+
+Input tbl or tab file to be converted to the other format
+
+=back
+
+=head2 Optional options
+
+=over 20
+
+=item B<-h>, B<-help>
+
+Help (perldoc POD)
+
+=item B<-v>, B<-version>
+
+Print version number to C<STDERR>
+
+=back
+
+=head3 Mode B<tbl2tab>
+
+=over 20
+
+=item B<-l>=I<str>, B<-locus_prefix>=I<str>
+
+Only in combination with option B<-s> and there mandatory to include
+the locus_tag prefix in the formula for column 'spreadsheet_locus_tag'
+
+=item B<-c>, B<-concat>
+
+Concatenate values of identical tags within one primary tag with '~'
+(e.g. several '/EC_number' or '/inference' tags)
+
+=item B<-e>=I<str>, B<-empty>=I<str>
+
+String used for primary features without value for a tag [default = '']
+
+=item B<-s>, B<-spreadsheet>
+
+Include formulas for spreadsheet editing
+
+=item B<-f>=I<e|g>, B<-formula_lang>=I<e|g>
+
+Syntax language of the spreadsheet formulas, either 'English' or
+'German'. If you're still encountering problems with the formulas
+set the decimal and thousands separator manually in the options of
+the spreadsheet software (instead of using the operating system
+separators). [default = 'e']
+
+=back
+
+=head3 Mode B<tab2tbl>
+
+=over 20
+
+=item B<-l>=I<str>, B<-locus_prefix>=I<str>
+
+Prefix to the SeqID if not present already in the SeqID
+
+=item B<-g>, B<-gene>
+
+Include accessory 'gene' primary tags (with '/gene', '/locus_tag'
+and possibly '/pseudo' tags) for 'CDS/RNA' primary tags; NCBI standard
+
+=item B<-t>, B<-tags_full>
+
+Only in combination with option B<-g>, include '/gene' and
+'/locus_tag' tags additionally in primary tag, not only in accessory
+'gene' primary tag
+
+=item B<-p>=I<str>, B<-protein_id_prefix>=I<str>
+
+Prefix for '/protein_id' tags; don't forget the double quotes for
+the string, otherwise the shell will intepret as pipe [default =
+'gnl|goetting|']
+
+=back
+
+=head1 OUTPUT
+
+=over 20
+
+=item F<*.tab|tbl>
+
+Result file in the opposite format
+
+=item (F<hypo_putative_genes.txt>)
+
+Created in mode 'tab2tbl', indicates if CDSs are annotated as
+'hypothetical/putative/predicted protein' but still have a gene name
+
+=back
+
+=head1 EXAMPLES
+
+=over
+
+=item C<perl tbl2tab.pl -m tbl2tab -i example.tbl -s -l EPE>
+
+=item C<perl tbl2tab.pl -m tab2tbl -i example2.tab -g -l EPE>
+
+=back
+
+=head1 VERSION
+
+ 0.2                                               update: 29-10-2014
+ 0.1                                                       24-06-2014
+
+=head1 AUTHOR
+
+ Andreas Leimbach                               aleimba[at]gmx[dot]de
+
+=head1 LICENSE
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 (GPLv3) of the License,
+or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see L<http://www.gnu.org/licenses/>.
+
+=cut
+
+
+########
+# MAIN #
+########
+
+use strict;
+use warnings;
+use autodie;
+use Getopt::Long;
+use Pod::Usage;
+
+
+
+### Get options with Getopt::Long, works also abbreviated and with two "--": -i, --i, -input ...
+my $Input_File; # input file
+my $Mode = 'tbl2tab'; # mode of script, i.e. either convert from tbl2tab or from tab2tbl; default 'tbl2tab'
+my $Locus_Prefix = ''; # required for option 'spreadsheet' in mode 'tbl2tab', in mode 'tab2tbl' optional
+my $Opt_Concat; # optionally, concatenate values of the same tag within one primary tag in a tbl file in one column in the resulting tab file with '~' (e.g. several 'EC_number' tags etc.)
+my $Empty = ''; # optionally, set what should be used for tags without a value in resulting tab file; default is nothing
+my $Opt_Spreadsheet; # optionally, include formulas for spreadsheet editing (e.g. Libre Office, MS Excel)
+my $Formula_Lang_Spreadsheet = 'e'; # optionally, either German or English formulas in Spreadsheet option; default 'e' for English
+my $Opt_Gene; # optionally, include accessory gene primary tags (with '/gene' and '/locus_tag' [and '/pseudo'] tags) for CDS|RNA primary tags
+my $Opt_Tags_Full; # optionally, include '/gene' and '/locus_tag' additionally in primary tag not only accessory 'gene' primary tag
+my $Protein_Id_Prefix = 'gnl|goetting|'; # optionally give a different string to prefix the '/protein_id' tags
+my $VERSION = 0.2;
+my ($Opt_Version, $Opt_Help);
+GetOptions ('input=s' => \$Input_File,
+            'mode=s' => \$Mode,
+            'locus_prefix:s' => \$Locus_Prefix,
+            'concat' => \$Opt_Concat,
+            'empty:s' => \$Empty,
+            'spreadsheet' => \$Opt_Spreadsheet,
+            'formula_lang:s' => \$Formula_Lang_Spreadsheet,
+            'gene' => \$Opt_Gene,
+            'tags_full' => \$Opt_Tags_Full,
+            'protein_id_prefix:s' => \$Protein_Id_Prefix,
+            'version' => \$Opt_Version,
+            'help|?' => \$Opt_Help);
+
+
+
+### Run perldoc on POD
+pod2usage(-verbose => 2) if ($Opt_Help);
+die "$0 $VERSION\n" if ($Opt_Version);
+if (!$Input_File) {
+    my $warning = "\n### Fatal error: Option '-i' or its argument is missing!\n";
+    pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
+} elsif (!($Mode =~ /tbl2tab/i || $Mode =~ /tab2tbl/i)) { # case-insensitive
+    my $warning = "\n### Fatal error: Incorrect run mode with option '-m' given! Please choose either 'tbl2tab' or 'tab2tbl' for '-m'!\n";
+    pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
+}
+
+
+
+### Enforce mandatory or optional options
+if ($Mode =~ /tbl2tab/i && ($Opt_Gene || $Opt_Tags_Full || $Protein_Id_Prefix ne 'gnl|goetting|')) {
+    warn "\nIncompatible option(s) '-g', '-p', or '-t' set with mode 'tbl2tab'. Ignoring the option(s)!\n";
+} elsif ($Mode =~ /tab2tbl/i && ($Opt_Concat || $Empty || $Opt_Spreadsheet || $Formula_Lang_Spreadsheet ne 'e')) {
+    warn "\nIncompatible option(s) '-c', '-e', '-f', or '-s' set with mode 'tab2tbl'. Ignoring the option(s)!\n";
+    $Formula_Lang_Spreadsheet = 'e' if ($Formula_Lang_Spreadsheet ne 'e'); # avoid die with error below if option not set correctly
+}
+
+if ($Mode =~ /tbl2tab/i && ($Locus_Prefix || $Formula_Lang_Spreadsheet ne 'e') && !$Opt_Spreadsheet) {
+    warn "\nOption(s) '-l' or '-f' set, but not '-s'. Forcing option '-s'!\n";
+    $Opt_Spreadsheet = 1;
+}
+if ($Mode =~ /tbl2tab/i && !$Locus_Prefix && $Opt_Spreadsheet) {
+    warn "\nOption '-s' set, but not '-l'. Please give a prefix for the locus tags: ";
+    chomp($Locus_Prefix = <>);
+}
+
+if ($Formula_Lang_Spreadsheet !~ /^(e|g)/i) {
+    die "\n### Fatal error: Incorrect language for option '-f' given! Please choose either 'e|eng' or 'g|ger' for '-f'!\n";
+}
+
+if ($Mode =~ /tab2tbl/i && !$Opt_Gene && $Opt_Tags_Full) {
+    warn "\nOption '-t' set, but not '-g'. Forcing option '-g'!\n";
+    $Opt_Gene = 1;
+}
+
+
+
+### Read in tbl or tab-separated data and write to result file in the opposite format
+my $Out_File = $Input_File;
+$Out_File =~ s/^(.+)\.\w+$/$1/; # strip filename extension
+my $Error_File = 'hypo_putative_genes.txt';
+if ($Mode =~ /tbl2tab/i) {
+    my ($data_hash_ref, $tags_max_count_hash_ref) = read_tbl(); # subroutine
+    $Out_File .= '.tab';
+    write_tab($data_hash_ref, $tags_max_count_hash_ref); # subroutine
+
+} elsif ($Mode =~ /tab2tbl/i) {
+    $Out_File .= '.tbl';
+    read_tab_write_tbl(); # subroutine
+}
+
+
+
+### Message which file was created
+if ($Mode =~ /tbl2tab/i) {
+    print "Input tbl file '$Input_File' was converted to tab output file '$Out_File'!\n";
+} elsif ($Mode =~ /tab2tbl/i) {
+    print "Input tab file '$Input_File' was converted to tbl output file '$Out_File'!\n";
+}
+
+if (-e $Error_File) {
+    if (-s $Error_File >= 40) { # smaller than just the header, which should be 27 bytes
+        warn "\n### Warning: CDSs found that are annotated with 'hypothetical|putative|predicted protein' but still include a '/gene' tag, see file '$Error_File'!\n";
+    } elsif (-s $Error_File < 40) {
+        unlink $Error_File;
+    }
+}
+
+
+exit;
+
+###############
+# Subroutines #
+###############
+
+### Subroutine to test for file existence and give warning to STDERR
+sub file_exist {
+    my $file = shift;
+    if (-e $file) {
+        warn "\nThe result file \'$file\' exists already and will be overwritten!\n\n";
+        return 1;
+    }
+    return 0;
+}
+
+
+
+### Print tag values to tab result file
+sub print_tag2tab {
+    my ($tag, $data_hash_ref, $seq_id, $pos, $tag_max_count) = @_;
+
+    if ($Opt_Concat) { # values concatenated by '~' in $data_hash_ref
+        if ($data_hash_ref->{$seq_id}->{$pos}->{$tag}) {
+            print "\t$data_hash_ref->{$seq_id}->{$pos}->{$tag}";
+            return 1;
+        } else {
+            print "\t$Empty";
+            return 1;
+        }
+
+    } elsif (!$Opt_Concat) { # split concatenated values in individual values
+        my @values = split(/~/, $data_hash_ref->{$seq_id}->{$pos}->{$tag}) if ($data_hash_ref->{$seq_id}->{$pos}->{$tag});
+        if (@values) {
+            foreach (@values) {
+                print "\t$_";
+            }
+            print "\t$Empty" x ($tag_max_count - @values); # fill residual columns till maximum occurrence
+        } else {
+            print "\t$Empty" x $tag_max_count;
+        }
+        return 1;
+    }
+
+    return 0;
+}
+
+
+
+### Print tag values to tbl result file
+sub print_tag2tbl {
+    my ($tag, $value) = @_;
+    return 0 if ($value =~ /^$Empty$/);
+    if ($tag =~ /pseudo/) {
+        print "\t\t\t$tag\n";
+        return 1;
+    }
+
+    ### remove quotations from values introduced by Excel by saving as tab-separated file:
+    ### https://office.microsoft.com/en-001/excel-help/excel-formatting-and-features-that-are-not-transferred-to-other-file-formats-HP010014105.aspx
+    ### - if a cell contains a comma, the cell contents are enclosed in double quotation marks
+    ### - if the data contains a quotation mark, double quotation marks will replace the quotation mark, and the cell contents are also enclosed in double quotation marks
+    $value =~ s/""/"/g;
+    $value =~ s/^"//;
+    $value =~ s/"$//;
+
+    foreach (split(/~/, $value)) {
+        print "\t\t\t$tag\t$_\n";
+    }
+    return 1;
+}
+
+
+### Read in data from tab input file and write it to tbl output file
+sub read_tab_write_tbl {
+    file_exist($Error_File); # subroutine
+    open (my $error_file_fh, ">", $Error_File);
+    print $error_file_fh "row\tlocus_tag\tgene\tproduct\n";
+
+    open (my $input_file_fh, "<", $Input_File);
+    my $header = <$input_file_fh>;
+    $header =~ s/\R/\012/; # convert line to unix-style line endings
+    chomp $header;
+    if ($header !~ /^seq_id\tstart\tstop\tprimary_tag\t/) { # check if tbl file starts with mandatory header fields or quit
+        die "\n### Fatal error: Input tab file '$Input_File' doesn't start with the mandatory 'seq_id', 'start', 'stop', and 'primary_tag' tab-separated header fields. Sure this is a valid tab file?\nExiting program!\n\n";
+    }
+
+    my @tags;
+    foreach (split(/\t/, $header)) {
+        last if (/spreadsheet_locus_tag/); # skip all optional extra spreadsheet columns
+        push(@tags, $_); # store all header fields/columns to associate with each field in each line
+    }
+
+    file_exist($Out_File); # subroutine
+    open (my $out_file_fh, ">", $Out_File);
+    select $out_file_fh; # select fh for standard print/f output
+
+    my $row = 1; # count row numbers of tab input file for $Error_File (start with '1' as header already parsed above)
+    my $seq_id = ''; # store previous SeqID for multi-contig/replicon tab files
+    while (<$input_file_fh>) {
+        $row++;
+        $_ =~ s/\R/\012/; # convert line to unix-style line ending
+        chomp;
+        next if ($_ =~ /^\s+$/ || $_ =~ /^$/); # skip empty lines
+
+        my ($locus_tag, $gene, $hypo_putative) = ('', '', ''); # needed for $Error_File
+
+        my @cells = split(/\t/, $_);
+        for (my $i = 0; $i < 4; $i++) { # check each row for mandatory fields
+            if ($cells[$i] =~ /^$/) {
+                close $out_file_fh;
+                unlink $Out_File;
+                die "\n### Fatal error: Row $row of input tab file '$Input_File' is missing a value for one of the mandatory fields 'seq_id', 'start', 'stop', or 'primary_tag'!\nExiting program!\n\n";
+            }
+        }
+
+        ### print SeqID
+        if ($cells[0] ne $seq_id) { # print new contig for multi-contig/replicon tab files
+            $seq_id = $cells[0];
+            $cells[0] = $Locus_Prefix."_".$cells[0] if ($cells[0] !~ /$Locus_Prefix/ && $Locus_Prefix); # append locus_tag prefix only if not present already
+            print ">Feature $cells[0]\n";
+        }
+
+        ### print accessory 'gene' primary tags with '/locus_tag', '/gene', and potential '/pseudo' tags
+        if ($Opt_Gene && $cells[3] =~ /CDS|RNA/) { # accessory 'gene' primary tags only for CDS and RNA (rRNA, tRNA ...) features
+            print "$cells[1]\t$cells[2]\tgene\n";
+            my $column_count = 0;
+            foreach my $tag (@tags) {
+                if ($tag =~ /locus_tag/ && $cells[$column_count] =~ /^$/) { # CDSs|RNAs mandatory need a '/locus_tag' with option '-g'
+                    close $out_file_fh;
+                    unlink $Out_File;
+                    die "\n### Fatal error: Row $row of input tab file '$Input_File' is missing a 'locus_tag' which is mandatory for option '-g'!\nExiting program!\n\n";
+                }
+                print_tag2tbl($tag, $cells[$column_count]) if ($tag =~ /locus_tag|^gene$|pseudo/); # subroutine; '^gene$' needed, so 'gene_desc' isn't hit (see below)
+                $column_count++;
+            }
+        }
+
+        ### print primary tag
+        print "$cells[1]\t$cells[2]"; # start\tstop
+        if ($cells[3] =~ /pseudo/) { # pseudo-gene should include '/pseudo' tag
+            print "\tgene\n";
+            print "\t\t\tpseudo\n";
+        } else {
+            print "\t$cells[3]\n";
+        }
+
+        ### print tags with values
+        for (my $i = 4; $i < @tags; $i++) { # start with field 5 of array with header fields/columns (the first 4 are mandatory, see above)
+            next if ($tags[$i] =~ /gene_desc/); # skip 'gene_desc' fields in tab file, reserved for pseudo-genes (see below)
+
+            ### enforce mandatory tags for CDS primary tags
+            if ($tags[$i] =~ /product/ && $cells[3] =~ /CDS/) {
+                if ($cells[$i] =~ /(hypothetical|putative|predicted) protein/) { # needed for $Error_File
+                    $hypo_putative = $cells[$i];
+                } elsif ($cells[$i] =~ /^$/) { # CDSs mandatory need a value for '/product'
+                    close $out_file_fh;
+                    unlink $Out_File;
+                    die "\n### Fatal error: Row $row of input tab file '$Input_File' is missing a 'product' value which is mandatory for CDS primary tags!\nExiting program!\n\n";
+                }
+            }
+
+            if ($tags[$i] =~ /locus_tag/ && $cells[3] =~ /CDS/) { # '/protein_id' mandatory for 'CDS' primary tags
+                $locus_tag = $cells[$i]; # needed for $Error_File
+                my $protein_id = "$Protein_Id_Prefix".$cells[$i];
+                print_tag2tbl('protein_id', $protein_id); # subroutine
+            }
+            next if ($tags[$i] =~ /protein_id/); # skip 'protein_id' field in tab file as they should be created from the 'locus_tag' column
+
+            $gene = $cells[$i] if ($tags[$i] =~ /^gene$/ && $cells[3] =~ /CDS/); # needed for $Error_File
+
+            ### enforce mandatory tags for CDS/RNA primary tags
+            next if ($tags[$i] =~ /locus_tag|^gene$/ && $Opt_Gene && !$Opt_Tags_Full && $cells[3] =~ /CDS|RNA/); # skip '/locus_tag' and '/gene' tags if accessory gene primary tags are present for CDS|RNA features (except option '-t' is set)
+            if ($tags[$i] =~ /product/ && $cells[3] =~ /RNA/ && $cells[$i] =~ /^$/) { # RNAs mandatory need a value for '/product'
+                close $out_file_fh;
+                unlink $Out_File;
+                die "\n### Fatal error: Row $row of input tab file '$Input_File' is missing a 'product' value which is mandatory for RNA primary tags!\nExiting program!\n\n";
+            }
+
+            ### enforce mandatory tag for pseudo-genes (have 'pseudo' as primary tag in tab file)
+            if ($tags[$i] =~ /locus_tag/ && $Opt_Gene && $cells[3] =~ /pseudo/ && $cells[$i] =~ /^$/) { # pseudo-genes mandatory need a '/locus_tag' with option '-g'
+                close $out_file_fh;
+                unlink $Out_File;
+                die "\n### Fatal error: Row $row of input tab file '$Input_File' is missing a 'locus_tag' which is mandatory for option '-g'!\nExiting program!\n\n";
+            }
+
+            ### the rest
+            if ($tags[$i] =~ /product/ && $cells[3] =~ /pseudo/) { # write 'product' values for pseudo-genes to '/gene_desc' tags
+                print_tag2tbl('gene_desc', $cells[$i]); # subroutine
+            } elsif ($tags[$i] =~ /pseudo/ && $cells[3] =~ /pseudo/) { # skip 'pseudo' tag if pseudo-gene
+                next;
+            } else {
+                print_tag2tbl($tags[$i], $cells[$i]); # subroutine
+            }
+
+        }
+        print $error_file_fh "$row\t$locus_tag\t$gene\t$hypo_putative\n" if ($hypo_putative && $gene);
+    }
+
+    select STDOUT;
+    close $input_file_fh;
+    close $error_file_fh;
+    close $out_file_fh;
+    return 1;
+}
+
+
+
+### Read in data from tbl input file
+sub read_tbl {
+    open (my $input_file_fh, "<", $Input_File);
+
+    my $seq_id = <$input_file_fh>; # SeqID
+    $seq_id =~ s/\R/\012/; # convert line to unix-style line endings
+    chomp $seq_id;
+    if ($seq_id !~ /^>Feature/) { # check if tbl file starts with mandatory '>Feature' and get first SeqID
+        die "\n### Fatal error: tbl file doesn't start with a '>Feature SeqID' line. Sure this is a valid tbl file?\nExiting program!\n\n";
+    } else {
+        $seq_id =~ s/>Feature (\S+)\s*$/$1/; # only use non-whitespace characters as SeqID
+    }
+
+    my %data; # hash-in-hash-in-hash to store tbl input data
+    my $pos_key; # store start..stop for each primary tag and use as key in %data
+    my $primary_tag; # store previous primary tag to determine if values for repeatedly occuring tags should be concatenated
+    my %tags_max_count; # hash to store all occuring tags with maximal number of presence (within a single primary tag) in the tbl file for final tab column headers
+    my @tags; # array to store all tags of each primary tag, supplement to %tags_max_count
+
+    while (<$input_file_fh>) {
+        $_ =~ s/\R/\012/; # convert line to unix-style line ending
+        chomp;
+        next if ($_ =~ /^\s+$/); # skip empty lines
+        my @fields = split(/\t/, $_);
+
+        ### get next SeqID from '>Feature' line for multi-contig/replicon tbl files
+        if ($fields[0] =~ /^>Feature (\S+)\s*$/) {
+            $seq_id = $1;
+
+        ### get primary tags/features and fill %tags_max_count
+        } elsif ($fields[0] =~ /^\d+$/) { # $fields[2] with primary tag
+            foreach my $tag (@tags) { # fill %tags_max_count for previous primary tag
+                if ($tags_max_count{$tag}) {
+                    $tags_max_count{$tag} = grep(/$tag/, @tags) if ($tags_max_count{$tag} < grep(/$tag/, @tags));
+                } elsif (!$tags_max_count{$tag}) {
+                    $tags_max_count{$tag} = grep(/$tag/, @tags);
+                }
+            }
+            @tags = (); # empty tags array for new current primary tag
+
+            $pos_key = "$fields[0]..$fields[1]"; # position of primary tag used as key for %data, "start..stop"
+            $primary_tag = $fields[2];
+            if (!$data{$seq_id}->{$pos_key}->{'primary_tag'} || $data{$seq_id}->{$pos_key}->{'primary_tag'} =~ /gene/) { # if primary tag not present or overwrite accessory 'gene' primary tag
+                $data{$seq_id}->{$pos_key}->{'primary_tag'} = $primary_tag; # store data in anonymous hash-in-hash-in-hash
+                $data{$seq_id}->{$pos_key}->{'start'} = $fields[0]; # to be able to sort afterwards via the start position
+            } elsif ($data{$seq_id}->{$pos_key}->{'primary_tag'} =~ /pseudo/) { # 'gene' primary tag with '/pseudo' tag will be replaced by 'pseudo' primary tag for pseudo-genes (see below), however if 'gene' primary tag is ACCESSORY to CDS|RNA primary tag replace by this primary tag and include '/pseudo' tag
+                $data{$seq_id}->{$pos_key}->{'primary_tag'} = $primary_tag;
+                $data{$seq_id}->{$pos_key}->{'pseudo'} = 'T'; # value 'T' for true
+            }
+
+        ### get tags/qualifiers
+        } elsif ($fields[3] =~ /^\w+/) {
+            push(@tags, $fields[3]) if ($fields[3] !~ /gene_desc/); # store tags for current primary tag; skip '/gene_desc' as reserved for pseudo-genes (replaced by '/product' see below)
+            if ($fields[3] =~ /pseudo/) {
+                if ($data{$seq_id}->{$pos_key}->{'primary_tag'} =~ /gene/) { # change 'gene' primary tag of pseudo-genes to 'pseudo' (if accessory 'gene' primary tag will be replaced by *actual* primary tag, see above)
+                    $data{$seq_id}->{$pos_key}->{'primary_tag'} = 'pseudo';
+                } else { # else include a '/pseudo' tag with value 'T' for true
+                    $data{$seq_id}->{$pos_key}->{'pseudo'} = 'T';
+                }
+                next; # next line
+            }
+
+            ### remove quotations from values introduced by Excel by saving as tab-separated file (see above)
+            $fields[4] =~ s/""/"/g;
+            $fields[4] =~ s/^"//;
+            $fields[4] =~ s/"$//;
+
+            ### adjust '/gene' and '/product' values to NCBI standard
+            if ($fields[3] =~ /gene/) {
+                $fields[4] =~ s/(\w+)/\l$1/; # first character of gene name should be lower case
+                $fields[4] =~ s/^(\w)$/\u$1/; # one letter phage genes should be upper case
+            } elsif ($fields[3] =~ /product/) {
+                $fields[4] =~ s/\b([A-Z][a-z]{3,})/\l$1/g; # lower the case for '/protein' value initials
+                $fields[4] =~ s/(rossman|willebrand)/\u$1/; # exception to the rule
+            }
+
+            if ($fields[3] =~ /gene_desc/) { # '/gene_desc' tags from pseudo-genes replaced by '/product' for resulting tab file
+                $data{$seq_id}->{$pos_key}->{'product'} = $fields[4];
+                next;
+            }
+            if ($data{$seq_id}->{$pos_key}->{$fields[3]} && $data{$seq_id}->{$pos_key}->{'primary_tag'} =~ /$primary_tag/) { # tag already exists for this position (e.g. several EC_numbers), concatenate the additional values with '~' as separator only WITHIN the same primary tag (OTHERWISE overwrite in 'else' below)
+                $data{$seq_id}->{$pos_key}->{$fields[3]} .= '~'.$fields[4];
+            } else { # tag doesn't exist yet or overwrite if current primary tag at the same position of previous (e.g. accessory 'gene' primary tag to CDS/RNA primary tag)
+                $data{$seq_id}->{$pos_key}->{$fields[3]} = $fields[4];
+            }
+        }
+    }
+    foreach my $tag (@tags) { # fill %tags_max_count for last primary tag
+        if ($tags_max_count{$tag}) {
+            $tags_max_count{$tag} = grep(/$tag/, @tags) if ($tags_max_count{$tag} < grep(/$tag/, @tags));
+        } elsif (!$tags_max_count{$tag}) {
+            $tags_max_count{$tag} = grep(/$tag/, @tags);
+        }
+    }
+    close $input_file_fh;
+    return \%data, \%tags_max_count;
+}
+
+
+
+### Write data to tab output file
+sub write_tab {
+    my ($data_hash_ref, $tags_max_count_hash_ref) = @_;
+    file_exist($Out_File); # subroutine
+    open (my $out_file_fh, ">", $Out_File);
+    select $out_file_fh; # select fh for standard print/f output
+
+    ### print header for tab result file
+    print "seq_id\tstart\tstop\tprimary_tag\tlocus_tag"; # mandatory columns/fields in tab file
+    if ($Opt_Concat) {
+        foreach (sort keys %{$tags_max_count_hash_ref}) { # print residual tags
+            print "\t$_" if (!/locus_tag/);
+        }
+    } elsif (!$Opt_Concat) {
+        foreach (sort keys %{$tags_max_count_hash_ref}) {
+            print "\t$_" x $tags_max_count_hash_ref->{$_} if (!/locus_tag/); # print max occurrence (in tbl) of each residual tag
+        }
+    }
+
+    print "\tspreadsheet_locus_tag\tposition\tdistance\tgene_number\tcontig_order" if ($Opt_Spreadsheet); # print optional spreadsheet header columns
+    print "\n";
+
+    ### variables for optional spreadsheet formulas
+    my @spread_columns = ("A".."AZ") if ($Opt_Spreadsheet); # columns in spreadsheet software for formulas
+    my ($tags_column_count, $spread_row_count, $spread_contig_order) = (0, 1, 1) if ($Opt_Spreadsheet);
+    if ($Opt_Spreadsheet) {
+        if ($Opt_Concat) {
+            $tags_column_count = (scalar keys %{$tags_max_count_hash_ref}) - scalar grep($_ =~ /locus_tag/, keys %{$tags_max_count_hash_ref}); # subtract tags for correct spreadsheet formulas
+        } elsif (!$Opt_Concat) {
+            foreach (keys %{$tags_max_count_hash_ref}) {
+                next if ($_ =~ /locus_tag/);
+                $tags_column_count += $tags_max_count_hash_ref->{$_};
+            }
+        }
+    }
+
+    ### print data from hash into tab result file, optional with G2L-style spreadsheet formulas
+    foreach my $seq_id (sort keys %{$data_hash_ref}) {
+        foreach my $pos (sort {$data_hash_ref->{$seq_id}->{$a}->{'start'} <=> $data_hash_ref->{$seq_id}->{$b}->{'start'}} keys $data_hash_ref->{$seq_id}) { # sort each position entry in %data via start position
+            print "$seq_id";
+            my ($start, $stop) = split(/\.\./, $pos);
+            print "\t$start\t$stop";
+            print "\t$data_hash_ref->{$seq_id}->{$pos}->{'primary_tag'}"; # primary_tag should always be present
+            print_tag2tab('locus_tag', $data_hash_ref, $seq_id, $pos, 1); # subroutine; locus_tag should occur always just one time per primary tag
+            foreach (sort keys %{$tags_max_count_hash_ref}) { # print residual tags
+                print_tag2tab($_, $data_hash_ref, $seq_id, $pos, $tags_max_count_hash_ref->{$_}) if (!/locus_tag/); # subroutine
+            }
+
+            ### G2L-style spreadsheet formulas
+            if ($Opt_Spreadsheet) {
+                $spread_row_count++;
+                if ($Formula_Lang_Spreadsheet =~ /^e/i) {
+                    print "\t=\"$Locus_Prefix\"", '&"_"&A', "$spread_row_count&TEXT(", $spread_columns[$tags_column_count+8], $spread_row_count, ',"0000")&"0"'; # spreadsheet column 'spreadsheet_locus_tag'
+                } elsif ($Formula_Lang_Spreadsheet =~ /^g/i) {
+                    print "\t=\"$Locus_Prefix\"", '&"_"&A', "$spread_row_count&TEXT(", $spread_columns[$tags_column_count+8], $spread_row_count, ';"0000")&"0"';
+                }
+                print "\t=MIN(B$spread_row_count:C$spread_row_count)"; # spreadsheet column 'position'
+                print "\t=", $spread_columns[$tags_column_count+6], $spread_row_count + 1, "-MAX(B$spread_row_count:C$spread_row_count)"; # spreadsheet column 'distance'
+                if ($Formula_Lang_Spreadsheet =~ /^e/i) {
+                    print "\t=IF(", $spread_columns[$tags_column_count+9], $spread_row_count, '=', $spread_columns[$tags_column_count+9], $spread_row_count - 1, ",$spread_columns[$tags_column_count+8]", $spread_row_count - 1, '+1,1)'; # spreadsheet column 'gene_number'
+                } elsif ($Formula_Lang_Spreadsheet =~ /^g/i) {
+                    print "\t=WENN(", $spread_columns[$tags_column_count+9], $spread_row_count, '=', $spread_columns[$tags_column_count+9], $spread_row_count - 1, ";$spread_columns[$tags_column_count+8]", $spread_row_count - 1, '+1;1)';
+                }
+                print "\t$spread_contig_order"; # spreadsheet column 'contig_order'
+            }
+
+            print "\n";
+        }
+        $spread_contig_order++; # next contig/replicon (SeqID) in tbl file
+    }
+
+    select STDOUT;
+    close $out_file_fh;
+    return 1;
+}
author	dereeper
date	Thu, 30 May 2024 11:52:25 +0000
parents
children