diff COG/bac-genomics-scripts/rename_fasta_id/rename_fasta_id.pl @ 3:e42d30da7a74 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:52:25 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/COG/bac-genomics-scripts/rename_fasta_id/rename_fasta_id.pl	Thu May 30 11:52:25 2024 +0000
@@ -0,0 +1,265 @@
+#!/usr/bin/perl
+
+#######
+# POD #
+#######
+
+=pod
+
+=head1 NAME
+
+C<rename_fasta_id.pl> - rename fasta IDs according to regular expressions
+
+=head1 SYNOPSIS
+
+C<perl rename_fasta_id.pl -i file.fasta -p "NODE_.+$" -r "K-12_" -n -a c E<gt> out.fasta>
+
+B<or>
+
+C<zcat file.fasta.gz | perl rename_fasta_id.pl -i - -p "coli" -r "" -o E<gt> out.fasta>
+
+=head1 DESCRIPTION
+
+This script uses the built-in Perl substitution operator C<s///> to
+replace strings in FASTA IDs. To do this, a B<pattern> and a
+B<replacement> have to be provided (Perl regular expression syntax
+can be used). The leading '>' character for the FASTA ID will be
+removed before the substitution and added again afterwards. FASTA
+IDs will be searched for matches with the B<pattern>, and if found
+the B<pattern> will be replaced by the B<replacement>.
+
+B<IMPORTANT>: Enclose the B<pattern> and the B<replacement> in
+quotation marks (' or ") if they contain characters that would be
+interpreted by the shell (e.g. pipes '|', brackets etc.).
+
+For substitutions without any appendices in a UNIX OS you can of
+course just use the great
+L<C<sed>|https://www.gnu.org/software/sed/manual/sed.html> (see
+C<man sed>), e.g.:
+
+C<sed 's/^E<gt>pattern/E<gt>replacement/' file.fasta>
+
+=head1 OPTIONS
+
+=head2 Mandatory options
+
+=over 20
+
+=item B<-i>=I<str>, B<-input>=I<str>
+
+Input FASTA file or piped STDIN (-) from a gzipped file
+
+=item B<-p>=I<str>, B<-pattern>=I<str>
+
+Pattern to be replaced in FASTA ID
+
+=item B<-r>=I<str>, B<-replacement>=I<str>
+
+Replacement to replace the pattern with. To entirely remove the
+pattern use '' or "" as input for B<-r>.
+
+=back
+
+=head2 Optional options
+
+=over 20
+
+=item B<-h>, B<-help>
+
+Help (perldoc POD)
+
+=item B<-c>, B<-case-insensitive>
+
+Match pattern case-insensitive
+
+=item B<-g>, B<-global>
+
+Replace pattern globally in the string
+
+=item B<-n>, B<-numerate>
+
+Append a numeration/the count of the pattern hits to the
+replacement. This is e.g. useful to number contigs consecutively in
+a draft genome.
+
+=item B<-a>=I<str>, B<-append>=I<str>
+
+Append a string after the numeration, e.g. 'c' for chromosome
+
+=item B<-o>, B<-output>
+
+Verbose output of the substitutions that were carried out, printed
+to C<STDERR>
+
+=item B<-v>, B<-version>
+
+Print version number to C<STDERR>
+
+=back
+
+=head1 OUTPUT
+
+=over 20
+
+=item C<STDOUT>
+
+The FASTA file with substituted ID lines is printed to C<STDOUT>.
+Redirect or pipe into another tool as needed.
+
+=back
+
+=head1 EXAMPLES
+
+=over
+
+=item C<perl rename_fasta_id.pl -i file.fasta -p "T" -r "a" -c -g -o>
+
+=back
+
+=head1 VERSION
+
+ 0.1                                                 09-11-2014
+
+=head1 AUTHOR
+
+ Andreas Leimbach                                    aleimba[at]gmx[dot]de
+
+=head1 LICENSE
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 (GPLv3) of the License,
+or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see L<http://www.gnu.org/licenses/>.
+
+=cut
+
+
+########
+# MAIN #
+########
+
+use strict;
+use warnings;
+use autodie;
+use Getopt::Long;
+use Pod::Usage;
+
+### Get the options with Getopt::Long
+my $Input_File; # input fasta file
+my $Pattern; # pattern to search for in the FASTA IDs
+my $Replacement; # regex to replace pattern with
+my $Opt_Case; # substitute case-insensitive
+my $Opt_Global; # substitute pattern globally in string
+my $Opt_Numerate; # append the count of the performed substitions to each replacement regex
+my $Append; # append an additional string after $Opt_Numerate
+my $Opt_Output; # print substitutions to STDERR
+my $VERSION = 0.1;
+my ($Opt_Version, $Opt_Help);
+GetOptions ('input=s' => \$Input_File,
+            'pattern=s' => \$Pattern,
+            'replacement=s' => \$Replacement,
+            'case-insensitive' => \$Opt_Case,
+            'global' => \$Opt_Global,
+            'numerate' => \$Opt_Numerate,
+            'append:s' => \$Append,
+            'output' => \$Opt_Output,
+            'version' => \$Opt_Version,
+            'help|?' => \$Opt_Help);
+
+
+
+### Run perldoc on POD
+pod2usage(-verbose => 2) if ($Opt_Help);
+die "$0 $VERSION\n" if ($Opt_Version);
+if (!$Input_File || !$Pattern) {
+    my $warning = "\n### Fatal error: Options '-i' or '-p' or their arguments are missing!\n";
+    pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
+}
+
+
+
+### Pipe input from STDIN or open input file
+my $Input_Fh;
+if ($Input_File eq '-') { # file input via STDIN
+    $Input_Fh = *STDIN; # capture typeglob of STDIN
+} else { # input via input file
+    open ($Input_Fh, "<", "$Input_File");
+}
+
+
+
+### Parse FASTA file
+my $Substitution_Count = 0; # count substitutions
+while (<$Input_Fh>) {
+    chomp;
+
+    # only substitute in FASTA ID lines
+    if (/^>/) {
+        # only substitute if pattern found, case-sensitive or case-INsensitive
+        if (/$Pattern/ || (/$Pattern/i && $Opt_Case)) {
+            $_ = substitute_string($_); # subroutine
+
+        # "reprint" FASTA IDs, which don't fit the pattern
+        } else {
+            print "$_\n";
+        }
+
+    # "reprint" sequence/non-ID lines of FASTA files
+    } else {
+        print "$_\n";
+    }
+}
+print STDERR "$Substitution_Count substitutions have been carried out\n";
+
+exit;
+
+
+#############
+#Subroutines#
+#############
+
+### Subroutine to rename headers/ID lines of the FASTA file
+sub substitute_string {
+    my $string = shift;
+    $string =~ s/^>//; # get rid of '>', append afterwards
+
+    print STDERR "$string " if ($Opt_Output); # optional verbose output to STDERR
+    $Substitution_Count++; # count occurences of carried out substitutions
+
+    # substitutions
+    if ($Opt_Global && $Opt_Case) {
+        $string =~ s/$Pattern/$Replacement/gi;
+    } elsif ($Opt_Case) {
+        $string =~ s/$Pattern/$Replacement/i;
+    } elsif ($Opt_Global) {
+        $string =~ s/$Pattern/$Replacement/g;
+    } else {
+        $string =~ s/$Pattern/$Replacement/;
+    }
+
+    # output to STDOUT, optionally STDERR
+    print ">$string";
+    print STDERR "-> $string" if ($Opt_Output);
+    if ($Opt_Numerate) {
+        print "$Substitution_Count";
+        print STDERR "$Substitution_Count" if ($Opt_Output);
+    }
+
+    if ($Append) {
+        print "$Append";
+        print STDERR "$Append" if ($Opt_Output);
+    }
+
+    print "\n";
+    print STDERR "\n" if ($Opt_Output);
+
+    return 1;
+}