view COG/bac-genomics-scripts/rename_fasta_id/rename_fasta_id.pl @ 3:e42d30da7a74 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:52:25 +0000
parents
children
line wrap: on
line source

#!/usr/bin/perl

#######
# POD #
#######

=pod

=head1 NAME

C<rename_fasta_id.pl> - rename fasta IDs according to regular expressions

=head1 SYNOPSIS

C<perl rename_fasta_id.pl -i file.fasta -p "NODE_.+$" -r "K-12_" -n -a c E<gt> out.fasta>

B<or>

C<zcat file.fasta.gz | perl rename_fasta_id.pl -i - -p "coli" -r "" -o E<gt> out.fasta>

=head1 DESCRIPTION

This script uses the built-in Perl substitution operator C<s///> to
replace strings in FASTA IDs. To do this, a B<pattern> and a
B<replacement> have to be provided (Perl regular expression syntax
can be used). The leading '>' character for the FASTA ID will be
removed before the substitution and added again afterwards. FASTA
IDs will be searched for matches with the B<pattern>, and if found
the B<pattern> will be replaced by the B<replacement>.

B<IMPORTANT>: Enclose the B<pattern> and the B<replacement> in
quotation marks (' or ") if they contain characters that would be
interpreted by the shell (e.g. pipes '|', brackets etc.).

For substitutions without any appendices in a UNIX OS you can of
course just use the great
L<C<sed>|https://www.gnu.org/software/sed/manual/sed.html> (see
C<man sed>), e.g.:

C<sed 's/^E<gt>pattern/E<gt>replacement/' file.fasta>

=head1 OPTIONS

=head2 Mandatory options

=over 20

=item B<-i>=I<str>, B<-input>=I<str>

Input FASTA file or piped STDIN (-) from a gzipped file

=item B<-p>=I<str>, B<-pattern>=I<str>

Pattern to be replaced in FASTA ID

=item B<-r>=I<str>, B<-replacement>=I<str>

Replacement to replace the pattern with. To entirely remove the
pattern use '' or "" as input for B<-r>.

=back

=head2 Optional options

=over 20

=item B<-h>, B<-help>

Help (perldoc POD)

=item B<-c>, B<-case-insensitive>

Match pattern case-insensitive

=item B<-g>, B<-global>

Replace pattern globally in the string

=item B<-n>, B<-numerate>

Append a numeration/the count of the pattern hits to the
replacement. This is e.g. useful to number contigs consecutively in
a draft genome.

=item B<-a>=I<str>, B<-append>=I<str>

Append a string after the numeration, e.g. 'c' for chromosome

=item B<-o>, B<-output>

Verbose output of the substitutions that were carried out, printed
to C<STDERR>

=item B<-v>, B<-version>

Print version number to C<STDERR>

=back

=head1 OUTPUT

=over 20

=item C<STDOUT>

The FASTA file with substituted ID lines is printed to C<STDOUT>.
Redirect or pipe into another tool as needed.

=back

=head1 EXAMPLES

=over

=item C<perl rename_fasta_id.pl -i file.fasta -p "T" -r "a" -c -g -o>

=back

=head1 VERSION

 0.1                                                 09-11-2014

=head1 AUTHOR

 Andreas Leimbach                                    aleimba[at]gmx[dot]de

=head1 LICENSE

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 (GPLv3) of the License,
or (at your option) any later version.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see L<http://www.gnu.org/licenses/>.

=cut


########
# MAIN #
########

use strict;
use warnings;
use autodie;
use Getopt::Long;
use Pod::Usage;

### Get the options with Getopt::Long
my $Input_File; # input fasta file
my $Pattern; # pattern to search for in the FASTA IDs
my $Replacement; # regex to replace pattern with
my $Opt_Case; # substitute case-insensitive
my $Opt_Global; # substitute pattern globally in string
my $Opt_Numerate; # append the count of the performed substitions to each replacement regex
my $Append; # append an additional string after $Opt_Numerate
my $Opt_Output; # print substitutions to STDERR
my $VERSION = 0.1;
my ($Opt_Version, $Opt_Help);
GetOptions ('input=s' => \$Input_File,
            'pattern=s' => \$Pattern,
            'replacement=s' => \$Replacement,
            'case-insensitive' => \$Opt_Case,
            'global' => \$Opt_Global,
            'numerate' => \$Opt_Numerate,
            'append:s' => \$Append,
            'output' => \$Opt_Output,
            'version' => \$Opt_Version,
            'help|?' => \$Opt_Help);



### Run perldoc on POD
pod2usage(-verbose => 2) if ($Opt_Help);
die "$0 $VERSION\n" if ($Opt_Version);
if (!$Input_File || !$Pattern) {
    my $warning = "\n### Fatal error: Options '-i' or '-p' or their arguments are missing!\n";
    pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
}



### Pipe input from STDIN or open input file
my $Input_Fh;
if ($Input_File eq '-') { # file input via STDIN
    $Input_Fh = *STDIN; # capture typeglob of STDIN
} else { # input via input file
    open ($Input_Fh, "<", "$Input_File");
}



### Parse FASTA file
my $Substitution_Count = 0; # count substitutions
while (<$Input_Fh>) {
    chomp;

    # only substitute in FASTA ID lines
    if (/^>/) {
        # only substitute if pattern found, case-sensitive or case-INsensitive
        if (/$Pattern/ || (/$Pattern/i && $Opt_Case)) {
            $_ = substitute_string($_); # subroutine

        # "reprint" FASTA IDs, which don't fit the pattern
        } else {
            print "$_\n";
        }

    # "reprint" sequence/non-ID lines of FASTA files
    } else {
        print "$_\n";
    }
}
print STDERR "$Substitution_Count substitutions have been carried out\n";

exit;


#############
#Subroutines#
#############

### Subroutine to rename headers/ID lines of the FASTA file
sub substitute_string {
    my $string = shift;
    $string =~ s/^>//; # get rid of '>', append afterwards

    print STDERR "$string " if ($Opt_Output); # optional verbose output to STDERR
    $Substitution_Count++; # count occurences of carried out substitutions

    # substitutions
    if ($Opt_Global && $Opt_Case) {
        $string =~ s/$Pattern/$Replacement/gi;
    } elsif ($Opt_Case) {
        $string =~ s/$Pattern/$Replacement/i;
    } elsif ($Opt_Global) {
        $string =~ s/$Pattern/$Replacement/g;
    } else {
        $string =~ s/$Pattern/$Replacement/;
    }

    # output to STDOUT, optionally STDERR
    print ">$string";
    print STDERR "-> $string" if ($Opt_Output);
    if ($Opt_Numerate) {
        print "$Substitution_Count";
        print STDERR "$Substitution_Count" if ($Opt_Output);
    }

    if ($Append) {
        print "$Append";
        print STDERR "$Append" if ($Opt_Output);
    }

    print "\n";
    print STDERR "\n" if ($Opt_Output);

    return 1;
}