comparison COG/bac-genomics-scripts/rename_fasta_id/rename_fasta_id.pl @ 3:e42d30da7a74 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:52:25 +0000
parents
children
comparison
equal deleted inserted replaced
2:97e4e3e818b6 3:e42d30da7a74
1 #!/usr/bin/perl
2
3 #######
4 # POD #
5 #######
6
7 =pod
8
9 =head1 NAME
10
11 C<rename_fasta_id.pl> - rename fasta IDs according to regular expressions
12
13 =head1 SYNOPSIS
14
15 C<perl rename_fasta_id.pl -i file.fasta -p "NODE_.+$" -r "K-12_" -n -a c E<gt> out.fasta>
16
17 B<or>
18
19 C<zcat file.fasta.gz | perl rename_fasta_id.pl -i - -p "coli" -r "" -o E<gt> out.fasta>
20
21 =head1 DESCRIPTION
22
23 This script uses the built-in Perl substitution operator C<s///> to
24 replace strings in FASTA IDs. To do this, a B<pattern> and a
25 B<replacement> have to be provided (Perl regular expression syntax
26 can be used). The leading '>' character for the FASTA ID will be
27 removed before the substitution and added again afterwards. FASTA
28 IDs will be searched for matches with the B<pattern>, and if found
29 the B<pattern> will be replaced by the B<replacement>.
30
31 B<IMPORTANT>: Enclose the B<pattern> and the B<replacement> in
32 quotation marks (' or ") if they contain characters that would be
33 interpreted by the shell (e.g. pipes '|', brackets etc.).
34
35 For substitutions without any appendices in a UNIX OS you can of
36 course just use the great
37 L<C<sed>|https://www.gnu.org/software/sed/manual/sed.html> (see
38 C<man sed>), e.g.:
39
40 C<sed 's/^E<gt>pattern/E<gt>replacement/' file.fasta>
41
42 =head1 OPTIONS
43
44 =head2 Mandatory options
45
46 =over 20
47
48 =item B<-i>=I<str>, B<-input>=I<str>
49
50 Input FASTA file or piped STDIN (-) from a gzipped file
51
52 =item B<-p>=I<str>, B<-pattern>=I<str>
53
54 Pattern to be replaced in FASTA ID
55
56 =item B<-r>=I<str>, B<-replacement>=I<str>
57
58 Replacement to replace the pattern with. To entirely remove the
59 pattern use '' or "" as input for B<-r>.
60
61 =back
62
63 =head2 Optional options
64
65 =over 20
66
67 =item B<-h>, B<-help>
68
69 Help (perldoc POD)
70
71 =item B<-c>, B<-case-insensitive>
72
73 Match pattern case-insensitive
74
75 =item B<-g>, B<-global>
76
77 Replace pattern globally in the string
78
79 =item B<-n>, B<-numerate>
80
81 Append a numeration/the count of the pattern hits to the
82 replacement. This is e.g. useful to number contigs consecutively in
83 a draft genome.
84
85 =item B<-a>=I<str>, B<-append>=I<str>
86
87 Append a string after the numeration, e.g. 'c' for chromosome
88
89 =item B<-o>, B<-output>
90
91 Verbose output of the substitutions that were carried out, printed
92 to C<STDERR>
93
94 =item B<-v>, B<-version>
95
96 Print version number to C<STDERR>
97
98 =back
99
100 =head1 OUTPUT
101
102 =over 20
103
104 =item C<STDOUT>
105
106 The FASTA file with substituted ID lines is printed to C<STDOUT>.
107 Redirect or pipe into another tool as needed.
108
109 =back
110
111 =head1 EXAMPLES
112
113 =over
114
115 =item C<perl rename_fasta_id.pl -i file.fasta -p "T" -r "a" -c -g -o>
116
117 =back
118
119 =head1 VERSION
120
121 0.1 09-11-2014
122
123 =head1 AUTHOR
124
125 Andreas Leimbach aleimba[at]gmx[dot]de
126
127 =head1 LICENSE
128
129 This program is free software: you can redistribute it and/or modify
130 it under the terms of the GNU General Public License as published by
131 the Free Software Foundation; either version 3 (GPLv3) of the License,
132 or (at your option) any later version.
133
134 This program is distributed in the hope that it will be useful, but
135 WITHOUT ANY WARRANTY; without even the implied warranty of
136 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
137 General Public License for more details.
138
139 You should have received a copy of the GNU General Public License
140 along with this program. If not, see L<http://www.gnu.org/licenses/>.
141
142 =cut
143
144
145 ########
146 # MAIN #
147 ########
148
149 use strict;
150 use warnings;
151 use autodie;
152 use Getopt::Long;
153 use Pod::Usage;
154
155 ### Get the options with Getopt::Long
156 my $Input_File; # input fasta file
157 my $Pattern; # pattern to search for in the FASTA IDs
158 my $Replacement; # regex to replace pattern with
159 my $Opt_Case; # substitute case-insensitive
160 my $Opt_Global; # substitute pattern globally in string
161 my $Opt_Numerate; # append the count of the performed substitions to each replacement regex
162 my $Append; # append an additional string after $Opt_Numerate
163 my $Opt_Output; # print substitutions to STDERR
164 my $VERSION = 0.1;
165 my ($Opt_Version, $Opt_Help);
166 GetOptions ('input=s' => \$Input_File,
167 'pattern=s' => \$Pattern,
168 'replacement=s' => \$Replacement,
169 'case-insensitive' => \$Opt_Case,
170 'global' => \$Opt_Global,
171 'numerate' => \$Opt_Numerate,
172 'append:s' => \$Append,
173 'output' => \$Opt_Output,
174 'version' => \$Opt_Version,
175 'help|?' => \$Opt_Help);
176
177
178
179 ### Run perldoc on POD
180 pod2usage(-verbose => 2) if ($Opt_Help);
181 die "$0 $VERSION\n" if ($Opt_Version);
182 if (!$Input_File || !$Pattern) {
183 my $warning = "\n### Fatal error: Options '-i' or '-p' or their arguments are missing!\n";
184 pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
185 }
186
187
188
189 ### Pipe input from STDIN or open input file
190 my $Input_Fh;
191 if ($Input_File eq '-') { # file input via STDIN
192 $Input_Fh = *STDIN; # capture typeglob of STDIN
193 } else { # input via input file
194 open ($Input_Fh, "<", "$Input_File");
195 }
196
197
198
199 ### Parse FASTA file
200 my $Substitution_Count = 0; # count substitutions
201 while (<$Input_Fh>) {
202 chomp;
203
204 # only substitute in FASTA ID lines
205 if (/^>/) {
206 # only substitute if pattern found, case-sensitive or case-INsensitive
207 if (/$Pattern/ || (/$Pattern/i && $Opt_Case)) {
208 $_ = substitute_string($_); # subroutine
209
210 # "reprint" FASTA IDs, which don't fit the pattern
211 } else {
212 print "$_\n";
213 }
214
215 # "reprint" sequence/non-ID lines of FASTA files
216 } else {
217 print "$_\n";
218 }
219 }
220 print STDERR "$Substitution_Count substitutions have been carried out\n";
221
222 exit;
223
224
225 #############
226 #Subroutines#
227 #############
228
229 ### Subroutine to rename headers/ID lines of the FASTA file
230 sub substitute_string {
231 my $string = shift;
232 $string =~ s/^>//; # get rid of '>', append afterwards
233
234 print STDERR "$string " if ($Opt_Output); # optional verbose output to STDERR
235 $Substitution_Count++; # count occurences of carried out substitutions
236
237 # substitutions
238 if ($Opt_Global && $Opt_Case) {
239 $string =~ s/$Pattern/$Replacement/gi;
240 } elsif ($Opt_Case) {
241 $string =~ s/$Pattern/$Replacement/i;
242 } elsif ($Opt_Global) {
243 $string =~ s/$Pattern/$Replacement/g;
244 } else {
245 $string =~ s/$Pattern/$Replacement/;
246 }
247
248 # output to STDOUT, optionally STDERR
249 print ">$string";
250 print STDERR "-> $string" if ($Opt_Output);
251 if ($Opt_Numerate) {
252 print "$Substitution_Count";
253 print STDERR "$Substitution_Count" if ($Opt_Output);
254 }
255
256 if ($Append) {
257 print "$Append";
258 print STDERR "$Append" if ($Opt_Output);
259 }
260
261 print "\n";
262 print STDERR "\n" if ($Opt_Output);
263
264 return 1;
265 }