3
|
1 #!/usr/bin/perl
|
|
2
|
|
3 #######
|
|
4 # POD #
|
|
5 #######
|
|
6
|
|
7 =pod
|
|
8
|
|
9 =head1 NAME
|
|
10
|
|
11 C<rename_fasta_id.pl> - rename fasta IDs according to regular expressions
|
|
12
|
|
13 =head1 SYNOPSIS
|
|
14
|
|
15 C<perl rename_fasta_id.pl -i file.fasta -p "NODE_.+$" -r "K-12_" -n -a c E<gt> out.fasta>
|
|
16
|
|
17 B<or>
|
|
18
|
|
19 C<zcat file.fasta.gz | perl rename_fasta_id.pl -i - -p "coli" -r "" -o E<gt> out.fasta>
|
|
20
|
|
21 =head1 DESCRIPTION
|
|
22
|
|
23 This script uses the built-in Perl substitution operator C<s///> to
|
|
24 replace strings in FASTA IDs. To do this, a B<pattern> and a
|
|
25 B<replacement> have to be provided (Perl regular expression syntax
|
|
26 can be used). The leading '>' character for the FASTA ID will be
|
|
27 removed before the substitution and added again afterwards. FASTA
|
|
28 IDs will be searched for matches with the B<pattern>, and if found
|
|
29 the B<pattern> will be replaced by the B<replacement>.
|
|
30
|
|
31 B<IMPORTANT>: Enclose the B<pattern> and the B<replacement> in
|
|
32 quotation marks (' or ") if they contain characters that would be
|
|
33 interpreted by the shell (e.g. pipes '|', brackets etc.).
|
|
34
|
|
35 For substitutions without any appendices in a UNIX OS you can of
|
|
36 course just use the great
|
|
37 L<C<sed>|https://www.gnu.org/software/sed/manual/sed.html> (see
|
|
38 C<man sed>), e.g.:
|
|
39
|
|
40 C<sed 's/^E<gt>pattern/E<gt>replacement/' file.fasta>
|
|
41
|
|
42 =head1 OPTIONS
|
|
43
|
|
44 =head2 Mandatory options
|
|
45
|
|
46 =over 20
|
|
47
|
|
48 =item B<-i>=I<str>, B<-input>=I<str>
|
|
49
|
|
50 Input FASTA file or piped STDIN (-) from a gzipped file
|
|
51
|
|
52 =item B<-p>=I<str>, B<-pattern>=I<str>
|
|
53
|
|
54 Pattern to be replaced in FASTA ID
|
|
55
|
|
56 =item B<-r>=I<str>, B<-replacement>=I<str>
|
|
57
|
|
58 Replacement to replace the pattern with. To entirely remove the
|
|
59 pattern use '' or "" as input for B<-r>.
|
|
60
|
|
61 =back
|
|
62
|
|
63 =head2 Optional options
|
|
64
|
|
65 =over 20
|
|
66
|
|
67 =item B<-h>, B<-help>
|
|
68
|
|
69 Help (perldoc POD)
|
|
70
|
|
71 =item B<-c>, B<-case-insensitive>
|
|
72
|
|
73 Match pattern case-insensitive
|
|
74
|
|
75 =item B<-g>, B<-global>
|
|
76
|
|
77 Replace pattern globally in the string
|
|
78
|
|
79 =item B<-n>, B<-numerate>
|
|
80
|
|
81 Append a numeration/the count of the pattern hits to the
|
|
82 replacement. This is e.g. useful to number contigs consecutively in
|
|
83 a draft genome.
|
|
84
|
|
85 =item B<-a>=I<str>, B<-append>=I<str>
|
|
86
|
|
87 Append a string after the numeration, e.g. 'c' for chromosome
|
|
88
|
|
89 =item B<-o>, B<-output>
|
|
90
|
|
91 Verbose output of the substitutions that were carried out, printed
|
|
92 to C<STDERR>
|
|
93
|
|
94 =item B<-v>, B<-version>
|
|
95
|
|
96 Print version number to C<STDERR>
|
|
97
|
|
98 =back
|
|
99
|
|
100 =head1 OUTPUT
|
|
101
|
|
102 =over 20
|
|
103
|
|
104 =item C<STDOUT>
|
|
105
|
|
106 The FASTA file with substituted ID lines is printed to C<STDOUT>.
|
|
107 Redirect or pipe into another tool as needed.
|
|
108
|
|
109 =back
|
|
110
|
|
111 =head1 EXAMPLES
|
|
112
|
|
113 =over
|
|
114
|
|
115 =item C<perl rename_fasta_id.pl -i file.fasta -p "T" -r "a" -c -g -o>
|
|
116
|
|
117 =back
|
|
118
|
|
119 =head1 VERSION
|
|
120
|
|
121 0.1 09-11-2014
|
|
122
|
|
123 =head1 AUTHOR
|
|
124
|
|
125 Andreas Leimbach aleimba[at]gmx[dot]de
|
|
126
|
|
127 =head1 LICENSE
|
|
128
|
|
129 This program is free software: you can redistribute it and/or modify
|
|
130 it under the terms of the GNU General Public License as published by
|
|
131 the Free Software Foundation; either version 3 (GPLv3) of the License,
|
|
132 or (at your option) any later version.
|
|
133
|
|
134 This program is distributed in the hope that it will be useful, but
|
|
135 WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
136 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
137 General Public License for more details.
|
|
138
|
|
139 You should have received a copy of the GNU General Public License
|
|
140 along with this program. If not, see L<http://www.gnu.org/licenses/>.
|
|
141
|
|
142 =cut
|
|
143
|
|
144
|
|
145 ########
|
|
146 # MAIN #
|
|
147 ########
|
|
148
|
|
149 use strict;
|
|
150 use warnings;
|
|
151 use autodie;
|
|
152 use Getopt::Long;
|
|
153 use Pod::Usage;
|
|
154
|
|
155 ### Get the options with Getopt::Long
|
|
156 my $Input_File; # input fasta file
|
|
157 my $Pattern; # pattern to search for in the FASTA IDs
|
|
158 my $Replacement; # regex to replace pattern with
|
|
159 my $Opt_Case; # substitute case-insensitive
|
|
160 my $Opt_Global; # substitute pattern globally in string
|
|
161 my $Opt_Numerate; # append the count of the performed substitions to each replacement regex
|
|
162 my $Append; # append an additional string after $Opt_Numerate
|
|
163 my $Opt_Output; # print substitutions to STDERR
|
|
164 my $VERSION = 0.1;
|
|
165 my ($Opt_Version, $Opt_Help);
|
|
166 GetOptions ('input=s' => \$Input_File,
|
|
167 'pattern=s' => \$Pattern,
|
|
168 'replacement=s' => \$Replacement,
|
|
169 'case-insensitive' => \$Opt_Case,
|
|
170 'global' => \$Opt_Global,
|
|
171 'numerate' => \$Opt_Numerate,
|
|
172 'append:s' => \$Append,
|
|
173 'output' => \$Opt_Output,
|
|
174 'version' => \$Opt_Version,
|
|
175 'help|?' => \$Opt_Help);
|
|
176
|
|
177
|
|
178
|
|
179 ### Run perldoc on POD
|
|
180 pod2usage(-verbose => 2) if ($Opt_Help);
|
|
181 die "$0 $VERSION\n" if ($Opt_Version);
|
|
182 if (!$Input_File || !$Pattern) {
|
|
183 my $warning = "\n### Fatal error: Options '-i' or '-p' or their arguments are missing!\n";
|
|
184 pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
|
|
185 }
|
|
186
|
|
187
|
|
188
|
|
189 ### Pipe input from STDIN or open input file
|
|
190 my $Input_Fh;
|
|
191 if ($Input_File eq '-') { # file input via STDIN
|
|
192 $Input_Fh = *STDIN; # capture typeglob of STDIN
|
|
193 } else { # input via input file
|
|
194 open ($Input_Fh, "<", "$Input_File");
|
|
195 }
|
|
196
|
|
197
|
|
198
|
|
199 ### Parse FASTA file
|
|
200 my $Substitution_Count = 0; # count substitutions
|
|
201 while (<$Input_Fh>) {
|
|
202 chomp;
|
|
203
|
|
204 # only substitute in FASTA ID lines
|
|
205 if (/^>/) {
|
|
206 # only substitute if pattern found, case-sensitive or case-INsensitive
|
|
207 if (/$Pattern/ || (/$Pattern/i && $Opt_Case)) {
|
|
208 $_ = substitute_string($_); # subroutine
|
|
209
|
|
210 # "reprint" FASTA IDs, which don't fit the pattern
|
|
211 } else {
|
|
212 print "$_\n";
|
|
213 }
|
|
214
|
|
215 # "reprint" sequence/non-ID lines of FASTA files
|
|
216 } else {
|
|
217 print "$_\n";
|
|
218 }
|
|
219 }
|
|
220 print STDERR "$Substitution_Count substitutions have been carried out\n";
|
|
221
|
|
222 exit;
|
|
223
|
|
224
|
|
225 #############
|
|
226 #Subroutines#
|
|
227 #############
|
|
228
|
|
229 ### Subroutine to rename headers/ID lines of the FASTA file
|
|
230 sub substitute_string {
|
|
231 my $string = shift;
|
|
232 $string =~ s/^>//; # get rid of '>', append afterwards
|
|
233
|
|
234 print STDERR "$string " if ($Opt_Output); # optional verbose output to STDERR
|
|
235 $Substitution_Count++; # count occurences of carried out substitutions
|
|
236
|
|
237 # substitutions
|
|
238 if ($Opt_Global && $Opt_Case) {
|
|
239 $string =~ s/$Pattern/$Replacement/gi;
|
|
240 } elsif ($Opt_Case) {
|
|
241 $string =~ s/$Pattern/$Replacement/i;
|
|
242 } elsif ($Opt_Global) {
|
|
243 $string =~ s/$Pattern/$Replacement/g;
|
|
244 } else {
|
|
245 $string =~ s/$Pattern/$Replacement/;
|
|
246 }
|
|
247
|
|
248 # output to STDOUT, optionally STDERR
|
|
249 print ">$string";
|
|
250 print STDERR "-> $string" if ($Opt_Output);
|
|
251 if ($Opt_Numerate) {
|
|
252 print "$Substitution_Count";
|
|
253 print STDERR "$Substitution_Count" if ($Opt_Output);
|
|
254 }
|
|
255
|
|
256 if ($Append) {
|
|
257 print "$Append";
|
|
258 print STDERR "$Append" if ($Opt_Output);
|
|
259 }
|
|
260
|
|
261 print "\n";
|
|
262 print STDERR "\n" if ($Opt_Output);
|
|
263
|
|
264 return 1;
|
|
265 }
|