Mercurial > repos > dereeper > pangenome_explorer
comparison COG/bac-genomics-scripts/rename_fasta_id/rename_fasta_id.pl @ 3:e42d30da7a74 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:52:25 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:97e4e3e818b6 | 3:e42d30da7a74 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 ####### | |
4 # POD # | |
5 ####### | |
6 | |
7 =pod | |
8 | |
9 =head1 NAME | |
10 | |
11 C<rename_fasta_id.pl> - rename fasta IDs according to regular expressions | |
12 | |
13 =head1 SYNOPSIS | |
14 | |
15 C<perl rename_fasta_id.pl -i file.fasta -p "NODE_.+$" -r "K-12_" -n -a c E<gt> out.fasta> | |
16 | |
17 B<or> | |
18 | |
19 C<zcat file.fasta.gz | perl rename_fasta_id.pl -i - -p "coli" -r "" -o E<gt> out.fasta> | |
20 | |
21 =head1 DESCRIPTION | |
22 | |
23 This script uses the built-in Perl substitution operator C<s///> to | |
24 replace strings in FASTA IDs. To do this, a B<pattern> and a | |
25 B<replacement> have to be provided (Perl regular expression syntax | |
26 can be used). The leading '>' character for the FASTA ID will be | |
27 removed before the substitution and added again afterwards. FASTA | |
28 IDs will be searched for matches with the B<pattern>, and if found | |
29 the B<pattern> will be replaced by the B<replacement>. | |
30 | |
31 B<IMPORTANT>: Enclose the B<pattern> and the B<replacement> in | |
32 quotation marks (' or ") if they contain characters that would be | |
33 interpreted by the shell (e.g. pipes '|', brackets etc.). | |
34 | |
35 For substitutions without any appendices in a UNIX OS you can of | |
36 course just use the great | |
37 L<C<sed>|https://www.gnu.org/software/sed/manual/sed.html> (see | |
38 C<man sed>), e.g.: | |
39 | |
40 C<sed 's/^E<gt>pattern/E<gt>replacement/' file.fasta> | |
41 | |
42 =head1 OPTIONS | |
43 | |
44 =head2 Mandatory options | |
45 | |
46 =over 20 | |
47 | |
48 =item B<-i>=I<str>, B<-input>=I<str> | |
49 | |
50 Input FASTA file or piped STDIN (-) from a gzipped file | |
51 | |
52 =item B<-p>=I<str>, B<-pattern>=I<str> | |
53 | |
54 Pattern to be replaced in FASTA ID | |
55 | |
56 =item B<-r>=I<str>, B<-replacement>=I<str> | |
57 | |
58 Replacement to replace the pattern with. To entirely remove the | |
59 pattern use '' or "" as input for B<-r>. | |
60 | |
61 =back | |
62 | |
63 =head2 Optional options | |
64 | |
65 =over 20 | |
66 | |
67 =item B<-h>, B<-help> | |
68 | |
69 Help (perldoc POD) | |
70 | |
71 =item B<-c>, B<-case-insensitive> | |
72 | |
73 Match pattern case-insensitive | |
74 | |
75 =item B<-g>, B<-global> | |
76 | |
77 Replace pattern globally in the string | |
78 | |
79 =item B<-n>, B<-numerate> | |
80 | |
81 Append a numeration/the count of the pattern hits to the | |
82 replacement. This is e.g. useful to number contigs consecutively in | |
83 a draft genome. | |
84 | |
85 =item B<-a>=I<str>, B<-append>=I<str> | |
86 | |
87 Append a string after the numeration, e.g. 'c' for chromosome | |
88 | |
89 =item B<-o>, B<-output> | |
90 | |
91 Verbose output of the substitutions that were carried out, printed | |
92 to C<STDERR> | |
93 | |
94 =item B<-v>, B<-version> | |
95 | |
96 Print version number to C<STDERR> | |
97 | |
98 =back | |
99 | |
100 =head1 OUTPUT | |
101 | |
102 =over 20 | |
103 | |
104 =item C<STDOUT> | |
105 | |
106 The FASTA file with substituted ID lines is printed to C<STDOUT>. | |
107 Redirect or pipe into another tool as needed. | |
108 | |
109 =back | |
110 | |
111 =head1 EXAMPLES | |
112 | |
113 =over | |
114 | |
115 =item C<perl rename_fasta_id.pl -i file.fasta -p "T" -r "a" -c -g -o> | |
116 | |
117 =back | |
118 | |
119 =head1 VERSION | |
120 | |
121 0.1 09-11-2014 | |
122 | |
123 =head1 AUTHOR | |
124 | |
125 Andreas Leimbach aleimba[at]gmx[dot]de | |
126 | |
127 =head1 LICENSE | |
128 | |
129 This program is free software: you can redistribute it and/or modify | |
130 it under the terms of the GNU General Public License as published by | |
131 the Free Software Foundation; either version 3 (GPLv3) of the License, | |
132 or (at your option) any later version. | |
133 | |
134 This program is distributed in the hope that it will be useful, but | |
135 WITHOUT ANY WARRANTY; without even the implied warranty of | |
136 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
137 General Public License for more details. | |
138 | |
139 You should have received a copy of the GNU General Public License | |
140 along with this program. If not, see L<http://www.gnu.org/licenses/>. | |
141 | |
142 =cut | |
143 | |
144 | |
145 ######## | |
146 # MAIN # | |
147 ######## | |
148 | |
149 use strict; | |
150 use warnings; | |
151 use autodie; | |
152 use Getopt::Long; | |
153 use Pod::Usage; | |
154 | |
155 ### Get the options with Getopt::Long | |
156 my $Input_File; # input fasta file | |
157 my $Pattern; # pattern to search for in the FASTA IDs | |
158 my $Replacement; # regex to replace pattern with | |
159 my $Opt_Case; # substitute case-insensitive | |
160 my $Opt_Global; # substitute pattern globally in string | |
161 my $Opt_Numerate; # append the count of the performed substitions to each replacement regex | |
162 my $Append; # append an additional string after $Opt_Numerate | |
163 my $Opt_Output; # print substitutions to STDERR | |
164 my $VERSION = 0.1; | |
165 my ($Opt_Version, $Opt_Help); | |
166 GetOptions ('input=s' => \$Input_File, | |
167 'pattern=s' => \$Pattern, | |
168 'replacement=s' => \$Replacement, | |
169 'case-insensitive' => \$Opt_Case, | |
170 'global' => \$Opt_Global, | |
171 'numerate' => \$Opt_Numerate, | |
172 'append:s' => \$Append, | |
173 'output' => \$Opt_Output, | |
174 'version' => \$Opt_Version, | |
175 'help|?' => \$Opt_Help); | |
176 | |
177 | |
178 | |
179 ### Run perldoc on POD | |
180 pod2usage(-verbose => 2) if ($Opt_Help); | |
181 die "$0 $VERSION\n" if ($Opt_Version); | |
182 if (!$Input_File || !$Pattern) { | |
183 my $warning = "\n### Fatal error: Options '-i' or '-p' or their arguments are missing!\n"; | |
184 pod2usage(-verbose => 1, -message => $warning, -exitval => 2); | |
185 } | |
186 | |
187 | |
188 | |
189 ### Pipe input from STDIN or open input file | |
190 my $Input_Fh; | |
191 if ($Input_File eq '-') { # file input via STDIN | |
192 $Input_Fh = *STDIN; # capture typeglob of STDIN | |
193 } else { # input via input file | |
194 open ($Input_Fh, "<", "$Input_File"); | |
195 } | |
196 | |
197 | |
198 | |
199 ### Parse FASTA file | |
200 my $Substitution_Count = 0; # count substitutions | |
201 while (<$Input_Fh>) { | |
202 chomp; | |
203 | |
204 # only substitute in FASTA ID lines | |
205 if (/^>/) { | |
206 # only substitute if pattern found, case-sensitive or case-INsensitive | |
207 if (/$Pattern/ || (/$Pattern/i && $Opt_Case)) { | |
208 $_ = substitute_string($_); # subroutine | |
209 | |
210 # "reprint" FASTA IDs, which don't fit the pattern | |
211 } else { | |
212 print "$_\n"; | |
213 } | |
214 | |
215 # "reprint" sequence/non-ID lines of FASTA files | |
216 } else { | |
217 print "$_\n"; | |
218 } | |
219 } | |
220 print STDERR "$Substitution_Count substitutions have been carried out\n"; | |
221 | |
222 exit; | |
223 | |
224 | |
225 ############# | |
226 #Subroutines# | |
227 ############# | |
228 | |
229 ### Subroutine to rename headers/ID lines of the FASTA file | |
230 sub substitute_string { | |
231 my $string = shift; | |
232 $string =~ s/^>//; # get rid of '>', append afterwards | |
233 | |
234 print STDERR "$string " if ($Opt_Output); # optional verbose output to STDERR | |
235 $Substitution_Count++; # count occurences of carried out substitutions | |
236 | |
237 # substitutions | |
238 if ($Opt_Global && $Opt_Case) { | |
239 $string =~ s/$Pattern/$Replacement/gi; | |
240 } elsif ($Opt_Case) { | |
241 $string =~ s/$Pattern/$Replacement/i; | |
242 } elsif ($Opt_Global) { | |
243 $string =~ s/$Pattern/$Replacement/g; | |
244 } else { | |
245 $string =~ s/$Pattern/$Replacement/; | |
246 } | |
247 | |
248 # output to STDOUT, optionally STDERR | |
249 print ">$string"; | |
250 print STDERR "-> $string" if ($Opt_Output); | |
251 if ($Opt_Numerate) { | |
252 print "$Substitution_Count"; | |
253 print STDERR "$Substitution_Count" if ($Opt_Output); | |
254 } | |
255 | |
256 if ($Append) { | |
257 print "$Append"; | |
258 print STDERR "$Append" if ($Opt_Output); | |
259 } | |
260 | |
261 print "\n"; | |
262 print STDERR "\n" if ($Opt_Output); | |
263 | |
264 return 1; | |
265 } |