Mercurial > repos > bgruening > text_processing
annotate find_and_replace @ 21:86755160afbf draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/text_processing commit c2b1677d1c94433f777c2dc28ac8eec0a99cc6a7
author | bgruening |
---|---|
date | Fri, 16 Aug 2024 10:41:54 +0000 |
parents | fb4ff3c42cd3 |
children |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env perl |
2 use strict; | |
3 use warnings; | |
4 use Getopt::Std; | |
5 | |
6 sub parse_command_line(); | |
7 sub build_regex_string(); | |
8 sub usage(); | |
9 | |
10 my $input_file ; | |
11 my $output_file; | |
12 my $find_pattern ; | |
13 my $replace_pattern ; | |
14 my $find_complete_words ; | |
15 my $find_pattern_is_regex ; | |
16 my $find_in_specific_column ; | |
17 my $find_case_insensitive ; | |
18 my $replace_global ; | |
19 my $skip_first_line ; | |
20 | |
21 | |
22 ## | |
23 ## Program Start | |
24 ## | |
25 usage() if @ARGV<2; | |
26 parse_command_line(); | |
27 my $regex_string = build_regex_string() ; | |
28 | |
29 # Allow first line to pass without filtering? | |
30 if ( $skip_first_line ) { | |
31 my $line = <$input_file>; | |
32 print $output_file $line ; | |
33 } | |
34 | |
35 | |
36 ## | |
37 ## Main loop | |
38 ## | |
39 | |
40 ## I LOVE PERL (and hate it, at the same time...) | |
41 ## | |
42 ## So what's going on with the self-compiling perl code? | |
43 ## | |
44 ## 1. The program gets the find-pattern and the replace-pattern from the user (as strings). | |
45 ## 2. If both the find-pattern and replace-pattern are simple strings (not regex), | |
46 ## it would be possible to pre-compile a regex (with qr//) and use it in a 's///' | |
47 ## 3. If the find-pattern is a regex but the replace-pattern is a simple text string (with out back-references) | |
48 ## it is still possible to pre-compile the regex and use it in a 's///' | |
49 ## However, | |
50 ## 4. If the replace-pattern contains back-references, pre-compiling is not possible. | |
51 ## (in perl, you can't precompile a substitute regex). | |
52 ## See these examples: | |
53 ## http://www.perlmonks.org/?node_id=84420 | |
54 ## http://stackoverflow.com/questions/125171/passing-a-regex-substitution-as-a-variable-in-perl | |
55 ## | |
56 ## The solution: | |
57 ## we build the regex string as valid perl code (in 'build_regex()', stored in $regex_string ), | |
58 ## Then eval() a new perl code that contains the substitution regex as inlined code. | |
59 ## Gotta love perl! | |
60 | |
61 my $perl_program ; | |
62 if ( $find_in_specific_column ) { | |
63 # Find & replace in specific column | |
64 | |
65 $perl_program = <<EOF; | |
66 while ( <STDIN> ) { | |
67 chomp ; | |
14
fb4ff3c42cd3
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/text_processing commit 09b22cceacb34dd4c6c1b42890f93232df128208"
bgruening
parents:
0
diff
changeset
|
68 my \@columns = split /\t/; |
0 | 69 |
70 #not enough columns in this line - skip it | |
71 next if ( \@columns < $find_in_specific_column ) ; | |
72 | |
73 \$columns [ $find_in_specific_column - 1 ] =~ $regex_string ; | |
74 | |
75 print STDOUT join("\t", \@columns), "\n" ; | |
76 } | |
77 EOF | |
78 | |
79 } else { | |
80 # Find & replace the entire line | |
81 $perl_program = <<EOF; | |
82 while ( <STDIN> ) { | |
83 $regex_string ; | |
84 print STDOUT; | |
85 } | |
86 EOF | |
87 } | |
88 | |
89 | |
90 # The dynamic perl code reads from STDIN and writes to STDOUT, | |
91 # so connect these handles (if the user didn't specifiy input / output | |
92 # file names, these might be already be STDIN/OUT, so the whole could be a no-op). | |
93 *STDIN = $input_file ; | |
94 *STDOUT = $output_file ; | |
95 eval $perl_program ; | |
96 | |
97 | |
98 ## | |
99 ## Program end | |
100 ## | |
101 | |
102 | |
103 sub parse_command_line() | |
104 { | |
105 my %opts ; | |
106 getopts('grsiwc:o:', \%opts) or die "$0: Invalid option specified\n"; | |
107 | |
108 die "$0: missing Find-Pattern argument\n" if (@ARGV==0); | |
109 $find_pattern = $ARGV[0]; | |
110 die "$0: missing Replace-Pattern argument\n" if (@ARGV==1); | |
111 $replace_pattern = $ARGV[1]; | |
112 | |
113 $find_complete_words = ( exists $opts{w} ) ; | |
114 $find_case_insensitive = ( exists $opts{i} ) ; | |
115 $skip_first_line = ( exists $opts{s} ) ; | |
116 $find_pattern_is_regex = ( exists $opts{r} ) ; | |
117 $replace_global = ( exists $opts{g} ) ; | |
118 | |
119 # Search in specific column ? | |
120 if ( defined $opts{c} ) { | |
121 $find_in_specific_column = $opts{c}; | |
122 | |
123 die "$0: invalid column number ($find_in_specific_column).\n" | |
124 unless $find_in_specific_column =~ /^\d+$/ ; | |
125 | |
126 die "$0: invalid column number ($find_in_specific_column).\n" | |
127 if $find_in_specific_column <= 0; | |
128 } | |
129 else { | |
130 $find_in_specific_column = 0 ; | |
131 } | |
132 | |
133 # Output File specified (instead of STDOUT) ? | |
134 if ( defined $opts{o} ) { | |
135 my $filename = $opts{o}; | |
136 open $output_file, ">$filename" or die "$0: Failed to create output file '$filename': $!\n" ; | |
137 } else { | |
138 $output_file = *STDOUT ; | |
139 } | |
140 | |
141 | |
142 # Input file Specified (instead of STDIN) ? | |
143 if ( @ARGV>2 ) { | |
144 my $filename = $ARGV[2]; | |
145 open $input_file, "<$filename" or die "$0: Failed to open input file '$filename': $!\n" ; | |
146 } else { | |
147 $input_file = *STDIN; | |
148 } | |
149 } | |
150 | |
151 sub build_regex_string() | |
152 { | |
153 my $find_string ; | |
154 my $replace_string ; | |
155 | |
156 if ( $find_pattern_is_regex ) { | |
157 $find_string = $find_pattern ; | |
158 $replace_string = $replace_pattern ; | |
159 } else { | |
160 $find_string = quotemeta $find_pattern ; | |
161 $replace_string = quotemeta $replace_pattern; | |
162 } | |
163 | |
164 if ( $find_complete_words ) { | |
165 $find_string = "\\b($find_string)\\b"; | |
166 } | |
167 | |
168 my $regex_string = "s/$find_string/$replace_string/"; | |
169 | |
170 $regex_string .= "i" if ( $find_case_insensitive ); | |
171 $regex_string .= "g" if ( $replace_global ) ; | |
172 | |
173 | |
174 return $regex_string; | |
175 } | |
176 | |
177 sub usage() | |
178 { | |
179 print <<EOF; | |
180 | |
181 Find and Replace | |
182 Copyright (C) 2009 - by A. Gordon ( gordon at cshl dot edu ) | |
183 | |
184 Usage: $0 [-o OUTPUT] [-g] [-r] [-w] [-i] [-c N] [-l] FIND-PATTERN REPLACE-PATTERN [INPUT-FILE] | |
185 | |
186 -g - Global replace - replace all occurences in line/column. | |
187 Default - replace just the first instance. | |
188 -w - search for complete words (not partial sub-strings). | |
189 -i - case insensitive search. | |
190 -c N - check only column N, instead of entire line (line split by whitespace). | |
191 -l - skip first line (don't replace anything in it) | |
192 -r - FIND-PATTERN and REPLACE-PATTERN are perl regular expression, | |
193 usable inside a 's///' statement. | |
194 By default, they are used as verbatim text strings. | |
195 -o OUT - specify output file (default = STDOUT). | |
196 INPUT-FILE - (optional) read from file (default = from STDIN). | |
197 | |
198 | |
199 EOF | |
200 | |
201 exit; | |
202 } |