annotate find_and_replace @ 7:01ca80da2266 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/text_processing commit 4379e712f76f2bb12ee2cc270dd8a0e806df2cd6
author bgruening
date Mon, 22 May 2017 07:41:58 -0400
parents 5314e5d6f040
children fb4ff3c42cd3
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
1 #!/usr/bin/env perl
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
2 use strict;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
3 use warnings;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
4 use Getopt::Std;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
5
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
6 sub parse_command_line();
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
7 sub build_regex_string();
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
8 sub usage();
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
9
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
10 my $input_file ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
11 my $output_file;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
12 my $find_pattern ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
13 my $replace_pattern ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
14 my $find_complete_words ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
15 my $find_pattern_is_regex ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
16 my $find_in_specific_column ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
17 my $find_case_insensitive ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
18 my $replace_global ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
19 my $skip_first_line ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
20
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
21
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
22 ##
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
23 ## Program Start
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
24 ##
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
25 usage() if @ARGV<2;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
26 parse_command_line();
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
27 my $regex_string = build_regex_string() ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
28
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
29 # Allow first line to pass without filtering?
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
30 if ( $skip_first_line ) {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
31 my $line = <$input_file>;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
32 print $output_file $line ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
33 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
34
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
35
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
36 ##
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
37 ## Main loop
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
38 ##
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
39
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
40 ## I LOVE PERL (and hate it, at the same time...)
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
41 ##
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
42 ## So what's going on with the self-compiling perl code?
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
43 ##
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
44 ## 1. The program gets the find-pattern and the replace-pattern from the user (as strings).
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
45 ## 2. If both the find-pattern and replace-pattern are simple strings (not regex),
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
46 ## it would be possible to pre-compile a regex (with qr//) and use it in a 's///'
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
47 ## 3. If the find-pattern is a regex but the replace-pattern is a simple text string (with out back-references)
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
48 ## it is still possible to pre-compile the regex and use it in a 's///'
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
49 ## However,
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
50 ## 4. If the replace-pattern contains back-references, pre-compiling is not possible.
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
51 ## (in perl, you can't precompile a substitute regex).
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
52 ## See these examples:
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
53 ## http://www.perlmonks.org/?node_id=84420
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
54 ## http://stackoverflow.com/questions/125171/passing-a-regex-substitution-as-a-variable-in-perl
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
55 ##
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
56 ## The solution:
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
57 ## we build the regex string as valid perl code (in 'build_regex()', stored in $regex_string ),
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
58 ## Then eval() a new perl code that contains the substitution regex as inlined code.
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
59 ## Gotta love perl!
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
60
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
61 my $perl_program ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
62 if ( $find_in_specific_column ) {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
63 # Find & replace in specific column
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
64
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
65 $perl_program = <<EOF;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
66 while ( <STDIN> ) {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
67 chomp ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
68 my \@columns = split ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
69
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
70 #not enough columns in this line - skip it
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
71 next if ( \@columns < $find_in_specific_column ) ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
72
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
73 \$columns [ $find_in_specific_column - 1 ] =~ $regex_string ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
74
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
75 print STDOUT join("\t", \@columns), "\n" ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
76 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
77 EOF
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
78
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
79 } else {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
80 # Find & replace the entire line
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
81 $perl_program = <<EOF;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
82 while ( <STDIN> ) {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
83 $regex_string ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
84 print STDOUT;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
85 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
86 EOF
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
87 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
88
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
89
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
90 # The dynamic perl code reads from STDIN and writes to STDOUT,
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
91 # so connect these handles (if the user didn't specifiy input / output
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
92 # file names, these might be already be STDIN/OUT, so the whole could be a no-op).
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
93 *STDIN = $input_file ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
94 *STDOUT = $output_file ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
95 eval $perl_program ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
96
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
97
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
98 ##
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
99 ## Program end
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
100 ##
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
101
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
102
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
103 sub parse_command_line()
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
104 {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
105 my %opts ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
106 getopts('grsiwc:o:', \%opts) or die "$0: Invalid option specified\n";
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
107
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
108 die "$0: missing Find-Pattern argument\n" if (@ARGV==0);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
109 $find_pattern = $ARGV[0];
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
110 die "$0: missing Replace-Pattern argument\n" if (@ARGV==1);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
111 $replace_pattern = $ARGV[1];
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
112
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
113 $find_complete_words = ( exists $opts{w} ) ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
114 $find_case_insensitive = ( exists $opts{i} ) ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
115 $skip_first_line = ( exists $opts{s} ) ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
116 $find_pattern_is_regex = ( exists $opts{r} ) ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
117 $replace_global = ( exists $opts{g} ) ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
118
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
119 # Search in specific column ?
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
120 if ( defined $opts{c} ) {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
121 $find_in_specific_column = $opts{c};
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
122
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
123 die "$0: invalid column number ($find_in_specific_column).\n"
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
124 unless $find_in_specific_column =~ /^\d+$/ ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
125
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
126 die "$0: invalid column number ($find_in_specific_column).\n"
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
127 if $find_in_specific_column <= 0;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
128 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
129 else {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
130 $find_in_specific_column = 0 ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
131 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
132
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
133 # Output File specified (instead of STDOUT) ?
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
134 if ( defined $opts{o} ) {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
135 my $filename = $opts{o};
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
136 open $output_file, ">$filename" or die "$0: Failed to create output file '$filename': $!\n" ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
137 } else {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
138 $output_file = *STDOUT ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
139 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
140
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
141
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
142 # Input file Specified (instead of STDIN) ?
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
143 if ( @ARGV>2 ) {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
144 my $filename = $ARGV[2];
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
145 open $input_file, "<$filename" or die "$0: Failed to open input file '$filename': $!\n" ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
146 } else {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
147 $input_file = *STDIN;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
148 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
149 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
150
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
151 sub build_regex_string()
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
152 {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
153 my $find_string ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
154 my $replace_string ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
155
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
156 if ( $find_pattern_is_regex ) {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
157 $find_string = $find_pattern ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
158 $replace_string = $replace_pattern ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
159 } else {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
160 $find_string = quotemeta $find_pattern ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
161 $replace_string = quotemeta $replace_pattern;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
162 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
163
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
164 if ( $find_complete_words ) {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
165 $find_string = "\\b($find_string)\\b";
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
166 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
167
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
168 my $regex_string = "s/$find_string/$replace_string/";
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
169
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
170 $regex_string .= "i" if ( $find_case_insensitive );
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
171 $regex_string .= "g" if ( $replace_global ) ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
172
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
173
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
174 return $regex_string;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
175 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
176
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
177 sub usage()
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
178 {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
179 print <<EOF;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
180
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
181 Find and Replace
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
182 Copyright (C) 2009 - by A. Gordon ( gordon at cshl dot edu )
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
183
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
184 Usage: $0 [-o OUTPUT] [-g] [-r] [-w] [-i] [-c N] [-l] FIND-PATTERN REPLACE-PATTERN [INPUT-FILE]
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
185
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
186 -g - Global replace - replace all occurences in line/column.
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
187 Default - replace just the first instance.
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
188 -w - search for complete words (not partial sub-strings).
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
189 -i - case insensitive search.
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
190 -c N - check only column N, instead of entire line (line split by whitespace).
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
191 -l - skip first line (don't replace anything in it)
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
192 -r - FIND-PATTERN and REPLACE-PATTERN are perl regular expression,
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
193 usable inside a 's///' statement.
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
194 By default, they are used as verbatim text strings.
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
195 -o OUT - specify output file (default = STDOUT).
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
196 INPUT-FILE - (optional) read from file (default = from STDIN).
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
197
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
198
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
199 EOF
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
200
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
201 exit;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
202 }