Mercurial > repos > bgruening > text_processing
view find_and_replace @ 7:01ca80da2266 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/text_processing commit 4379e712f76f2bb12ee2cc270dd8a0e806df2cd6
author | bgruening |
---|---|
date | Mon, 22 May 2017 07:41:58 -0400 |
parents | 5314e5d6f040 |
children | fb4ff3c42cd3 |
line wrap: on
line source
#!/usr/bin/env perl use strict; use warnings; use Getopt::Std; sub parse_command_line(); sub build_regex_string(); sub usage(); my $input_file ; my $output_file; my $find_pattern ; my $replace_pattern ; my $find_complete_words ; my $find_pattern_is_regex ; my $find_in_specific_column ; my $find_case_insensitive ; my $replace_global ; my $skip_first_line ; ## ## Program Start ## usage() if @ARGV<2; parse_command_line(); my $regex_string = build_regex_string() ; # Allow first line to pass without filtering? if ( $skip_first_line ) { my $line = <$input_file>; print $output_file $line ; } ## ## Main loop ## ## I LOVE PERL (and hate it, at the same time...) ## ## So what's going on with the self-compiling perl code? ## ## 1. The program gets the find-pattern and the replace-pattern from the user (as strings). ## 2. If both the find-pattern and replace-pattern are simple strings (not regex), ## it would be possible to pre-compile a regex (with qr//) and use it in a 's///' ## 3. If the find-pattern is a regex but the replace-pattern is a simple text string (with out back-references) ## it is still possible to pre-compile the regex and use it in a 's///' ## However, ## 4. If the replace-pattern contains back-references, pre-compiling is not possible. ## (in perl, you can't precompile a substitute regex). ## See these examples: ## http://www.perlmonks.org/?node_id=84420 ## http://stackoverflow.com/questions/125171/passing-a-regex-substitution-as-a-variable-in-perl ## ## The solution: ## we build the regex string as valid perl code (in 'build_regex()', stored in $regex_string ), ## Then eval() a new perl code that contains the substitution regex as inlined code. ## Gotta love perl! my $perl_program ; if ( $find_in_specific_column ) { # Find & replace in specific column $perl_program = <<EOF; while ( <STDIN> ) { chomp ; my \@columns = split ; #not enough columns in this line - skip it next if ( \@columns < $find_in_specific_column ) ; \$columns [ $find_in_specific_column - 1 ] =~ $regex_string ; print STDOUT join("\t", \@columns), "\n" ; } EOF } else { # Find & replace the entire line $perl_program = <<EOF; while ( <STDIN> ) { $regex_string ; print STDOUT; } EOF } # The dynamic perl code reads from STDIN and writes to STDOUT, # so connect these handles (if the user didn't specifiy input / output # file names, these might be already be STDIN/OUT, so the whole could be a no-op). *STDIN = $input_file ; *STDOUT = $output_file ; eval $perl_program ; ## ## Program end ## sub parse_command_line() { my %opts ; getopts('grsiwc:o:', \%opts) or die "$0: Invalid option specified\n"; die "$0: missing Find-Pattern argument\n" if (@ARGV==0); $find_pattern = $ARGV[0]; die "$0: missing Replace-Pattern argument\n" if (@ARGV==1); $replace_pattern = $ARGV[1]; $find_complete_words = ( exists $opts{w} ) ; $find_case_insensitive = ( exists $opts{i} ) ; $skip_first_line = ( exists $opts{s} ) ; $find_pattern_is_regex = ( exists $opts{r} ) ; $replace_global = ( exists $opts{g} ) ; # Search in specific column ? if ( defined $opts{c} ) { $find_in_specific_column = $opts{c}; die "$0: invalid column number ($find_in_specific_column).\n" unless $find_in_specific_column =~ /^\d+$/ ; die "$0: invalid column number ($find_in_specific_column).\n" if $find_in_specific_column <= 0; } else { $find_in_specific_column = 0 ; } # Output File specified (instead of STDOUT) ? if ( defined $opts{o} ) { my $filename = $opts{o}; open $output_file, ">$filename" or die "$0: Failed to create output file '$filename': $!\n" ; } else { $output_file = *STDOUT ; } # Input file Specified (instead of STDIN) ? if ( @ARGV>2 ) { my $filename = $ARGV[2]; open $input_file, "<$filename" or die "$0: Failed to open input file '$filename': $!\n" ; } else { $input_file = *STDIN; } } sub build_regex_string() { my $find_string ; my $replace_string ; if ( $find_pattern_is_regex ) { $find_string = $find_pattern ; $replace_string = $replace_pattern ; } else { $find_string = quotemeta $find_pattern ; $replace_string = quotemeta $replace_pattern; } if ( $find_complete_words ) { $find_string = "\\b($find_string)\\b"; } my $regex_string = "s/$find_string/$replace_string/"; $regex_string .= "i" if ( $find_case_insensitive ); $regex_string .= "g" if ( $replace_global ) ; return $regex_string; } sub usage() { print <<EOF; Find and Replace Copyright (C) 2009 - by A. Gordon ( gordon at cshl dot edu ) Usage: $0 [-o OUTPUT] [-g] [-r] [-w] [-i] [-c N] [-l] FIND-PATTERN REPLACE-PATTERN [INPUT-FILE] -g - Global replace - replace all occurences in line/column. Default - replace just the first instance. -w - search for complete words (not partial sub-strings). -i - case insensitive search. -c N - check only column N, instead of entire line (line split by whitespace). -l - skip first line (don't replace anything in it) -r - FIND-PATTERN and REPLACE-PATTERN are perl regular expression, usable inside a 's///' statement. By default, they are used as verbatim text strings. -o OUT - specify output file (default = STDOUT). INPUT-FILE - (optional) read from file (default = from STDIN). EOF exit; }