Mercurial > repos > bgruening > text_processing
diff easyjoin @ 0:5314e5d6f040 draft
Imported from capsule None
author | bgruening |
---|---|
date | Thu, 29 Jan 2015 07:53:17 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/easyjoin Thu Jan 29 07:53:17 2015 -0500 @@ -0,0 +1,308 @@ +#!/usr/bin/env perl +## EASY Join - +## Join with automatic pre-sorting of both files +## Copyright (C) 2010 A. Gordon (gordon@cshl.edu) +## license: AGPLv3+ +use strict; +use warnings; +use Data::Dumper; +use Getopt::Long qw(:config bundling no_ignore_case_always); +use File::Temp qw/tempfile/; +use POSIX qw(locale_h); + +sub show_help(); +sub show_version(); +sub show_examples(); +sub parse_commandline_options(); +sub sort_file($$$); +sub join_files($$); +sub cleanup_files(@); + + +my $PROGRAM="easyjoin"; +my $VERSION="0.6.1"; + +my $debug=undef; +my $HEADER=undef; +my $IGNORE_CASE=undef; +my $FIELD_SEP=undef; +my $FILE1_KEY_COLUMN=1; +my $FILE2_KEY_COLUMN=1; +my @OUTPUT_SPECIFIERS=(); +my $OUTPUT_FORMAT=undef; +my $EMPTY_FILLER=undef; +my $SORT_BUFFER_SIZE=undef; +my $SORT_TEMP_DIR=undef; +my $input_filename1; +my $input_filename2; + +## +## Program Start +## +$ENV{'LANG'}="C";## "C" locale is critical for sorting and joining correctly +parse_commandline_options(); +my (undef, $tmp_filename1) = tempfile(OPEN=>0); +my (undef, $tmp_filename2) = tempfile(OPEN=>0); +sort_file($input_filename1, $tmp_filename1, $FILE1_KEY_COLUMN); +sort_file($input_filename2, $tmp_filename2, $FILE2_KEY_COLUMN); +my $join_exit_code = join_files($tmp_filename1, $tmp_filename2); +cleanup_files($tmp_filename1, $tmp_filename2); +exit($join_exit_code); + +## +## Program end +## + + +sub show_help() +{ +print<<EOF; +${PROGRAM}: Wrapper for GNU join+sort, automaticalyl sorts files before joining them. + +Usage: $PROGRAM [OPTIONS] [JOIN-OPTIONS] [SORT-OPTIONS] FILE1 FILE2 + +OPTIONS: Options specific to this program: + + --header = Both input files have a header line as the first line. + The header line will be joined properly, without being sorted. + + --version = Print ${PROGRAM}'s version. + + --debug = Print debug messages (relating to ${PROGRAM}'s operation). + + --help = Show this help screen. + + --example = Show usage examples. + + --all = Short-cut for: + -a 1 -a 2 -o auto -e . -t <TAB> + This will show all values (paired and unpared) from both files, + Automatically formatting the columns, and using TAB as field separator. + You can override the empty filler (-e X) on the command line. + + --allh = Short-cut for: + -a 1 -a 2 -o auto -e . -t <TAB> --header + Same as above, but will also respect the header line from both input files. + +JOIN-OPTIONS: + All of GNU join options are supported. + Run: + join --help + To see all possible joining options. + +SORT-OPTIONS: + The following options are supported for the intermediate sorting step: + + -S SIZE + --buffer-size SIZE = GNU sort's --buffer-size option. + + -T DIR + --temporary-directory DIR = GNU sort's --temporary-directory option. + + Run: + sort --help + To learn about these options. They might improve sorting performances for big files. + +FILE1 FILE2: + The two input files to be sorted, joined. + Unlike GNU join, joining STDIN is not supported. Both files must be real files. + + +NOTE About "--header" and "--auto-format": + The "--header" feature requires GNU coreutils version 8.6 or later. + The "-o auto" feature requires GNU coreutils version 8.10 or later. + +EOF + exit(0); +} + +sub show_version() +{ +print<<EOF; +$PROGRAM $VERSION +Copyright (C) 2010 A. Gordon (gordon\@cshl.edu) +License AGPLv3+: Affero GPL version 3 or later (http://www.gnu.org/licenses/agpl.html) + +To see the GNU's join version, run: + join --version +EOF + exit(0); +} + +sub show_examples() +{ +print<<EOF; +Example of joining two unsorted files (each file having a header line): + +\$ cat input1.txt +Fruit Color +Apple red +Banana yellow +Orange orange +Melon green + +\$ cat input2.txt +Fruit Price +Orange 7 +Avocado 8 +Apple 4 +Banana 3 + +\$ easyjoin -j 1 -a 1 -a 2 --header -e . -o auto input1.txt input2.txt +Fruit Color Price +Apple red 4 +Avocado . 8 +Banana yellow 3 +Melon green . +Orange orange 7 + +## A short-cut for all the options above: +\$ easyjoin --allh input1.txt input2.txt +Fruit Color Price +Apple red 4 +Avocado . 8 +Banana yellow 3 +Melon green . +Orange orange 7 + +EOF + exit(0); +} + +sub parse_commandline_options() +{ + ## + ## Parse command line + ## + my $rc = GetOptions( + "a=i" => sub { push @OUTPUT_SPECIFIERS, '-a', $_[1] }, + "e=s" => \$EMPTY_FILLER, + "ignore-case|i" => \$IGNORE_CASE, + "j=i" => sub { $FILE1_KEY_COLUMN = $_[1] ; $FILE2_KEY_COLUMN = $_[1] ; }, + "o=s" => \$OUTPUT_FORMAT, + "t=s" => \$FIELD_SEP, + "v=i" => sub { push @OUTPUT_SPECIFIERS, '-v', $_[1] }, + "1=i" => \$FILE1_KEY_COLUMN, + "2=i" => \$FILE2_KEY_COLUMN, + "debug" => \$debug, + "header" => \$HEADER, + "help" => \&show_help, + "version" => \&show_version, + "examples" => \&show_examples, + "buffer-size|S=s" => \$SORT_BUFFER_SIZE, + "temporary-directory|T=s" => \$SORT_TEMP_DIR, + "all" => sub { + push @OUTPUT_SPECIFIERS, "-a", 1, "-a", 2; + $FIELD_SEP = "\t"; + $OUTPUT_FORMAT = "auto"; + $EMPTY_FILLER = "." unless defined $EMPTY_FILLER; + }, + "allh" => sub { + push @OUTPUT_SPECIFIERS, "-a", 1, "-a", 2; + $FIELD_SEP = "\t"; + $OUTPUT_FORMAT = "auto"; + $HEADER=1; + $EMPTY_FILLER = "." unless defined $EMPTY_FILLER; + }, + ); + die "$PROGRAM: invalid command-line arguments.\n" unless $rc; + + ## We need two file names to join + my @INPUT_FILES = @ARGV; + die "$PROGRAM: missing operand: two file names to join\n" if (scalar(@INPUT_FILES)<2); + die "$PROGRAM: error: too many files specified (can only join two files)\n" if (scalar(@INPUT_FILES)>2); + die "$PROGRAM: error: input file can't be STDIN, please use a real file name.\n" if $INPUT_FILES[0] eq "-" || $INPUT_FILES[1] eq "-"; + die "$PROGRAM: error: input file 1 '" . $INPUT_FILES[0] . "' not found!" unless -e $INPUT_FILES[0]; + die "$PROGRAM: error: input file 2 '" . $INPUT_FILES[1] . "' not found!" unless -e $INPUT_FILES[1]; + + $input_filename1 = $INPUT_FILES[0]; + $input_filename2 = $INPUT_FILES[1]; +} + +sub sort_file($$$) +{ + my ($input_filename, $output_filename, $key_column) = @_; + + my @SORT_COMMAND; + push @SORT_COMMAND, $HEADER ? "./sort-header" : "sort" ; + push @SORT_COMMAND, "-f" if $IGNORE_CASE; + push @SORT_COMMAND, "-k${key_column},${key_column}" ; + push @SORT_COMMAND, "--buffer-size", $SORT_BUFFER_SIZE if $SORT_BUFFER_SIZE; + push @SORT_COMMAND, "--temporary-directory", $SORT_TEMP_DIR if $SORT_TEMP_DIR; + push @SORT_COMMAND, "--output", $output_filename; + push @SORT_COMMAND, "--debugheader" if $debug && $HEADER; + push @SORT_COMMAND, "-t", $FIELD_SEP if $FIELD_SEP; + push @SORT_COMMAND, $input_filename; + + if ($debug) { + warn "$PROGRAM: Running sort on '$input_filename' => '$output_filename'\n"; + warn "$PROGRAM: Sort command line:\n"; + print STDERR Dumper(\@SORT_COMMAND), "\n"; + } + + my $sort_exit_code=1; + system(@SORT_COMMAND); + if ($? == -1) { + die "$PROGRAM: Error: failed to execute 'sort': $!\n"; + } + elsif ($? & 127) { + my $signal = ($? & 127); + kill 2, $$ if $signal == 2; ##if sort was interrupted (CTRL-C) - just pass it on and commit suicide + die "$PROGRAM: Error: 'sort' child-process died with signal $signal\n"; + } + else { + $sort_exit_code = ($? >> 8); + } + die "$PROGRAM: Error: 'sort' process failed, exit code $sort_exit_code\n" if $sort_exit_code!=0; +} + +sub join_files($$) +{ + my ($file1, $file2) = @_; + + my @join_command = qw/join/; + push @join_command, "--header" if $HEADER; + push @join_command, "--ignore-case" if $IGNORE_CASE; + push @join_command, "-t", $FIELD_SEP if $FIELD_SEP; + push @join_command, "-1", $FILE1_KEY_COLUMN if $FILE1_KEY_COLUMN; + push @join_command, "-2", $FILE2_KEY_COLUMN if $FILE2_KEY_COLUMN; + push @join_command, "-e", $EMPTY_FILLER if defined $EMPTY_FILLER; + push @join_command, "-o", $OUTPUT_FORMAT if $OUTPUT_FORMAT; + push @join_command, @OUTPUT_SPECIFIERS; + push @join_command, $file1, $file2; + + if ($debug) { + warn "$PROGRAM: Running join on '$file1' and '$file2'\n"; + warn "$PROGRAM: join command line:\n"; + print STDERR Dumper(\@join_command), "\n"; + } + + my $join_exit_code=1; + system(@join_command); + if ($? == -1) { + die "$PROGRAM: Error: failed to execute 'join': $!\n"; + } + elsif ($? & 127) { + my $signal = ($? & 127); + kill 2, $$ if $signal == 2; ##if join was interrupted (CTRL-C) - just pass it on and commit suicide + die "$PROGRAM: Error: 'join' child-process died with signal $signal\n"; + } + else { + $join_exit_code = ($? >> 8); + } + return $join_exit_code; +} + +sub cleanup_files(@) +{ + my (@files) = @_; + + foreach my $file (@files) { + if ($debug) { + warn "$PROGRAM: debug mode, not deleting temporary file '$file'\n"; + } else { + my $count = unlink $file; + warn "$PROGRAM: Error: failed to delete temporary file '$file': $!\n" if ($count != 1); + } + } +}