| 0 | 1 #!/usr/bin/env perl | 
|  | 2 ## EASY Join - | 
|  | 3 ## Join with automatic pre-sorting of both files | 
|  | 4 ## Copyright (C) 2010 A. Gordon (gordon@cshl.edu) | 
|  | 5 ## license: AGPLv3+ | 
|  | 6 use strict; | 
|  | 7 use warnings; | 
|  | 8 use Data::Dumper; | 
|  | 9 use Getopt::Long qw(:config bundling no_ignore_case_always); | 
|  | 10 use File::Temp qw/tempfile/; | 
|  | 11 use POSIX qw(locale_h); | 
|  | 12 | 
|  | 13 sub show_help(); | 
|  | 14 sub show_version(); | 
|  | 15 sub show_examples(); | 
|  | 16 sub parse_commandline_options(); | 
|  | 17 sub sort_file($$$); | 
|  | 18 sub join_files($$); | 
|  | 19 sub cleanup_files(@); | 
|  | 20 | 
|  | 21 | 
|  | 22 my $PROGRAM="easyjoin"; | 
|  | 23 my $VERSION="0.6.1"; | 
|  | 24 | 
|  | 25 my $debug=undef; | 
|  | 26 my $HEADER=undef; | 
|  | 27 my $IGNORE_CASE=undef; | 
|  | 28 my $FIELD_SEP=undef; | 
|  | 29 my $FILE1_KEY_COLUMN=1; | 
|  | 30 my $FILE2_KEY_COLUMN=1; | 
|  | 31 my @OUTPUT_SPECIFIERS=(); | 
|  | 32 my $OUTPUT_FORMAT=undef; | 
|  | 33 my $EMPTY_FILLER=undef; | 
|  | 34 my $SORT_BUFFER_SIZE=undef; | 
|  | 35 my $SORT_TEMP_DIR=undef; | 
|  | 36 my $input_filename1; | 
|  | 37 my $input_filename2; | 
|  | 38 | 
|  | 39 ## | 
|  | 40 ## Program Start | 
|  | 41 ## | 
|  | 42 $ENV{'LANG'}="C";## "C" locale is critical for sorting and joining correctly | 
|  | 43 parse_commandline_options(); | 
|  | 44 my (undef, $tmp_filename1) = tempfile(OPEN=>0); | 
|  | 45 my (undef, $tmp_filename2) = tempfile(OPEN=>0); | 
|  | 46 sort_file($input_filename1, $tmp_filename1, $FILE1_KEY_COLUMN); | 
|  | 47 sort_file($input_filename2, $tmp_filename2, $FILE2_KEY_COLUMN); | 
|  | 48 my $join_exit_code = join_files($tmp_filename1, $tmp_filename2); | 
|  | 49 cleanup_files($tmp_filename1, $tmp_filename2); | 
|  | 50 exit($join_exit_code); | 
|  | 51 | 
|  | 52 ## | 
|  | 53 ## Program end | 
|  | 54 ## | 
|  | 55 | 
|  | 56 | 
|  | 57 sub show_help() | 
|  | 58 { | 
|  | 59 print<<EOF; | 
|  | 60 ${PROGRAM}: Wrapper for GNU join+sort, automaticalyl sorts files before joining them. | 
|  | 61 | 
|  | 62 Usage: $PROGRAM [OPTIONS] [JOIN-OPTIONS] [SORT-OPTIONS] FILE1 FILE2 | 
|  | 63 | 
|  | 64 OPTIONS: Options specific to this program: | 
|  | 65 | 
|  | 66    --header      =  Both input files have a header line as the first line. | 
|  | 67                     The header line will be joined properly, without being sorted. | 
|  | 68 | 
|  | 69    --version     =  Print ${PROGRAM}'s version. | 
|  | 70 | 
|  | 71    --debug       =  Print debug messages (relating to ${PROGRAM}'s operation). | 
|  | 72 | 
|  | 73    --help        =  Show this help screen. | 
|  | 74 | 
|  | 75    --example     =  Show usage examples. | 
|  | 76 | 
|  | 77    --all         =  Short-cut for: | 
|  | 78                       -a 1 -a 2 -o auto -e . -t <TAB> | 
|  | 79                     This will show all values (paired and unpared) from both files, | 
|  | 80 		    Automatically formatting the columns, and using TAB as field separator. | 
|  | 81 		    You can override the empty filler (-e X) on the command line. | 
|  | 82 | 
|  | 83    --allh        =  Short-cut for: | 
|  | 84                        -a 1 -a 2 -o auto -e . -t <TAB> --header | 
|  | 85 		    Same as above, but will also respect the header line from both input files. | 
|  | 86 | 
|  | 87 JOIN-OPTIONS: | 
|  | 88    All of GNU join options are supported. | 
|  | 89    Run: | 
|  | 90        join --help | 
|  | 91    To see all possible joining options. | 
|  | 92 | 
|  | 93 SORT-OPTIONS: | 
|  | 94    The following options are supported for the intermediate sorting step: | 
|  | 95 | 
|  | 96    -S SIZE | 
|  | 97    --buffer-size SIZE   = GNU sort's --buffer-size option. | 
|  | 98 | 
|  | 99    -T DIR | 
|  | 100    --temporary-directory DIR = GNU sort's --temporary-directory option. | 
|  | 101 | 
|  | 102    Run: | 
|  | 103       sort --help | 
|  | 104    To learn about these options. They might improve sorting performances for big files. | 
|  | 105 | 
|  | 106 FILE1 FILE2: | 
|  | 107    The two input files to be sorted, joined. | 
|  | 108    Unlike GNU join,  joining STDIN is not supported. Both files must be real files. | 
|  | 109 | 
|  | 110 | 
|  | 111 NOTE About "--header" and "--auto-format": | 
|  | 112    The "--header" feature requires GNU coreutils version 8.6 or later. | 
|  | 113    The "-o auto" feature requires GNU coreutils version 8.10 or later. | 
|  | 114 | 
|  | 115 EOF | 
|  | 116 	exit(0); | 
|  | 117 } | 
|  | 118 | 
|  | 119 sub show_version() | 
|  | 120 { | 
|  | 121 print<<EOF; | 
|  | 122 $PROGRAM $VERSION | 
|  | 123 Copyright (C) 2010 A. Gordon (gordon\@cshl.edu) | 
|  | 124 License AGPLv3+: Affero GPL version 3 or later (http://www.gnu.org/licenses/agpl.html) | 
|  | 125 | 
|  | 126 To see the GNU's join version, run: | 
|  | 127 	join --version | 
|  | 128 EOF | 
|  | 129 	exit(0); | 
|  | 130 } | 
|  | 131 | 
|  | 132 sub show_examples() | 
|  | 133 { | 
|  | 134 print<<EOF; | 
|  | 135 Example of joining two unsorted files (each file having a header line): | 
|  | 136 | 
|  | 137 \$ cat input1.txt | 
|  | 138 Fruit	Color | 
|  | 139 Apple	red | 
|  | 140 Banana	yellow | 
|  | 141 Orange	orange | 
|  | 142 Melon	green | 
|  | 143 | 
|  | 144 \$ cat input2.txt | 
|  | 145 Fruit	Price | 
|  | 146 Orange	7 | 
|  | 147 Avocado	8 | 
|  | 148 Apple	4 | 
|  | 149 Banana	3 | 
|  | 150 | 
|  | 151 \$ easyjoin -j 1 -a 1 -a 2 --header -e . -o auto input1.txt input2.txt | 
|  | 152 Fruit   Color   Price | 
|  | 153 Apple   red     4 | 
|  | 154 Avocado .       8 | 
|  | 155 Banana  yellow  3 | 
|  | 156 Melon   green   . | 
|  | 157 Orange  orange  7 | 
|  | 158 | 
|  | 159 ## A short-cut for all the options above: | 
|  | 160 \$ easyjoin --allh input1.txt input2.txt | 
|  | 161 Fruit   Color   Price | 
|  | 162 Apple   red     4 | 
|  | 163 Avocado .       8 | 
|  | 164 Banana  yellow  3 | 
|  | 165 Melon   green   . | 
|  | 166 Orange  orange  7 | 
|  | 167 | 
|  | 168 EOF | 
|  | 169 	exit(0); | 
|  | 170 } | 
|  | 171 | 
|  | 172 sub parse_commandline_options() | 
|  | 173 { | 
|  | 174 	## | 
|  | 175 	## Parse command line | 
|  | 176 	## | 
|  | 177 	my $rc = GetOptions( | 
|  | 178 			"a=i" => sub { push @OUTPUT_SPECIFIERS, '-a', $_[1] }, | 
|  | 179 			"e=s" => \$EMPTY_FILLER, | 
|  | 180 			"ignore-case|i" => \$IGNORE_CASE, | 
|  | 181 			"j=i" => sub { $FILE1_KEY_COLUMN = $_[1] ; $FILE2_KEY_COLUMN = $_[1] ; }, | 
|  | 182 			"o=s" => \$OUTPUT_FORMAT, | 
|  | 183 			"t=s" => \$FIELD_SEP, | 
|  | 184 			"v=i" => sub { push @OUTPUT_SPECIFIERS, '-v', $_[1] }, | 
|  | 185 			"1=i" => \$FILE1_KEY_COLUMN, | 
|  | 186 			"2=i" => \$FILE2_KEY_COLUMN, | 
|  | 187 			"debug" => \$debug, | 
|  | 188 			"header" => \$HEADER, | 
|  | 189 			"help" => \&show_help, | 
|  | 190 			"version" => \&show_version, | 
|  | 191 			"examples" => \&show_examples, | 
|  | 192 			"buffer-size|S=s" => \$SORT_BUFFER_SIZE, | 
|  | 193 			"temporary-directory|T=s" => \$SORT_TEMP_DIR, | 
|  | 194 			"all" => sub { | 
|  | 195 					push @OUTPUT_SPECIFIERS, "-a", 1, "-a", 2; | 
|  | 196 					$FIELD_SEP = "\t"; | 
|  | 197 					$OUTPUT_FORMAT = "auto"; | 
|  | 198 					$EMPTY_FILLER = "." unless defined $EMPTY_FILLER; | 
|  | 199 				}, | 
|  | 200 			"allh" => sub { | 
|  | 201 					push @OUTPUT_SPECIFIERS, "-a", 1, "-a", 2; | 
|  | 202 					$FIELD_SEP = "\t"; | 
|  | 203 					$OUTPUT_FORMAT = "auto"; | 
|  | 204 					$HEADER=1; | 
|  | 205 					$EMPTY_FILLER = "." unless defined $EMPTY_FILLER; | 
|  | 206 				}, | 
|  | 207 		); | 
|  | 208 	die "$PROGRAM: invalid command-line arguments.\n" unless $rc; | 
|  | 209 | 
|  | 210 	## We need two file names to join | 
|  | 211 	my @INPUT_FILES = @ARGV; | 
|  | 212 	die "$PROGRAM: missing operand: two file names to join\n" if (scalar(@INPUT_FILES)<2); | 
|  | 213 	die "$PROGRAM: error: too many files specified (can only join two files)\n" if (scalar(@INPUT_FILES)>2); | 
|  | 214 	die "$PROGRAM: error: input file can't be STDIN, please use a real file name.\n" if $INPUT_FILES[0] eq "-" || $INPUT_FILES[1] eq "-"; | 
|  | 215 	die "$PROGRAM: error: input file 1 '" . $INPUT_FILES[0] . "' not found!" unless -e $INPUT_FILES[0]; | 
|  | 216 	die "$PROGRAM: error: input file 2 '" . $INPUT_FILES[1] . "' not found!" unless -e $INPUT_FILES[1]; | 
|  | 217 | 
|  | 218 	$input_filename1 = $INPUT_FILES[0]; | 
|  | 219 	$input_filename2 = $INPUT_FILES[1]; | 
|  | 220 } | 
|  | 221 | 
|  | 222 sub sort_file($$$) | 
|  | 223 { | 
|  | 224 	my ($input_filename, $output_filename, $key_column) = @_; | 
|  | 225 | 
|  | 226 	my @SORT_COMMAND; | 
|  | 227 	push @SORT_COMMAND, $HEADER ? "./sort-header" : "sort" ; | 
|  | 228 	push @SORT_COMMAND, "-f" if $IGNORE_CASE; | 
|  | 229 	push @SORT_COMMAND, "-k${key_column},${key_column}" ; | 
|  | 230 	push @SORT_COMMAND, "--buffer-size", $SORT_BUFFER_SIZE if $SORT_BUFFER_SIZE; | 
|  | 231 	push @SORT_COMMAND, "--temporary-directory", $SORT_TEMP_DIR if $SORT_TEMP_DIR; | 
|  | 232 	push @SORT_COMMAND, "--output", $output_filename; | 
|  | 233 	push @SORT_COMMAND, "--debugheader" if $debug && $HEADER; | 
|  | 234 	push @SORT_COMMAND, "-t", $FIELD_SEP if $FIELD_SEP; | 
|  | 235 	push @SORT_COMMAND, $input_filename; | 
|  | 236 | 
|  | 237 	if ($debug) { | 
|  | 238 		warn "$PROGRAM: Running sort on '$input_filename' => '$output_filename'\n"; | 
|  | 239 		warn "$PROGRAM: Sort command line:\n"; | 
|  | 240 		print STDERR Dumper(\@SORT_COMMAND), "\n"; | 
|  | 241 	} | 
|  | 242 | 
|  | 243 	my $sort_exit_code=1; | 
|  | 244 	system(@SORT_COMMAND); | 
|  | 245 	if ($? == -1) { | 
|  | 246 		die "$PROGRAM: Error: failed to execute 'sort': $!\n"; | 
|  | 247 	} | 
|  | 248 	elsif ($? & 127) { | 
|  | 249 		my $signal = ($? & 127); | 
|  | 250 		kill 2, $$ if $signal == 2; ##if sort was interrupted (CTRL-C) - just pass it on and commit suicide | 
|  | 251 		die "$PROGRAM: Error: 'sort' child-process died with signal $signal\n"; | 
|  | 252 	} | 
|  | 253 	else { | 
|  | 254 		$sort_exit_code = ($? >> 8); | 
|  | 255 	} | 
|  | 256 	die "$PROGRAM: Error: 'sort' process failed, exit code $sort_exit_code\n" if $sort_exit_code!=0; | 
|  | 257 } | 
|  | 258 | 
|  | 259 sub join_files($$) | 
|  | 260 { | 
|  | 261 	my ($file1, $file2) = @_; | 
|  | 262 | 
|  | 263 	my @join_command = qw/join/; | 
|  | 264 	push @join_command, "--header" if $HEADER; | 
|  | 265 	push @join_command, "--ignore-case" if $IGNORE_CASE; | 
|  | 266 	push @join_command, "-t", $FIELD_SEP if $FIELD_SEP; | 
|  | 267 	push @join_command, "-1", $FILE1_KEY_COLUMN if $FILE1_KEY_COLUMN; | 
|  | 268 	push @join_command, "-2", $FILE2_KEY_COLUMN if $FILE2_KEY_COLUMN; | 
|  | 269 	push @join_command, "-e", $EMPTY_FILLER if defined $EMPTY_FILLER; | 
|  | 270 	push @join_command, "-o", $OUTPUT_FORMAT if $OUTPUT_FORMAT; | 
|  | 271 	push @join_command, @OUTPUT_SPECIFIERS; | 
|  | 272 	push @join_command, $file1, $file2; | 
|  | 273 | 
|  | 274 	if ($debug) { | 
|  | 275 		warn "$PROGRAM: Running join on '$file1'  and '$file2'\n"; | 
|  | 276 		warn "$PROGRAM: join command line:\n"; | 
|  | 277 		print STDERR Dumper(\@join_command), "\n"; | 
|  | 278 	} | 
|  | 279 | 
|  | 280 	my $join_exit_code=1; | 
|  | 281 	system(@join_command); | 
|  | 282 	if ($? == -1) { | 
|  | 283 		die "$PROGRAM: Error: failed to execute 'join': $!\n"; | 
|  | 284 	} | 
|  | 285 	elsif ($? & 127) { | 
|  | 286 		my $signal = ($? & 127); | 
|  | 287 		kill 2, $$ if $signal == 2; ##if join was interrupted (CTRL-C) - just pass it on and commit suicide | 
|  | 288 		die "$PROGRAM: Error: 'join' child-process died with signal $signal\n"; | 
|  | 289 	} | 
|  | 290 	else { | 
|  | 291 		$join_exit_code = ($? >> 8); | 
|  | 292 	} | 
|  | 293 	return $join_exit_code; | 
|  | 294 } | 
|  | 295 | 
|  | 296 sub cleanup_files(@) | 
|  | 297 { | 
|  | 298 	my (@files) = @_; | 
|  | 299 | 
|  | 300 	foreach my $file (@files) { | 
|  | 301 		if ($debug) { | 
|  | 302 			warn "$PROGRAM: debug mode, not deleting temporary file '$file'\n"; | 
|  | 303 		} else { | 
|  | 304 			my $count = unlink $file; | 
|  | 305 			warn "$PROGRAM: Error: failed to delete temporary file '$file': $!\n" if ($count != 1); | 
|  | 306 		} | 
|  | 307 	} | 
|  | 308 } |