annotate easyjoin @ 21:86755160afbf draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/text_processing commit c2b1677d1c94433f777c2dc28ac8eec0a99cc6a7
author bgruening
date Fri, 16 Aug 2024 10:41:54 +0000
parents 5314e5d6f040
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
1 #!/usr/bin/env perl
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
2 ## EASY Join -
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
3 ## Join with automatic pre-sorting of both files
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
4 ## Copyright (C) 2010 A. Gordon (gordon@cshl.edu)
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
5 ## license: AGPLv3+
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
6 use strict;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
7 use warnings;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
8 use Data::Dumper;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
9 use Getopt::Long qw(:config bundling no_ignore_case_always);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
10 use File::Temp qw/tempfile/;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
11 use POSIX qw(locale_h);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
12
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
13 sub show_help();
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
14 sub show_version();
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
15 sub show_examples();
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
16 sub parse_commandline_options();
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
17 sub sort_file($$$);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
18 sub join_files($$);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
19 sub cleanup_files(@);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
20
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
21
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
22 my $PROGRAM="easyjoin";
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
23 my $VERSION="0.6.1";
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
24
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
25 my $debug=undef;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
26 my $HEADER=undef;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
27 my $IGNORE_CASE=undef;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
28 my $FIELD_SEP=undef;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
29 my $FILE1_KEY_COLUMN=1;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
30 my $FILE2_KEY_COLUMN=1;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
31 my @OUTPUT_SPECIFIERS=();
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
32 my $OUTPUT_FORMAT=undef;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
33 my $EMPTY_FILLER=undef;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
34 my $SORT_BUFFER_SIZE=undef;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
35 my $SORT_TEMP_DIR=undef;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
36 my $input_filename1;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
37 my $input_filename2;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
38
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
39 ##
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
40 ## Program Start
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
41 ##
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
42 $ENV{'LANG'}="C";## "C" locale is critical for sorting and joining correctly
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
43 parse_commandline_options();
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
44 my (undef, $tmp_filename1) = tempfile(OPEN=>0);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
45 my (undef, $tmp_filename2) = tempfile(OPEN=>0);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
46 sort_file($input_filename1, $tmp_filename1, $FILE1_KEY_COLUMN);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
47 sort_file($input_filename2, $tmp_filename2, $FILE2_KEY_COLUMN);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
48 my $join_exit_code = join_files($tmp_filename1, $tmp_filename2);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
49 cleanup_files($tmp_filename1, $tmp_filename2);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
50 exit($join_exit_code);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
51
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
52 ##
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
53 ## Program end
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
54 ##
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
55
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
56
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
57 sub show_help()
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
58 {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
59 print<<EOF;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
60 ${PROGRAM}: Wrapper for GNU join+sort, automaticalyl sorts files before joining them.
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
61
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
62 Usage: $PROGRAM [OPTIONS] [JOIN-OPTIONS] [SORT-OPTIONS] FILE1 FILE2
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
63
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
64 OPTIONS: Options specific to this program:
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
65
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
66 --header = Both input files have a header line as the first line.
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
67 The header line will be joined properly, without being sorted.
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
68
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
69 --version = Print ${PROGRAM}'s version.
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
70
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
71 --debug = Print debug messages (relating to ${PROGRAM}'s operation).
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
72
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
73 --help = Show this help screen.
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
74
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
75 --example = Show usage examples.
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
76
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
77 --all = Short-cut for:
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
78 -a 1 -a 2 -o auto -e . -t <TAB>
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
79 This will show all values (paired and unpared) from both files,
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
80 Automatically formatting the columns, and using TAB as field separator.
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
81 You can override the empty filler (-e X) on the command line.
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
82
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
83 --allh = Short-cut for:
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
84 -a 1 -a 2 -o auto -e . -t <TAB> --header
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
85 Same as above, but will also respect the header line from both input files.
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
86
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
87 JOIN-OPTIONS:
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
88 All of GNU join options are supported.
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
89 Run:
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
90 join --help
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
91 To see all possible joining options.
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
92
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
93 SORT-OPTIONS:
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
94 The following options are supported for the intermediate sorting step:
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
95
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
96 -S SIZE
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
97 --buffer-size SIZE = GNU sort's --buffer-size option.
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
98
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
99 -T DIR
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
100 --temporary-directory DIR = GNU sort's --temporary-directory option.
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
101
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
102 Run:
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
103 sort --help
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
104 To learn about these options. They might improve sorting performances for big files.
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
105
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
106 FILE1 FILE2:
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
107 The two input files to be sorted, joined.
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
108 Unlike GNU join, joining STDIN is not supported. Both files must be real files.
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
109
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
110
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
111 NOTE About "--header" and "--auto-format":
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
112 The "--header" feature requires GNU coreutils version 8.6 or later.
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
113 The "-o auto" feature requires GNU coreutils version 8.10 or later.
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
114
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
115 EOF
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
116 exit(0);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
117 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
118
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
119 sub show_version()
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
120 {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
121 print<<EOF;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
122 $PROGRAM $VERSION
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
123 Copyright (C) 2010 A. Gordon (gordon\@cshl.edu)
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
124 License AGPLv3+: Affero GPL version 3 or later (http://www.gnu.org/licenses/agpl.html)
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
125
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
126 To see the GNU's join version, run:
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
127 join --version
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
128 EOF
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
129 exit(0);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
130 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
131
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
132 sub show_examples()
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
133 {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
134 print<<EOF;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
135 Example of joining two unsorted files (each file having a header line):
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
136
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
137 \$ cat input1.txt
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
138 Fruit Color
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
139 Apple red
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
140 Banana yellow
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
141 Orange orange
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
142 Melon green
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
143
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
144 \$ cat input2.txt
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
145 Fruit Price
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
146 Orange 7
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
147 Avocado 8
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
148 Apple 4
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
149 Banana 3
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
150
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
151 \$ easyjoin -j 1 -a 1 -a 2 --header -e . -o auto input1.txt input2.txt
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
152 Fruit Color Price
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
153 Apple red 4
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
154 Avocado . 8
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
155 Banana yellow 3
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
156 Melon green .
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
157 Orange orange 7
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
158
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
159 ## A short-cut for all the options above:
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
160 \$ easyjoin --allh input1.txt input2.txt
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
161 Fruit Color Price
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
162 Apple red 4
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
163 Avocado . 8
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
164 Banana yellow 3
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
165 Melon green .
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
166 Orange orange 7
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
167
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
168 EOF
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
169 exit(0);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
170 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
171
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
172 sub parse_commandline_options()
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
173 {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
174 ##
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
175 ## Parse command line
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
176 ##
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
177 my $rc = GetOptions(
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
178 "a=i" => sub { push @OUTPUT_SPECIFIERS, '-a', $_[1] },
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
179 "e=s" => \$EMPTY_FILLER,
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
180 "ignore-case|i" => \$IGNORE_CASE,
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
181 "j=i" => sub { $FILE1_KEY_COLUMN = $_[1] ; $FILE2_KEY_COLUMN = $_[1] ; },
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
182 "o=s" => \$OUTPUT_FORMAT,
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
183 "t=s" => \$FIELD_SEP,
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
184 "v=i" => sub { push @OUTPUT_SPECIFIERS, '-v', $_[1] },
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
185 "1=i" => \$FILE1_KEY_COLUMN,
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
186 "2=i" => \$FILE2_KEY_COLUMN,
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
187 "debug" => \$debug,
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
188 "header" => \$HEADER,
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
189 "help" => \&show_help,
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
190 "version" => \&show_version,
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
191 "examples" => \&show_examples,
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
192 "buffer-size|S=s" => \$SORT_BUFFER_SIZE,
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
193 "temporary-directory|T=s" => \$SORT_TEMP_DIR,
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
194 "all" => sub {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
195 push @OUTPUT_SPECIFIERS, "-a", 1, "-a", 2;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
196 $FIELD_SEP = "\t";
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
197 $OUTPUT_FORMAT = "auto";
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
198 $EMPTY_FILLER = "." unless defined $EMPTY_FILLER;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
199 },
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
200 "allh" => sub {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
201 push @OUTPUT_SPECIFIERS, "-a", 1, "-a", 2;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
202 $FIELD_SEP = "\t";
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
203 $OUTPUT_FORMAT = "auto";
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
204 $HEADER=1;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
205 $EMPTY_FILLER = "." unless defined $EMPTY_FILLER;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
206 },
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
207 );
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
208 die "$PROGRAM: invalid command-line arguments.\n" unless $rc;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
209
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
210 ## We need two file names to join
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
211 my @INPUT_FILES = @ARGV;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
212 die "$PROGRAM: missing operand: two file names to join\n" if (scalar(@INPUT_FILES)<2);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
213 die "$PROGRAM: error: too many files specified (can only join two files)\n" if (scalar(@INPUT_FILES)>2);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
214 die "$PROGRAM: error: input file can't be STDIN, please use a real file name.\n" if $INPUT_FILES[0] eq "-" || $INPUT_FILES[1] eq "-";
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
215 die "$PROGRAM: error: input file 1 '" . $INPUT_FILES[0] . "' not found!" unless -e $INPUT_FILES[0];
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
216 die "$PROGRAM: error: input file 2 '" . $INPUT_FILES[1] . "' not found!" unless -e $INPUT_FILES[1];
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
217
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
218 $input_filename1 = $INPUT_FILES[0];
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
219 $input_filename2 = $INPUT_FILES[1];
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
220 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
221
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
222 sub sort_file($$$)
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
223 {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
224 my ($input_filename, $output_filename, $key_column) = @_;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
225
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
226 my @SORT_COMMAND;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
227 push @SORT_COMMAND, $HEADER ? "./sort-header" : "sort" ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
228 push @SORT_COMMAND, "-f" if $IGNORE_CASE;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
229 push @SORT_COMMAND, "-k${key_column},${key_column}" ;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
230 push @SORT_COMMAND, "--buffer-size", $SORT_BUFFER_SIZE if $SORT_BUFFER_SIZE;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
231 push @SORT_COMMAND, "--temporary-directory", $SORT_TEMP_DIR if $SORT_TEMP_DIR;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
232 push @SORT_COMMAND, "--output", $output_filename;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
233 push @SORT_COMMAND, "--debugheader" if $debug && $HEADER;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
234 push @SORT_COMMAND, "-t", $FIELD_SEP if $FIELD_SEP;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
235 push @SORT_COMMAND, $input_filename;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
236
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
237 if ($debug) {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
238 warn "$PROGRAM: Running sort on '$input_filename' => '$output_filename'\n";
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
239 warn "$PROGRAM: Sort command line:\n";
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
240 print STDERR Dumper(\@SORT_COMMAND), "\n";
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
241 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
242
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
243 my $sort_exit_code=1;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
244 system(@SORT_COMMAND);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
245 if ($? == -1) {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
246 die "$PROGRAM: Error: failed to execute 'sort': $!\n";
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
247 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
248 elsif ($? & 127) {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
249 my $signal = ($? & 127);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
250 kill 2, $$ if $signal == 2; ##if sort was interrupted (CTRL-C) - just pass it on and commit suicide
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
251 die "$PROGRAM: Error: 'sort' child-process died with signal $signal\n";
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
252 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
253 else {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
254 $sort_exit_code = ($? >> 8);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
255 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
256 die "$PROGRAM: Error: 'sort' process failed, exit code $sort_exit_code\n" if $sort_exit_code!=0;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
257 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
258
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
259 sub join_files($$)
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
260 {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
261 my ($file1, $file2) = @_;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
262
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
263 my @join_command = qw/join/;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
264 push @join_command, "--header" if $HEADER;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
265 push @join_command, "--ignore-case" if $IGNORE_CASE;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
266 push @join_command, "-t", $FIELD_SEP if $FIELD_SEP;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
267 push @join_command, "-1", $FILE1_KEY_COLUMN if $FILE1_KEY_COLUMN;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
268 push @join_command, "-2", $FILE2_KEY_COLUMN if $FILE2_KEY_COLUMN;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
269 push @join_command, "-e", $EMPTY_FILLER if defined $EMPTY_FILLER;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
270 push @join_command, "-o", $OUTPUT_FORMAT if $OUTPUT_FORMAT;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
271 push @join_command, @OUTPUT_SPECIFIERS;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
272 push @join_command, $file1, $file2;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
273
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
274 if ($debug) {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
275 warn "$PROGRAM: Running join on '$file1' and '$file2'\n";
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
276 warn "$PROGRAM: join command line:\n";
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
277 print STDERR Dumper(\@join_command), "\n";
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
278 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
279
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
280 my $join_exit_code=1;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
281 system(@join_command);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
282 if ($? == -1) {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
283 die "$PROGRAM: Error: failed to execute 'join': $!\n";
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
284 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
285 elsif ($? & 127) {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
286 my $signal = ($? & 127);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
287 kill 2, $$ if $signal == 2; ##if join was interrupted (CTRL-C) - just pass it on and commit suicide
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
288 die "$PROGRAM: Error: 'join' child-process died with signal $signal\n";
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
289 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
290 else {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
291 $join_exit_code = ($? >> 8);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
292 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
293 return $join_exit_code;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
294 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
295
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
296 sub cleanup_files(@)
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
297 {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
298 my (@files) = @_;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
299
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
300 foreach my $file (@files) {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
301 if ($debug) {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
302 warn "$PROGRAM: debug mode, not deleting temporary file '$file'\n";
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
303 } else {
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
304 my $count = unlink $file;
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
305 warn "$PROGRAM: Error: failed to delete temporary file '$file': $!\n" if ($count != 1);
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
306 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
307 }
5314e5d6f040 Imported from capsule None
bgruening
parents:
diff changeset
308 }