Mercurial > repos > bgruening > text_processing
comparison sort-header @ 0:5314e5d6f040 draft
Imported from capsule None
| author | bgruening |
|---|---|
| date | Thu, 29 Jan 2015 07:53:17 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:5314e5d6f040 |
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 ## | |
| 3 ## Sort-header - wrapper for GNU sort with header-line support | |
| 4 ## | |
| 5 ## Copyright(C) A. Gordon | |
| 6 ## license AGPLv3+ | |
| 7 ## | |
| 8 use strict; | |
| 9 use warnings; | |
| 10 use Data::Dumper; | |
| 11 use IO::Handle; | |
| 12 use Getopt::Long qw(:config bundling no_ignore_case_always); | |
| 13 | |
| 14 ## Forward declarations | |
| 15 sub add_standard_sort_param(@); | |
| 16 sub add_standard_sort_param_value(@); | |
| 17 sub forbidden_sort_param(@); | |
| 18 sub show_help(); | |
| 19 sub show_version(); | |
| 20 sub show_examples(); | |
| 21 sub parse_commandline_options(); | |
| 22 sub reassign_input_output(); | |
| 23 sub process_header_lines(); | |
| 24 sub run_sort(); | |
| 25 sub read_line_non_buffered(); | |
| 26 | |
| 27 | |
| 28 ## | |
| 29 ## Runtime options | |
| 30 ## | |
| 31 my $PROGRAM="sort-header"; | |
| 32 my $VERSION=0.4; | |
| 33 | |
| 34 my $check_only=undef; | |
| 35 my $input_file=undef; | |
| 36 my $output_file=undef; | |
| 37 my $field_separator=undef; | |
| 38 my $header_lines =1 ; | |
| 39 my $debug=undef; | |
| 40 my $sort_exit_code=1; #by default, assume some error | |
| 41 | |
| 42 my @sort_options; | |
| 43 | |
| 44 ## | |
| 45 ## Program Start | |
| 46 ## | |
| 47 parse_commandline_options(); | |
| 48 reassign_input_output(); | |
| 49 process_header_lines(); | |
| 50 run_sort(); | |
| 51 exit($sort_exit_code); | |
| 52 ## | |
| 53 ## Program End | |
| 54 ## | |
| 55 | |
| 56 sub show_examples() | |
| 57 { | |
| 58 print<<EOF; | |
| 59 Sorting a file with a header line: | |
| 60 | |
| 61 \$ cat input.txt | |
| 62 Fruit Color Price | |
| 63 Banana Yellow 4.1 | |
| 64 Avocado Green 8.0 | |
| 65 Apple Red 3.0 | |
| 66 Melon Green 6.1 | |
| 67 | |
| 68 # By default, 'sort-header' assumes 1 header line | |
| 69 # (no need to use --header in this case). | |
| 70 | |
| 71 \$ sort-header -k3,3nr input.txt | |
| 72 Fruit Color Price | |
| 73 Avocado Green 8.0 | |
| 74 Melon Green 6.1 | |
| 75 Banana Yellow 4.1 | |
| 76 Apple Red 3.0 | |
| 77 | |
| 78 EOF | |
| 79 exit(0); | |
| 80 } | |
| 81 | |
| 82 sub show_help() | |
| 83 { | |
| 84 print<<EOF; | |
| 85 ${PROGRAM}: Wrapper for GNU sort, allowing sorting files with header lines. | |
| 86 | |
| 87 Usage: $PROGRAM [HEADER-OPTIONS] [GNU sort Options] [INPUT-FILE] | |
| 88 | |
| 89 HEADER-OPTIONS: the following options are supported by '${PROGRAM}': | |
| 90 | |
| 91 --header N = Treat the first N lines as header lines. | |
| 92 These line will NOT be sorted. They will be passed | |
| 93 directly to the output file. (default: 1) | |
| 94 | |
| 95 --version = Print ${PROGRAM}'s version. | |
| 96 | |
| 97 --debugheader = Print debug messages (relating to ${PROGRAM}'s operation). | |
| 98 | |
| 99 --help = Show this help screen. | |
| 100 | |
| 101 --examples = Show usage examples. | |
| 102 | |
| 103 GNU sort options: | |
| 104 Most of the standard GNU sort options are supported and passed to GNU sort. | |
| 105 The following options can not be used with '${PROGRAM}': | |
| 106 | |
| 107 -m --merge => ${PROGRAM} can only sort one file, not merge multiple files. | |
| 108 -c -C --check => Currently not supported | |
| 109 --files0-from => Currently not supported | |
| 110 -z --zero-terminated => Currently not supported | |
| 111 | |
| 112 INPUT-FILE: | |
| 113 If INPUT-FILE is not specified, $PROGRAM will use STDIN (just like GNU sort). | |
| 114 | |
| 115 EOF | |
| 116 exit(0); | |
| 117 } | |
| 118 | |
| 119 sub show_version() | |
| 120 { | |
| 121 print<<EOF; | |
| 122 $PROGRAM $VERSION | |
| 123 Copyright (C) 2010 A. Gordon (gordon\@cshl.edu) | |
| 124 License AGPLv3+: Affero GPL version 3 or later (http://www.gnu.org/licenses/agpl.html) | |
| 125 | |
| 126 To see the GNU's sort version, run: | |
| 127 sort --version | |
| 128 EOF | |
| 129 exit(0); | |
| 130 } | |
| 131 | |
| 132 sub parse_commandline_options() | |
| 133 { | |
| 134 my $rc = GetOptions( | |
| 135 "ignore-leading-blanks|b" => \&add_standard_sort_param, | |
| 136 "dictionary-order|d" => \&add_standard_sort_param, | |
| 137 "ignore-case|f" => \&add_standard_sort_param, | |
| 138 "general-numeric-sort|g" => \&add_standard_sort_param, | |
| 139 "ignore-nonprinting|i" => \&add_standard_sort_param, | |
| 140 "month-sort|M" => \&add_standard_sort_param, | |
| 141 "human-numeric-sort|h" => \&add_standard_sort_param, | |
| 142 "numeric-sort|n" => \&add_standard_sort_param, | |
| 143 "random-source=s" => \&add_standard_sort_param_value, | |
| 144 "random-sort|R" => \&add_standard_sort_param, | |
| 145 "reverse|r" => \&add_standard_sort_param, | |
| 146 "sort=s" => \&add_standard_sort_param_value, | |
| 147 "version-sort|V" => \&add_standard_sort_param, | |
| 148 | |
| 149 "check|c" => \&forbidden_sort_param, | |
| 150 "C" => \&forbidden_sort_param, | |
| 151 "compress-program=s" => \&add_standard_sort_param_value, | |
| 152 "debug" => \&add_standard_sort_param, | |
| 153 | |
| 154 "files0-from=s" => \&forbidden_sort_param, | |
| 155 | |
| 156 "key|k=s" => \&add_standard_sort_param_value, | |
| 157 "merge|m" => \&forbidden_sort_param, | |
| 158 "batch-size=i" => \&forbidden_sort_param, | |
| 159 | |
| 160 "parallel=i" => \&add_standard_sort_param_value, | |
| 161 | |
| 162 "output|o=s" => \$output_file, | |
| 163 | |
| 164 "stable|s" => \&add_standard_sort_param, | |
| 165 "buffer-size|S=s" => \&add_standard_sort_param_value, | |
| 166 | |
| 167 "field-separator|t=s" => \&add_standard_sort_param_value, | |
| 168 "temporary-directory|T=s" => \&add_standard_sort_param_value, | |
| 169 "unique|u" => \&add_standard_sort_param, | |
| 170 | |
| 171 "zero-terminated|z" => \&forbidden_sort_param, | |
| 172 | |
| 173 "help" => \&show_help, | |
| 174 "version" => \&show_version, | |
| 175 "examples" => \&show_examples, | |
| 176 | |
| 177 "header=i" => \$header_lines, | |
| 178 "debugheader" => \$debug, | |
| 179 ); | |
| 180 | |
| 181 exit 1 unless $rc; | |
| 182 | |
| 183 my @INPUT_FILES = @ARGV; | |
| 184 | |
| 185 die "$PROGRAM: error: invalid number of header lines ($header_lines)\n" unless $header_lines>=0; | |
| 186 die "$PROGRAM: error: Multiple input files specified. This program can sort only a signle file.\n" if (scalar(@INPUT_FILES)>1); | |
| 187 $input_file = shift @INPUT_FILES if scalar(@INPUT_FILES)==1; | |
| 188 | |
| 189 if ($debug) { | |
| 190 warn "$PROGRAM: number of header lines = $header_lines\n"; | |
| 191 warn "$PROGRAM: PASS-to-Sort options:\n", Dumper(\@sort_options), "\n"; | |
| 192 } | |
| 193 } | |
| 194 | |
| 195 sub reassign_input_output() | |
| 196 { | |
| 197 if ($output_file) { | |
| 198 warn "$PROGRAM: Re-assigning STDOUT to '$output_file'\n" if $debug; | |
| 199 open OUTPUT, '>', $output_file or die "$PROGRAM: Error: failed to create output file '$output_file': $!\n"; | |
| 200 STDOUT->fdopen(\*OUTPUT, 'w') or die "$PROGRAM: Error: failed to reassign STDOUT to '$output_file': $!\n"; | |
| 201 } | |
| 202 | |
| 203 | |
| 204 if ($input_file) { | |
| 205 warn "$PROGRAM: Re-assigning STDIN to '$input_file'\n" if $debug; | |
| 206 open INPUT, '<', $input_file or die "$PROGRAM: Error: failed to open input file '$input_file': $!\n"; | |
| 207 STDIN->fdopen(\*INPUT, 'r') or die "$PROGRAM: Error: failed to reassign STDIN to '$input_file': $!\n"; | |
| 208 } | |
| 209 } | |
| 210 | |
| 211 sub process_header_lines() | |
| 212 { | |
| 213 warn "$PROGRAM: Reading $header_lines header lines...\n" if $debug; | |
| 214 for (my $i=0; $i<$header_lines; $i++) { | |
| 215 my $line = read_line_non_buffered(); | |
| 216 exit unless defined $line; | |
| 217 print $line; | |
| 218 } | |
| 219 } | |
| 220 | |
| 221 sub run_sort() | |
| 222 { | |
| 223 warn "$PROGRAM: Running GNU sort...\n" if $debug; | |
| 224 system('sort', @sort_options); | |
| 225 if ($? == -1) { | |
| 226 die "$PROGRAM: Error: failed to execute 'sort': $!\n"; | |
| 227 } | |
| 228 elsif ($? & 127) { | |
| 229 my $signal = ($? & 127); | |
| 230 kill 2, $$ if $signal == 2; ##if sort was interrupted (CTRL-C) - just pass it on and commit suicide | |
| 231 die "$PROGRAM: Error: 'sort' child-process died with signal $signal\n"; | |
| 232 } | |
| 233 else { | |
| 234 $sort_exit_code = ($? >> 8); | |
| 235 } | |
| 236 } | |
| 237 | |
| 238 | |
| 239 sub add_standard_sort_param(@) | |
| 240 { | |
| 241 my ($obj)= @_; | |
| 242 add_standard_sort_param_value($obj, undef); | |
| 243 } | |
| 244 | |
| 245 sub add_standard_sort_param_value(@) | |
| 246 { | |
| 247 my ($obj,$value)= @_; | |
| 248 | |
| 249 my $option = "" . $obj ; #stringify the optino object, get the option name. | |
| 250 | |
| 251 if (length($option)==1) { | |
| 252 $option = "-" . $option ; | |
| 253 } else { | |
| 254 $option = "--" . $option ; | |
| 255 } | |
| 256 push @sort_options, $option ; | |
| 257 push @sort_options, $value if $value; | |
| 258 } | |
| 259 | |
| 260 sub forbidden_sort_param(@) | |
| 261 { | |
| 262 my ($obj,$value)= @_; | |
| 263 my $option = "" . $obj ; #stringify the optino object, get the option name. | |
| 264 | |
| 265 die "$PROGRAM: Error: option '$option' can not be used with this program. If you must use it, run GNU sort directly. see --help for more details.\n"; | |
| 266 } | |
| 267 | |
| 268 sub read_line_non_buffered() | |
| 269 { | |
| 270 my $line = ''; | |
| 271 while ( 1 ) { | |
| 272 my $c; | |
| 273 my $rc = sysread STDIN, $c, 1; | |
| 274 die "$PROGRAM: STDIN Read error: $!" unless defined $rc; | |
| 275 return $line if $rc==0 && $line; | |
| 276 return undef if $rc==0 && (!$line); | |
| 277 $line .= $c ; | |
| 278 return $line if ( $c eq "\n"); | |
| 279 } | |
| 280 } | |
| 281 |
