annotate tools/unix_tools/word_list_grep.pl @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/perl
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2 use strict;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 use warnings;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 use Getopt::Std;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 sub parse_command_line();
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 sub load_word_list();
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 sub compile_regex(@);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 sub usage();
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 my $word_list_file;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 my $input_file ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 my $output_file;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 my $find_complete_words ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 my $find_inverse;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 my $find_in_specific_column ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 my $find_case_insensitive ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 my $skip_first_line ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 ##
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 ## Program Start
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 ##
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 usage() if @ARGV==0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 parse_command_line();
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 my @words = load_word_list();
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 my $regex = compile_regex(@words);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 # Allow first line to pass without filtering?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 if ( $skip_first_line ) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 my $line = <$input_file>;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 print $output_file $line ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 ##
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 ## Main loop
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 ##
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 while ( <$input_file> ) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 my $target = $_;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 # If searching in a specific column (and not in the entire line)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 # extract the content of that one column
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 if ( $find_in_specific_column ) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 my @columns = split ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 #not enough columns in this line - skip it
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 next if ( @columns < $find_in_specific_column ) ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 $target = $columns [ $find_in_specific_column - 1 ] ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 # Match ?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 if ( ($target =~ $regex) ^ ($find_inverse) ) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 print $output_file $_ ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 close $input_file;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 close $output_file;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 ##
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 ## Program end
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 ##
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 sub parse_command_line()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 my %opts ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 getopts('siwvc:o:', \%opts) or die "$0: Invalid option specified\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 die "$0: missing word-list file name\n" if (@ARGV==0);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 $word_list_file = $ARGV[0];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 die "$0: Word-list file '$word_list_file' not found\n" unless -e $word_list_file ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 $find_complete_words = ( exists $opts{w} ) ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 $find_inverse = ( exists $opts{v} ) ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 $find_case_insensitive = ( exists $opts{i} ) ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 $skip_first_line = ( exists $opts{s} ) ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 # Search in specific column ?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 if ( defined $opts{c} ) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 $find_in_specific_column = $opts{c};
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 die "$0: invalid column number ($find_in_specific_column).\n"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 unless $find_in_specific_column =~ /^\d+$/ ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 die "$0: invalid column number ($find_in_specific_column).\n"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 if $find_in_specific_column <= 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 else {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 $find_in_specific_column = 0 ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 # Output File specified (instead of STDOUT) ?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 if ( defined $opts{o} ) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 my $filename = $opts{o};
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 open $output_file, ">$filename" or die "$0: Failed to create output file '$filename': $!\n" ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 } else {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 $output_file = *STDOUT ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 # Input file Specified (instead of STDIN) ?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112 if ( @ARGV>1 ) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 my $filename = $ARGV[1];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 open $input_file, "<$filename" or die "$0: Failed to open input file '$filename': $!\n" ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115 } else {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116 $input_file = *STDIN;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 sub load_word_list()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121 {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122 open WORDLIST, "<$word_list_file" or die "$0: Failed to open word-list file '$word_list_file'\n" ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 my @words ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124 while ( <WORDLIST> ) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125 chomp ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126 s/^\s+//;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
127 s/\s+$//;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
128 next if length==0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
129 push @words,quotemeta $_;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
130 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
131 close WORDLIST;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
132
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
133 die "$0: Error: word-list file '$word_list_file' is empty!\n"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
134 unless @words;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
135
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
136 return @words;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
137 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
138
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
139 sub compile_regex(@)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
140 {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
141 my @words = @_;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
142
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
143 my $regex_string = join ( '|', @words ) ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
144 if ( $find_complete_words ) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
145 $regex_string = "\\b($regex_string)\\b";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
146 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
147 my $regex;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
148
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
149 if ( $find_case_insensitive ) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
150 $regex = qr/$regex_string/i ;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
151 } else {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
152 $regex = qr/$regex_string/;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
153 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
154
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
155 return $regex;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
156 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
157
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
158 sub usage()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
159 {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
160 print <<EOF;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
161
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
162 Word-List Grep
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
163 Copyright (C) 2009 - by A. Gordon ( gordon at cshl dot edu )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
164
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
165 Usage: $0 [-o OUTPUT] [-s] [-w] [-i] [-c N] [-v] WORD-LIST-FILE [INPUT-FILE]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
166
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
167 -s - do not filter first line - always output the first line from the input file.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
168 -w - search for complete words (not partial sub-strings).
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
169 -i - case insensitive search.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
170 -v - inverse - output lines NOT matching the word list.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
171 -c N - check only column N, instead of entire line (line split by whitespace).
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
172 -o OUT - specify output file (default = STDOUT).
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
173 WORD-LIST-FILE - file containing one word per line. These will be used
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
174 for the search.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
175 INPUT-FILE - (optional) read from file (default = from STDIN).
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
176
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
177
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
178
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
179 EOF
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
180
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
181 exit;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
182 }