annotate GrepFile.pl @ 1:dba6ffec8e2e draft

Uploaded
author geert-vandeweyer
date Thu, 13 Feb 2014 08:37:30 -0500
parents
children 606e24c6fda0
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
1 #!/usr/bin/perl
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
2
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
3 # load modules
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
4 use Getopt::Std;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
5 use threads;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
6 use Thread::Queue;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
7 use threads::shared;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
8
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
9 $now = time ;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
10
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
11 # opts
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
12 # i : infile
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
13 # f : patternfile
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
14 # o : output file
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
15 # t : type (file/single)
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
16 # I : Insenstive to case
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
17 # P : Perl-Based Grep (boolean)
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
18 # A : number of extra lines to fetch
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
19 getopts('i:f:o:t:PIA:', \%opts) ;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
20
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
21 ## variables for threads.
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
22 my $infile :shared;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
23 my $outfile :shared;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
24 my $args :shared;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
25 my $rand :shared;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
26 my $tmpdir :shared;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
27
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
28 ## nr of grep threads (if created), one extra is created for printing.
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
29 my $nrgrep = 6;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
30
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
31
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
32 ## infile && outfile check
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
33 if (!defined($opts{'i'})) { die('Input file is mandatory');}
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
34 $infile = $opts{'i'};
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
35 if (!defined($opts{'o'})) { die('Output file is mandatory');}
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
36 $outfile = $opts{'o'};
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
37
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
38 #$args = '';
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
39 if (defined($opts{'I'})) {
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
40 $args .= ' -i';
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
41 }
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
42 if (defined($opts{'P'})) {
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
43 $args .= " -P";
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
44 }
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
45 if (defined($opts{'A'})) {
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
46 if ($opts{'A'} =~ m/^\d+$/) {
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
47 $args .= " -A $opts{'A'}";
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
48 }
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
49 else {
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
50 die("Invalid amount of lines to fetch provided (must be integer)");
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
51 }
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
52 }
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
53
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
54 ## create tmp location & outfile.
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
55 $rand = int(rand(10000));
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
56 while (-d "/tmp/GrepFile.$rand") {
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
57 $rand = int(rand(10000));
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
58 }
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
59 mkdir("/tmp/GrepFile.$rand");
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
60 $tmpdir = "/tmp/GrepFile.$rand";
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
61
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
62
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
63 ## type?
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
64 if (!defined($opts{'t'})) { die('Pattern source is mandatory (file/single)');}
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
65 $type = $opts{'t'};
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
66
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
67 ## only threads for file based patterns
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
68 if ($type eq 'file') {
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
69 $grepqueue = Thread::Queue->new();
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
70 $printqueue = Thread::Queue->new();
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
71 for ($i = 1; $i <= $nrgrep; $i++) {
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
72 ${"grep$i"} = threads->create('grep');
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
73 }
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
74 $print = threads->create('printout');
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
75 }
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
76
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
77 ## single pattern => direct processing
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
78 if ($type eq 'single') {
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
79 $patt = $opts{'f'};
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
80 system("grep $args '$patt' $infile > $tmpdir/result.txt");
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
81
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
82 }
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
83 elsif ($type eq 'file') {
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
84 $pattfile = $opts{'f'};
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
85 ## make sure patterns are unique
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
86 open IN, "$pattfile";
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
87 my %pats;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
88 while (<IN>) {
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
89 chomp($_);
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
90 $pats{$_} = 1;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
91 }
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
92 close IN;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
93 ## copy infile to local system for speed
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
94 system("cp '$infile' '$tmpdir/infile'");
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
95 mkdir("$tmpdir/pattfiles");
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
96 ## run in batches of 100 patterns.
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
97 my @patterns = keys(%pats);
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
98 $idx = 0;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
99 while (my @subset = splice(@patterns,0,100)) {
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
100 if (scalar(@subset) == 0) {
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
101 last;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
102 }
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
103 $idx++;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
104 open OUT, ">$tmpdir/pattfiles/$idx";
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
105 print OUT join("\n",@subset);
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
106 close OUT;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
107 $grepqueue->enqueue($idx);
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
108 }
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
109 for ($i = 1; $i <= $nrgrep; $i++) {
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
110 $grepqueue->enqueue(undef);
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
111 }
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
112 }
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
113 else {
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
114 die('only "file" and "single" are supported as value of the -t flag');
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
115 }
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
116
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
117 for ($i = 1; $i<= $nrgrep ; $i++) {
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
118 ${"grep$i"}->join();
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
119 }
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
120 $printqueue->enqueue(undef);
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
121 $print->join();
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
122
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
123 system("cp $tmpdir/result.txt '$outfile'");
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
124
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
125 system("rm -Rf $tmpdir");
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
126
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
127 ##################
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
128 # PRINT RUN-TIME #
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
129 ##################
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
130 $now = time - $now;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
131 printf("\n\nRunning time:%02d:%02d:%02d\n",int($now/3600),int(($now % 3600)/60),int($now % 60));
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
132
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
133 sub grep {
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
134 #local copies
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
135 my $in = $infile;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
136 my $largs = $args;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
137 my $ltmp = $tmpdir;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
138 while (defined(my $idx = $grepqueue->dequeue())) {
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
139 $command = "grep $largs -f '$ltmp/pattfiles/$idx' '$ltmp/infile' | grep -v '^--\$'";
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
140 my $out = `$command`;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
141 $printqueue->enqueue($out);
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
142 }
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
143 }
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
144
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
145 sub printout {
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
146 $counter = 0;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
147 $output = '';
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
148 while (defined(my $result = $printqueue->dequeue())) {
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
149 if ($result ne '' && $result ne "\n") {
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
150 $output .= $result;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
151 $counter++;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
152 }
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
153 if ($counter > 50) {
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
154 open OUT, ">>$tmpdir/result.txt";
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
155 print OUT $output;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
156 close OUT;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
157 $output = '';
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
158 $counter = 0;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
159 }
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
160 }
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
161 if ($output ne '') {
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
162 open OUT, ">>$tmpdir/result.txt";
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
163 print OUT $output;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
164 close OUT;
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
165 }
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
166 }
dba6ffec8e2e Uploaded
geert-vandeweyer
parents:
diff changeset
167