comparison GrepFile.pl @ 1:dba6ffec8e2e draft

Uploaded
author geert-vandeweyer
date Thu, 13 Feb 2014 08:37:30 -0500
parents
children 606e24c6fda0
comparison
equal deleted inserted replaced
0:fd6b71c81011 1:dba6ffec8e2e
1 #!/usr/bin/perl
2
3 # load modules
4 use Getopt::Std;
5 use threads;
6 use Thread::Queue;
7 use threads::shared;
8
9 $now = time ;
10
11 # opts
12 # i : infile
13 # f : patternfile
14 # o : output file
15 # t : type (file/single)
16 # I : Insenstive to case
17 # P : Perl-Based Grep (boolean)
18 # A : number of extra lines to fetch
19 getopts('i:f:o:t:PIA:', \%opts) ;
20
21 ## variables for threads.
22 my $infile :shared;
23 my $outfile :shared;
24 my $args :shared;
25 my $rand :shared;
26 my $tmpdir :shared;
27
28 ## nr of grep threads (if created), one extra is created for printing.
29 my $nrgrep = 6;
30
31
32 ## infile && outfile check
33 if (!defined($opts{'i'})) { die('Input file is mandatory');}
34 $infile = $opts{'i'};
35 if (!defined($opts{'o'})) { die('Output file is mandatory');}
36 $outfile = $opts{'o'};
37
38 #$args = '';
39 if (defined($opts{'I'})) {
40 $args .= ' -i';
41 }
42 if (defined($opts{'P'})) {
43 $args .= " -P";
44 }
45 if (defined($opts{'A'})) {
46 if ($opts{'A'} =~ m/^\d+$/) {
47 $args .= " -A $opts{'A'}";
48 }
49 else {
50 die("Invalid amount of lines to fetch provided (must be integer)");
51 }
52 }
53
54 ## create tmp location & outfile.
55 $rand = int(rand(10000));
56 while (-d "/tmp/GrepFile.$rand") {
57 $rand = int(rand(10000));
58 }
59 mkdir("/tmp/GrepFile.$rand");
60 $tmpdir = "/tmp/GrepFile.$rand";
61
62
63 ## type?
64 if (!defined($opts{'t'})) { die('Pattern source is mandatory (file/single)');}
65 $type = $opts{'t'};
66
67 ## only threads for file based patterns
68 if ($type eq 'file') {
69 $grepqueue = Thread::Queue->new();
70 $printqueue = Thread::Queue->new();
71 for ($i = 1; $i <= $nrgrep; $i++) {
72 ${"grep$i"} = threads->create('grep');
73 }
74 $print = threads->create('printout');
75 }
76
77 ## single pattern => direct processing
78 if ($type eq 'single') {
79 $patt = $opts{'f'};
80 system("grep $args '$patt' $infile > $tmpdir/result.txt");
81
82 }
83 elsif ($type eq 'file') {
84 $pattfile = $opts{'f'};
85 ## make sure patterns are unique
86 open IN, "$pattfile";
87 my %pats;
88 while (<IN>) {
89 chomp($_);
90 $pats{$_} = 1;
91 }
92 close IN;
93 ## copy infile to local system for speed
94 system("cp '$infile' '$tmpdir/infile'");
95 mkdir("$tmpdir/pattfiles");
96 ## run in batches of 100 patterns.
97 my @patterns = keys(%pats);
98 $idx = 0;
99 while (my @subset = splice(@patterns,0,100)) {
100 if (scalar(@subset) == 0) {
101 last;
102 }
103 $idx++;
104 open OUT, ">$tmpdir/pattfiles/$idx";
105 print OUT join("\n",@subset);
106 close OUT;
107 $grepqueue->enqueue($idx);
108 }
109 for ($i = 1; $i <= $nrgrep; $i++) {
110 $grepqueue->enqueue(undef);
111 }
112 }
113 else {
114 die('only "file" and "single" are supported as value of the -t flag');
115 }
116
117 for ($i = 1; $i<= $nrgrep ; $i++) {
118 ${"grep$i"}->join();
119 }
120 $printqueue->enqueue(undef);
121 $print->join();
122
123 system("cp $tmpdir/result.txt '$outfile'");
124
125 system("rm -Rf $tmpdir");
126
127 ##################
128 # PRINT RUN-TIME #
129 ##################
130 $now = time - $now;
131 printf("\n\nRunning time:%02d:%02d:%02d\n",int($now/3600),int(($now % 3600)/60),int($now % 60));
132
133 sub grep {
134 #local copies
135 my $in = $infile;
136 my $largs = $args;
137 my $ltmp = $tmpdir;
138 while (defined(my $idx = $grepqueue->dequeue())) {
139 $command = "grep $largs -f '$ltmp/pattfiles/$idx' '$ltmp/infile' | grep -v '^--\$'";
140 my $out = `$command`;
141 $printqueue->enqueue($out);
142 }
143 }
144
145 sub printout {
146 $counter = 0;
147 $output = '';
148 while (defined(my $result = $printqueue->dequeue())) {
149 if ($result ne '' && $result ne "\n") {
150 $output .= $result;
151 $counter++;
152 }
153 if ($counter > 50) {
154 open OUT, ">>$tmpdir/result.txt";
155 print OUT $output;
156 close OUT;
157 $output = '';
158 $counter = 0;
159 }
160 }
161 if ($output ne '') {
162 open OUT, ">>$tmpdir/result.txt";
163 print OUT $output;
164 close OUT;
165 }
166 }
167