Mercurial > repos > geert-vandeweyer > advanced_grep_from_file
diff GrepFile.pl @ 1:dba6ffec8e2e draft
Uploaded
author | geert-vandeweyer |
---|---|
date | Thu, 13 Feb 2014 08:37:30 -0500 |
parents | |
children | 606e24c6fda0 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GrepFile.pl Thu Feb 13 08:37:30 2014 -0500 @@ -0,0 +1,167 @@ +#!/usr/bin/perl + +# load modules +use Getopt::Std; +use threads; +use Thread::Queue; +use threads::shared; + +$now = time ; + +# opts +# i : infile +# f : patternfile +# o : output file +# t : type (file/single) +# I : Insenstive to case +# P : Perl-Based Grep (boolean) +# A : number of extra lines to fetch +getopts('i:f:o:t:PIA:', \%opts) ; + +## variables for threads. +my $infile :shared; +my $outfile :shared; +my $args :shared; +my $rand :shared; +my $tmpdir :shared; + +## nr of grep threads (if created), one extra is created for printing. +my $nrgrep = 6; + + +## infile && outfile check +if (!defined($opts{'i'})) { die('Input file is mandatory');} +$infile = $opts{'i'}; +if (!defined($opts{'o'})) { die('Output file is mandatory');} +$outfile = $opts{'o'}; + +#$args = ''; +if (defined($opts{'I'})) { + $args .= ' -i'; +} +if (defined($opts{'P'})) { + $args .= " -P"; +} +if (defined($opts{'A'})) { + if ($opts{'A'} =~ m/^\d+$/) { + $args .= " -A $opts{'A'}"; + } + else { + die("Invalid amount of lines to fetch provided (must be integer)"); + } +} + +## create tmp location & outfile. +$rand = int(rand(10000)); +while (-d "/tmp/GrepFile.$rand") { + $rand = int(rand(10000)); +} +mkdir("/tmp/GrepFile.$rand"); +$tmpdir = "/tmp/GrepFile.$rand"; + + +## type? +if (!defined($opts{'t'})) { die('Pattern source is mandatory (file/single)');} +$type = $opts{'t'}; + +## only threads for file based patterns +if ($type eq 'file') { + $grepqueue = Thread::Queue->new(); + $printqueue = Thread::Queue->new(); + for ($i = 1; $i <= $nrgrep; $i++) { + ${"grep$i"} = threads->create('grep'); + } + $print = threads->create('printout'); +} + +## single pattern => direct processing +if ($type eq 'single') { + $patt = $opts{'f'}; + system("grep $args '$patt' $infile > $tmpdir/result.txt"); + +} +elsif ($type eq 'file') { + $pattfile = $opts{'f'}; + ## make sure patterns are unique + open IN, "$pattfile"; + my %pats; + while (<IN>) { + chomp($_); + $pats{$_} = 1; + } + close IN; + ## copy infile to local system for speed + system("cp '$infile' '$tmpdir/infile'"); + mkdir("$tmpdir/pattfiles"); + ## run in batches of 100 patterns. + my @patterns = keys(%pats); + $idx = 0; + while (my @subset = splice(@patterns,0,100)) { + if (scalar(@subset) == 0) { + last; + } + $idx++; + open OUT, ">$tmpdir/pattfiles/$idx"; + print OUT join("\n",@subset); + close OUT; + $grepqueue->enqueue($idx); + } + for ($i = 1; $i <= $nrgrep; $i++) { + $grepqueue->enqueue(undef); + } +} +else { + die('only "file" and "single" are supported as value of the -t flag'); +} + +for ($i = 1; $i<= $nrgrep ; $i++) { + ${"grep$i"}->join(); +} +$printqueue->enqueue(undef); +$print->join(); + +system("cp $tmpdir/result.txt '$outfile'"); + +system("rm -Rf $tmpdir"); + +################## +# PRINT RUN-TIME # +################## +$now = time - $now; +printf("\n\nRunning time:%02d:%02d:%02d\n",int($now/3600),int(($now % 3600)/60),int($now % 60)); + +sub grep { + #local copies + my $in = $infile; + my $largs = $args; + my $ltmp = $tmpdir; + while (defined(my $idx = $grepqueue->dequeue())) { + $command = "grep $largs -f '$ltmp/pattfiles/$idx' '$ltmp/infile' | grep -v '^--\$'"; + my $out = `$command`; + $printqueue->enqueue($out); + } +} + +sub printout { + $counter = 0; + $output = ''; + while (defined(my $result = $printqueue->dequeue())) { + if ($result ne '' && $result ne "\n") { + $output .= $result; + $counter++; + } + if ($counter > 50) { + open OUT, ">>$tmpdir/result.txt"; + print OUT $output; + close OUT; + $output = ''; + $counter = 0; + } + } + if ($output ne '') { + open OUT, ">>$tmpdir/result.txt"; + print OUT $output; + close OUT; + } +} +