view GrepFile.pl @ 2:36fa1f424923 draft

Uploaded
author geert-vandeweyer
date Thu, 13 Feb 2014 08:37:36 -0500
parents dba6ffec8e2e
children 606e24c6fda0
line wrap: on
line source

#!/usr/bin/perl

# load modules
use Getopt::Std;
use threads;
use Thread::Queue;
use threads::shared;

$now = time ;

# opts
# i : infile
# f : patternfile
# o : output file 
# t : type (file/single)
# I : Insenstive to case
# P : Perl-Based Grep (boolean)
# A : number of extra lines to fetch
getopts('i:f:o:t:PIA:', \%opts) ;

## variables for threads.
my $infile :shared;
my $outfile :shared;
my $args :shared;
my $rand :shared;
my $tmpdir :shared;

## nr of grep threads (if created), one extra is created for printing.
my $nrgrep = 6;


## infile && outfile check
if (!defined($opts{'i'})) { die('Input file is mandatory');}
$infile = $opts{'i'};
if (!defined($opts{'o'})) { die('Output file is mandatory');}
$outfile = $opts{'o'};

#$args = '';
if (defined($opts{'I'})) {
	$args .= ' -i';
}
if (defined($opts{'P'})) {
	$args .= " -P";
}
if (defined($opts{'A'})) {
	if ($opts{'A'} =~ m/^\d+$/) {
		$args .= " -A $opts{'A'}";
	}
	else {
		die("Invalid amount of lines to fetch provided (must be integer)");
	}
}

## create tmp location & outfile.
$rand = int(rand(10000));
while (-d "/tmp/GrepFile.$rand") {
	$rand = int(rand(10000));
}	
mkdir("/tmp/GrepFile.$rand");
$tmpdir = "/tmp/GrepFile.$rand";


## type? 
if (!defined($opts{'t'})) { die('Pattern source is mandatory (file/single)');}
$type = $opts{'t'};

## only threads for file based patterns
if ($type eq 'file') {
	$grepqueue = Thread::Queue->new();
	$printqueue = Thread::Queue->new();
	for ($i = 1; $i <= $nrgrep; $i++) {
		${"grep$i"} = threads->create('grep');
	}
	$print = threads->create('printout');
}

## single pattern => direct processing
if ($type eq 'single') {
	$patt = $opts{'f'};
	system("grep $args '$patt' $infile > $tmpdir/result.txt");
	
}
elsif ($type eq 'file') {
	$pattfile = $opts{'f'};
	## make sure patterns are unique
	open IN, "$pattfile";
	my %pats;
	while (<IN>) {
		chomp($_);
		$pats{$_} = 1;
	}
	close IN;
	## copy infile to local system for speed
	system("cp '$infile' '$tmpdir/infile'");
	mkdir("$tmpdir/pattfiles");
	## run in batches of 100 patterns.
	my @patterns = keys(%pats);
	$idx = 0;
	while (my @subset = splice(@patterns,0,100)) {
		if (scalar(@subset) == 0) {
			last;
		}
		$idx++;
		open OUT, ">$tmpdir/pattfiles/$idx";
		print OUT join("\n",@subset);
		close OUT;
		$grepqueue->enqueue($idx);
	}
	for ($i = 1; $i <= $nrgrep; $i++) {
		$grepqueue->enqueue(undef);
	}
}
else {
	die('only "file" and "single" are supported as value of the -t flag');
}

for ($i = 1; $i<= $nrgrep ; $i++) {
	${"grep$i"}->join();
}
$printqueue->enqueue(undef);
$print->join();

system("cp $tmpdir/result.txt '$outfile'");

system("rm -Rf $tmpdir");

##################
# PRINT RUN-TIME #
##################
$now = time - $now;
printf("\n\nRunning time:%02d:%02d:%02d\n",int($now/3600),int(($now % 3600)/60),int($now % 60));

sub grep {
	#local copies
	my $in = $infile;
	my $largs = $args;
	my $ltmp = $tmpdir;
	while (defined(my $idx = $grepqueue->dequeue())) {
		$command = "grep $largs -f '$ltmp/pattfiles/$idx' '$ltmp/infile' | grep -v '^--\$'";
		my $out = `$command`;
		$printqueue->enqueue($out);
	}
}

sub printout {
	$counter = 0;
	$output = '';
	while (defined(my $result = $printqueue->dequeue())) {
		if ($result ne '' && $result ne "\n") {
			$output .= $result;
			$counter++;
		}
		if ($counter > 50) {
			open OUT, ">>$tmpdir/result.txt";
			print OUT $output;
			close OUT;
			$output = '';
			$counter = 0;
		} 
	}
	if ($output ne '') {
		open OUT, ">>$tmpdir/result.txt";
		print OUT $output;
		close OUT;
	}
}