view cmpfastq @ 1:54ab39950f59 default tip

first commit
author nilesh
date Fri, 12 Jul 2013 14:36:19 -0500
parents
children
line wrap: on
line source

use strict;
use warnings;
use Getopt::Std;
use File::Basename;

## CONSTANTS ##
my $TRUE              = 1;
my $FALSE             = 0;
my $DEBUG             = $FALSE;
my $EXITSTATUS        = 0;

# Default umask
umask 027;

# Define variables

# Get the options passed at the command prompt
GetOptions();

##############################
#   M A I N   P R O G R A M  #
##############################

# Check to see we received two files in the arguments

my $fail = $FALSE;

# Check to see if the files exist
foreach my $file (@ARGV)
{
	if(!-e $file)
	{
		print STDERR "File $file didn't exist\n";
		$fail = $TRUE;
	}
}

# If any of the files didn't exist, let's kill it
if($fail)
{
	exit(1);
}

# Read the file names in from the command line
my $file1 = shift(@ARGV);
my $file2 = shift(@ARGV);


# Index the first file.
my %fastqIndex1 = %{IndexFastq($file1)};

# Compare the two files
CompareFastq($file1, $file2, \%fastqIndex1);

exit($EXITSTATUS);

# Subroutines
sub Usage
{
	my $base = basename($0);
	print "Usage: $base [dh] file1 file2\n";
	print "\td:\tDebug mode on (default off)\n";
	print "\th:\tPrint this usage\n";
}

sub GetOptions
{
	# Get the options passed at the command prompt
	my %options=();
	getopts("dh", \%options);

	if(defined($options{'d'}))
	{
		$DEBUG = $TRUE;
	}

	if(defined($options{'h'}))
	{
		Usage();
		exit($EXITSTATUS);
	}
}

sub IndexFastq
{
	my $file = shift;
	my %fastqIndex;

	open(IN, $file) or die("Could not open $file\n");
	my $pos = tell(IN);
	my $lineCounter = 1;
	while(my $line = <IN>)
	{
		chomp($line);

		# Each block is going to be of 4 lines
		# Let's get the seq ID from the sequence name
		if($line =~ m/^@(.*)#.*/)
		{
			$fastqIndex{$1} = $pos;
			# Skip the next 3 lines
			for(my $i=0; $i<3; $i++)
			{
				<IN>;
				$lineCounter++;
			}
		}
		elsif($line =~ m/^#/)
		{
			print STDERR "File: $file\[$lineCounter]: Skipping comment line: $line\n" if($DEBUG);
		}
		elsif($line =~ m/^$/)
		{
			print STDERR "File: $file\[$lineCounter]: Skipping empty line: $line\n" if($DEBUG);
		}
		else
		{
			print STDERR "File: $file\[$lineCounter]: Could not match the sequence ID from the name: $line\n" if($DEBUG);
		}
		$pos = tell(IN);
		$lineCounter++;
	}
	close(IN);

	return \%fastqIndex;
}

sub CompareFastq
{
	my $file1          = shift;
	my $file2          = shift;
	my $fastqIndex1Ref = shift;
	my %fastqIndex1    = %{$fastqIndex1Ref};
	my %found1;

	# We don't want to have to open/close file handles each time, so let's open them here
	open(F1COUT, ">$ARGV[0]");
	open(F2COUT, ">$ARGV[1]");
	open(F1UOUT, ">$ARGV[2]");
	open(F2UOUT, ">$ARGV[3]");

	open(F1IN, $file1) or die("Could not open $file1\n");
	open(F2IN, $file2) or die("Could not open $file2\n");
	while(my $line = <F2IN>)
	{
		chomp($line);

		# Skip empty lines or comments
		if($line =~ m/^$/g or $line =~ m/^\s*#/)
		{
			next;
		}

		# Each block is going to be of 4 lines
		# Let's get the seq ID from the sequence name
		if($line =~ m/^@(.*)#.*/)
		{
			my $seqId = $1;

			if(defined($fastqIndex1{$seqId}))
			{
				$found1{$seqId} = $TRUE;

				# Print out from file1
				seek(F1IN, $fastqIndex1{$seqId}, 0);
				for(my $i=0;$i<4;$i++)
				{
					my $tmpLine = <F1IN>;
					print F1COUT $tmpLine;
				}
				
				# Print out from file 2
				print F2COUT $line . "\n";
				for(my $i=0; $i<3; $i++)
				{
					my $tmpLine = <F2IN>;
                               		print F2COUT $tmpLine;
				}
			}
			else
			{
				# Print out from file 2
				print F2UOUT $line . "\n";
				for(my $i=0; $i<3; $i++)
				{
					my $tmpLine = <F2IN>;
					print F2UOUT $tmpLine;
				}
			}
		}
		else
		{
			print STDERR "Could not match the sequence ID from the name: $line\n";;
			next;
		}
	}
	close(F1COUT);
	close(F2COUT);
	close(F2UOUT);
	close(F2IN);
	# Now let's worry about the sequences that weren't common in file 1

	# File 1
	if(keys(%fastqIndex1) != keys(%found1))
	{
		foreach my $seqId (keys %fastqIndex1)
		{
			if(!defined($found1{$seqId}))
			{
				seek(F1IN, $fastqIndex1{$seqId}, 0);
				for(my $i=0;$i<4;$i++)
				{
					my $tmpLine = <F1IN>;
					print F1UOUT $tmpLine;
				}
			}
		}
	}
	close(F1UOUT);
	close(F1IN)
}