Mercurial > repos > nilesh > cmpfastq
view cmpfastq @ 1:54ab39950f59 default tip
first commit
author | nilesh |
---|---|
date | Fri, 12 Jul 2013 14:36:19 -0500 |
parents | |
children |
line wrap: on
line source
use strict; use warnings; use Getopt::Std; use File::Basename; ## CONSTANTS ## my $TRUE = 1; my $FALSE = 0; my $DEBUG = $FALSE; my $EXITSTATUS = 0; # Default umask umask 027; # Define variables # Get the options passed at the command prompt GetOptions(); ############################## # M A I N P R O G R A M # ############################## # Check to see we received two files in the arguments my $fail = $FALSE; # Check to see if the files exist foreach my $file (@ARGV) { if(!-e $file) { print STDERR "File $file didn't exist\n"; $fail = $TRUE; } } # If any of the files didn't exist, let's kill it if($fail) { exit(1); } # Read the file names in from the command line my $file1 = shift(@ARGV); my $file2 = shift(@ARGV); # Index the first file. my %fastqIndex1 = %{IndexFastq($file1)}; # Compare the two files CompareFastq($file1, $file2, \%fastqIndex1); exit($EXITSTATUS); # Subroutines sub Usage { my $base = basename($0); print "Usage: $base [dh] file1 file2\n"; print "\td:\tDebug mode on (default off)\n"; print "\th:\tPrint this usage\n"; } sub GetOptions { # Get the options passed at the command prompt my %options=(); getopts("dh", \%options); if(defined($options{'d'})) { $DEBUG = $TRUE; } if(defined($options{'h'})) { Usage(); exit($EXITSTATUS); } } sub IndexFastq { my $file = shift; my %fastqIndex; open(IN, $file) or die("Could not open $file\n"); my $pos = tell(IN); my $lineCounter = 1; while(my $line = <IN>) { chomp($line); # Each block is going to be of 4 lines # Let's get the seq ID from the sequence name if($line =~ m/^@(.*)#.*/) { $fastqIndex{$1} = $pos; # Skip the next 3 lines for(my $i=0; $i<3; $i++) { <IN>; $lineCounter++; } } elsif($line =~ m/^#/) { print STDERR "File: $file\[$lineCounter]: Skipping comment line: $line\n" if($DEBUG); } elsif($line =~ m/^$/) { print STDERR "File: $file\[$lineCounter]: Skipping empty line: $line\n" if($DEBUG); } else { print STDERR "File: $file\[$lineCounter]: Could not match the sequence ID from the name: $line\n" if($DEBUG); } $pos = tell(IN); $lineCounter++; } close(IN); return \%fastqIndex; } sub CompareFastq { my $file1 = shift; my $file2 = shift; my $fastqIndex1Ref = shift; my %fastqIndex1 = %{$fastqIndex1Ref}; my %found1; # We don't want to have to open/close file handles each time, so let's open them here open(F1COUT, ">$ARGV[0]"); open(F2COUT, ">$ARGV[1]"); open(F1UOUT, ">$ARGV[2]"); open(F2UOUT, ">$ARGV[3]"); open(F1IN, $file1) or die("Could not open $file1\n"); open(F2IN, $file2) or die("Could not open $file2\n"); while(my $line = <F2IN>) { chomp($line); # Skip empty lines or comments if($line =~ m/^$/g or $line =~ m/^\s*#/) { next; } # Each block is going to be of 4 lines # Let's get the seq ID from the sequence name if($line =~ m/^@(.*)#.*/) { my $seqId = $1; if(defined($fastqIndex1{$seqId})) { $found1{$seqId} = $TRUE; # Print out from file1 seek(F1IN, $fastqIndex1{$seqId}, 0); for(my $i=0;$i<4;$i++) { my $tmpLine = <F1IN>; print F1COUT $tmpLine; } # Print out from file 2 print F2COUT $line . "\n"; for(my $i=0; $i<3; $i++) { my $tmpLine = <F2IN>; print F2COUT $tmpLine; } } else { # Print out from file 2 print F2UOUT $line . "\n"; for(my $i=0; $i<3; $i++) { my $tmpLine = <F2IN>; print F2UOUT $tmpLine; } } } else { print STDERR "Could not match the sequence ID from the name: $line\n";; next; } } close(F1COUT); close(F2COUT); close(F2UOUT); close(F2IN); # Now let's worry about the sequences that weren't common in file 1 # File 1 if(keys(%fastqIndex1) != keys(%found1)) { foreach my $seqId (keys %fastqIndex1) { if(!defined($found1{$seqId})) { seek(F1IN, $fastqIndex1{$seqId}, 0); for(my $i=0;$i<4;$i++) { my $tmpLine = <F1IN>; print F1UOUT $tmpLine; } } } } close(F1UOUT); close(F1IN) }