changeset 1:54ab39950f59 default tip

first commit
author nilesh
date Fri, 12 Jul 2013 14:36:19 -0500 (2013-07-12)
parents 177470e28e5f
children
files cmpfastq cmpfastq.xml
diffstat 2 files changed, 254 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpfastq	Fri Jul 12 14:36:19 2013 -0500
@@ -0,0 +1,221 @@
+use strict;
+use warnings;
+use Getopt::Std;
+use File::Basename;
+
+## CONSTANTS ##
+my $TRUE              = 1;
+my $FALSE             = 0;
+my $DEBUG             = $FALSE;
+my $EXITSTATUS        = 0;
+
+# Default umask
+umask 027;
+
+# Define variables
+
+# Get the options passed at the command prompt
+GetOptions();
+
+##############################
+#   M A I N   P R O G R A M  #
+##############################
+
+# Check to see we received two files in the arguments
+
+my $fail = $FALSE;
+
+# Check to see if the files exist
+foreach my $file (@ARGV)
+{
+	if(!-e $file)
+	{
+		print STDERR "File $file didn't exist\n";
+		$fail = $TRUE;
+	}
+}
+
+# If any of the files didn't exist, let's kill it
+if($fail)
+{
+	exit(1);
+}
+
+# Read the file names in from the command line
+my $file1 = shift(@ARGV);
+my $file2 = shift(@ARGV);
+
+
+# Index the first file.
+my %fastqIndex1 = %{IndexFastq($file1)};
+
+# Compare the two files
+CompareFastq($file1, $file2, \%fastqIndex1);
+
+exit($EXITSTATUS);
+
+# Subroutines
+sub Usage
+{
+	my $base = basename($0);
+	print "Usage: $base [dh] file1 file2\n";
+	print "\td:\tDebug mode on (default off)\n";
+	print "\th:\tPrint this usage\n";
+}
+
+sub GetOptions
+{
+	# Get the options passed at the command prompt
+	my %options=();
+	getopts("dh", \%options);
+
+	if(defined($options{'d'}))
+	{
+		$DEBUG = $TRUE;
+	}
+
+	if(defined($options{'h'}))
+	{
+		Usage();
+		exit($EXITSTATUS);
+	}
+}
+
+sub IndexFastq
+{
+	my $file = shift;
+	my %fastqIndex;
+
+	open(IN, $file) or die("Could not open $file\n");
+	my $pos = tell(IN);
+	my $lineCounter = 1;
+	while(my $line = <IN>)
+	{
+		chomp($line);
+
+		# Each block is going to be of 4 lines
+		# Let's get the seq ID from the sequence name
+		if($line =~ m/^@(.*)#.*/)
+		{
+			$fastqIndex{$1} = $pos;
+			# Skip the next 3 lines
+			for(my $i=0; $i<3; $i++)
+			{
+				<IN>;
+				$lineCounter++;
+			}
+		}
+		elsif($line =~ m/^#/)
+		{
+			print STDERR "File: $file\[$lineCounter]: Skipping comment line: $line\n" if($DEBUG);
+		}
+		elsif($line =~ m/^$/)
+		{
+			print STDERR "File: $file\[$lineCounter]: Skipping empty line: $line\n" if($DEBUG);
+		}
+		else
+		{
+			print STDERR "File: $file\[$lineCounter]: Could not match the sequence ID from the name: $line\n" if($DEBUG);
+		}
+		$pos = tell(IN);
+		$lineCounter++;
+	}
+	close(IN);
+
+	return \%fastqIndex;
+}
+
+sub CompareFastq
+{
+	my $file1          = shift;
+	my $file2          = shift;
+	my $fastqIndex1Ref = shift;
+	my %fastqIndex1    = %{$fastqIndex1Ref};
+	my %found1;
+
+	# We don't want to have to open/close file handles each time, so let's open them here
+	open(F1COUT, ">$ARGV[0]");
+	open(F2COUT, ">$ARGV[1]");
+	open(F1UOUT, ">$ARGV[2]");
+	open(F2UOUT, ">$ARGV[3]");
+
+	open(F1IN, $file1) or die("Could not open $file1\n");
+	open(F2IN, $file2) or die("Could not open $file2\n");
+	while(my $line = <F2IN>)
+	{
+		chomp($line);
+
+		# Skip empty lines or comments
+		if($line =~ m/^$/g or $line =~ m/^\s*#/)
+		{
+			next;
+		}
+
+		# Each block is going to be of 4 lines
+		# Let's get the seq ID from the sequence name
+		if($line =~ m/^@(.*)#.*/)
+		{
+			my $seqId = $1;
+
+			if(defined($fastqIndex1{$seqId}))
+			{
+				$found1{$seqId} = $TRUE;
+
+				# Print out from file1
+				seek(F1IN, $fastqIndex1{$seqId}, 0);
+				for(my $i=0;$i<4;$i++)
+				{
+					my $tmpLine = <F1IN>;
+					print F1COUT $tmpLine;
+				}
+				
+				# Print out from file 2
+				print F2COUT $line . "\n";
+				for(my $i=0; $i<3; $i++)
+				{
+					my $tmpLine = <F2IN>;
+                               		print F2COUT $tmpLine;
+				}
+			}
+			else
+			{
+				# Print out from file 2
+				print F2UOUT $line . "\n";
+				for(my $i=0; $i<3; $i++)
+				{
+					my $tmpLine = <F2IN>;
+					print F2UOUT $tmpLine;
+				}
+			}
+		}
+		else
+		{
+			print STDERR "Could not match the sequence ID from the name: $line\n";;
+			next;
+		}
+	}
+	close(F1COUT);
+	close(F2COUT);
+	close(F2UOUT);
+	close(F2IN);
+	# Now let's worry about the sequences that weren't common in file 1
+
+	# File 1
+	if(keys(%fastqIndex1) != keys(%found1))
+	{
+		foreach my $seqId (keys %fastqIndex1)
+		{
+			if(!defined($found1{$seqId}))
+			{
+				seek(F1IN, $fastqIndex1{$seqId}, 0);
+				for(my $i=0;$i<4;$i++)
+				{
+					my $tmpLine = <F1IN>;
+					print F1UOUT $tmpLine;
+				}
+			}
+		}
+	}
+	close(F1UOUT);
+	close(F1IN)
+}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cmpfastq.xml	Fri Jul 12 14:36:19 2013 -0500
@@ -0,0 +1,33 @@
+<tool id="cmpfastq" name="Compare two fast files">
+  <description>Comparing two fastq files</description>
+  <requirements>
+    <requirement type="set_environment">CFQ_SCRIPT_PATH</requirement>
+  </requirements>
+  <command>\$CFQ_SCRIPT_PATH/cmpfastq $input1 $input2 $output1 $output2 $output3 $output4</command>
+  <inputs>
+    <param name="input1" type="data"  format="fastq" label="Source file 1"/>
+    <param name="input2" type="data" format="fastq" label="Source file 2"/>
+    <!--<param name="input3" type="select" label="Option">
+        <option value="d">d</option>
+        <option value="h ">h</option>
+     </param> -->
+  </inputs>
+  <outputs>
+    <data format="txt" name="output1" label="Common file 1"/>
+    <data format="txt" name="output2" label="Common file 2"/>
+    <data format="txt" name="output3" label="Unique file 1"/>
+    <data format="txt" name="output4" label="Unique file 2"/>
+  </outputs>
+
+  <tests>
+    <test>
+     <!-- <param name="input" value="fa_gc_content_input.fa"/>
+      <output name="out_file1" file="fa_gc_content_output.txt"/> -->
+    </test> 
+  </tests>
+
+  <help>
+This tool computes GC content from a FASTA file.
+  </help>
+
+</tool>
\ No newline at end of file