Mercurial > repos > nilesh > cmpfastq
changeset 1:54ab39950f59 default tip
first commit
author | nilesh |
---|---|
date | Fri, 12 Jul 2013 14:36:19 -0500 (2013-07-12) |
parents | 177470e28e5f |
children | |
files | cmpfastq cmpfastq.xml |
diffstat | 2 files changed, 254 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpfastq Fri Jul 12 14:36:19 2013 -0500 @@ -0,0 +1,221 @@ +use strict; +use warnings; +use Getopt::Std; +use File::Basename; + +## CONSTANTS ## +my $TRUE = 1; +my $FALSE = 0; +my $DEBUG = $FALSE; +my $EXITSTATUS = 0; + +# Default umask +umask 027; + +# Define variables + +# Get the options passed at the command prompt +GetOptions(); + +############################## +# M A I N P R O G R A M # +############################## + +# Check to see we received two files in the arguments + +my $fail = $FALSE; + +# Check to see if the files exist +foreach my $file (@ARGV) +{ + if(!-e $file) + { + print STDERR "File $file didn't exist\n"; + $fail = $TRUE; + } +} + +# If any of the files didn't exist, let's kill it +if($fail) +{ + exit(1); +} + +# Read the file names in from the command line +my $file1 = shift(@ARGV); +my $file2 = shift(@ARGV); + + +# Index the first file. +my %fastqIndex1 = %{IndexFastq($file1)}; + +# Compare the two files +CompareFastq($file1, $file2, \%fastqIndex1); + +exit($EXITSTATUS); + +# Subroutines +sub Usage +{ + my $base = basename($0); + print "Usage: $base [dh] file1 file2\n"; + print "\td:\tDebug mode on (default off)\n"; + print "\th:\tPrint this usage\n"; +} + +sub GetOptions +{ + # Get the options passed at the command prompt + my %options=(); + getopts("dh", \%options); + + if(defined($options{'d'})) + { + $DEBUG = $TRUE; + } + + if(defined($options{'h'})) + { + Usage(); + exit($EXITSTATUS); + } +} + +sub IndexFastq +{ + my $file = shift; + my %fastqIndex; + + open(IN, $file) or die("Could not open $file\n"); + my $pos = tell(IN); + my $lineCounter = 1; + while(my $line = <IN>) + { + chomp($line); + + # Each block is going to be of 4 lines + # Let's get the seq ID from the sequence name + if($line =~ m/^@(.*)#.*/) + { + $fastqIndex{$1} = $pos; + # Skip the next 3 lines + for(my $i=0; $i<3; $i++) + { + <IN>; + $lineCounter++; + } + } + elsif($line =~ m/^#/) + { + print STDERR "File: $file\[$lineCounter]: Skipping comment line: $line\n" if($DEBUG); + } + elsif($line =~ m/^$/) + { + print STDERR "File: $file\[$lineCounter]: Skipping empty line: $line\n" if($DEBUG); + } + else + { + print STDERR "File: $file\[$lineCounter]: Could not match the sequence ID from the name: $line\n" if($DEBUG); + } + $pos = tell(IN); + $lineCounter++; + } + close(IN); + + return \%fastqIndex; +} + +sub CompareFastq +{ + my $file1 = shift; + my $file2 = shift; + my $fastqIndex1Ref = shift; + my %fastqIndex1 = %{$fastqIndex1Ref}; + my %found1; + + # We don't want to have to open/close file handles each time, so let's open them here + open(F1COUT, ">$ARGV[0]"); + open(F2COUT, ">$ARGV[1]"); + open(F1UOUT, ">$ARGV[2]"); + open(F2UOUT, ">$ARGV[3]"); + + open(F1IN, $file1) or die("Could not open $file1\n"); + open(F2IN, $file2) or die("Could not open $file2\n"); + while(my $line = <F2IN>) + { + chomp($line); + + # Skip empty lines or comments + if($line =~ m/^$/g or $line =~ m/^\s*#/) + { + next; + } + + # Each block is going to be of 4 lines + # Let's get the seq ID from the sequence name + if($line =~ m/^@(.*)#.*/) + { + my $seqId = $1; + + if(defined($fastqIndex1{$seqId})) + { + $found1{$seqId} = $TRUE; + + # Print out from file1 + seek(F1IN, $fastqIndex1{$seqId}, 0); + for(my $i=0;$i<4;$i++) + { + my $tmpLine = <F1IN>; + print F1COUT $tmpLine; + } + + # Print out from file 2 + print F2COUT $line . "\n"; + for(my $i=0; $i<3; $i++) + { + my $tmpLine = <F2IN>; + print F2COUT $tmpLine; + } + } + else + { + # Print out from file 2 + print F2UOUT $line . "\n"; + for(my $i=0; $i<3; $i++) + { + my $tmpLine = <F2IN>; + print F2UOUT $tmpLine; + } + } + } + else + { + print STDERR "Could not match the sequence ID from the name: $line\n";; + next; + } + } + close(F1COUT); + close(F2COUT); + close(F2UOUT); + close(F2IN); + # Now let's worry about the sequences that weren't common in file 1 + + # File 1 + if(keys(%fastqIndex1) != keys(%found1)) + { + foreach my $seqId (keys %fastqIndex1) + { + if(!defined($found1{$seqId})) + { + seek(F1IN, $fastqIndex1{$seqId}, 0); + for(my $i=0;$i<4;$i++) + { + my $tmpLine = <F1IN>; + print F1UOUT $tmpLine; + } + } + } + } + close(F1UOUT); + close(F1IN) +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpfastq.xml Fri Jul 12 14:36:19 2013 -0500 @@ -0,0 +1,33 @@ +<tool id="cmpfastq" name="Compare two fast files"> + <description>Comparing two fastq files</description> + <requirements> + <requirement type="set_environment">CFQ_SCRIPT_PATH</requirement> + </requirements> + <command>\$CFQ_SCRIPT_PATH/cmpfastq $input1 $input2 $output1 $output2 $output3 $output4</command> + <inputs> + <param name="input1" type="data" format="fastq" label="Source file 1"/> + <param name="input2" type="data" format="fastq" label="Source file 2"/> + <!--<param name="input3" type="select" label="Option"> + <option value="d">d</option> + <option value="h ">h</option> + </param> --> + </inputs> + <outputs> + <data format="txt" name="output1" label="Common file 1"/> + <data format="txt" name="output2" label="Common file 2"/> + <data format="txt" name="output3" label="Unique file 1"/> + <data format="txt" name="output4" label="Unique file 2"/> + </outputs> + + <tests> + <test> + <!-- <param name="input" value="fa_gc_content_input.fa"/> + <output name="out_file1" file="fa_gc_content_output.txt"/> --> + </test> + </tests> + + <help> +This tool computes GC content from a FASTA file. + </help> + +</tool> \ No newline at end of file