1
|
1 use strict;
|
|
2 use warnings;
|
|
3 use Getopt::Std;
|
|
4 use File::Basename;
|
|
5
|
|
6 ## CONSTANTS ##
|
|
7 my $TRUE = 1;
|
|
8 my $FALSE = 0;
|
|
9 my $DEBUG = $FALSE;
|
|
10 my $EXITSTATUS = 0;
|
|
11
|
|
12 # Default umask
|
|
13 umask 027;
|
|
14
|
|
15 # Define variables
|
|
16
|
|
17 # Get the options passed at the command prompt
|
|
18 GetOptions();
|
|
19
|
|
20 ##############################
|
|
21 # M A I N P R O G R A M #
|
|
22 ##############################
|
|
23
|
|
24 # Check to see we received two files in the arguments
|
|
25
|
|
26 my $fail = $FALSE;
|
|
27
|
|
28 # Check to see if the files exist
|
|
29 foreach my $file (@ARGV)
|
|
30 {
|
|
31 if(!-e $file)
|
|
32 {
|
|
33 print STDERR "File $file didn't exist\n";
|
|
34 $fail = $TRUE;
|
|
35 }
|
|
36 }
|
|
37
|
|
38 # If any of the files didn't exist, let's kill it
|
|
39 if($fail)
|
|
40 {
|
|
41 exit(1);
|
|
42 }
|
|
43
|
|
44 # Read the file names in from the command line
|
|
45 my $file1 = shift(@ARGV);
|
|
46 my $file2 = shift(@ARGV);
|
|
47
|
|
48
|
|
49 # Index the first file.
|
|
50 my %fastqIndex1 = %{IndexFastq($file1)};
|
|
51
|
|
52 # Compare the two files
|
|
53 CompareFastq($file1, $file2, \%fastqIndex1);
|
|
54
|
|
55 exit($EXITSTATUS);
|
|
56
|
|
57 # Subroutines
|
|
58 sub Usage
|
|
59 {
|
|
60 my $base = basename($0);
|
|
61 print "Usage: $base [dh] file1 file2\n";
|
|
62 print "\td:\tDebug mode on (default off)\n";
|
|
63 print "\th:\tPrint this usage\n";
|
|
64 }
|
|
65
|
|
66 sub GetOptions
|
|
67 {
|
|
68 # Get the options passed at the command prompt
|
|
69 my %options=();
|
|
70 getopts("dh", \%options);
|
|
71
|
|
72 if(defined($options{'d'}))
|
|
73 {
|
|
74 $DEBUG = $TRUE;
|
|
75 }
|
|
76
|
|
77 if(defined($options{'h'}))
|
|
78 {
|
|
79 Usage();
|
|
80 exit($EXITSTATUS);
|
|
81 }
|
|
82 }
|
|
83
|
|
84 sub IndexFastq
|
|
85 {
|
|
86 my $file = shift;
|
|
87 my %fastqIndex;
|
|
88
|
|
89 open(IN, $file) or die("Could not open $file\n");
|
|
90 my $pos = tell(IN);
|
|
91 my $lineCounter = 1;
|
|
92 while(my $line = <IN>)
|
|
93 {
|
|
94 chomp($line);
|
|
95
|
|
96 # Each block is going to be of 4 lines
|
|
97 # Let's get the seq ID from the sequence name
|
|
98 if($line =~ m/^@(.*)#.*/)
|
|
99 {
|
|
100 $fastqIndex{$1} = $pos;
|
|
101 # Skip the next 3 lines
|
|
102 for(my $i=0; $i<3; $i++)
|
|
103 {
|
|
104 <IN>;
|
|
105 $lineCounter++;
|
|
106 }
|
|
107 }
|
|
108 elsif($line =~ m/^#/)
|
|
109 {
|
|
110 print STDERR "File: $file\[$lineCounter]: Skipping comment line: $line\n" if($DEBUG);
|
|
111 }
|
|
112 elsif($line =~ m/^$/)
|
|
113 {
|
|
114 print STDERR "File: $file\[$lineCounter]: Skipping empty line: $line\n" if($DEBUG);
|
|
115 }
|
|
116 else
|
|
117 {
|
|
118 print STDERR "File: $file\[$lineCounter]: Could not match the sequence ID from the name: $line\n" if($DEBUG);
|
|
119 }
|
|
120 $pos = tell(IN);
|
|
121 $lineCounter++;
|
|
122 }
|
|
123 close(IN);
|
|
124
|
|
125 return \%fastqIndex;
|
|
126 }
|
|
127
|
|
128 sub CompareFastq
|
|
129 {
|
|
130 my $file1 = shift;
|
|
131 my $file2 = shift;
|
|
132 my $fastqIndex1Ref = shift;
|
|
133 my %fastqIndex1 = %{$fastqIndex1Ref};
|
|
134 my %found1;
|
|
135
|
|
136 # We don't want to have to open/close file handles each time, so let's open them here
|
|
137 open(F1COUT, ">$ARGV[0]");
|
|
138 open(F2COUT, ">$ARGV[1]");
|
|
139 open(F1UOUT, ">$ARGV[2]");
|
|
140 open(F2UOUT, ">$ARGV[3]");
|
|
141
|
|
142 open(F1IN, $file1) or die("Could not open $file1\n");
|
|
143 open(F2IN, $file2) or die("Could not open $file2\n");
|
|
144 while(my $line = <F2IN>)
|
|
145 {
|
|
146 chomp($line);
|
|
147
|
|
148 # Skip empty lines or comments
|
|
149 if($line =~ m/^$/g or $line =~ m/^\s*#/)
|
|
150 {
|
|
151 next;
|
|
152 }
|
|
153
|
|
154 # Each block is going to be of 4 lines
|
|
155 # Let's get the seq ID from the sequence name
|
|
156 if($line =~ m/^@(.*)#.*/)
|
|
157 {
|
|
158 my $seqId = $1;
|
|
159
|
|
160 if(defined($fastqIndex1{$seqId}))
|
|
161 {
|
|
162 $found1{$seqId} = $TRUE;
|
|
163
|
|
164 # Print out from file1
|
|
165 seek(F1IN, $fastqIndex1{$seqId}, 0);
|
|
166 for(my $i=0;$i<4;$i++)
|
|
167 {
|
|
168 my $tmpLine = <F1IN>;
|
|
169 print F1COUT $tmpLine;
|
|
170 }
|
|
171
|
|
172 # Print out from file 2
|
|
173 print F2COUT $line . "\n";
|
|
174 for(my $i=0; $i<3; $i++)
|
|
175 {
|
|
176 my $tmpLine = <F2IN>;
|
|
177 print F2COUT $tmpLine;
|
|
178 }
|
|
179 }
|
|
180 else
|
|
181 {
|
|
182 # Print out from file 2
|
|
183 print F2UOUT $line . "\n";
|
|
184 for(my $i=0; $i<3; $i++)
|
|
185 {
|
|
186 my $tmpLine = <F2IN>;
|
|
187 print F2UOUT $tmpLine;
|
|
188 }
|
|
189 }
|
|
190 }
|
|
191 else
|
|
192 {
|
|
193 print STDERR "Could not match the sequence ID from the name: $line\n";;
|
|
194 next;
|
|
195 }
|
|
196 }
|
|
197 close(F1COUT);
|
|
198 close(F2COUT);
|
|
199 close(F2UOUT);
|
|
200 close(F2IN);
|
|
201 # Now let's worry about the sequences that weren't common in file 1
|
|
202
|
|
203 # File 1
|
|
204 if(keys(%fastqIndex1) != keys(%found1))
|
|
205 {
|
|
206 foreach my $seqId (keys %fastqIndex1)
|
|
207 {
|
|
208 if(!defined($found1{$seqId}))
|
|
209 {
|
|
210 seek(F1IN, $fastqIndex1{$seqId}, 0);
|
|
211 for(my $i=0;$i<4;$i++)
|
|
212 {
|
|
213 my $tmpLine = <F1IN>;
|
|
214 print F1UOUT $tmpLine;
|
|
215 }
|
|
216 }
|
|
217 }
|
|
218 }
|
|
219 close(F1UOUT);
|
|
220 close(F1IN)
|
|
221 } |