annotate cmpfastq @ 1:54ab39950f59 default tip

first commit
author nilesh
date Fri, 12 Jul 2013 14:36:19 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
54ab39950f59 first commit
nilesh
parents:
diff changeset
1 use strict;
54ab39950f59 first commit
nilesh
parents:
diff changeset
2 use warnings;
54ab39950f59 first commit
nilesh
parents:
diff changeset
3 use Getopt::Std;
54ab39950f59 first commit
nilesh
parents:
diff changeset
4 use File::Basename;
54ab39950f59 first commit
nilesh
parents:
diff changeset
5
54ab39950f59 first commit
nilesh
parents:
diff changeset
6 ## CONSTANTS ##
54ab39950f59 first commit
nilesh
parents:
diff changeset
7 my $TRUE = 1;
54ab39950f59 first commit
nilesh
parents:
diff changeset
8 my $FALSE = 0;
54ab39950f59 first commit
nilesh
parents:
diff changeset
9 my $DEBUG = $FALSE;
54ab39950f59 first commit
nilesh
parents:
diff changeset
10 my $EXITSTATUS = 0;
54ab39950f59 first commit
nilesh
parents:
diff changeset
11
54ab39950f59 first commit
nilesh
parents:
diff changeset
12 # Default umask
54ab39950f59 first commit
nilesh
parents:
diff changeset
13 umask 027;
54ab39950f59 first commit
nilesh
parents:
diff changeset
14
54ab39950f59 first commit
nilesh
parents:
diff changeset
15 # Define variables
54ab39950f59 first commit
nilesh
parents:
diff changeset
16
54ab39950f59 first commit
nilesh
parents:
diff changeset
17 # Get the options passed at the command prompt
54ab39950f59 first commit
nilesh
parents:
diff changeset
18 GetOptions();
54ab39950f59 first commit
nilesh
parents:
diff changeset
19
54ab39950f59 first commit
nilesh
parents:
diff changeset
20 ##############################
54ab39950f59 first commit
nilesh
parents:
diff changeset
21 # M A I N P R O G R A M #
54ab39950f59 first commit
nilesh
parents:
diff changeset
22 ##############################
54ab39950f59 first commit
nilesh
parents:
diff changeset
23
54ab39950f59 first commit
nilesh
parents:
diff changeset
24 # Check to see we received two files in the arguments
54ab39950f59 first commit
nilesh
parents:
diff changeset
25
54ab39950f59 first commit
nilesh
parents:
diff changeset
26 my $fail = $FALSE;
54ab39950f59 first commit
nilesh
parents:
diff changeset
27
54ab39950f59 first commit
nilesh
parents:
diff changeset
28 # Check to see if the files exist
54ab39950f59 first commit
nilesh
parents:
diff changeset
29 foreach my $file (@ARGV)
54ab39950f59 first commit
nilesh
parents:
diff changeset
30 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
31 if(!-e $file)
54ab39950f59 first commit
nilesh
parents:
diff changeset
32 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
33 print STDERR "File $file didn't exist\n";
54ab39950f59 first commit
nilesh
parents:
diff changeset
34 $fail = $TRUE;
54ab39950f59 first commit
nilesh
parents:
diff changeset
35 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
36 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
37
54ab39950f59 first commit
nilesh
parents:
diff changeset
38 # If any of the files didn't exist, let's kill it
54ab39950f59 first commit
nilesh
parents:
diff changeset
39 if($fail)
54ab39950f59 first commit
nilesh
parents:
diff changeset
40 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
41 exit(1);
54ab39950f59 first commit
nilesh
parents:
diff changeset
42 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
43
54ab39950f59 first commit
nilesh
parents:
diff changeset
44 # Read the file names in from the command line
54ab39950f59 first commit
nilesh
parents:
diff changeset
45 my $file1 = shift(@ARGV);
54ab39950f59 first commit
nilesh
parents:
diff changeset
46 my $file2 = shift(@ARGV);
54ab39950f59 first commit
nilesh
parents:
diff changeset
47
54ab39950f59 first commit
nilesh
parents:
diff changeset
48
54ab39950f59 first commit
nilesh
parents:
diff changeset
49 # Index the first file.
54ab39950f59 first commit
nilesh
parents:
diff changeset
50 my %fastqIndex1 = %{IndexFastq($file1)};
54ab39950f59 first commit
nilesh
parents:
diff changeset
51
54ab39950f59 first commit
nilesh
parents:
diff changeset
52 # Compare the two files
54ab39950f59 first commit
nilesh
parents:
diff changeset
53 CompareFastq($file1, $file2, \%fastqIndex1);
54ab39950f59 first commit
nilesh
parents:
diff changeset
54
54ab39950f59 first commit
nilesh
parents:
diff changeset
55 exit($EXITSTATUS);
54ab39950f59 first commit
nilesh
parents:
diff changeset
56
54ab39950f59 first commit
nilesh
parents:
diff changeset
57 # Subroutines
54ab39950f59 first commit
nilesh
parents:
diff changeset
58 sub Usage
54ab39950f59 first commit
nilesh
parents:
diff changeset
59 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
60 my $base = basename($0);
54ab39950f59 first commit
nilesh
parents:
diff changeset
61 print "Usage: $base [dh] file1 file2\n";
54ab39950f59 first commit
nilesh
parents:
diff changeset
62 print "\td:\tDebug mode on (default off)\n";
54ab39950f59 first commit
nilesh
parents:
diff changeset
63 print "\th:\tPrint this usage\n";
54ab39950f59 first commit
nilesh
parents:
diff changeset
64 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
65
54ab39950f59 first commit
nilesh
parents:
diff changeset
66 sub GetOptions
54ab39950f59 first commit
nilesh
parents:
diff changeset
67 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
68 # Get the options passed at the command prompt
54ab39950f59 first commit
nilesh
parents:
diff changeset
69 my %options=();
54ab39950f59 first commit
nilesh
parents:
diff changeset
70 getopts("dh", \%options);
54ab39950f59 first commit
nilesh
parents:
diff changeset
71
54ab39950f59 first commit
nilesh
parents:
diff changeset
72 if(defined($options{'d'}))
54ab39950f59 first commit
nilesh
parents:
diff changeset
73 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
74 $DEBUG = $TRUE;
54ab39950f59 first commit
nilesh
parents:
diff changeset
75 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
76
54ab39950f59 first commit
nilesh
parents:
diff changeset
77 if(defined($options{'h'}))
54ab39950f59 first commit
nilesh
parents:
diff changeset
78 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
79 Usage();
54ab39950f59 first commit
nilesh
parents:
diff changeset
80 exit($EXITSTATUS);
54ab39950f59 first commit
nilesh
parents:
diff changeset
81 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
82 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
83
54ab39950f59 first commit
nilesh
parents:
diff changeset
84 sub IndexFastq
54ab39950f59 first commit
nilesh
parents:
diff changeset
85 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
86 my $file = shift;
54ab39950f59 first commit
nilesh
parents:
diff changeset
87 my %fastqIndex;
54ab39950f59 first commit
nilesh
parents:
diff changeset
88
54ab39950f59 first commit
nilesh
parents:
diff changeset
89 open(IN, $file) or die("Could not open $file\n");
54ab39950f59 first commit
nilesh
parents:
diff changeset
90 my $pos = tell(IN);
54ab39950f59 first commit
nilesh
parents:
diff changeset
91 my $lineCounter = 1;
54ab39950f59 first commit
nilesh
parents:
diff changeset
92 while(my $line = <IN>)
54ab39950f59 first commit
nilesh
parents:
diff changeset
93 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
94 chomp($line);
54ab39950f59 first commit
nilesh
parents:
diff changeset
95
54ab39950f59 first commit
nilesh
parents:
diff changeset
96 # Each block is going to be of 4 lines
54ab39950f59 first commit
nilesh
parents:
diff changeset
97 # Let's get the seq ID from the sequence name
54ab39950f59 first commit
nilesh
parents:
diff changeset
98 if($line =~ m/^@(.*)#.*/)
54ab39950f59 first commit
nilesh
parents:
diff changeset
99 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
100 $fastqIndex{$1} = $pos;
54ab39950f59 first commit
nilesh
parents:
diff changeset
101 # Skip the next 3 lines
54ab39950f59 first commit
nilesh
parents:
diff changeset
102 for(my $i=0; $i<3; $i++)
54ab39950f59 first commit
nilesh
parents:
diff changeset
103 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
104 <IN>;
54ab39950f59 first commit
nilesh
parents:
diff changeset
105 $lineCounter++;
54ab39950f59 first commit
nilesh
parents:
diff changeset
106 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
107 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
108 elsif($line =~ m/^#/)
54ab39950f59 first commit
nilesh
parents:
diff changeset
109 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
110 print STDERR "File: $file\[$lineCounter]: Skipping comment line: $line\n" if($DEBUG);
54ab39950f59 first commit
nilesh
parents:
diff changeset
111 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
112 elsif($line =~ m/^$/)
54ab39950f59 first commit
nilesh
parents:
diff changeset
113 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
114 print STDERR "File: $file\[$lineCounter]: Skipping empty line: $line\n" if($DEBUG);
54ab39950f59 first commit
nilesh
parents:
diff changeset
115 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
116 else
54ab39950f59 first commit
nilesh
parents:
diff changeset
117 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
118 print STDERR "File: $file\[$lineCounter]: Could not match the sequence ID from the name: $line\n" if($DEBUG);
54ab39950f59 first commit
nilesh
parents:
diff changeset
119 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
120 $pos = tell(IN);
54ab39950f59 first commit
nilesh
parents:
diff changeset
121 $lineCounter++;
54ab39950f59 first commit
nilesh
parents:
diff changeset
122 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
123 close(IN);
54ab39950f59 first commit
nilesh
parents:
diff changeset
124
54ab39950f59 first commit
nilesh
parents:
diff changeset
125 return \%fastqIndex;
54ab39950f59 first commit
nilesh
parents:
diff changeset
126 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
127
54ab39950f59 first commit
nilesh
parents:
diff changeset
128 sub CompareFastq
54ab39950f59 first commit
nilesh
parents:
diff changeset
129 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
130 my $file1 = shift;
54ab39950f59 first commit
nilesh
parents:
diff changeset
131 my $file2 = shift;
54ab39950f59 first commit
nilesh
parents:
diff changeset
132 my $fastqIndex1Ref = shift;
54ab39950f59 first commit
nilesh
parents:
diff changeset
133 my %fastqIndex1 = %{$fastqIndex1Ref};
54ab39950f59 first commit
nilesh
parents:
diff changeset
134 my %found1;
54ab39950f59 first commit
nilesh
parents:
diff changeset
135
54ab39950f59 first commit
nilesh
parents:
diff changeset
136 # We don't want to have to open/close file handles each time, so let's open them here
54ab39950f59 first commit
nilesh
parents:
diff changeset
137 open(F1COUT, ">$ARGV[0]");
54ab39950f59 first commit
nilesh
parents:
diff changeset
138 open(F2COUT, ">$ARGV[1]");
54ab39950f59 first commit
nilesh
parents:
diff changeset
139 open(F1UOUT, ">$ARGV[2]");
54ab39950f59 first commit
nilesh
parents:
diff changeset
140 open(F2UOUT, ">$ARGV[3]");
54ab39950f59 first commit
nilesh
parents:
diff changeset
141
54ab39950f59 first commit
nilesh
parents:
diff changeset
142 open(F1IN, $file1) or die("Could not open $file1\n");
54ab39950f59 first commit
nilesh
parents:
diff changeset
143 open(F2IN, $file2) or die("Could not open $file2\n");
54ab39950f59 first commit
nilesh
parents:
diff changeset
144 while(my $line = <F2IN>)
54ab39950f59 first commit
nilesh
parents:
diff changeset
145 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
146 chomp($line);
54ab39950f59 first commit
nilesh
parents:
diff changeset
147
54ab39950f59 first commit
nilesh
parents:
diff changeset
148 # Skip empty lines or comments
54ab39950f59 first commit
nilesh
parents:
diff changeset
149 if($line =~ m/^$/g or $line =~ m/^\s*#/)
54ab39950f59 first commit
nilesh
parents:
diff changeset
150 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
151 next;
54ab39950f59 first commit
nilesh
parents:
diff changeset
152 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
153
54ab39950f59 first commit
nilesh
parents:
diff changeset
154 # Each block is going to be of 4 lines
54ab39950f59 first commit
nilesh
parents:
diff changeset
155 # Let's get the seq ID from the sequence name
54ab39950f59 first commit
nilesh
parents:
diff changeset
156 if($line =~ m/^@(.*)#.*/)
54ab39950f59 first commit
nilesh
parents:
diff changeset
157 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
158 my $seqId = $1;
54ab39950f59 first commit
nilesh
parents:
diff changeset
159
54ab39950f59 first commit
nilesh
parents:
diff changeset
160 if(defined($fastqIndex1{$seqId}))
54ab39950f59 first commit
nilesh
parents:
diff changeset
161 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
162 $found1{$seqId} = $TRUE;
54ab39950f59 first commit
nilesh
parents:
diff changeset
163
54ab39950f59 first commit
nilesh
parents:
diff changeset
164 # Print out from file1
54ab39950f59 first commit
nilesh
parents:
diff changeset
165 seek(F1IN, $fastqIndex1{$seqId}, 0);
54ab39950f59 first commit
nilesh
parents:
diff changeset
166 for(my $i=0;$i<4;$i++)
54ab39950f59 first commit
nilesh
parents:
diff changeset
167 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
168 my $tmpLine = <F1IN>;
54ab39950f59 first commit
nilesh
parents:
diff changeset
169 print F1COUT $tmpLine;
54ab39950f59 first commit
nilesh
parents:
diff changeset
170 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
171
54ab39950f59 first commit
nilesh
parents:
diff changeset
172 # Print out from file 2
54ab39950f59 first commit
nilesh
parents:
diff changeset
173 print F2COUT $line . "\n";
54ab39950f59 first commit
nilesh
parents:
diff changeset
174 for(my $i=0; $i<3; $i++)
54ab39950f59 first commit
nilesh
parents:
diff changeset
175 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
176 my $tmpLine = <F2IN>;
54ab39950f59 first commit
nilesh
parents:
diff changeset
177 print F2COUT $tmpLine;
54ab39950f59 first commit
nilesh
parents:
diff changeset
178 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
179 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
180 else
54ab39950f59 first commit
nilesh
parents:
diff changeset
181 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
182 # Print out from file 2
54ab39950f59 first commit
nilesh
parents:
diff changeset
183 print F2UOUT $line . "\n";
54ab39950f59 first commit
nilesh
parents:
diff changeset
184 for(my $i=0; $i<3; $i++)
54ab39950f59 first commit
nilesh
parents:
diff changeset
185 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
186 my $tmpLine = <F2IN>;
54ab39950f59 first commit
nilesh
parents:
diff changeset
187 print F2UOUT $tmpLine;
54ab39950f59 first commit
nilesh
parents:
diff changeset
188 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
189 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
190 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
191 else
54ab39950f59 first commit
nilesh
parents:
diff changeset
192 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
193 print STDERR "Could not match the sequence ID from the name: $line\n";;
54ab39950f59 first commit
nilesh
parents:
diff changeset
194 next;
54ab39950f59 first commit
nilesh
parents:
diff changeset
195 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
196 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
197 close(F1COUT);
54ab39950f59 first commit
nilesh
parents:
diff changeset
198 close(F2COUT);
54ab39950f59 first commit
nilesh
parents:
diff changeset
199 close(F2UOUT);
54ab39950f59 first commit
nilesh
parents:
diff changeset
200 close(F2IN);
54ab39950f59 first commit
nilesh
parents:
diff changeset
201 # Now let's worry about the sequences that weren't common in file 1
54ab39950f59 first commit
nilesh
parents:
diff changeset
202
54ab39950f59 first commit
nilesh
parents:
diff changeset
203 # File 1
54ab39950f59 first commit
nilesh
parents:
diff changeset
204 if(keys(%fastqIndex1) != keys(%found1))
54ab39950f59 first commit
nilesh
parents:
diff changeset
205 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
206 foreach my $seqId (keys %fastqIndex1)
54ab39950f59 first commit
nilesh
parents:
diff changeset
207 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
208 if(!defined($found1{$seqId}))
54ab39950f59 first commit
nilesh
parents:
diff changeset
209 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
210 seek(F1IN, $fastqIndex1{$seqId}, 0);
54ab39950f59 first commit
nilesh
parents:
diff changeset
211 for(my $i=0;$i<4;$i++)
54ab39950f59 first commit
nilesh
parents:
diff changeset
212 {
54ab39950f59 first commit
nilesh
parents:
diff changeset
213 my $tmpLine = <F1IN>;
54ab39950f59 first commit
nilesh
parents:
diff changeset
214 print F1UOUT $tmpLine;
54ab39950f59 first commit
nilesh
parents:
diff changeset
215 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
216 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
217 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
218 }
54ab39950f59 first commit
nilesh
parents:
diff changeset
219 close(F1UOUT);
54ab39950f59 first commit
nilesh
parents:
diff changeset
220 close(F1IN)
54ab39950f59 first commit
nilesh
parents:
diff changeset
221 }