1
|
1 package Tools::Fastq;
|
|
2
|
|
3 use strict;
|
|
4 use warnings;
|
|
5 use Logger::Logger;
|
|
6 use Storable;
|
|
7
|
|
8
|
|
9 =head1 INDEXED FASTQ RELATED METHODS
|
|
10
|
|
11 =head2
|
|
12
|
|
13 =head2 new
|
|
14
|
|
15 =head2
|
|
16
|
|
17 =head3 Description
|
|
18
|
|
19 Create a new Tools::Fastq object and index the FASTQ file
|
|
20
|
|
21 =head3 Arguments
|
|
22
|
|
23 =over 4
|
|
24
|
|
25 =item
|
|
26
|
|
27 A hash of parameters.
|
|
28
|
|
29 Currently accepted keys are :
|
|
30
|
|
31 'file' => FASTQ file path
|
|
32
|
|
33 =back
|
|
34
|
|
35 =head3 Returns
|
|
36
|
|
37 =over 4
|
|
38
|
|
39 =item
|
|
40
|
|
41 A Tools::Fastq object
|
|
42
|
|
43 =back
|
|
44
|
|
45 =cut
|
|
46
|
|
47 sub new {
|
|
48 my ($class, %attrs) = @_;
|
|
49 my $self = {};
|
|
50 bless $self;
|
|
51 if(defined($attrs{file})){
|
|
52 $self->{file} = $attrs{file};
|
|
53 open($self->{file_handle},$self->{file}) || $logger->logdie('Error opening file : '. $self->{file}.' : '.$!."\n");
|
|
54 $self->indexFastqFile;
|
|
55 }
|
|
56 return $self;
|
|
57 }
|
|
58
|
|
59 =head2 indexFastqFile
|
|
60
|
|
61 =head2
|
|
62
|
|
63 =head3 Description
|
|
64
|
|
65 Index a FASTQ file creating a hash reference with the following structure :
|
|
66
|
|
67 $index -> {seq_id} = {'id_begin_position' => integer, 'id_length' => integer}
|
|
68
|
|
69 For each sequence id, the "@" symbol and all the text after space will be removed.
|
|
70
|
|
71 This cleaned id will be used as key for the index.
|
|
72
|
|
73 =head3 Arguments
|
|
74
|
|
75 =over 4
|
|
76
|
|
77 =item
|
|
78
|
|
79 None
|
|
80
|
|
81 =back
|
|
82
|
|
83 =head3 Returns
|
|
84
|
|
85 =over 4
|
|
86
|
|
87 =item
|
|
88
|
|
89 None
|
|
90
|
|
91 =back
|
|
92
|
|
93 =cut
|
|
94
|
|
95 sub indexFastqFile{
|
|
96
|
|
97 my ($self) = @_;
|
|
98 $logger->info('Indexing file : '.$self->{file}."\n");
|
|
99 my $index;
|
|
100 my $id;
|
|
101 my $id_begin_position = 0;
|
|
102 my $fh = $self->{file_handle};
|
|
103 while(my $line = <$fh>){
|
|
104
|
|
105 if($line =~ /^@(\S+)/){
|
|
106
|
|
107 $id = $1;
|
|
108 chomp $id;
|
|
109 $index -> {$id} = {'id_begin_position' => $id_begin_position, 'id_length' => length $line};
|
|
110 $logger->trace('Indexing sequence' . $id . ' (position_begin_id : '. $index -> {$id}{'id_begin_position'} . ', id_length : '. $index -> {$id}{'id_length'} .') from ' . $self->{file} . "\n");
|
|
111 <$fh>; <$fh>; <$fh>;
|
|
112 }
|
|
113
|
|
114 $id_begin_position = tell($fh);
|
|
115 }
|
|
116
|
|
117 $logger->info('File '.$self->{file}.' is now indexed (index contains '.(scalar keys %$index)." sequences)\n");
|
|
118 $self->{index} = $index;
|
|
119 }
|
|
120
|
|
121 =head2 loadFastqIndexFile
|
|
122
|
|
123 =head2
|
|
124
|
|
125 =head3 Description
|
|
126
|
|
127 Retrieve index from file using Storable module
|
|
128
|
|
129 =head3 Arguments
|
|
130
|
|
131 =over 4
|
|
132
|
|
133 An index file
|
|
134
|
|
135 =back
|
|
136
|
|
137 =head3 Returns
|
|
138
|
|
139 =over 4
|
|
140
|
|
141 =item
|
|
142
|
|
143 A hash reference corresponding to the index of the input FASTQ file :
|
|
144
|
|
145 $index -> {seq_id} = {'id_begin_position' => integer, 'id_length' => integer}
|
|
146
|
|
147 =back
|
|
148
|
|
149 =cut
|
|
150
|
|
151 sub loadFastqIndexFile{
|
|
152
|
|
153 my ($self, $file) = @_;
|
|
154 $self->{index} = retrieve($file);
|
|
155 $logger->info('File '.$file." is now loaded\n");
|
|
156 }
|
|
157
|
|
158 =he=head2 writeFastaIndexFile
|
|
159
|
|
160 =head2
|
|
161
|
|
162 =head3 Description
|
|
163
|
|
164 Write index to file using Storable module
|
|
165
|
|
166 =head3 Arguments
|
|
167
|
|
168 =over 4
|
|
169
|
|
170 =item
|
|
171
|
|
172 A hash reference corresponding to FASTQ index.
|
|
173
|
|
174 =item
|
|
175
|
|
176 An output file path where to store the index.
|
|
177
|
|
178 =back
|
|
179
|
|
180 =head3 Returns
|
|
181
|
|
182 =over 4
|
|
183
|
|
184 =item
|
|
185
|
|
186 The output file path containing index
|
|
187
|
|
188 =back
|
|
189
|
|
190 =cut
|
|
191
|
|
192 sub writeFastqIndexFile{
|
|
193 my ($self, $file) = @_;
|
|
194 $logger->info('Writing index ('.(scalar keys %{$self->{index}}).' sequences) in file : '.$file."\n");
|
|
195 store $self->{index}, $file;
|
|
196 $logger->info('File '.$file." is now created\n");
|
|
197 }
|
|
198
|
|
199 =head2 retrieveFastqSequence
|
|
200
|
|
201 =head2
|
|
202
|
|
203 =head3 Description
|
|
204
|
|
205 Retrieve FASTQ sequences using a list of ids
|
|
206
|
|
207 =head3 Arguments
|
|
208
|
|
209 =over 4
|
|
210
|
|
211 =item
|
|
212
|
|
213 A sequence id OR an array reference containing the list of sequences id to retrieve.
|
|
214
|
|
215 =back
|
|
216
|
|
217 =head3 Returns
|
|
218
|
|
219 =over 4
|
|
220
|
|
221 =item
|
|
222
|
|
223 A hash reference containing sequences id as keys and sequences as values
|
|
224
|
|
225 $data -> {seq_id} = sequence_corresponding_to_seq_id
|
|
226
|
|
227 =back
|
|
228
|
|
229 =cut
|
|
230
|
|
231 sub retrieveFastqSequence{
|
|
232 my ($self, $ids) = @_;
|
|
233 my $data={};
|
|
234 my $nbSequences = 0;
|
|
235 if(! ref $ids){$ids = [$ids]}
|
|
236 $logger->debug('Retrieving sequences of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n");
|
|
237 my $fh = $self->{file_handle};
|
|
238 foreach my $id (@$ids){
|
|
239 my $cleanedId = $id;
|
|
240 if($id =~ /@(\S+)/){$cleanedId = $1}
|
|
241 $logger->trace('Retrieving informations of id ' . $cleanedId. " from index\n");
|
|
242 if(exists $self->{index} -> {$cleanedId}){
|
|
243 $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n");
|
|
244 seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0);
|
|
245 <$fh>;
|
|
246 my $sequence = <$fh>;
|
|
247 $data->{$id} = $sequence;
|
|
248 $nbSequences ++;
|
|
249 $logger->trace('Sequence of id '.$cleanedId.' is : ' . $sequence . "\n")
|
|
250 }
|
|
251 else{
|
|
252 $logger->trace('id ' . $cleanedId. " not found in index\n")
|
|
253 }
|
|
254 }
|
|
255 $logger->debug($nbSequences.'/'.scalar(@$ids).' sequences has been retrieved from indexed file ' . $self->{file} . "\n");
|
|
256 return $data;
|
|
257 }
|
|
258
|
|
259 =head2 retrieveFastqQuality
|
|
260
|
|
261 =head2
|
|
262
|
|
263 =head3 Description
|
|
264
|
|
265 Retrieve FASTQ sequences quality using a list of ids
|
|
266
|
|
267 =head3 Arguments
|
|
268
|
|
269 =over 4
|
|
270
|
|
271 =item
|
|
272
|
|
273 A sequence id OR an array reference containing the list of sequences id to retrieve quality.
|
|
274
|
|
275 =back
|
|
276
|
|
277 =head3 Returns
|
|
278
|
|
279 =over 4
|
|
280
|
|
281 =item
|
|
282
|
|
283 A hash reference containing sequences id as keys and sequences quality as values
|
|
284
|
|
285 $data -> {seq_id} = sequence_quality_corresponding_to_seq_id
|
|
286
|
|
287 =back
|
|
288
|
|
289 =cut
|
|
290
|
|
291 sub retrieveFastqQuality{
|
|
292 my ($self, $ids) = @_;
|
|
293 my $data;
|
|
294 my $nbSequences = 0;
|
|
295 if(! ref $ids){$ids = [$ids]}
|
|
296 $logger->debug('Retrieving sequence quality of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n");
|
|
297 my $fh = $self->{file_handle};
|
|
298 foreach my $id (@$ids){
|
|
299 my $cleanedId = $id;
|
|
300 if($id =~ /@(\S+)/){
|
|
301 $cleanedId = $1;
|
|
302 }
|
|
303 $logger->trace('retrieving informations of id ' . $cleanedId. " from index\n");
|
|
304 if(exists $self->{index} -> {$cleanedId}){
|
|
305 $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n");
|
|
306 seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0);
|
|
307 my $quality .= <$fh>.<$fh>.<$fh>;
|
|
308 $quality = <$fh>;
|
|
309 $data .= $quality;
|
|
310 $nbSequences ++;
|
|
311 $logger->trace('Sequence quality of id '.$cleanedId.' is : ' . $quality. "\n")
|
|
312 }
|
|
313 else{
|
|
314 $logger->trace('id ' . $cleanedId. " not found in index\n");
|
|
315 }
|
|
316 }
|
|
317 $logger->debug($nbSequences.'/'.scalar(@$ids).' sequences qualities has been retrieved from indexed file ' . $self->{file} . "\n");
|
|
318 return $data;
|
|
319 }
|
|
320
|
|
321 =head2 retrieveFastqBlock
|
|
322
|
|
323 =head2
|
|
324
|
|
325 =head3 Description
|
|
326
|
|
327 Retrieve FASTQ formatted sequences using a list of ids
|
|
328
|
|
329 =head3 Arguments
|
|
330
|
|
331 =over 4
|
|
332
|
|
333 =item
|
|
334
|
|
335 A sequence id OR an array reference containing the list of sequences id to retrieve.
|
|
336
|
|
337 =back
|
|
338
|
|
339 =head3 Returns
|
|
340
|
|
341 =over 4
|
|
342
|
|
343 =item
|
|
344
|
|
345 A scalar containing the sequences corresponding to ids in FASTQ format
|
|
346
|
|
347 =back
|
|
348
|
|
349 =cut
|
|
350
|
|
351 sub retrieveFastqBlock{
|
|
352 my ($self, $ids) = @_;
|
|
353 my $data;
|
|
354 my $nbSequences = 0;
|
|
355
|
|
356 if(! ref $ids){$ids = [$ids]}
|
|
357
|
|
358 $logger->trace('Retrieving fastq block of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n");
|
|
359 my $fh = $self->{file_handle};
|
|
360 foreach my $id (@$ids){
|
|
361 my $cleanedId = $id;
|
|
362 if($id =~ /@(\S+)/){
|
|
363 $cleanedId = $1;
|
|
364 }
|
|
365 $logger->trace('Retrieving informations of id ' . $cleanedId. " from index\n");
|
|
366 if(exists $self->{index} -> {$cleanedId}){
|
|
367 $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n");
|
|
368 seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0);
|
|
369 read($fh, my $block, $self->{index} -> {$cleanedId}{'id_length'});
|
|
370 $block .= <$fh>.<$fh>.<$fh>;
|
|
371 $data .= $block;
|
|
372 $nbSequences++;
|
|
373 $logger->trace('fastq block of id '.$cleanedId.' is : ' ."\n". $block. "\n")
|
|
374 }
|
|
375 else{
|
|
376 $logger->trace('id ' . $cleanedId. " not found in index\n");
|
|
377 }
|
|
378 }
|
|
379 $logger->trace($nbSequences.'/'.scalar(@$ids).' fastq block has been retrieved from indexed file ' . $self->{file} . "\n");
|
|
380 return $data;
|
|
381 }
|
|
382 1;
|