comparison perl/lib/Fastq.pm @ 1:3203097d0a70 draft

Uploaded
author stheil
date Thu, 15 Oct 2015 09:53:48 -0400
parents
children
comparison
equal deleted inserted replaced
0:9730db7c9ad3 1:3203097d0a70
1 package Tools::Fastq;
2
3 use strict;
4 use warnings;
5 use Logger::Logger;
6 use Storable;
7
8
9 =head1 INDEXED FASTQ RELATED METHODS
10
11 =head2
12
13 =head2 new
14
15 =head2
16
17 =head3 Description
18
19 Create a new Tools::Fastq object and index the FASTQ file
20
21 =head3 Arguments
22
23 =over 4
24
25 =item
26
27 A hash of parameters.
28
29 Currently accepted keys are :
30
31 'file' => FASTQ file path
32
33 =back
34
35 =head3 Returns
36
37 =over 4
38
39 =item
40
41 A Tools::Fastq object
42
43 =back
44
45 =cut
46
47 sub new {
48 my ($class, %attrs) = @_;
49 my $self = {};
50 bless $self;
51 if(defined($attrs{file})){
52 $self->{file} = $attrs{file};
53 open($self->{file_handle},$self->{file}) || $logger->logdie('Error opening file : '. $self->{file}.' : '.$!."\n");
54 $self->indexFastqFile;
55 }
56 return $self;
57 }
58
59 =head2 indexFastqFile
60
61 =head2
62
63 =head3 Description
64
65 Index a FASTQ file creating a hash reference with the following structure :
66
67 $index -> {seq_id} = {'id_begin_position' => integer, 'id_length' => integer}
68
69 For each sequence id, the "@" symbol and all the text after space will be removed.
70
71 This cleaned id will be used as key for the index.
72
73 =head3 Arguments
74
75 =over 4
76
77 =item
78
79 None
80
81 =back
82
83 =head3 Returns
84
85 =over 4
86
87 =item
88
89 None
90
91 =back
92
93 =cut
94
95 sub indexFastqFile{
96
97 my ($self) = @_;
98 $logger->info('Indexing file : '.$self->{file}."\n");
99 my $index;
100 my $id;
101 my $id_begin_position = 0;
102 my $fh = $self->{file_handle};
103 while(my $line = <$fh>){
104
105 if($line =~ /^@(\S+)/){
106
107 $id = $1;
108 chomp $id;
109 $index -> {$id} = {'id_begin_position' => $id_begin_position, 'id_length' => length $line};
110 $logger->trace('Indexing sequence' . $id . ' (position_begin_id : '. $index -> {$id}{'id_begin_position'} . ', id_length : '. $index -> {$id}{'id_length'} .') from ' . $self->{file} . "\n");
111 <$fh>; <$fh>; <$fh>;
112 }
113
114 $id_begin_position = tell($fh);
115 }
116
117 $logger->info('File '.$self->{file}.' is now indexed (index contains '.(scalar keys %$index)." sequences)\n");
118 $self->{index} = $index;
119 }
120
121 =head2 loadFastqIndexFile
122
123 =head2
124
125 =head3 Description
126
127 Retrieve index from file using Storable module
128
129 =head3 Arguments
130
131 =over 4
132
133 An index file
134
135 =back
136
137 =head3 Returns
138
139 =over 4
140
141 =item
142
143 A hash reference corresponding to the index of the input FASTQ file :
144
145 $index -> {seq_id} = {'id_begin_position' => integer, 'id_length' => integer}
146
147 =back
148
149 =cut
150
151 sub loadFastqIndexFile{
152
153 my ($self, $file) = @_;
154 $self->{index} = retrieve($file);
155 $logger->info('File '.$file." is now loaded\n");
156 }
157
158 =he=head2 writeFastaIndexFile
159
160 =head2
161
162 =head3 Description
163
164 Write index to file using Storable module
165
166 =head3 Arguments
167
168 =over 4
169
170 =item
171
172 A hash reference corresponding to FASTQ index.
173
174 =item
175
176 An output file path where to store the index.
177
178 =back
179
180 =head3 Returns
181
182 =over 4
183
184 =item
185
186 The output file path containing index
187
188 =back
189
190 =cut
191
192 sub writeFastqIndexFile{
193 my ($self, $file) = @_;
194 $logger->info('Writing index ('.(scalar keys %{$self->{index}}).' sequences) in file : '.$file."\n");
195 store $self->{index}, $file;
196 $logger->info('File '.$file." is now created\n");
197 }
198
199 =head2 retrieveFastqSequence
200
201 =head2
202
203 =head3 Description
204
205 Retrieve FASTQ sequences using a list of ids
206
207 =head3 Arguments
208
209 =over 4
210
211 =item
212
213 A sequence id OR an array reference containing the list of sequences id to retrieve.
214
215 =back
216
217 =head3 Returns
218
219 =over 4
220
221 =item
222
223 A hash reference containing sequences id as keys and sequences as values
224
225 $data -> {seq_id} = sequence_corresponding_to_seq_id
226
227 =back
228
229 =cut
230
231 sub retrieveFastqSequence{
232 my ($self, $ids) = @_;
233 my $data={};
234 my $nbSequences = 0;
235 if(! ref $ids){$ids = [$ids]}
236 $logger->debug('Retrieving sequences of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n");
237 my $fh = $self->{file_handle};
238 foreach my $id (@$ids){
239 my $cleanedId = $id;
240 if($id =~ /@(\S+)/){$cleanedId = $1}
241 $logger->trace('Retrieving informations of id ' . $cleanedId. " from index\n");
242 if(exists $self->{index} -> {$cleanedId}){
243 $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n");
244 seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0);
245 <$fh>;
246 my $sequence = <$fh>;
247 $data->{$id} = $sequence;
248 $nbSequences ++;
249 $logger->trace('Sequence of id '.$cleanedId.' is : ' . $sequence . "\n")
250 }
251 else{
252 $logger->trace('id ' . $cleanedId. " not found in index\n")
253 }
254 }
255 $logger->debug($nbSequences.'/'.scalar(@$ids).' sequences has been retrieved from indexed file ' . $self->{file} . "\n");
256 return $data;
257 }
258
259 =head2 retrieveFastqQuality
260
261 =head2
262
263 =head3 Description
264
265 Retrieve FASTQ sequences quality using a list of ids
266
267 =head3 Arguments
268
269 =over 4
270
271 =item
272
273 A sequence id OR an array reference containing the list of sequences id to retrieve quality.
274
275 =back
276
277 =head3 Returns
278
279 =over 4
280
281 =item
282
283 A hash reference containing sequences id as keys and sequences quality as values
284
285 $data -> {seq_id} = sequence_quality_corresponding_to_seq_id
286
287 =back
288
289 =cut
290
291 sub retrieveFastqQuality{
292 my ($self, $ids) = @_;
293 my $data;
294 my $nbSequences = 0;
295 if(! ref $ids){$ids = [$ids]}
296 $logger->debug('Retrieving sequence quality of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n");
297 my $fh = $self->{file_handle};
298 foreach my $id (@$ids){
299 my $cleanedId = $id;
300 if($id =~ /@(\S+)/){
301 $cleanedId = $1;
302 }
303 $logger->trace('retrieving informations of id ' . $cleanedId. " from index\n");
304 if(exists $self->{index} -> {$cleanedId}){
305 $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n");
306 seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0);
307 my $quality .= <$fh>.<$fh>.<$fh>;
308 $quality = <$fh>;
309 $data .= $quality;
310 $nbSequences ++;
311 $logger->trace('Sequence quality of id '.$cleanedId.' is : ' . $quality. "\n")
312 }
313 else{
314 $logger->trace('id ' . $cleanedId. " not found in index\n");
315 }
316 }
317 $logger->debug($nbSequences.'/'.scalar(@$ids).' sequences qualities has been retrieved from indexed file ' . $self->{file} . "\n");
318 return $data;
319 }
320
321 =head2 retrieveFastqBlock
322
323 =head2
324
325 =head3 Description
326
327 Retrieve FASTQ formatted sequences using a list of ids
328
329 =head3 Arguments
330
331 =over 4
332
333 =item
334
335 A sequence id OR an array reference containing the list of sequences id to retrieve.
336
337 =back
338
339 =head3 Returns
340
341 =over 4
342
343 =item
344
345 A scalar containing the sequences corresponding to ids in FASTQ format
346
347 =back
348
349 =cut
350
351 sub retrieveFastqBlock{
352 my ($self, $ids) = @_;
353 my $data;
354 my $nbSequences = 0;
355
356 if(! ref $ids){$ids = [$ids]}
357
358 $logger->trace('Retrieving fastq block of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n");
359 my $fh = $self->{file_handle};
360 foreach my $id (@$ids){
361 my $cleanedId = $id;
362 if($id =~ /@(\S+)/){
363 $cleanedId = $1;
364 }
365 $logger->trace('Retrieving informations of id ' . $cleanedId. " from index\n");
366 if(exists $self->{index} -> {$cleanedId}){
367 $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n");
368 seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0);
369 read($fh, my $block, $self->{index} -> {$cleanedId}{'id_length'});
370 $block .= <$fh>.<$fh>.<$fh>;
371 $data .= $block;
372 $nbSequences++;
373 $logger->trace('fastq block of id '.$cleanedId.' is : ' ."\n". $block. "\n")
374 }
375 else{
376 $logger->trace('id ' . $cleanedId. " not found in index\n");
377 }
378 }
379 $logger->trace($nbSequences.'/'.scalar(@$ids).' fastq block has been retrieved from indexed file ' . $self->{file} . "\n");
380 return $data;
381 }
382 1;