Mercurial > repos > stheil > readpercontig_blat
comparison perl/lib/Fastq.pm @ 1:3203097d0a70 draft
Uploaded
author | stheil |
---|---|
date | Thu, 15 Oct 2015 09:53:48 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:9730db7c9ad3 | 1:3203097d0a70 |
---|---|
1 package Tools::Fastq; | |
2 | |
3 use strict; | |
4 use warnings; | |
5 use Logger::Logger; | |
6 use Storable; | |
7 | |
8 | |
9 =head1 INDEXED FASTQ RELATED METHODS | |
10 | |
11 =head2 | |
12 | |
13 =head2 new | |
14 | |
15 =head2 | |
16 | |
17 =head3 Description | |
18 | |
19 Create a new Tools::Fastq object and index the FASTQ file | |
20 | |
21 =head3 Arguments | |
22 | |
23 =over 4 | |
24 | |
25 =item | |
26 | |
27 A hash of parameters. | |
28 | |
29 Currently accepted keys are : | |
30 | |
31 'file' => FASTQ file path | |
32 | |
33 =back | |
34 | |
35 =head3 Returns | |
36 | |
37 =over 4 | |
38 | |
39 =item | |
40 | |
41 A Tools::Fastq object | |
42 | |
43 =back | |
44 | |
45 =cut | |
46 | |
47 sub new { | |
48 my ($class, %attrs) = @_; | |
49 my $self = {}; | |
50 bless $self; | |
51 if(defined($attrs{file})){ | |
52 $self->{file} = $attrs{file}; | |
53 open($self->{file_handle},$self->{file}) || $logger->logdie('Error opening file : '. $self->{file}.' : '.$!."\n"); | |
54 $self->indexFastqFile; | |
55 } | |
56 return $self; | |
57 } | |
58 | |
59 =head2 indexFastqFile | |
60 | |
61 =head2 | |
62 | |
63 =head3 Description | |
64 | |
65 Index a FASTQ file creating a hash reference with the following structure : | |
66 | |
67 $index -> {seq_id} = {'id_begin_position' => integer, 'id_length' => integer} | |
68 | |
69 For each sequence id, the "@" symbol and all the text after space will be removed. | |
70 | |
71 This cleaned id will be used as key for the index. | |
72 | |
73 =head3 Arguments | |
74 | |
75 =over 4 | |
76 | |
77 =item | |
78 | |
79 None | |
80 | |
81 =back | |
82 | |
83 =head3 Returns | |
84 | |
85 =over 4 | |
86 | |
87 =item | |
88 | |
89 None | |
90 | |
91 =back | |
92 | |
93 =cut | |
94 | |
95 sub indexFastqFile{ | |
96 | |
97 my ($self) = @_; | |
98 $logger->info('Indexing file : '.$self->{file}."\n"); | |
99 my $index; | |
100 my $id; | |
101 my $id_begin_position = 0; | |
102 my $fh = $self->{file_handle}; | |
103 while(my $line = <$fh>){ | |
104 | |
105 if($line =~ /^@(\S+)/){ | |
106 | |
107 $id = $1; | |
108 chomp $id; | |
109 $index -> {$id} = {'id_begin_position' => $id_begin_position, 'id_length' => length $line}; | |
110 $logger->trace('Indexing sequence' . $id . ' (position_begin_id : '. $index -> {$id}{'id_begin_position'} . ', id_length : '. $index -> {$id}{'id_length'} .') from ' . $self->{file} . "\n"); | |
111 <$fh>; <$fh>; <$fh>; | |
112 } | |
113 | |
114 $id_begin_position = tell($fh); | |
115 } | |
116 | |
117 $logger->info('File '.$self->{file}.' is now indexed (index contains '.(scalar keys %$index)." sequences)\n"); | |
118 $self->{index} = $index; | |
119 } | |
120 | |
121 =head2 loadFastqIndexFile | |
122 | |
123 =head2 | |
124 | |
125 =head3 Description | |
126 | |
127 Retrieve index from file using Storable module | |
128 | |
129 =head3 Arguments | |
130 | |
131 =over 4 | |
132 | |
133 An index file | |
134 | |
135 =back | |
136 | |
137 =head3 Returns | |
138 | |
139 =over 4 | |
140 | |
141 =item | |
142 | |
143 A hash reference corresponding to the index of the input FASTQ file : | |
144 | |
145 $index -> {seq_id} = {'id_begin_position' => integer, 'id_length' => integer} | |
146 | |
147 =back | |
148 | |
149 =cut | |
150 | |
151 sub loadFastqIndexFile{ | |
152 | |
153 my ($self, $file) = @_; | |
154 $self->{index} = retrieve($file); | |
155 $logger->info('File '.$file." is now loaded\n"); | |
156 } | |
157 | |
158 =he=head2 writeFastaIndexFile | |
159 | |
160 =head2 | |
161 | |
162 =head3 Description | |
163 | |
164 Write index to file using Storable module | |
165 | |
166 =head3 Arguments | |
167 | |
168 =over 4 | |
169 | |
170 =item | |
171 | |
172 A hash reference corresponding to FASTQ index. | |
173 | |
174 =item | |
175 | |
176 An output file path where to store the index. | |
177 | |
178 =back | |
179 | |
180 =head3 Returns | |
181 | |
182 =over 4 | |
183 | |
184 =item | |
185 | |
186 The output file path containing index | |
187 | |
188 =back | |
189 | |
190 =cut | |
191 | |
192 sub writeFastqIndexFile{ | |
193 my ($self, $file) = @_; | |
194 $logger->info('Writing index ('.(scalar keys %{$self->{index}}).' sequences) in file : '.$file."\n"); | |
195 store $self->{index}, $file; | |
196 $logger->info('File '.$file." is now created\n"); | |
197 } | |
198 | |
199 =head2 retrieveFastqSequence | |
200 | |
201 =head2 | |
202 | |
203 =head3 Description | |
204 | |
205 Retrieve FASTQ sequences using a list of ids | |
206 | |
207 =head3 Arguments | |
208 | |
209 =over 4 | |
210 | |
211 =item | |
212 | |
213 A sequence id OR an array reference containing the list of sequences id to retrieve. | |
214 | |
215 =back | |
216 | |
217 =head3 Returns | |
218 | |
219 =over 4 | |
220 | |
221 =item | |
222 | |
223 A hash reference containing sequences id as keys and sequences as values | |
224 | |
225 $data -> {seq_id} = sequence_corresponding_to_seq_id | |
226 | |
227 =back | |
228 | |
229 =cut | |
230 | |
231 sub retrieveFastqSequence{ | |
232 my ($self, $ids) = @_; | |
233 my $data={}; | |
234 my $nbSequences = 0; | |
235 if(! ref $ids){$ids = [$ids]} | |
236 $logger->debug('Retrieving sequences of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n"); | |
237 my $fh = $self->{file_handle}; | |
238 foreach my $id (@$ids){ | |
239 my $cleanedId = $id; | |
240 if($id =~ /@(\S+)/){$cleanedId = $1} | |
241 $logger->trace('Retrieving informations of id ' . $cleanedId. " from index\n"); | |
242 if(exists $self->{index} -> {$cleanedId}){ | |
243 $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n"); | |
244 seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0); | |
245 <$fh>; | |
246 my $sequence = <$fh>; | |
247 $data->{$id} = $sequence; | |
248 $nbSequences ++; | |
249 $logger->trace('Sequence of id '.$cleanedId.' is : ' . $sequence . "\n") | |
250 } | |
251 else{ | |
252 $logger->trace('id ' . $cleanedId. " not found in index\n") | |
253 } | |
254 } | |
255 $logger->debug($nbSequences.'/'.scalar(@$ids).' sequences has been retrieved from indexed file ' . $self->{file} . "\n"); | |
256 return $data; | |
257 } | |
258 | |
259 =head2 retrieveFastqQuality | |
260 | |
261 =head2 | |
262 | |
263 =head3 Description | |
264 | |
265 Retrieve FASTQ sequences quality using a list of ids | |
266 | |
267 =head3 Arguments | |
268 | |
269 =over 4 | |
270 | |
271 =item | |
272 | |
273 A sequence id OR an array reference containing the list of sequences id to retrieve quality. | |
274 | |
275 =back | |
276 | |
277 =head3 Returns | |
278 | |
279 =over 4 | |
280 | |
281 =item | |
282 | |
283 A hash reference containing sequences id as keys and sequences quality as values | |
284 | |
285 $data -> {seq_id} = sequence_quality_corresponding_to_seq_id | |
286 | |
287 =back | |
288 | |
289 =cut | |
290 | |
291 sub retrieveFastqQuality{ | |
292 my ($self, $ids) = @_; | |
293 my $data; | |
294 my $nbSequences = 0; | |
295 if(! ref $ids){$ids = [$ids]} | |
296 $logger->debug('Retrieving sequence quality of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n"); | |
297 my $fh = $self->{file_handle}; | |
298 foreach my $id (@$ids){ | |
299 my $cleanedId = $id; | |
300 if($id =~ /@(\S+)/){ | |
301 $cleanedId = $1; | |
302 } | |
303 $logger->trace('retrieving informations of id ' . $cleanedId. " from index\n"); | |
304 if(exists $self->{index} -> {$cleanedId}){ | |
305 $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n"); | |
306 seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0); | |
307 my $quality .= <$fh>.<$fh>.<$fh>; | |
308 $quality = <$fh>; | |
309 $data .= $quality; | |
310 $nbSequences ++; | |
311 $logger->trace('Sequence quality of id '.$cleanedId.' is : ' . $quality. "\n") | |
312 } | |
313 else{ | |
314 $logger->trace('id ' . $cleanedId. " not found in index\n"); | |
315 } | |
316 } | |
317 $logger->debug($nbSequences.'/'.scalar(@$ids).' sequences qualities has been retrieved from indexed file ' . $self->{file} . "\n"); | |
318 return $data; | |
319 } | |
320 | |
321 =head2 retrieveFastqBlock | |
322 | |
323 =head2 | |
324 | |
325 =head3 Description | |
326 | |
327 Retrieve FASTQ formatted sequences using a list of ids | |
328 | |
329 =head3 Arguments | |
330 | |
331 =over 4 | |
332 | |
333 =item | |
334 | |
335 A sequence id OR an array reference containing the list of sequences id to retrieve. | |
336 | |
337 =back | |
338 | |
339 =head3 Returns | |
340 | |
341 =over 4 | |
342 | |
343 =item | |
344 | |
345 A scalar containing the sequences corresponding to ids in FASTQ format | |
346 | |
347 =back | |
348 | |
349 =cut | |
350 | |
351 sub retrieveFastqBlock{ | |
352 my ($self, $ids) = @_; | |
353 my $data; | |
354 my $nbSequences = 0; | |
355 | |
356 if(! ref $ids){$ids = [$ids]} | |
357 | |
358 $logger->trace('Retrieving fastq block of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n"); | |
359 my $fh = $self->{file_handle}; | |
360 foreach my $id (@$ids){ | |
361 my $cleanedId = $id; | |
362 if($id =~ /@(\S+)/){ | |
363 $cleanedId = $1; | |
364 } | |
365 $logger->trace('Retrieving informations of id ' . $cleanedId. " from index\n"); | |
366 if(exists $self->{index} -> {$cleanedId}){ | |
367 $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n"); | |
368 seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0); | |
369 read($fh, my $block, $self->{index} -> {$cleanedId}{'id_length'}); | |
370 $block .= <$fh>.<$fh>.<$fh>; | |
371 $data .= $block; | |
372 $nbSequences++; | |
373 $logger->trace('fastq block of id '.$cleanedId.' is : ' ."\n". $block. "\n") | |
374 } | |
375 else{ | |
376 $logger->trace('id ' . $cleanedId. " not found in index\n"); | |
377 } | |
378 } | |
379 $logger->trace($nbSequences.'/'.scalar(@$ids).' fastq block has been retrieved from indexed file ' . $self->{file} . "\n"); | |
380 return $data; | |
381 } | |
382 1; |