Mercurial > repos > stheil > readpercontig_blat
comparison perl/lib/Fastq.pm @ 1:3203097d0a70 draft
Uploaded
| author | stheil | 
|---|---|
| date | Thu, 15 Oct 2015 09:53:48 -0400 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 0:9730db7c9ad3 | 1:3203097d0a70 | 
|---|---|
| 1 package Tools::Fastq; | |
| 2 | |
| 3 use strict; | |
| 4 use warnings; | |
| 5 use Logger::Logger; | |
| 6 use Storable; | |
| 7 | |
| 8 | |
| 9 =head1 INDEXED FASTQ RELATED METHODS | |
| 10 | |
| 11 =head2 | |
| 12 | |
| 13 =head2 new | |
| 14 | |
| 15 =head2 | |
| 16 | |
| 17 =head3 Description | |
| 18 | |
| 19 Create a new Tools::Fastq object and index the FASTQ file | |
| 20 | |
| 21 =head3 Arguments | |
| 22 | |
| 23 =over 4 | |
| 24 | |
| 25 =item | |
| 26 | |
| 27 A hash of parameters. | |
| 28 | |
| 29 Currently accepted keys are : | |
| 30 | |
| 31 'file' => FASTQ file path | |
| 32 | |
| 33 =back | |
| 34 | |
| 35 =head3 Returns | |
| 36 | |
| 37 =over 4 | |
| 38 | |
| 39 =item | |
| 40 | |
| 41 A Tools::Fastq object | |
| 42 | |
| 43 =back | |
| 44 | |
| 45 =cut | |
| 46 | |
| 47 sub new { | |
| 48 my ($class, %attrs) = @_; | |
| 49 my $self = {}; | |
| 50 bless $self; | |
| 51 if(defined($attrs{file})){ | |
| 52 $self->{file} = $attrs{file}; | |
| 53 open($self->{file_handle},$self->{file}) || $logger->logdie('Error opening file : '. $self->{file}.' : '.$!."\n"); | |
| 54 $self->indexFastqFile; | |
| 55 } | |
| 56 return $self; | |
| 57 } | |
| 58 | |
| 59 =head2 indexFastqFile | |
| 60 | |
| 61 =head2 | |
| 62 | |
| 63 =head3 Description | |
| 64 | |
| 65 Index a FASTQ file creating a hash reference with the following structure : | |
| 66 | |
| 67 $index -> {seq_id} = {'id_begin_position' => integer, 'id_length' => integer} | |
| 68 | |
| 69 For each sequence id, the "@" symbol and all the text after space will be removed. | |
| 70 | |
| 71 This cleaned id will be used as key for the index. | |
| 72 | |
| 73 =head3 Arguments | |
| 74 | |
| 75 =over 4 | |
| 76 | |
| 77 =item | |
| 78 | |
| 79 None | |
| 80 | |
| 81 =back | |
| 82 | |
| 83 =head3 Returns | |
| 84 | |
| 85 =over 4 | |
| 86 | |
| 87 =item | |
| 88 | |
| 89 None | |
| 90 | |
| 91 =back | |
| 92 | |
| 93 =cut | |
| 94 | |
| 95 sub indexFastqFile{ | |
| 96 | |
| 97 my ($self) = @_; | |
| 98 $logger->info('Indexing file : '.$self->{file}."\n"); | |
| 99 my $index; | |
| 100 my $id; | |
| 101 my $id_begin_position = 0; | |
| 102 my $fh = $self->{file_handle}; | |
| 103 while(my $line = <$fh>){ | |
| 104 | |
| 105 if($line =~ /^@(\S+)/){ | |
| 106 | |
| 107 $id = $1; | |
| 108 chomp $id; | |
| 109 $index -> {$id} = {'id_begin_position' => $id_begin_position, 'id_length' => length $line}; | |
| 110 $logger->trace('Indexing sequence' . $id . ' (position_begin_id : '. $index -> {$id}{'id_begin_position'} . ', id_length : '. $index -> {$id}{'id_length'} .') from ' . $self->{file} . "\n"); | |
| 111 <$fh>; <$fh>; <$fh>; | |
| 112 } | |
| 113 | |
| 114 $id_begin_position = tell($fh); | |
| 115 } | |
| 116 | |
| 117 $logger->info('File '.$self->{file}.' is now indexed (index contains '.(scalar keys %$index)." sequences)\n"); | |
| 118 $self->{index} = $index; | |
| 119 } | |
| 120 | |
| 121 =head2 loadFastqIndexFile | |
| 122 | |
| 123 =head2 | |
| 124 | |
| 125 =head3 Description | |
| 126 | |
| 127 Retrieve index from file using Storable module | |
| 128 | |
| 129 =head3 Arguments | |
| 130 | |
| 131 =over 4 | |
| 132 | |
| 133 An index file | |
| 134 | |
| 135 =back | |
| 136 | |
| 137 =head3 Returns | |
| 138 | |
| 139 =over 4 | |
| 140 | |
| 141 =item | |
| 142 | |
| 143 A hash reference corresponding to the index of the input FASTQ file : | |
| 144 | |
| 145 $index -> {seq_id} = {'id_begin_position' => integer, 'id_length' => integer} | |
| 146 | |
| 147 =back | |
| 148 | |
| 149 =cut | |
| 150 | |
| 151 sub loadFastqIndexFile{ | |
| 152 | |
| 153 my ($self, $file) = @_; | |
| 154 $self->{index} = retrieve($file); | |
| 155 $logger->info('File '.$file." is now loaded\n"); | |
| 156 } | |
| 157 | |
| 158 =he=head2 writeFastaIndexFile | |
| 159 | |
| 160 =head2 | |
| 161 | |
| 162 =head3 Description | |
| 163 | |
| 164 Write index to file using Storable module | |
| 165 | |
| 166 =head3 Arguments | |
| 167 | |
| 168 =over 4 | |
| 169 | |
| 170 =item | |
| 171 | |
| 172 A hash reference corresponding to FASTQ index. | |
| 173 | |
| 174 =item | |
| 175 | |
| 176 An output file path where to store the index. | |
| 177 | |
| 178 =back | |
| 179 | |
| 180 =head3 Returns | |
| 181 | |
| 182 =over 4 | |
| 183 | |
| 184 =item | |
| 185 | |
| 186 The output file path containing index | |
| 187 | |
| 188 =back | |
| 189 | |
| 190 =cut | |
| 191 | |
| 192 sub writeFastqIndexFile{ | |
| 193 my ($self, $file) = @_; | |
| 194 $logger->info('Writing index ('.(scalar keys %{$self->{index}}).' sequences) in file : '.$file."\n"); | |
| 195 store $self->{index}, $file; | |
| 196 $logger->info('File '.$file." is now created\n"); | |
| 197 } | |
| 198 | |
| 199 =head2 retrieveFastqSequence | |
| 200 | |
| 201 =head2 | |
| 202 | |
| 203 =head3 Description | |
| 204 | |
| 205 Retrieve FASTQ sequences using a list of ids | |
| 206 | |
| 207 =head3 Arguments | |
| 208 | |
| 209 =over 4 | |
| 210 | |
| 211 =item | |
| 212 | |
| 213 A sequence id OR an array reference containing the list of sequences id to retrieve. | |
| 214 | |
| 215 =back | |
| 216 | |
| 217 =head3 Returns | |
| 218 | |
| 219 =over 4 | |
| 220 | |
| 221 =item | |
| 222 | |
| 223 A hash reference containing sequences id as keys and sequences as values | |
| 224 | |
| 225 $data -> {seq_id} = sequence_corresponding_to_seq_id | |
| 226 | |
| 227 =back | |
| 228 | |
| 229 =cut | |
| 230 | |
| 231 sub retrieveFastqSequence{ | |
| 232 my ($self, $ids) = @_; | |
| 233 my $data={}; | |
| 234 my $nbSequences = 0; | |
| 235 if(! ref $ids){$ids = [$ids]} | |
| 236 $logger->debug('Retrieving sequences of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n"); | |
| 237 my $fh = $self->{file_handle}; | |
| 238 foreach my $id (@$ids){ | |
| 239 my $cleanedId = $id; | |
| 240 if($id =~ /@(\S+)/){$cleanedId = $1} | |
| 241 $logger->trace('Retrieving informations of id ' . $cleanedId. " from index\n"); | |
| 242 if(exists $self->{index} -> {$cleanedId}){ | |
| 243 $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n"); | |
| 244 seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0); | |
| 245 <$fh>; | |
| 246 my $sequence = <$fh>; | |
| 247 $data->{$id} = $sequence; | |
| 248 $nbSequences ++; | |
| 249 $logger->trace('Sequence of id '.$cleanedId.' is : ' . $sequence . "\n") | |
| 250 } | |
| 251 else{ | |
| 252 $logger->trace('id ' . $cleanedId. " not found in index\n") | |
| 253 } | |
| 254 } | |
| 255 $logger->debug($nbSequences.'/'.scalar(@$ids).' sequences has been retrieved from indexed file ' . $self->{file} . "\n"); | |
| 256 return $data; | |
| 257 } | |
| 258 | |
| 259 =head2 retrieveFastqQuality | |
| 260 | |
| 261 =head2 | |
| 262 | |
| 263 =head3 Description | |
| 264 | |
| 265 Retrieve FASTQ sequences quality using a list of ids | |
| 266 | |
| 267 =head3 Arguments | |
| 268 | |
| 269 =over 4 | |
| 270 | |
| 271 =item | |
| 272 | |
| 273 A sequence id OR an array reference containing the list of sequences id to retrieve quality. | |
| 274 | |
| 275 =back | |
| 276 | |
| 277 =head3 Returns | |
| 278 | |
| 279 =over 4 | |
| 280 | |
| 281 =item | |
| 282 | |
| 283 A hash reference containing sequences id as keys and sequences quality as values | |
| 284 | |
| 285 $data -> {seq_id} = sequence_quality_corresponding_to_seq_id | |
| 286 | |
| 287 =back | |
| 288 | |
| 289 =cut | |
| 290 | |
| 291 sub retrieveFastqQuality{ | |
| 292 my ($self, $ids) = @_; | |
| 293 my $data; | |
| 294 my $nbSequences = 0; | |
| 295 if(! ref $ids){$ids = [$ids]} | |
| 296 $logger->debug('Retrieving sequence quality of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n"); | |
| 297 my $fh = $self->{file_handle}; | |
| 298 foreach my $id (@$ids){ | |
| 299 my $cleanedId = $id; | |
| 300 if($id =~ /@(\S+)/){ | |
| 301 $cleanedId = $1; | |
| 302 } | |
| 303 $logger->trace('retrieving informations of id ' . $cleanedId. " from index\n"); | |
| 304 if(exists $self->{index} -> {$cleanedId}){ | |
| 305 $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n"); | |
| 306 seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0); | |
| 307 my $quality .= <$fh>.<$fh>.<$fh>; | |
| 308 $quality = <$fh>; | |
| 309 $data .= $quality; | |
| 310 $nbSequences ++; | |
| 311 $logger->trace('Sequence quality of id '.$cleanedId.' is : ' . $quality. "\n") | |
| 312 } | |
| 313 else{ | |
| 314 $logger->trace('id ' . $cleanedId. " not found in index\n"); | |
| 315 } | |
| 316 } | |
| 317 $logger->debug($nbSequences.'/'.scalar(@$ids).' sequences qualities has been retrieved from indexed file ' . $self->{file} . "\n"); | |
| 318 return $data; | |
| 319 } | |
| 320 | |
| 321 =head2 retrieveFastqBlock | |
| 322 | |
| 323 =head2 | |
| 324 | |
| 325 =head3 Description | |
| 326 | |
| 327 Retrieve FASTQ formatted sequences using a list of ids | |
| 328 | |
| 329 =head3 Arguments | |
| 330 | |
| 331 =over 4 | |
| 332 | |
| 333 =item | |
| 334 | |
| 335 A sequence id OR an array reference containing the list of sequences id to retrieve. | |
| 336 | |
| 337 =back | |
| 338 | |
| 339 =head3 Returns | |
| 340 | |
| 341 =over 4 | |
| 342 | |
| 343 =item | |
| 344 | |
| 345 A scalar containing the sequences corresponding to ids in FASTQ format | |
| 346 | |
| 347 =back | |
| 348 | |
| 349 =cut | |
| 350 | |
| 351 sub retrieveFastqBlock{ | |
| 352 my ($self, $ids) = @_; | |
| 353 my $data; | |
| 354 my $nbSequences = 0; | |
| 355 | |
| 356 if(! ref $ids){$ids = [$ids]} | |
| 357 | |
| 358 $logger->trace('Retrieving fastq block of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n"); | |
| 359 my $fh = $self->{file_handle}; | |
| 360 foreach my $id (@$ids){ | |
| 361 my $cleanedId = $id; | |
| 362 if($id =~ /@(\S+)/){ | |
| 363 $cleanedId = $1; | |
| 364 } | |
| 365 $logger->trace('Retrieving informations of id ' . $cleanedId. " from index\n"); | |
| 366 if(exists $self->{index} -> {$cleanedId}){ | |
| 367 $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n"); | |
| 368 seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0); | |
| 369 read($fh, my $block, $self->{index} -> {$cleanedId}{'id_length'}); | |
| 370 $block .= <$fh>.<$fh>.<$fh>; | |
| 371 $data .= $block; | |
| 372 $nbSequences++; | |
| 373 $logger->trace('fastq block of id '.$cleanedId.' is : ' ."\n". $block. "\n") | |
| 374 } | |
| 375 else{ | |
| 376 $logger->trace('id ' . $cleanedId. " not found in index\n"); | |
| 377 } | |
| 378 } | |
| 379 $logger->trace($nbSequences.'/'.scalar(@$ids).' fastq block has been retrieved from indexed file ' . $self->{file} . "\n"); | |
| 380 return $data; | |
| 381 } | |
| 382 1; | 
