Mercurial > repos > stheil > readpercontig_blat
view perl/lib/Fastq.pm @ 2:ea81b455dbf6 draft default tip
Uploaded
author | stheil |
---|---|
date | Thu, 15 Oct 2015 10:12:03 -0400 |
parents | 3203097d0a70 |
children |
line wrap: on
line source
package Tools::Fastq; use strict; use warnings; use Logger::Logger; use Storable; =head1 INDEXED FASTQ RELATED METHODS =head2 =head2 new =head2 =head3 Description Create a new Tools::Fastq object and index the FASTQ file =head3 Arguments =over 4 =item A hash of parameters. Currently accepted keys are : 'file' => FASTQ file path =back =head3 Returns =over 4 =item A Tools::Fastq object =back =cut sub new { my ($class, %attrs) = @_; my $self = {}; bless $self; if(defined($attrs{file})){ $self->{file} = $attrs{file}; open($self->{file_handle},$self->{file}) || $logger->logdie('Error opening file : '. $self->{file}.' : '.$!."\n"); $self->indexFastqFile; } return $self; } =head2 indexFastqFile =head2 =head3 Description Index a FASTQ file creating a hash reference with the following structure : $index -> {seq_id} = {'id_begin_position' => integer, 'id_length' => integer} For each sequence id, the "@" symbol and all the text after space will be removed. This cleaned id will be used as key for the index. =head3 Arguments =over 4 =item None =back =head3 Returns =over 4 =item None =back =cut sub indexFastqFile{ my ($self) = @_; $logger->info('Indexing file : '.$self->{file}."\n"); my $index; my $id; my $id_begin_position = 0; my $fh = $self->{file_handle}; while(my $line = <$fh>){ if($line =~ /^@(\S+)/){ $id = $1; chomp $id; $index -> {$id} = {'id_begin_position' => $id_begin_position, 'id_length' => length $line}; $logger->trace('Indexing sequence' . $id . ' (position_begin_id : '. $index -> {$id}{'id_begin_position'} . ', id_length : '. $index -> {$id}{'id_length'} .') from ' . $self->{file} . "\n"); <$fh>; <$fh>; <$fh>; } $id_begin_position = tell($fh); } $logger->info('File '.$self->{file}.' is now indexed (index contains '.(scalar keys %$index)." sequences)\n"); $self->{index} = $index; } =head2 loadFastqIndexFile =head2 =head3 Description Retrieve index from file using Storable module =head3 Arguments =over 4 An index file =back =head3 Returns =over 4 =item A hash reference corresponding to the index of the input FASTQ file : $index -> {seq_id} = {'id_begin_position' => integer, 'id_length' => integer} =back =cut sub loadFastqIndexFile{ my ($self, $file) = @_; $self->{index} = retrieve($file); $logger->info('File '.$file." is now loaded\n"); } =he=head2 writeFastaIndexFile =head2 =head3 Description Write index to file using Storable module =head3 Arguments =over 4 =item A hash reference corresponding to FASTQ index. =item An output file path where to store the index. =back =head3 Returns =over 4 =item The output file path containing index =back =cut sub writeFastqIndexFile{ my ($self, $file) = @_; $logger->info('Writing index ('.(scalar keys %{$self->{index}}).' sequences) in file : '.$file."\n"); store $self->{index}, $file; $logger->info('File '.$file." is now created\n"); } =head2 retrieveFastqSequence =head2 =head3 Description Retrieve FASTQ sequences using a list of ids =head3 Arguments =over 4 =item A sequence id OR an array reference containing the list of sequences id to retrieve. =back =head3 Returns =over 4 =item A hash reference containing sequences id as keys and sequences as values $data -> {seq_id} = sequence_corresponding_to_seq_id =back =cut sub retrieveFastqSequence{ my ($self, $ids) = @_; my $data={}; my $nbSequences = 0; if(! ref $ids){$ids = [$ids]} $logger->debug('Retrieving sequences of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n"); my $fh = $self->{file_handle}; foreach my $id (@$ids){ my $cleanedId = $id; if($id =~ /@(\S+)/){$cleanedId = $1} $logger->trace('Retrieving informations of id ' . $cleanedId. " from index\n"); if(exists $self->{index} -> {$cleanedId}){ $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n"); seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0); <$fh>; my $sequence = <$fh>; $data->{$id} = $sequence; $nbSequences ++; $logger->trace('Sequence of id '.$cleanedId.' is : ' . $sequence . "\n") } else{ $logger->trace('id ' . $cleanedId. " not found in index\n") } } $logger->debug($nbSequences.'/'.scalar(@$ids).' sequences has been retrieved from indexed file ' . $self->{file} . "\n"); return $data; } =head2 retrieveFastqQuality =head2 =head3 Description Retrieve FASTQ sequences quality using a list of ids =head3 Arguments =over 4 =item A sequence id OR an array reference containing the list of sequences id to retrieve quality. =back =head3 Returns =over 4 =item A hash reference containing sequences id as keys and sequences quality as values $data -> {seq_id} = sequence_quality_corresponding_to_seq_id =back =cut sub retrieveFastqQuality{ my ($self, $ids) = @_; my $data; my $nbSequences = 0; if(! ref $ids){$ids = [$ids]} $logger->debug('Retrieving sequence quality of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n"); my $fh = $self->{file_handle}; foreach my $id (@$ids){ my $cleanedId = $id; if($id =~ /@(\S+)/){ $cleanedId = $1; } $logger->trace('retrieving informations of id ' . $cleanedId. " from index\n"); if(exists $self->{index} -> {$cleanedId}){ $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n"); seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0); my $quality .= <$fh>.<$fh>.<$fh>; $quality = <$fh>; $data .= $quality; $nbSequences ++; $logger->trace('Sequence quality of id '.$cleanedId.' is : ' . $quality. "\n") } else{ $logger->trace('id ' . $cleanedId. " not found in index\n"); } } $logger->debug($nbSequences.'/'.scalar(@$ids).' sequences qualities has been retrieved from indexed file ' . $self->{file} . "\n"); return $data; } =head2 retrieveFastqBlock =head2 =head3 Description Retrieve FASTQ formatted sequences using a list of ids =head3 Arguments =over 4 =item A sequence id OR an array reference containing the list of sequences id to retrieve. =back =head3 Returns =over 4 =item A scalar containing the sequences corresponding to ids in FASTQ format =back =cut sub retrieveFastqBlock{ my ($self, $ids) = @_; my $data; my $nbSequences = 0; if(! ref $ids){$ids = [$ids]} $logger->trace('Retrieving fastq block of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n"); my $fh = $self->{file_handle}; foreach my $id (@$ids){ my $cleanedId = $id; if($id =~ /@(\S+)/){ $cleanedId = $1; } $logger->trace('Retrieving informations of id ' . $cleanedId. " from index\n"); if(exists $self->{index} -> {$cleanedId}){ $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n"); seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0); read($fh, my $block, $self->{index} -> {$cleanedId}{'id_length'}); $block .= <$fh>.<$fh>.<$fh>; $data .= $block; $nbSequences++; $logger->trace('fastq block of id '.$cleanedId.' is : ' ."\n". $block. "\n") } else{ $logger->trace('id ' . $cleanedId. " not found in index\n"); } } $logger->trace($nbSequences.'/'.scalar(@$ids).' fastq block has been retrieved from indexed file ' . $self->{file} . "\n"); return $data; } 1;