Mercurial > repos > stheil > readpercontig_blat
diff perl/lib/Fastq.pm @ 1:3203097d0a70 draft
Uploaded
author | stheil |
---|---|
date | Thu, 15 Oct 2015 09:53:48 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/perl/lib/Fastq.pm Thu Oct 15 09:53:48 2015 -0400 @@ -0,0 +1,382 @@ +package Tools::Fastq; + +use strict; +use warnings; +use Logger::Logger; +use Storable; + + +=head1 INDEXED FASTQ RELATED METHODS + +=head2 + +=head2 new + +=head2 + +=head3 Description + +Create a new Tools::Fastq object and index the FASTQ file + +=head3 Arguments + +=over 4 + +=item + +A hash of parameters. + +Currently accepted keys are : + +'file' => FASTQ file path + +=back + +=head3 Returns + +=over 4 + +=item + +A Tools::Fastq object + +=back + +=cut + +sub new { + my ($class, %attrs) = @_; + my $self = {}; + bless $self; + if(defined($attrs{file})){ + $self->{file} = $attrs{file}; + open($self->{file_handle},$self->{file}) || $logger->logdie('Error opening file : '. $self->{file}.' : '.$!."\n"); + $self->indexFastqFile; + } + return $self; +} + +=head2 indexFastqFile + +=head2 + +=head3 Description + +Index a FASTQ file creating a hash reference with the following structure : + +$index -> {seq_id} = {'id_begin_position' => integer, 'id_length' => integer} + +For each sequence id, the "@" symbol and all the text after space will be removed. + +This cleaned id will be used as key for the index. + +=head3 Arguments + +=over 4 + +=item + +None + +=back + +=head3 Returns + +=over 4 + +=item + +None + +=back + +=cut + +sub indexFastqFile{ + + my ($self) = @_; + $logger->info('Indexing file : '.$self->{file}."\n"); + my $index; + my $id; + my $id_begin_position = 0; + my $fh = $self->{file_handle}; + while(my $line = <$fh>){ + + if($line =~ /^@(\S+)/){ + + $id = $1; + chomp $id; + $index -> {$id} = {'id_begin_position' => $id_begin_position, 'id_length' => length $line}; + $logger->trace('Indexing sequence' . $id . ' (position_begin_id : '. $index -> {$id}{'id_begin_position'} . ', id_length : '. $index -> {$id}{'id_length'} .') from ' . $self->{file} . "\n"); + <$fh>; <$fh>; <$fh>; + } + + $id_begin_position = tell($fh); + } + + $logger->info('File '.$self->{file}.' is now indexed (index contains '.(scalar keys %$index)." sequences)\n"); + $self->{index} = $index; +} + +=head2 loadFastqIndexFile + +=head2 + +=head3 Description + +Retrieve index from file using Storable module + +=head3 Arguments + +=over 4 + +An index file + +=back + +=head3 Returns + +=over 4 + +=item + +A hash reference corresponding to the index of the input FASTQ file : + +$index -> {seq_id} = {'id_begin_position' => integer, 'id_length' => integer} + +=back + +=cut + +sub loadFastqIndexFile{ + + my ($self, $file) = @_; + $self->{index} = retrieve($file); + $logger->info('File '.$file." is now loaded\n"); +} + +=he=head2 writeFastaIndexFile + +=head2 + +=head3 Description + +Write index to file using Storable module + +=head3 Arguments + +=over 4 + +=item + +A hash reference corresponding to FASTQ index. + +=item + +An output file path where to store the index. + +=back + +=head3 Returns + +=over 4 + +=item + +The output file path containing index + +=back + +=cut + +sub writeFastqIndexFile{ + my ($self, $file) = @_; + $logger->info('Writing index ('.(scalar keys %{$self->{index}}).' sequences) in file : '.$file."\n"); + store $self->{index}, $file; + $logger->info('File '.$file." is now created\n"); +} + +=head2 retrieveFastqSequence + +=head2 + +=head3 Description + +Retrieve FASTQ sequences using a list of ids + +=head3 Arguments + +=over 4 + +=item + +A sequence id OR an array reference containing the list of sequences id to retrieve. + +=back + +=head3 Returns + +=over 4 + +=item + +A hash reference containing sequences id as keys and sequences as values + +$data -> {seq_id} = sequence_corresponding_to_seq_id + +=back + +=cut + +sub retrieveFastqSequence{ + my ($self, $ids) = @_; + my $data={}; + my $nbSequences = 0; + if(! ref $ids){$ids = [$ids]} + $logger->debug('Retrieving sequences of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n"); + my $fh = $self->{file_handle}; + foreach my $id (@$ids){ + my $cleanedId = $id; + if($id =~ /@(\S+)/){$cleanedId = $1} + $logger->trace('Retrieving informations of id ' . $cleanedId. " from index\n"); + if(exists $self->{index} -> {$cleanedId}){ + $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n"); + seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0); + <$fh>; + my $sequence = <$fh>; + $data->{$id} = $sequence; + $nbSequences ++; + $logger->trace('Sequence of id '.$cleanedId.' is : ' . $sequence . "\n") + } + else{ + $logger->trace('id ' . $cleanedId. " not found in index\n") + } + } + $logger->debug($nbSequences.'/'.scalar(@$ids).' sequences has been retrieved from indexed file ' . $self->{file} . "\n"); + return $data; +} + +=head2 retrieveFastqQuality + +=head2 + +=head3 Description + +Retrieve FASTQ sequences quality using a list of ids + +=head3 Arguments + +=over 4 + +=item + +A sequence id OR an array reference containing the list of sequences id to retrieve quality. + +=back + +=head3 Returns + +=over 4 + +=item + +A hash reference containing sequences id as keys and sequences quality as values + +$data -> {seq_id} = sequence_quality_corresponding_to_seq_id + +=back + +=cut + +sub retrieveFastqQuality{ + my ($self, $ids) = @_; + my $data; + my $nbSequences = 0; + if(! ref $ids){$ids = [$ids]} + $logger->debug('Retrieving sequence quality of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n"); + my $fh = $self->{file_handle}; + foreach my $id (@$ids){ + my $cleanedId = $id; + if($id =~ /@(\S+)/){ + $cleanedId = $1; + } + $logger->trace('retrieving informations of id ' . $cleanedId. " from index\n"); + if(exists $self->{index} -> {$cleanedId}){ + $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n"); + seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0); + my $quality .= <$fh>.<$fh>.<$fh>; + $quality = <$fh>; + $data .= $quality; + $nbSequences ++; + $logger->trace('Sequence quality of id '.$cleanedId.' is : ' . $quality. "\n") + } + else{ + $logger->trace('id ' . $cleanedId. " not found in index\n"); + } + } + $logger->debug($nbSequences.'/'.scalar(@$ids).' sequences qualities has been retrieved from indexed file ' . $self->{file} . "\n"); + return $data; +} + +=head2 retrieveFastqBlock + +=head2 + +=head3 Description + +Retrieve FASTQ formatted sequences using a list of ids + +=head3 Arguments + +=over 4 + +=item + +A sequence id OR an array reference containing the list of sequences id to retrieve. + +=back + +=head3 Returns + +=over 4 + +=item + +A scalar containing the sequences corresponding to ids in FASTQ format + +=back + +=cut + +sub retrieveFastqBlock{ + my ($self, $ids) = @_; + my $data; + my $nbSequences = 0; + + if(! ref $ids){$ids = [$ids]} + + $logger->trace('Retrieving fastq block of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n"); + my $fh = $self->{file_handle}; + foreach my $id (@$ids){ + my $cleanedId = $id; + if($id =~ /@(\S+)/){ + $cleanedId = $1; + } + $logger->trace('Retrieving informations of id ' . $cleanedId. " from index\n"); + if(exists $self->{index} -> {$cleanedId}){ + $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n"); + seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0); + read($fh, my $block, $self->{index} -> {$cleanedId}{'id_length'}); + $block .= <$fh>.<$fh>.<$fh>; + $data .= $block; + $nbSequences++; + $logger->trace('fastq block of id '.$cleanedId.' is : ' ."\n". $block. "\n") + } + else{ + $logger->trace('id ' . $cleanedId. " not found in index\n"); + } + } + $logger->trace($nbSequences.'/'.scalar(@$ids).' fastq block has been retrieved from indexed file ' . $self->{file} . "\n"); + return $data; +} +1;