Mercurial > repos > stheil > readpercontig_blat
view perl/lib/Fasta.pm @ 1:3203097d0a70 draft
Uploaded
author | stheil |
---|---|
date | Thu, 15 Oct 2015 09:53:48 -0400 |
parents | |
children |
line wrap: on
line source
package Tools::Fasta; use strict; use warnings; use Logger::Logger; use Data::Dumper; use Storable; =head1 INDEXED FASTA RELATED METHODS =head2 =head2 new =head2 =head3 Description Create a new Tools::Fasta object and index the FASTA file =head3 Arguments =over 4 =item A hash of parameters. Currently accepted keys are : 'file' => FASTA file path =back =head3 Returns =over 4 =item A Tools::Fasta object =back =cut sub new { my ($class, %attrs) = @_; my $self = {}; bless $self; if(defined($attrs{file})){ $self->{file} = $attrs{file}; open($self->{file_handle},$self->{file}) || $logger->logdie('Error opening file : '. $self->{file}.' : '.$!."\n"); $self->indexFastaFile; } return $self; } =head2 indexFastaFile =head2 =head3 Description Index a FASTA file creating a hash reference with the following structure : $index -> {seq_id} = {'id_begin_position' => integer, 'sequence_end_position' => integer, 'description' => string, 'position' => integer} For each sequence id, the ">" symbol and all the text after space will be removed. This cleaned id will be used as key for the index. =head3 Arguments =over 4 =item None =back =head3 Returns =over 4 =item None =back =cut sub indexFastaFile { my ($self) = @_; $logger->info('Indexing file : '.$self->{file}."\n"); my $id; my $id_begin_position = 0; my $description; my $pos=0; my $fh = $self->{file_handle}; while(my $line = <$fh>){ if($line =~ /^>(\S+)\s?(.*)$/){ $id = $1; $description = $2; $pos++; $self->{index}->{$id} = {'id_begin_position' => $id_begin_position, 'sequence_end_position' => 0, 'description' => $description, 'position' => $pos}; } else{ $self->{index}->{$id}{'sequence_end_position'} += length $line; $logger->trace('TRACE: Indexing sequence' . $id . ' (id_begin_position : '. $self->{index}->{$id}{'id_begin_position'} . ', sequence_end_position : '. $self->{index}->{$id}{'sequence_end_position'} .') from ' . $self->{file} . "\n"); } $id_begin_position = tell($fh); } $logger->info('File '.$self->{file}.' is now indexed (index contains '.(scalar keys %{$self->{index}})." sequences)\n"); } =head2 loadFastaIndexFile =head2 =head3 Description Retrieve index from file using Storable module =head3 Arguments =over 4 An index file =back =head3 Returns =over 4 =item A hash reference corresponding to the index of the input FASTA file : $index -> {seq_id} = {'id_begin_position' => integer, 'sequence_end_position' => integer, 'description' => string, 'position' => integer} =back =cut sub loadFastaIndexFile { my ($self,$file) = @_; $self->{index} = retrieve($file); return $self->{index}; } =head2 writeFastaIndexFile =head2 =head3 Description Write index to file using Storable module =head3 Arguments =over 4 =item A hash reference corresponding to FASTA index. =item An output file path where to store the index. =back =head3 Returns =over 4 =item The output file path containing index =back =cut sub writeFastaIndexFile { my ($self,$file) = @_; $logger->info('Writing index ('.(scalar keys %{$self->{index}}).' sequences) in file : '.$file."\n"); store $self->{index}, $file; $logger->info('File '.$file." is now created\n"); return $file; } =head2 retrieveFastaSequence =head2 =head3 Description Retrieve FASTA sequences using a list of ids =head3 Arguments =over 4 =item A sequence id OR an array reference containing the list of sequences id to retrieve. =back =head3 Returns =over 4 =item A hash reference containing sequences id as keys and sequences as values $data -> {seq_id} = sequence_corresponding_to_seq_id =back =cut sub retrieveFastaSequence { my ($self,$ids) = @_; my $data; my $nbSequences = 0; my $fh = $self->{file_handle}; if(! ref $ids){$ids = [$ids]} $logger->debug('Retrieving sequences of ids ['.join(', ', @$ids).'] from indexed file : '.$self->{file}."\n"); foreach my $id (@$ids){ my $cleanedId = $id; if($id =~ /^>?(\S*)/){$cleanedId = $1} $logger->trace('DEBUG: retrieving informations of id ' . $cleanedId. " from index\n"); if(exists $self->{index}->{$cleanedId}){ $logger->trace('DEBUG: id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index}->{$cleanedId}{'id_begin_position'}. ', sequence_end_position : '. $self->{index}->{$cleanedId}{'sequence_end_position'}.")\n"); seek($fh, $self->{index}->{$cleanedId}{'id_begin_position'}, 0); my $sequence = <$fh>; read($fh, $sequence, $self->{index}->{$cleanedId}{'sequence_end_position'}); $sequence =~ s/\n//g; $data->{$id} = $sequence; $nbSequences++; } else{ $logger->trace('DEBUG: id ' . $cleanedId. " not found in index\n"); } } $logger->trace($nbSequences.'/'.scalar(@$ids).' sequences has been retrieved from indexed file ' . $self->{file} . "\n"); return $data; } =head2 retrieveFastaBlock =head2 =head3 Description Retrieve FASTA formatted sequences using a list of ids =head3 Arguments =over 4 =item A sequence id OR an array reference containing the list of sequences id to retrieve. =back =head3 Returns =over 4 =item A scalar containing the sequences corresponding to ids in FASTA format =back =cut sub retrieveFastaBlock { my ($self,$ids) = @_; my $data; my $nbSequences = 0; my $fh = $self->{file_handle}; if(! ref $ids){$ids = [$ids]} $logger->debug('Retrieving fasta block of ' . scalar(@$ids) . ' ids from indexed file : '.$self->{file}."\n"); foreach my $id (@$ids){ my $cleanedId = $id; if($id =~ /^>?(\S*)/){$cleanedId = $1} $logger->trace('TRACE: retrieving informations of id ' . $cleanedId. " from index\n"); if(exists $self->{index}->{$cleanedId}){ $logger->trace('TRACE: id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index}->{$cleanedId}{'id_begin_position'}. ', sequence_end_position : '. $self->{index}->{$cleanedId}{'sequence_end_position'}.")\n"); seek($fh, $self->{index}->{$cleanedId}{'id_begin_position'}, 0); my $blockId = <$fh>; read($fh, my $sequence, $self->{index}->{$cleanedId}{'sequence_end_position'}); $data .= $blockId . $sequence . "\n"; $nbSequences++; $logger->trace('TRACE: fasta block of id ' . $cleanedId . ' is : ' . "\n" . $blockId.$sequence . "\n") } else{ $logger->warn('WARN: id ' . $cleanedId. " not found in index\n"); } } $logger->debug($nbSequences . '/' . scalar(@$ids) . ' fasta block has been retrieved from indexed file ' . $self->{file} . "\n"); return $data; } 1;