| 
1
 | 
     1 package Tools::Fastq;
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 use strict;
 | 
| 
 | 
     4 use warnings;
 | 
| 
 | 
     5 use Logger::Logger;
 | 
| 
 | 
     6 use Storable;
 | 
| 
 | 
     7 
 | 
| 
 | 
     8 
 | 
| 
 | 
     9 =head1 INDEXED FASTQ RELATED METHODS
 | 
| 
 | 
    10 
 | 
| 
 | 
    11 =head2
 | 
| 
 | 
    12 
 | 
| 
 | 
    13 =head2 new
 | 
| 
 | 
    14 
 | 
| 
 | 
    15 =head2
 | 
| 
 | 
    16 
 | 
| 
 | 
    17 =head3 Description
 | 
| 
 | 
    18 
 | 
| 
 | 
    19 Create a new Tools::Fastq object and index the FASTQ file
 | 
| 
 | 
    20 
 | 
| 
 | 
    21 =head3 Arguments
 | 
| 
 | 
    22 
 | 
| 
 | 
    23 =over 4
 | 
| 
 | 
    24 
 | 
| 
 | 
    25 =item
 | 
| 
 | 
    26 
 | 
| 
 | 
    27 A hash of parameters.
 | 
| 
 | 
    28 
 | 
| 
 | 
    29 Currently accepted keys are :
 | 
| 
 | 
    30 
 | 
| 
 | 
    31 'file' => FASTQ file path
 | 
| 
 | 
    32 
 | 
| 
 | 
    33 =back
 | 
| 
 | 
    34 
 | 
| 
 | 
    35 =head3 Returns
 | 
| 
 | 
    36 
 | 
| 
 | 
    37 =over 4
 | 
| 
 | 
    38 
 | 
| 
 | 
    39 =item
 | 
| 
 | 
    40 
 | 
| 
 | 
    41 A Tools::Fastq object
 | 
| 
 | 
    42 
 | 
| 
 | 
    43 =back
 | 
| 
 | 
    44 
 | 
| 
 | 
    45 =cut
 | 
| 
 | 
    46 
 | 
| 
 | 
    47 sub new {
 | 
| 
 | 
    48 	my ($class, %attrs) = @_;
 | 
| 
 | 
    49 	my $self = {};
 | 
| 
 | 
    50 	bless $self;
 | 
| 
 | 
    51 	if(defined($attrs{file})){
 | 
| 
 | 
    52 		$self->{file} = $attrs{file};
 | 
| 
 | 
    53 		open($self->{file_handle},$self->{file}) || $logger->logdie('Error opening file : '. $self->{file}.' : '.$!."\n");
 | 
| 
 | 
    54 		$self->indexFastqFile;
 | 
| 
 | 
    55 	}
 | 
| 
 | 
    56 	return $self;
 | 
| 
 | 
    57 }
 | 
| 
 | 
    58 
 | 
| 
 | 
    59 =head2 indexFastqFile
 | 
| 
 | 
    60 
 | 
| 
 | 
    61 =head2
 | 
| 
 | 
    62 
 | 
| 
 | 
    63 =head3 Description
 | 
| 
 | 
    64 
 | 
| 
 | 
    65 Index a FASTQ file creating a hash reference with the following structure :
 | 
| 
 | 
    66 
 | 
| 
 | 
    67 $index -> {seq_id} = {'id_begin_position' => integer, 'id_length' => integer}
 | 
| 
 | 
    68 
 | 
| 
 | 
    69 For each sequence id, the "@" symbol and all the text after space will be removed.
 | 
| 
 | 
    70 
 | 
| 
 | 
    71 This cleaned id will be used as key for the index.
 | 
| 
 | 
    72 
 | 
| 
 | 
    73 =head3 Arguments
 | 
| 
 | 
    74 
 | 
| 
 | 
    75 =over 4
 | 
| 
 | 
    76 
 | 
| 
 | 
    77 =item
 | 
| 
 | 
    78 
 | 
| 
 | 
    79 None
 | 
| 
 | 
    80 
 | 
| 
 | 
    81 =back
 | 
| 
 | 
    82 
 | 
| 
 | 
    83 =head3 Returns
 | 
| 
 | 
    84 
 | 
| 
 | 
    85 =over 4
 | 
| 
 | 
    86 
 | 
| 
 | 
    87 =item
 | 
| 
 | 
    88 
 | 
| 
 | 
    89 None
 | 
| 
 | 
    90 
 | 
| 
 | 
    91 =back
 | 
| 
 | 
    92 
 | 
| 
 | 
    93 =cut
 | 
| 
 | 
    94 
 | 
| 
 | 
    95 sub indexFastqFile{
 | 
| 
 | 
    96 
 | 
| 
 | 
    97 	  my ($self) = @_;
 | 
| 
 | 
    98     $logger->info('Indexing file : '.$self->{file}."\n");
 | 
| 
 | 
    99     my $index;
 | 
| 
 | 
   100     my $id;
 | 
| 
 | 
   101     my $id_begin_position = 0;
 | 
| 
 | 
   102 		my $fh = $self->{file_handle};
 | 
| 
 | 
   103     while(my $line = <$fh>){
 | 
| 
 | 
   104 
 | 
| 
 | 
   105         if($line =~ /^@(\S+)/){
 | 
| 
 | 
   106 
 | 
| 
 | 
   107             $id = $1;
 | 
| 
 | 
   108             chomp $id;
 | 
| 
 | 
   109             $index -> {$id} = {'id_begin_position' => $id_begin_position, 'id_length' => length $line};
 | 
| 
 | 
   110             $logger->trace('Indexing sequence' . $id . ' (position_begin_id : '. $index -> {$id}{'id_begin_position'} . ', id_length : '. $index -> {$id}{'id_length'} .') from ' . $self->{file} . "\n");
 | 
| 
 | 
   111             <$fh>; <$fh>; <$fh>;
 | 
| 
 | 
   112         }
 | 
| 
 | 
   113 
 | 
| 
 | 
   114         $id_begin_position = tell($fh);
 | 
| 
 | 
   115     }
 | 
| 
 | 
   116 
 | 
| 
 | 
   117     $logger->info('File '.$self->{file}.' is now indexed (index contains '.(scalar keys %$index)." sequences)\n");
 | 
| 
 | 
   118     $self->{index} = $index;
 | 
| 
 | 
   119 }
 | 
| 
 | 
   120 
 | 
| 
 | 
   121 =head2 loadFastqIndexFile
 | 
| 
 | 
   122 
 | 
| 
 | 
   123 =head2
 | 
| 
 | 
   124 
 | 
| 
 | 
   125 =head3 Description
 | 
| 
 | 
   126 
 | 
| 
 | 
   127 Retrieve index from file using Storable module
 | 
| 
 | 
   128 
 | 
| 
 | 
   129 =head3 Arguments
 | 
| 
 | 
   130 
 | 
| 
 | 
   131 =over 4
 | 
| 
 | 
   132 
 | 
| 
 | 
   133 An index file
 | 
| 
 | 
   134 
 | 
| 
 | 
   135 =back
 | 
| 
 | 
   136 
 | 
| 
 | 
   137 =head3 Returns
 | 
| 
 | 
   138 
 | 
| 
 | 
   139 =over 4
 | 
| 
 | 
   140 
 | 
| 
 | 
   141 =item
 | 
| 
 | 
   142 
 | 
| 
 | 
   143 A hash reference corresponding to the index of the input FASTQ file :
 | 
| 
 | 
   144 
 | 
| 
 | 
   145 $index -> {seq_id} = {'id_begin_position' => integer, 'id_length' => integer}
 | 
| 
 | 
   146 
 | 
| 
 | 
   147 =back
 | 
| 
 | 
   148 
 | 
| 
 | 
   149 =cut
 | 
| 
 | 
   150 
 | 
| 
 | 
   151 sub loadFastqIndexFile{
 | 
| 
 | 
   152 
 | 
| 
 | 
   153 	my ($self, $file) = @_;
 | 
| 
 | 
   154   $self->{index} = retrieve($file);
 | 
| 
 | 
   155   $logger->info('File '.$file." is now loaded\n");
 | 
| 
 | 
   156 }
 | 
| 
 | 
   157 
 | 
| 
 | 
   158 =he=head2 writeFastaIndexFile
 | 
| 
 | 
   159 
 | 
| 
 | 
   160 =head2
 | 
| 
 | 
   161 
 | 
| 
 | 
   162 =head3 Description
 | 
| 
 | 
   163 
 | 
| 
 | 
   164 Write index to file using Storable module
 | 
| 
 | 
   165 
 | 
| 
 | 
   166 =head3 Arguments
 | 
| 
 | 
   167 
 | 
| 
 | 
   168 =over 4
 | 
| 
 | 
   169 
 | 
| 
 | 
   170 =item
 | 
| 
 | 
   171 
 | 
| 
 | 
   172 A hash reference corresponding to FASTQ index.
 | 
| 
 | 
   173 
 | 
| 
 | 
   174 =item
 | 
| 
 | 
   175 
 | 
| 
 | 
   176 An output file path where to store the index.
 | 
| 
 | 
   177 
 | 
| 
 | 
   178 =back
 | 
| 
 | 
   179 
 | 
| 
 | 
   180 =head3 Returns
 | 
| 
 | 
   181 
 | 
| 
 | 
   182 =over 4
 | 
| 
 | 
   183 
 | 
| 
 | 
   184 =item
 | 
| 
 | 
   185 
 | 
| 
 | 
   186 The output file path containing index
 | 
| 
 | 
   187 
 | 
| 
 | 
   188 =back
 | 
| 
 | 
   189 
 | 
| 
 | 
   190 =cut
 | 
| 
 | 
   191 
 | 
| 
 | 
   192 sub writeFastqIndexFile{
 | 
| 
 | 
   193 	my ($self, $file) = @_;
 | 
| 
 | 
   194   $logger->info('Writing index ('.(scalar keys %{$self->{index}}).' sequences) in file : '.$file."\n");
 | 
| 
 | 
   195   store $self->{index}, $file;
 | 
| 
 | 
   196   $logger->info('File '.$file." is now created\n");
 | 
| 
 | 
   197 }
 | 
| 
 | 
   198 
 | 
| 
 | 
   199 =head2 retrieveFastqSequence
 | 
| 
 | 
   200 
 | 
| 
 | 
   201 =head2
 | 
| 
 | 
   202 
 | 
| 
 | 
   203 =head3 Description
 | 
| 
 | 
   204 
 | 
| 
 | 
   205 Retrieve FASTQ sequences using a list of ids
 | 
| 
 | 
   206 
 | 
| 
 | 
   207 =head3 Arguments
 | 
| 
 | 
   208 
 | 
| 
 | 
   209 =over 4
 | 
| 
 | 
   210 
 | 
| 
 | 
   211 =item
 | 
| 
 | 
   212 
 | 
| 
 | 
   213 A sequence id OR an array reference containing the list of sequences id to retrieve.
 | 
| 
 | 
   214 
 | 
| 
 | 
   215 =back
 | 
| 
 | 
   216 
 | 
| 
 | 
   217 =head3 Returns
 | 
| 
 | 
   218 
 | 
| 
 | 
   219 =over 4
 | 
| 
 | 
   220 
 | 
| 
 | 
   221 =item
 | 
| 
 | 
   222 
 | 
| 
 | 
   223 A hash reference containing sequences id as keys and sequences as values
 | 
| 
 | 
   224 
 | 
| 
 | 
   225 $data -> {seq_id} = sequence_corresponding_to_seq_id
 | 
| 
 | 
   226 
 | 
| 
 | 
   227 =back
 | 
| 
 | 
   228 
 | 
| 
 | 
   229 =cut
 | 
| 
 | 
   230 
 | 
| 
 | 
   231 sub retrieveFastqSequence{
 | 
| 
 | 
   232 	my ($self, $ids) = @_;
 | 
| 
 | 
   233 	my $data={};
 | 
| 
 | 
   234   my $nbSequences = 0;
 | 
| 
 | 
   235   if(! ref $ids){$ids = [$ids]}
 | 
| 
 | 
   236   $logger->debug('Retrieving sequences of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n");
 | 
| 
 | 
   237 	my $fh = $self->{file_handle};
 | 
| 
 | 
   238 	foreach my $id (@$ids){
 | 
| 
 | 
   239 		my $cleanedId = $id;
 | 
| 
 | 
   240       if($id =~ /@(\S+)/){$cleanedId = $1}
 | 
| 
 | 
   241       $logger->trace('Retrieving informations of id ' . $cleanedId. " from index\n");
 | 
| 
 | 
   242       if(exists $self->{index} -> {$cleanedId}){
 | 
| 
 | 
   243         $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n");
 | 
| 
 | 
   244         seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0);
 | 
| 
 | 
   245         <$fh>;
 | 
| 
 | 
   246         my $sequence = <$fh>;
 | 
| 
 | 
   247         $data->{$id} = $sequence;
 | 
| 
 | 
   248         $nbSequences ++;
 | 
| 
 | 
   249       	$logger->trace('Sequence of id '.$cleanedId.' is : ' . $sequence . "\n")
 | 
| 
 | 
   250     	}
 | 
| 
 | 
   251     	else{
 | 
| 
 | 
   252 				$logger->trace('id ' . $cleanedId. " not found in index\n")
 | 
| 
 | 
   253 			}
 | 
| 
 | 
   254 	}
 | 
| 
 | 
   255   $logger->debug($nbSequences.'/'.scalar(@$ids).' sequences has been retrieved from indexed file ' . $self->{file} . "\n");
 | 
| 
 | 
   256 	return $data;
 | 
| 
 | 
   257 }
 | 
| 
 | 
   258 
 | 
| 
 | 
   259 =head2 retrieveFastqQuality
 | 
| 
 | 
   260 
 | 
| 
 | 
   261 =head2
 | 
| 
 | 
   262 
 | 
| 
 | 
   263 =head3 Description
 | 
| 
 | 
   264 
 | 
| 
 | 
   265 Retrieve FASTQ sequences quality using a list of ids
 | 
| 
 | 
   266 
 | 
| 
 | 
   267 =head3 Arguments
 | 
| 
 | 
   268 
 | 
| 
 | 
   269 =over 4
 | 
| 
 | 
   270 
 | 
| 
 | 
   271 =item
 | 
| 
 | 
   272 
 | 
| 
 | 
   273 A sequence id OR an array reference containing the list of sequences id to retrieve quality.
 | 
| 
 | 
   274 
 | 
| 
 | 
   275 =back
 | 
| 
 | 
   276 
 | 
| 
 | 
   277 =head3 Returns
 | 
| 
 | 
   278 
 | 
| 
 | 
   279 =over 4
 | 
| 
 | 
   280 
 | 
| 
 | 
   281 =item
 | 
| 
 | 
   282 
 | 
| 
 | 
   283 A hash reference containing sequences id as keys and sequences quality as values
 | 
| 
 | 
   284 
 | 
| 
 | 
   285 $data -> {seq_id} = sequence_quality_corresponding_to_seq_id
 | 
| 
 | 
   286 
 | 
| 
 | 
   287 =back
 | 
| 
 | 
   288 
 | 
| 
 | 
   289 =cut
 | 
| 
 | 
   290 
 | 
| 
 | 
   291 sub retrieveFastqQuality{
 | 
| 
 | 
   292 	my ($self, $ids) = @_;
 | 
| 
 | 
   293 	my $data;
 | 
| 
 | 
   294   my $nbSequences = 0;
 | 
| 
 | 
   295   if(! ref $ids){$ids = [$ids]}
 | 
| 
 | 
   296   $logger->debug('Retrieving sequence quality of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n");
 | 
| 
 | 
   297 	my $fh = $self->{file_handle};
 | 
| 
 | 
   298 	foreach my $id (@$ids){
 | 
| 
 | 
   299 		my $cleanedId = $id;
 | 
| 
 | 
   300 		if($id =~ /@(\S+)/){
 | 
| 
 | 
   301 			$cleanedId = $1;
 | 
| 
 | 
   302 		}
 | 
| 
 | 
   303     $logger->trace('retrieving informations of id ' . $cleanedId. " from index\n");
 | 
| 
 | 
   304     if(exists $self->{index} -> {$cleanedId}){
 | 
| 
 | 
   305       $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n");
 | 
| 
 | 
   306       seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0);
 | 
| 
 | 
   307       my $quality .= <$fh>.<$fh>.<$fh>;
 | 
| 
 | 
   308       $quality = <$fh>;
 | 
| 
 | 
   309       $data .= $quality;
 | 
| 
 | 
   310       $nbSequences ++;
 | 
| 
 | 
   311       $logger->trace('Sequence quality of id '.$cleanedId.' is : ' . $quality. "\n")
 | 
| 
 | 
   312     }
 | 
| 
 | 
   313     else{
 | 
| 
 | 
   314 			$logger->trace('id ' . $cleanedId. " not found in index\n");
 | 
| 
 | 
   315 		}
 | 
| 
 | 
   316 	}
 | 
| 
 | 
   317   $logger->debug($nbSequences.'/'.scalar(@$ids).' sequences qualities has been retrieved from indexed file ' . $self->{file} . "\n");
 | 
| 
 | 
   318 	return $data;
 | 
| 
 | 
   319 }
 | 
| 
 | 
   320 
 | 
| 
 | 
   321 =head2 retrieveFastqBlock
 | 
| 
 | 
   322 
 | 
| 
 | 
   323 =head2
 | 
| 
 | 
   324 
 | 
| 
 | 
   325 =head3 Description
 | 
| 
 | 
   326 
 | 
| 
 | 
   327 Retrieve FASTQ formatted sequences using a list of ids
 | 
| 
 | 
   328 
 | 
| 
 | 
   329 =head3 Arguments
 | 
| 
 | 
   330 
 | 
| 
 | 
   331 =over 4
 | 
| 
 | 
   332 
 | 
| 
 | 
   333 =item
 | 
| 
 | 
   334 
 | 
| 
 | 
   335 A sequence id OR an array reference containing the list of sequences id to retrieve.
 | 
| 
 | 
   336 
 | 
| 
 | 
   337 =back
 | 
| 
 | 
   338 
 | 
| 
 | 
   339 =head3 Returns
 | 
| 
 | 
   340 
 | 
| 
 | 
   341 =over 4
 | 
| 
 | 
   342 
 | 
| 
 | 
   343 =item
 | 
| 
 | 
   344 
 | 
| 
 | 
   345 A scalar containing the sequences corresponding to ids in FASTQ format
 | 
| 
 | 
   346 
 | 
| 
 | 
   347 =back
 | 
| 
 | 
   348 
 | 
| 
 | 
   349 =cut
 | 
| 
 | 
   350 
 | 
| 
 | 
   351 sub retrieveFastqBlock{
 | 
| 
 | 
   352 	my ($self, $ids) = @_;
 | 
| 
 | 
   353 	my $data;
 | 
| 
 | 
   354   my $nbSequences = 0;
 | 
| 
 | 
   355 
 | 
| 
 | 
   356   if(! ref $ids){$ids = [$ids]}
 | 
| 
 | 
   357 
 | 
| 
 | 
   358   $logger->trace('Retrieving fastq block of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n");
 | 
| 
 | 
   359 	my $fh = $self->{file_handle};
 | 
| 
 | 
   360 	foreach my $id (@$ids){
 | 
| 
 | 
   361 		my $cleanedId = $id;
 | 
| 
 | 
   362 		if($id =~ /@(\S+)/){
 | 
| 
 | 
   363 			$cleanedId = $1;
 | 
| 
 | 
   364 		}
 | 
| 
 | 
   365     $logger->trace('Retrieving informations of id ' . $cleanedId. " from index\n");
 | 
| 
 | 
   366     if(exists $self->{index} -> {$cleanedId}){
 | 
| 
 | 
   367       $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n");
 | 
| 
 | 
   368       seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0);
 | 
| 
 | 
   369     	read($fh, my $block, $self->{index} -> {$cleanedId}{'id_length'});
 | 
| 
 | 
   370       $block .= <$fh>.<$fh>.<$fh>;
 | 
| 
 | 
   371       $data .= $block;
 | 
| 
 | 
   372       $nbSequences++;
 | 
| 
 | 
   373       $logger->trace('fastq block of id '.$cleanedId.' is : ' ."\n". $block. "\n")
 | 
| 
 | 
   374     }
 | 
| 
 | 
   375     else{
 | 
| 
 | 
   376 			$logger->trace('id ' . $cleanedId. " not found in index\n");
 | 
| 
 | 
   377 		}
 | 
| 
 | 
   378 	}
 | 
| 
 | 
   379   $logger->trace($nbSequences.'/'.scalar(@$ids).' fastq block has been retrieved from indexed file ' . $self->{file} . "\n");
 | 
| 
 | 
   380 	return $data;
 | 
| 
 | 
   381 }
 | 
| 
 | 
   382 1;
 |