# HG changeset patch # User stheil # Date 1444917228 14400 # Node ID 3203097d0a70a7d4af31e9cbf447f5d1368de913 # Parent 9730db7c9ad3a48129ba23d6f66e220b0cbfecc9 Uploaded diff -r 9730db7c9ad3 -r 3203097d0a70 perl/lib/Fasta.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/perl/lib/Fasta.pm Thu Oct 15 09:53:48 2015 -0400 @@ -0,0 +1,314 @@ +package Tools::Fasta; + +use strict; +use warnings; +use Logger::Logger; +use Data::Dumper; +use Storable; + +=head1 INDEXED FASTA RELATED METHODS + +=head2 + +=head2 new + +=head2 + +=head3 Description + +Create a new Tools::Fasta object and index the FASTA file + +=head3 Arguments + +=over 4 + +=item + +A hash of parameters. + +Currently accepted keys are : + +'file' => FASTA file path + +=back + +=head3 Returns + +=over 4 + +=item + +A Tools::Fasta object + +=back + +=cut + +sub new { + my ($class, %attrs) = @_; + my $self = {}; + bless $self; + if(defined($attrs{file})){ + $self->{file} = $attrs{file}; + open($self->{file_handle},$self->{file}) || $logger->logdie('Error opening file : '. $self->{file}.' : '.$!."\n"); + $self->indexFastaFile; + } + return $self; +} + +=head2 indexFastaFile + +=head2 + +=head3 Description + +Index a FASTA file creating a hash reference with the following structure : + +$index -> {seq_id} = {'id_begin_position' => integer, 'sequence_end_position' => integer, 'description' => string, 'position' => integer} + +For each sequence id, the ">" symbol and all the text after space will be removed. + +This cleaned id will be used as key for the index. + +=head3 Arguments + +=over 4 + +=item + +None + +=back + +=head3 Returns + +=over 4 + +=item + +None + +=back + +=cut + +sub indexFastaFile { + my ($self) = @_; + $logger->info('Indexing file : '.$self->{file}."\n"); + my $id; + my $id_begin_position = 0; + my $description; + my $pos=0; + my $fh = $self->{file_handle}; + while(my $line = <$fh>){ + if($line =~ /^>(\S+)\s?(.*)$/){ + $id = $1; + $description = $2; + $pos++; + $self->{index}->{$id} = {'id_begin_position' => $id_begin_position, 'sequence_end_position' => 0, 'description' => $description, 'position' => $pos}; + } + else{ + $self->{index}->{$id}{'sequence_end_position'} += length $line; + $logger->trace('TRACE: Indexing sequence' . $id . ' (id_begin_position : '. $self->{index}->{$id}{'id_begin_position'} . ', sequence_end_position : '. $self->{index}->{$id}{'sequence_end_position'} .') from ' . $self->{file} . "\n"); + } + $id_begin_position = tell($fh); + } + $logger->info('File '.$self->{file}.' is now indexed (index contains '.(scalar keys %{$self->{index}})." sequences)\n"); +} + +=head2 loadFastaIndexFile + +=head2 + +=head3 Description + +Retrieve index from file using Storable module + +=head3 Arguments + +=over 4 + +An index file + +=back + +=head3 Returns + +=over 4 + +=item + +A hash reference corresponding to the index of the input FASTA file : + +$index -> {seq_id} = {'id_begin_position' => integer, 'sequence_end_position' => integer, 'description' => string, 'position' => integer} + +=back + +=cut + +sub loadFastaIndexFile { + my ($self,$file) = @_; + $self->{index} = retrieve($file); + return $self->{index}; +} + +=head2 writeFastaIndexFile + +=head2 + +=head3 Description + +Write index to file using Storable module + +=head3 Arguments + +=over 4 + +=item + +A hash reference corresponding to FASTA index. + +=item + +An output file path where to store the index. + +=back + +=head3 Returns + +=over 4 + +=item + +The output file path containing index + +=back + +=cut + +sub writeFastaIndexFile { + my ($self,$file) = @_; + $logger->info('Writing index ('.(scalar keys %{$self->{index}}).' sequences) in file : '.$file."\n"); + store $self->{index}, $file; + $logger->info('File '.$file." is now created\n"); + return $file; +} + +=head2 retrieveFastaSequence + +=head2 + +=head3 Description + +Retrieve FASTA sequences using a list of ids + +=head3 Arguments + +=over 4 + +=item + +A sequence id OR an array reference containing the list of sequences id to retrieve. + +=back + +=head3 Returns + +=over 4 + +=item + +A hash reference containing sequences id as keys and sequences as values + +$data -> {seq_id} = sequence_corresponding_to_seq_id + +=back + +=cut + +sub retrieveFastaSequence { + my ($self,$ids) = @_; + my $data; + my $nbSequences = 0; + my $fh = $self->{file_handle}; + if(! ref $ids){$ids = [$ids]} + $logger->debug('Retrieving sequences of ids ['.join(', ', @$ids).'] from indexed file : '.$self->{file}."\n"); + foreach my $id (@$ids){ + my $cleanedId = $id; + if($id =~ /^>?(\S*)/){$cleanedId = $1} + $logger->trace('DEBUG: retrieving informations of id ' . $cleanedId. " from index\n"); + if(exists $self->{index}->{$cleanedId}){ + $logger->trace('DEBUG: id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index}->{$cleanedId}{'id_begin_position'}. ', sequence_end_position : '. $self->{index}->{$cleanedId}{'sequence_end_position'}.")\n"); + seek($fh, $self->{index}->{$cleanedId}{'id_begin_position'}, 0); + my $sequence = <$fh>; + read($fh, $sequence, $self->{index}->{$cleanedId}{'sequence_end_position'}); + $sequence =~ s/\n//g; + $data->{$id} = $sequence; + $nbSequences++; + } + else{ + $logger->trace('DEBUG: id ' . $cleanedId. " not found in index\n"); + } + } + $logger->trace($nbSequences.'/'.scalar(@$ids).' sequences has been retrieved from indexed file ' . $self->{file} . "\n"); + return $data; +} + +=head2 retrieveFastaBlock + +=head2 + +=head3 Description + +Retrieve FASTA formatted sequences using a list of ids + +=head3 Arguments + +=over 4 + +=item + +A sequence id OR an array reference containing the list of sequences id to retrieve. + +=back + +=head3 Returns + +=over 4 + +=item + +A scalar containing the sequences corresponding to ids in FASTA format + +=back + +=cut + +sub retrieveFastaBlock { + my ($self,$ids) = @_; + my $data; + my $nbSequences = 0; + my $fh = $self->{file_handle}; + if(! ref $ids){$ids = [$ids]} + $logger->debug('Retrieving fasta block of ' . scalar(@$ids) . ' ids from indexed file : '.$self->{file}."\n"); + foreach my $id (@$ids){ + my $cleanedId = $id; + if($id =~ /^>?(\S*)/){$cleanedId = $1} + $logger->trace('TRACE: retrieving informations of id ' . $cleanedId. " from index\n"); + if(exists $self->{index}->{$cleanedId}){ + $logger->trace('TRACE: id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index}->{$cleanedId}{'id_begin_position'}. ', sequence_end_position : '. $self->{index}->{$cleanedId}{'sequence_end_position'}.")\n"); + seek($fh, $self->{index}->{$cleanedId}{'id_begin_position'}, 0); + my $blockId = <$fh>; + read($fh, my $sequence, $self->{index}->{$cleanedId}{'sequence_end_position'}); + $data .= $blockId . $sequence . "\n"; + $nbSequences++; + $logger->trace('TRACE: fasta block of id ' . $cleanedId . ' is : ' . "\n" . $blockId.$sequence . "\n") + } + else{ + $logger->warn('WARN: id ' . $cleanedId. " not found in index\n"); + } + } + $logger->debug($nbSequences . '/' . scalar(@$ids) . ' fasta block has been retrieved from indexed file ' . $self->{file} . "\n"); + return $data; +} +1; diff -r 9730db7c9ad3 -r 3203097d0a70 perl/lib/Fastq.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/perl/lib/Fastq.pm Thu Oct 15 09:53:48 2015 -0400 @@ -0,0 +1,382 @@ +package Tools::Fastq; + +use strict; +use warnings; +use Logger::Logger; +use Storable; + + +=head1 INDEXED FASTQ RELATED METHODS + +=head2 + +=head2 new + +=head2 + +=head3 Description + +Create a new Tools::Fastq object and index the FASTQ file + +=head3 Arguments + +=over 4 + +=item + +A hash of parameters. + +Currently accepted keys are : + +'file' => FASTQ file path + +=back + +=head3 Returns + +=over 4 + +=item + +A Tools::Fastq object + +=back + +=cut + +sub new { + my ($class, %attrs) = @_; + my $self = {}; + bless $self; + if(defined($attrs{file})){ + $self->{file} = $attrs{file}; + open($self->{file_handle},$self->{file}) || $logger->logdie('Error opening file : '. $self->{file}.' : '.$!."\n"); + $self->indexFastqFile; + } + return $self; +} + +=head2 indexFastqFile + +=head2 + +=head3 Description + +Index a FASTQ file creating a hash reference with the following structure : + +$index -> {seq_id} = {'id_begin_position' => integer, 'id_length' => integer} + +For each sequence id, the "@" symbol and all the text after space will be removed. + +This cleaned id will be used as key for the index. + +=head3 Arguments + +=over 4 + +=item + +None + +=back + +=head3 Returns + +=over 4 + +=item + +None + +=back + +=cut + +sub indexFastqFile{ + + my ($self) = @_; + $logger->info('Indexing file : '.$self->{file}."\n"); + my $index; + my $id; + my $id_begin_position = 0; + my $fh = $self->{file_handle}; + while(my $line = <$fh>){ + + if($line =~ /^@(\S+)/){ + + $id = $1; + chomp $id; + $index -> {$id} = {'id_begin_position' => $id_begin_position, 'id_length' => length $line}; + $logger->trace('Indexing sequence' . $id . ' (position_begin_id : '. $index -> {$id}{'id_begin_position'} . ', id_length : '. $index -> {$id}{'id_length'} .') from ' . $self->{file} . "\n"); + <$fh>; <$fh>; <$fh>; + } + + $id_begin_position = tell($fh); + } + + $logger->info('File '.$self->{file}.' is now indexed (index contains '.(scalar keys %$index)." sequences)\n"); + $self->{index} = $index; +} + +=head2 loadFastqIndexFile + +=head2 + +=head3 Description + +Retrieve index from file using Storable module + +=head3 Arguments + +=over 4 + +An index file + +=back + +=head3 Returns + +=over 4 + +=item + +A hash reference corresponding to the index of the input FASTQ file : + +$index -> {seq_id} = {'id_begin_position' => integer, 'id_length' => integer} + +=back + +=cut + +sub loadFastqIndexFile{ + + my ($self, $file) = @_; + $self->{index} = retrieve($file); + $logger->info('File '.$file." is now loaded\n"); +} + +=he=head2 writeFastaIndexFile + +=head2 + +=head3 Description + +Write index to file using Storable module + +=head3 Arguments + +=over 4 + +=item + +A hash reference corresponding to FASTQ index. + +=item + +An output file path where to store the index. + +=back + +=head3 Returns + +=over 4 + +=item + +The output file path containing index + +=back + +=cut + +sub writeFastqIndexFile{ + my ($self, $file) = @_; + $logger->info('Writing index ('.(scalar keys %{$self->{index}}).' sequences) in file : '.$file."\n"); + store $self->{index}, $file; + $logger->info('File '.$file." is now created\n"); +} + +=head2 retrieveFastqSequence + +=head2 + +=head3 Description + +Retrieve FASTQ sequences using a list of ids + +=head3 Arguments + +=over 4 + +=item + +A sequence id OR an array reference containing the list of sequences id to retrieve. + +=back + +=head3 Returns + +=over 4 + +=item + +A hash reference containing sequences id as keys and sequences as values + +$data -> {seq_id} = sequence_corresponding_to_seq_id + +=back + +=cut + +sub retrieveFastqSequence{ + my ($self, $ids) = @_; + my $data={}; + my $nbSequences = 0; + if(! ref $ids){$ids = [$ids]} + $logger->debug('Retrieving sequences of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n"); + my $fh = $self->{file_handle}; + foreach my $id (@$ids){ + my $cleanedId = $id; + if($id =~ /@(\S+)/){$cleanedId = $1} + $logger->trace('Retrieving informations of id ' . $cleanedId. " from index\n"); + if(exists $self->{index} -> {$cleanedId}){ + $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n"); + seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0); + <$fh>; + my $sequence = <$fh>; + $data->{$id} = $sequence; + $nbSequences ++; + $logger->trace('Sequence of id '.$cleanedId.' is : ' . $sequence . "\n") + } + else{ + $logger->trace('id ' . $cleanedId. " not found in index\n") + } + } + $logger->debug($nbSequences.'/'.scalar(@$ids).' sequences has been retrieved from indexed file ' . $self->{file} . "\n"); + return $data; +} + +=head2 retrieveFastqQuality + +=head2 + +=head3 Description + +Retrieve FASTQ sequences quality using a list of ids + +=head3 Arguments + +=over 4 + +=item + +A sequence id OR an array reference containing the list of sequences id to retrieve quality. + +=back + +=head3 Returns + +=over 4 + +=item + +A hash reference containing sequences id as keys and sequences quality as values + +$data -> {seq_id} = sequence_quality_corresponding_to_seq_id + +=back + +=cut + +sub retrieveFastqQuality{ + my ($self, $ids) = @_; + my $data; + my $nbSequences = 0; + if(! ref $ids){$ids = [$ids]} + $logger->debug('Retrieving sequence quality of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n"); + my $fh = $self->{file_handle}; + foreach my $id (@$ids){ + my $cleanedId = $id; + if($id =~ /@(\S+)/){ + $cleanedId = $1; + } + $logger->trace('retrieving informations of id ' . $cleanedId. " from index\n"); + if(exists $self->{index} -> {$cleanedId}){ + $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n"); + seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0); + my $quality .= <$fh>.<$fh>.<$fh>; + $quality = <$fh>; + $data .= $quality; + $nbSequences ++; + $logger->trace('Sequence quality of id '.$cleanedId.' is : ' . $quality. "\n") + } + else{ + $logger->trace('id ' . $cleanedId. " not found in index\n"); + } + } + $logger->debug($nbSequences.'/'.scalar(@$ids).' sequences qualities has been retrieved from indexed file ' . $self->{file} . "\n"); + return $data; +} + +=head2 retrieveFastqBlock + +=head2 + +=head3 Description + +Retrieve FASTQ formatted sequences using a list of ids + +=head3 Arguments + +=over 4 + +=item + +A sequence id OR an array reference containing the list of sequences id to retrieve. + +=back + +=head3 Returns + +=over 4 + +=item + +A scalar containing the sequences corresponding to ids in FASTQ format + +=back + +=cut + +sub retrieveFastqBlock{ + my ($self, $ids) = @_; + my $data; + my $nbSequences = 0; + + if(! ref $ids){$ids = [$ids]} + + $logger->trace('Retrieving fastq block of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n"); + my $fh = $self->{file_handle}; + foreach my $id (@$ids){ + my $cleanedId = $id; + if($id =~ /@(\S+)/){ + $cleanedId = $1; + } + $logger->trace('Retrieving informations of id ' . $cleanedId. " from index\n"); + if(exists $self->{index} -> {$cleanedId}){ + $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n"); + seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0); + read($fh, my $block, $self->{index} -> {$cleanedId}{'id_length'}); + $block .= <$fh>.<$fh>.<$fh>; + $data .= $block; + $nbSequences++; + $logger->trace('fastq block of id '.$cleanedId.' is : ' ."\n". $block. "\n") + } + else{ + $logger->trace('id ' . $cleanedId. " not found in index\n"); + } + } + $logger->trace($nbSequences.'/'.scalar(@$ids).' fastq block has been retrieved from indexed file ' . $self->{file} . "\n"); + return $data; +} +1; diff -r 9730db7c9ad3 -r 3203097d0a70 perl/scripts/readPerContig.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/perl/scripts/readPerContig.pl Thu Oct 15 09:53:48 2015 -0400 @@ -0,0 +1,179 @@ +#!/usr/bin/perl +use strict; +use Data::Dumper; +use Logger::Logger; +use File::Basename; +use Getopt::Long; +use Cwd 'abs_path' ; +use Tools::Fasta ; +use Tools::Fastq ; + + +my $ref=''; +my @p1=''; +my @p2=''; +my @read=''; +my $identity = 95; +my $coverage = 60; +my $verbosity=1; +my $format; +my $stat; +my $blatOutput=''; +my $statOutput=''; + +GetOptions( + "r=s" => \$ref, + "id=i" => \$identity, + "cov=i" => \$coverage, + "b|blat" => \$blatOutput, + "o|output=s" => \$statOutput, + "f=s" => \$format, + "s=s" => \$stat, + "v|verbosity=i" => \$verbosity +); + +Logger::Logger->changeMode($verbosity); + +&main; + +sub main { + my $self = {}; + bless $self; + $self->_setOptions(); + + $self->_checkFormat($format); + + $self->{ref_tool} = Tools::Fasta->new(file => $self->{reference}); + + $self->_parseBlat(); + + $self->_print($self->{_statistics}, $self->{statOutput}); +} + + +sub _parseBlat { + my ($self) = @_; + open(PSL,$self->{blatOutput}) || $logger->logdie('Cant read psl file ' . $self->{blatOutput}); + while(){ + chomp; + my @line = split(/\t/,$_); + my $tSize = $line[14]; + my $tStart = $line[15]; + my $tEnd = $line[16]; + my $coverage = (($tEnd-$tStart)/$tSize) * 100; + + if($coverage >= $self->{_covThreshold}){ + $self->{readNumber}->{$line[9]}++; + $self->{totalReadNumber}++; + $self->{size}->{$line[9]} = $line[10]; + $self->{reads_size_sum}->{$line[9]} += $tSize; + } + else{ + next; + } + } + close PSL; +} + + +sub _print { + my ($self, $mode, $statOutput) = @_; + $logger->info('Printing results ...'); + + if(!defined $mode){ + $mode = 'nb_reads'; + } + + if(!defined $statOutput){ + $statOutput = $self->{reference} . '.rn'; + } + + open(OUT,">$statOutput"); + foreach my $ref (sort(keys(%{$self->{ref_tool}->{index}}))){ + my $value = 0; + if(defined($self->{readNumber}->{$ref})){ + if($mode eq 'nb_reads'){ + $value = $self->{readNumber}->{$ref}; + } + elsif($mode eq 'rpkm'){ + $value = $self->{readNumber}->{$ref} / ($self->{size}->{$ref}/1000 * $self->{totalReadNumber}/1000000); + } + elsif($mode eq 'mean_coverage'){ + $value = $self->{reads_size_sum}->{$ref} / $self->{size}->{$ref}; + } + } + print OUT $ref . "\t" . $value . "\n"; + } + close OUT; +} + + +sub _setOptions { + my ($self) = @_; + if(-e $ref){ + $self->{reference} = abs_path($ref); + } + else{ + $logger->error('You must provide a reference.'); + &help; + } + if($identity >= 0 && $identity <= 100){ + $self->{_idThreshold} = $identity; + } + else{ + $logger->logdie('ERROR: identity must be between 0 and 100.'); + } + if($coverage >=0 && $coverage <= 100){ + $self->{_covThreshold} = $coverage; + } + else{ + $logger->logdie('ERROR: coverage must be between 0 and 100.'); + } + if($stat =~ /nb_reads|rpkm|mean_coverage/){ + $self->{_statistics} = $stat; + } + else{ + $logdie(''); + } + if($blatOutput eq ''){ + $self->{blatOutput} = 'blat.psl'; + } + else{ + $self->{blatOutput} = $blatOutput; + } + if($statOutput eq ''){ + my($filename, $dirs, $suffix) = fileparse($ref, qr/\.[^.]*/); + $statOutput = $filename . '.psl'; + } + else{ + $self->{statOutput} = $statOutput; + } +} + + +sub help { +my $prog = basename($0) ; +print STDERR < Rererence sequences file. + -id Identity threshold. (Default: $identity) + -cov Coverage threshold. (Default: $coverage) + -b Blat output file path + -o Output file path + -s Statistics to print : nb_reads, rpkm or mean_coverage. + -v Verbosity level. (0 -> 4). +EOF +exit(1); +} diff -r 9730db7c9ad3 -r 3203097d0a70 readPerContig.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/readPerContig.xml Thu Oct 15 09:53:48 2015 -0400 @@ -0,0 +1,21 @@ + + + This script counts the number of read per contig using a Blat output. + Output is a 2 column tabular file. + + readPerContig.pl-r $reference -id $identity -cov $coverage -b blat_result -v 0 -s $statistics -o $output + + + + + + + + + + + + + + + diff -r 9730db7c9ad3 -r 3203097d0a70 tool_dependencies.xml --- a/tool_dependencies.xml Thu Oct 15 05:52:00 2015 -0400 +++ b/tool_dependencies.xml Thu Oct 15 09:53:48 2015 -0400 @@ -11,12 +11,19 @@ + + perl + $INSTALL_DIR + Data::Dumper File::Basename Getopt::Long Cwd - $REPOSITORY_INSTALL_DIR/perl/lib + $INSTALL_DIR/perl/lib + + + $INSTALL_DIR/perl/scripts