Mercurial > repos > stheil > readpercontig_blat
changeset 1:3203097d0a70 draft
Uploaded
author | stheil |
---|---|
date | Thu, 15 Oct 2015 09:53:48 -0400 |
parents | 9730db7c9ad3 |
children | ea81b455dbf6 |
files | perl/lib/Fasta.pm perl/lib/Fastq.pm perl/scripts/readPerContig.pl readPerContig.xml tool_dependencies.xml |
diffstat | 5 files changed, 904 insertions(+), 1 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/perl/lib/Fasta.pm Thu Oct 15 09:53:48 2015 -0400 @@ -0,0 +1,314 @@ +package Tools::Fasta; + +use strict; +use warnings; +use Logger::Logger; +use Data::Dumper; +use Storable; + +=head1 INDEXED FASTA RELATED METHODS + +=head2 + +=head2 new + +=head2 + +=head3 Description + +Create a new Tools::Fasta object and index the FASTA file + +=head3 Arguments + +=over 4 + +=item + +A hash of parameters. + +Currently accepted keys are : + +'file' => FASTA file path + +=back + +=head3 Returns + +=over 4 + +=item + +A Tools::Fasta object + +=back + +=cut + +sub new { + my ($class, %attrs) = @_; + my $self = {}; + bless $self; + if(defined($attrs{file})){ + $self->{file} = $attrs{file}; + open($self->{file_handle},$self->{file}) || $logger->logdie('Error opening file : '. $self->{file}.' : '.$!."\n"); + $self->indexFastaFile; + } + return $self; +} + +=head2 indexFastaFile + +=head2 + +=head3 Description + +Index a FASTA file creating a hash reference with the following structure : + +$index -> {seq_id} = {'id_begin_position' => integer, 'sequence_end_position' => integer, 'description' => string, 'position' => integer} + +For each sequence id, the ">" symbol and all the text after space will be removed. + +This cleaned id will be used as key for the index. + +=head3 Arguments + +=over 4 + +=item + +None + +=back + +=head3 Returns + +=over 4 + +=item + +None + +=back + +=cut + +sub indexFastaFile { + my ($self) = @_; + $logger->info('Indexing file : '.$self->{file}."\n"); + my $id; + my $id_begin_position = 0; + my $description; + my $pos=0; + my $fh = $self->{file_handle}; + while(my $line = <$fh>){ + if($line =~ /^>(\S+)\s?(.*)$/){ + $id = $1; + $description = $2; + $pos++; + $self->{index}->{$id} = {'id_begin_position' => $id_begin_position, 'sequence_end_position' => 0, 'description' => $description, 'position' => $pos}; + } + else{ + $self->{index}->{$id}{'sequence_end_position'} += length $line; + $logger->trace('TRACE: Indexing sequence' . $id . ' (id_begin_position : '. $self->{index}->{$id}{'id_begin_position'} . ', sequence_end_position : '. $self->{index}->{$id}{'sequence_end_position'} .') from ' . $self->{file} . "\n"); + } + $id_begin_position = tell($fh); + } + $logger->info('File '.$self->{file}.' is now indexed (index contains '.(scalar keys %{$self->{index}})." sequences)\n"); +} + +=head2 loadFastaIndexFile + +=head2 + +=head3 Description + +Retrieve index from file using Storable module + +=head3 Arguments + +=over 4 + +An index file + +=back + +=head3 Returns + +=over 4 + +=item + +A hash reference corresponding to the index of the input FASTA file : + +$index -> {seq_id} = {'id_begin_position' => integer, 'sequence_end_position' => integer, 'description' => string, 'position' => integer} + +=back + +=cut + +sub loadFastaIndexFile { + my ($self,$file) = @_; + $self->{index} = retrieve($file); + return $self->{index}; +} + +=head2 writeFastaIndexFile + +=head2 + +=head3 Description + +Write index to file using Storable module + +=head3 Arguments + +=over 4 + +=item + +A hash reference corresponding to FASTA index. + +=item + +An output file path where to store the index. + +=back + +=head3 Returns + +=over 4 + +=item + +The output file path containing index + +=back + +=cut + +sub writeFastaIndexFile { + my ($self,$file) = @_; + $logger->info('Writing index ('.(scalar keys %{$self->{index}}).' sequences) in file : '.$file."\n"); + store $self->{index}, $file; + $logger->info('File '.$file." is now created\n"); + return $file; +} + +=head2 retrieveFastaSequence + +=head2 + +=head3 Description + +Retrieve FASTA sequences using a list of ids + +=head3 Arguments + +=over 4 + +=item + +A sequence id OR an array reference containing the list of sequences id to retrieve. + +=back + +=head3 Returns + +=over 4 + +=item + +A hash reference containing sequences id as keys and sequences as values + +$data -> {seq_id} = sequence_corresponding_to_seq_id + +=back + +=cut + +sub retrieveFastaSequence { + my ($self,$ids) = @_; + my $data; + my $nbSequences = 0; + my $fh = $self->{file_handle}; + if(! ref $ids){$ids = [$ids]} + $logger->debug('Retrieving sequences of ids ['.join(', ', @$ids).'] from indexed file : '.$self->{file}."\n"); + foreach my $id (@$ids){ + my $cleanedId = $id; + if($id =~ /^>?(\S*)/){$cleanedId = $1} + $logger->trace('DEBUG: retrieving informations of id ' . $cleanedId. " from index\n"); + if(exists $self->{index}->{$cleanedId}){ + $logger->trace('DEBUG: id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index}->{$cleanedId}{'id_begin_position'}. ', sequence_end_position : '. $self->{index}->{$cleanedId}{'sequence_end_position'}.")\n"); + seek($fh, $self->{index}->{$cleanedId}{'id_begin_position'}, 0); + my $sequence = <$fh>; + read($fh, $sequence, $self->{index}->{$cleanedId}{'sequence_end_position'}); + $sequence =~ s/\n//g; + $data->{$id} = $sequence; + $nbSequences++; + } + else{ + $logger->trace('DEBUG: id ' . $cleanedId. " not found in index\n"); + } + } + $logger->trace($nbSequences.'/'.scalar(@$ids).' sequences has been retrieved from indexed file ' . $self->{file} . "\n"); + return $data; +} + +=head2 retrieveFastaBlock + +=head2 + +=head3 Description + +Retrieve FASTA formatted sequences using a list of ids + +=head3 Arguments + +=over 4 + +=item + +A sequence id OR an array reference containing the list of sequences id to retrieve. + +=back + +=head3 Returns + +=over 4 + +=item + +A scalar containing the sequences corresponding to ids in FASTA format + +=back + +=cut + +sub retrieveFastaBlock { + my ($self,$ids) = @_; + my $data; + my $nbSequences = 0; + my $fh = $self->{file_handle}; + if(! ref $ids){$ids = [$ids]} + $logger->debug('Retrieving fasta block of ' . scalar(@$ids) . ' ids from indexed file : '.$self->{file}."\n"); + foreach my $id (@$ids){ + my $cleanedId = $id; + if($id =~ /^>?(\S*)/){$cleanedId = $1} + $logger->trace('TRACE: retrieving informations of id ' . $cleanedId. " from index\n"); + if(exists $self->{index}->{$cleanedId}){ + $logger->trace('TRACE: id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index}->{$cleanedId}{'id_begin_position'}. ', sequence_end_position : '. $self->{index}->{$cleanedId}{'sequence_end_position'}.")\n"); + seek($fh, $self->{index}->{$cleanedId}{'id_begin_position'}, 0); + my $blockId = <$fh>; + read($fh, my $sequence, $self->{index}->{$cleanedId}{'sequence_end_position'}); + $data .= $blockId . $sequence . "\n"; + $nbSequences++; + $logger->trace('TRACE: fasta block of id ' . $cleanedId . ' is : ' . "\n" . $blockId.$sequence . "\n") + } + else{ + $logger->warn('WARN: id ' . $cleanedId. " not found in index\n"); + } + } + $logger->debug($nbSequences . '/' . scalar(@$ids) . ' fasta block has been retrieved from indexed file ' . $self->{file} . "\n"); + return $data; +} +1;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/perl/lib/Fastq.pm Thu Oct 15 09:53:48 2015 -0400 @@ -0,0 +1,382 @@ +package Tools::Fastq; + +use strict; +use warnings; +use Logger::Logger; +use Storable; + + +=head1 INDEXED FASTQ RELATED METHODS + +=head2 + +=head2 new + +=head2 + +=head3 Description + +Create a new Tools::Fastq object and index the FASTQ file + +=head3 Arguments + +=over 4 + +=item + +A hash of parameters. + +Currently accepted keys are : + +'file' => FASTQ file path + +=back + +=head3 Returns + +=over 4 + +=item + +A Tools::Fastq object + +=back + +=cut + +sub new { + my ($class, %attrs) = @_; + my $self = {}; + bless $self; + if(defined($attrs{file})){ + $self->{file} = $attrs{file}; + open($self->{file_handle},$self->{file}) || $logger->logdie('Error opening file : '. $self->{file}.' : '.$!."\n"); + $self->indexFastqFile; + } + return $self; +} + +=head2 indexFastqFile + +=head2 + +=head3 Description + +Index a FASTQ file creating a hash reference with the following structure : + +$index -> {seq_id} = {'id_begin_position' => integer, 'id_length' => integer} + +For each sequence id, the "@" symbol and all the text after space will be removed. + +This cleaned id will be used as key for the index. + +=head3 Arguments + +=over 4 + +=item + +None + +=back + +=head3 Returns + +=over 4 + +=item + +None + +=back + +=cut + +sub indexFastqFile{ + + my ($self) = @_; + $logger->info('Indexing file : '.$self->{file}."\n"); + my $index; + my $id; + my $id_begin_position = 0; + my $fh = $self->{file_handle}; + while(my $line = <$fh>){ + + if($line =~ /^@(\S+)/){ + + $id = $1; + chomp $id; + $index -> {$id} = {'id_begin_position' => $id_begin_position, 'id_length' => length $line}; + $logger->trace('Indexing sequence' . $id . ' (position_begin_id : '. $index -> {$id}{'id_begin_position'} . ', id_length : '. $index -> {$id}{'id_length'} .') from ' . $self->{file} . "\n"); + <$fh>; <$fh>; <$fh>; + } + + $id_begin_position = tell($fh); + } + + $logger->info('File '.$self->{file}.' is now indexed (index contains '.(scalar keys %$index)." sequences)\n"); + $self->{index} = $index; +} + +=head2 loadFastqIndexFile + +=head2 + +=head3 Description + +Retrieve index from file using Storable module + +=head3 Arguments + +=over 4 + +An index file + +=back + +=head3 Returns + +=over 4 + +=item + +A hash reference corresponding to the index of the input FASTQ file : + +$index -> {seq_id} = {'id_begin_position' => integer, 'id_length' => integer} + +=back + +=cut + +sub loadFastqIndexFile{ + + my ($self, $file) = @_; + $self->{index} = retrieve($file); + $logger->info('File '.$file." is now loaded\n"); +} + +=he=head2 writeFastaIndexFile + +=head2 + +=head3 Description + +Write index to file using Storable module + +=head3 Arguments + +=over 4 + +=item + +A hash reference corresponding to FASTQ index. + +=item + +An output file path where to store the index. + +=back + +=head3 Returns + +=over 4 + +=item + +The output file path containing index + +=back + +=cut + +sub writeFastqIndexFile{ + my ($self, $file) = @_; + $logger->info('Writing index ('.(scalar keys %{$self->{index}}).' sequences) in file : '.$file."\n"); + store $self->{index}, $file; + $logger->info('File '.$file." is now created\n"); +} + +=head2 retrieveFastqSequence + +=head2 + +=head3 Description + +Retrieve FASTQ sequences using a list of ids + +=head3 Arguments + +=over 4 + +=item + +A sequence id OR an array reference containing the list of sequences id to retrieve. + +=back + +=head3 Returns + +=over 4 + +=item + +A hash reference containing sequences id as keys and sequences as values + +$data -> {seq_id} = sequence_corresponding_to_seq_id + +=back + +=cut + +sub retrieveFastqSequence{ + my ($self, $ids) = @_; + my $data={}; + my $nbSequences = 0; + if(! ref $ids){$ids = [$ids]} + $logger->debug('Retrieving sequences of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n"); + my $fh = $self->{file_handle}; + foreach my $id (@$ids){ + my $cleanedId = $id; + if($id =~ /@(\S+)/){$cleanedId = $1} + $logger->trace('Retrieving informations of id ' . $cleanedId. " from index\n"); + if(exists $self->{index} -> {$cleanedId}){ + $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n"); + seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0); + <$fh>; + my $sequence = <$fh>; + $data->{$id} = $sequence; + $nbSequences ++; + $logger->trace('Sequence of id '.$cleanedId.' is : ' . $sequence . "\n") + } + else{ + $logger->trace('id ' . $cleanedId. " not found in index\n") + } + } + $logger->debug($nbSequences.'/'.scalar(@$ids).' sequences has been retrieved from indexed file ' . $self->{file} . "\n"); + return $data; +} + +=head2 retrieveFastqQuality + +=head2 + +=head3 Description + +Retrieve FASTQ sequences quality using a list of ids + +=head3 Arguments + +=over 4 + +=item + +A sequence id OR an array reference containing the list of sequences id to retrieve quality. + +=back + +=head3 Returns + +=over 4 + +=item + +A hash reference containing sequences id as keys and sequences quality as values + +$data -> {seq_id} = sequence_quality_corresponding_to_seq_id + +=back + +=cut + +sub retrieveFastqQuality{ + my ($self, $ids) = @_; + my $data; + my $nbSequences = 0; + if(! ref $ids){$ids = [$ids]} + $logger->debug('Retrieving sequence quality of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n"); + my $fh = $self->{file_handle}; + foreach my $id (@$ids){ + my $cleanedId = $id; + if($id =~ /@(\S+)/){ + $cleanedId = $1; + } + $logger->trace('retrieving informations of id ' . $cleanedId. " from index\n"); + if(exists $self->{index} -> {$cleanedId}){ + $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n"); + seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0); + my $quality .= <$fh>.<$fh>.<$fh>; + $quality = <$fh>; + $data .= $quality; + $nbSequences ++; + $logger->trace('Sequence quality of id '.$cleanedId.' is : ' . $quality. "\n") + } + else{ + $logger->trace('id ' . $cleanedId. " not found in index\n"); + } + } + $logger->debug($nbSequences.'/'.scalar(@$ids).' sequences qualities has been retrieved from indexed file ' . $self->{file} . "\n"); + return $data; +} + +=head2 retrieveFastqBlock + +=head2 + +=head3 Description + +Retrieve FASTQ formatted sequences using a list of ids + +=head3 Arguments + +=over 4 + +=item + +A sequence id OR an array reference containing the list of sequences id to retrieve. + +=back + +=head3 Returns + +=over 4 + +=item + +A scalar containing the sequences corresponding to ids in FASTQ format + +=back + +=cut + +sub retrieveFastqBlock{ + my ($self, $ids) = @_; + my $data; + my $nbSequences = 0; + + if(! ref $ids){$ids = [$ids]} + + $logger->trace('Retrieving fastq block of '.scalar(@$ids).' ids from indexed file : '.$self->{file}."\n"); + my $fh = $self->{file_handle}; + foreach my $id (@$ids){ + my $cleanedId = $id; + if($id =~ /@(\S+)/){ + $cleanedId = $1; + } + $logger->trace('Retrieving informations of id ' . $cleanedId. " from index\n"); + if(exists $self->{index} -> {$cleanedId}){ + $logger->trace('id ' . $cleanedId . ' is present in index (id_begin_position : '. $self->{index} -> {$cleanedId}{'id_begin_position'}. ', id_length : '. $self->{index} -> {$cleanedId}{'id_length'}.")\n"); + seek($fh, $self->{index} -> {$cleanedId}{'id_begin_position'}, 0); + read($fh, my $block, $self->{index} -> {$cleanedId}{'id_length'}); + $block .= <$fh>.<$fh>.<$fh>; + $data .= $block; + $nbSequences++; + $logger->trace('fastq block of id '.$cleanedId.' is : ' ."\n". $block. "\n") + } + else{ + $logger->trace('id ' . $cleanedId. " not found in index\n"); + } + } + $logger->trace($nbSequences.'/'.scalar(@$ids).' fastq block has been retrieved from indexed file ' . $self->{file} . "\n"); + return $data; +} +1;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/perl/scripts/readPerContig.pl Thu Oct 15 09:53:48 2015 -0400 @@ -0,0 +1,179 @@ +#!/usr/bin/perl +use strict; +use Data::Dumper; +use Logger::Logger; +use File::Basename; +use Getopt::Long; +use Cwd 'abs_path' ; +use Tools::Fasta ; +use Tools::Fastq ; + + +my $ref=''; +my @p1=''; +my @p2=''; +my @read=''; +my $identity = 95; +my $coverage = 60; +my $verbosity=1; +my $format; +my $stat; +my $blatOutput=''; +my $statOutput=''; + +GetOptions( + "r=s" => \$ref, + "id=i" => \$identity, + "cov=i" => \$coverage, + "b|blat" => \$blatOutput, + "o|output=s" => \$statOutput, + "f=s" => \$format, + "s=s" => \$stat, + "v|verbosity=i" => \$verbosity +); + +Logger::Logger->changeMode($verbosity); + +&main; + +sub main { + my $self = {}; + bless $self; + $self->_setOptions(); + + $self->_checkFormat($format); + + $self->{ref_tool} = Tools::Fasta->new(file => $self->{reference}); + + $self->_parseBlat(); + + $self->_print($self->{_statistics}, $self->{statOutput}); +} + + +sub _parseBlat { + my ($self) = @_; + open(PSL,$self->{blatOutput}) || $logger->logdie('Cant read psl file ' . $self->{blatOutput}); + while(<PSL>){ + chomp; + my @line = split(/\t/,$_); + my $tSize = $line[14]; + my $tStart = $line[15]; + my $tEnd = $line[16]; + my $coverage = (($tEnd-$tStart)/$tSize) * 100; + + if($coverage >= $self->{_covThreshold}){ + $self->{readNumber}->{$line[9]}++; + $self->{totalReadNumber}++; + $self->{size}->{$line[9]} = $line[10]; + $self->{reads_size_sum}->{$line[9]} += $tSize; + } + else{ + next; + } + } + close PSL; +} + + +sub _print { + my ($self, $mode, $statOutput) = @_; + $logger->info('Printing results ...'); + + if(!defined $mode){ + $mode = 'nb_reads'; + } + + if(!defined $statOutput){ + $statOutput = $self->{reference} . '.rn'; + } + + open(OUT,">$statOutput"); + foreach my $ref (sort(keys(%{$self->{ref_tool}->{index}}))){ + my $value = 0; + if(defined($self->{readNumber}->{$ref})){ + if($mode eq 'nb_reads'){ + $value = $self->{readNumber}->{$ref}; + } + elsif($mode eq 'rpkm'){ + $value = $self->{readNumber}->{$ref} / ($self->{size}->{$ref}/1000 * $self->{totalReadNumber}/1000000); + } + elsif($mode eq 'mean_coverage'){ + $value = $self->{reads_size_sum}->{$ref} / $self->{size}->{$ref}; + } + } + print OUT $ref . "\t" . $value . "\n"; + } + close OUT; +} + + +sub _setOptions { + my ($self) = @_; + if(-e $ref){ + $self->{reference} = abs_path($ref); + } + else{ + $logger->error('You must provide a reference.'); + &help; + } + if($identity >= 0 && $identity <= 100){ + $self->{_idThreshold} = $identity; + } + else{ + $logger->logdie('ERROR: identity must be between 0 and 100.'); + } + if($coverage >=0 && $coverage <= 100){ + $self->{_covThreshold} = $coverage; + } + else{ + $logger->logdie('ERROR: coverage must be between 0 and 100.'); + } + if($stat =~ /nb_reads|rpkm|mean_coverage/){ + $self->{_statistics} = $stat; + } + else{ + $logdie(''); + } + if($blatOutput eq ''){ + $self->{blatOutput} = 'blat.psl'; + } + else{ + $self->{blatOutput} = $blatOutput; + } + if($statOutput eq ''){ + my($filename, $dirs, $suffix) = fileparse($ref, qr/\.[^.]*/); + $statOutput = $filename . '.psl'; + } + else{ + $self->{statOutput} = $statOutput; + } +} + + +sub help { +my $prog = basename($0) ; +print STDERR <<EOF ; +#### $prog #### +# +# AUTHOR: Sebastien THEIL +# LAST MODIF: 16/07/2015 +# PURPOSE: This script is basically used to map reads (using blat) on contigs (or scaffolds) and count the + number of read per contig. + Input can be multiple fasta or fastq. + Output is a 2 column tabular file. + +USAGE: + $prog -i singl.fastq -i singl.fasta -1 R1.fastq -2 R2.fastq .... + + ### OPTIONS ### + -r <string> Rererence sequences file. + -id <int> Identity threshold. (Default: $identity) + -cov <int> Coverage threshold. (Default: $coverage) + -b <string> Blat output file path + -o <string> Output file path + -s <string> Statistics to print : nb_reads, rpkm or mean_coverage. + -v <int> Verbosity level. (0 -> 4). +EOF +exit(1); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/readPerContig.xml Thu Oct 15 09:53:48 2015 -0400 @@ -0,0 +1,21 @@ +<tool id="readPerContig" name="readPerContig" version="1.0"> + <description> + This script counts the number of read per contig using a Blat output. + Output is a 2 column tabular file. + </description> + <command interpreter="perl">readPerContig.pl-r $reference -id $identity -cov $coverage -b blat_result -v 0 -s $statistics -o $output</command> + <inputs> + <param name="reference" type="data" format="fasta" label="References."/> + <param name="blat" type="data" format="psl" label="Blat .psl output file."/> + <param name="identity" type="integer" value="95" label="Identity threshold."/> + <param name="coverage" type="integer" value="60" label="Coverage threshold."/> + <param name="statistics" type="select" label="Statistics to print"> + <option value="nb_reads">Read number</option> + <option value="rpkm">RPKM</option> + <option value="mean_coverage">Mean coverage</option> + </param> + </inputs> + <outputs> + <data name="output" format="tabular"/> + </outputs> +</tool>
--- a/tool_dependencies.xml Thu Oct 15 05:52:00 2015 -0400 +++ b/tool_dependencies.xml Thu Oct 15 09:53:48 2015 -0400 @@ -11,12 +11,19 @@ <package name="perl" version="5.18.1" /> </repository> </action> + <action type="move_file"> + <source>perl</source> + <destination>$INSTALL_DIR</destination> + </action> <action type="setup_perl_environment"> <package>Data::Dumper</package> <package>File::Basename</package> <package>Getopt::Long</package> <package>Cwd</package> - <package>$REPOSITORY_INSTALL_DIR/perl/lib</package> + <package>$INSTALL_DIR/perl/lib</package> + </action> + <action type="set_environment"> + <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/perl/scripts</environment_variable> </action> </actions> </install>