Mercurial > repos > matteoc > agame_custom_tools
view pfamScan/Bio/Pfam/HMM/HMMResults.pm @ 0:68a3648c7d91 draft default tip
Uploaded
author | matteoc |
---|---|
date | Thu, 22 Dec 2016 04:45:31 -0500 |
parents | |
children |
line wrap: on
line source
# Bio::Pfam::HMM::HMMResults.pm # # Author: finnr # Maintainer: $Id: HMMResults.pm,v 1.3 2009-12-15 14:38:08 jt6 Exp $ # Version: $Revision: 1.3 $ # Created: Nov 19, 2008 # Last Modified: $Date: 2009-12-15 14:38:08 $ =head1 NAME Bio::Pfam::HMM::HMMResults - A object to represents the results from hmmsearch =cut package Bio::Pfam::HMM::HMMResults; =head1 DESCRIPTION A more detailed description of what this class does and how it does it. $Id: HMMResults.pm,v 1.3 2009-12-15 14:38:08 jt6 Exp $ =head1 COPYRIGHT File: Bio::Pfam::HMM::HMMResults.pm Copyright (c) 2007: Genome Research Ltd. Authors: Rob Finn (rdf@sanger.ac.uk) This is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. or see the on-line version at http://www.gnu.org/copyleft/gpl.txt =cut use strict; use warnings; use Moose; use Moose::Util::TypeConstraints; use Bio::Pfam::HMM::HMMSequence; use Bio::Pfam::HMM::HMMUnit; # #------------------------------------------------------------------------------- # Attributes has 'hmmerVersion' => ( isa => 'Str', is => 'rw', ); has 'hmmName' => ( isa => 'Str', is => 'rw' ); has 'seqDB' => ( isa => 'Str', is => 'rw' ); has hmmLength => ( isa => 'Int', is => 'rw' ); has 'thisFile' => ( isa => 'Str', is => 'rw' ); has seedName => ( isa => 'Str', is => 'rw' ); has 'seqs' => ( isa => 'HashRef', is => 'rw', default => sub { {} }, ); has 'units' => ( isa => 'ArrayRef', is => 'rw', default => sub { [] }, ); has 'domThr' => ( isa => 'Num', is => 'rw', default => '25.0' ); has 'seqThr' => ( isa => 'Num', is => 'rw', default => '25.0' ); has 'evalueThr' => ( isa => 'Num', is => 'rw' ); has 'domTC' => ( isa => 'Num', is => 'rw' ); has 'seqTC' => ( isa => 'Num', is => 'rw' ); has 'domNC' => ( isa => 'Num', is => 'rw' ); has 'seqNC' => ( isa => 'Num', is => 'rw' ); has 'randSeedNum' => ( isa => 'Int', is => 'rw' ); has 'description' => ( isa => 'Str', is => 'rw' ); has 'seqName' => ( isa => 'Str', is => 'rw' ); has 'seqLength' => ( isa => 'Int', is => 'rw' ); has 'eof' => ( isa => 'Int', is => 'rw', default => 0 ); has 'program' => ( isa => 'Str', is => 'rw' ); =head1 METHODS =head2 addHMMSeq Title : addHMMSeq Usage : $hmmRes->addHMMSeq( $hmmSeqObj ) Function : Adds a Bio::Pfam::HMM::HMMSequence object to the results object Args : A Bio::Pfam::HMM::HMMSequence object Returns : nothing =cut sub addHMMSeq { my( $self, $hmmSeq ) = @_; unless($hmmSeq->isa('Bio::Pfam::HMM::HMMSequence')){ die 'Trying to add a non Bio::Pfam::HMM::HMMSequence object'; } if($self->seqs){ if($self->seqs->{$hmmSeq->name}){ die "Trying to add the same sequence twice"; } } $self->seqs->{$hmmSeq->name} = $hmmSeq; } =head2 eachHMMSeq Title : eachHMMSeq Usage : my @seqs = $hmmRes->eachHMMSeq Function : Returns an array reference containing the references to all of the : Bio::Pfam::HMM::HMMSequence objects stored in the HMMResults object. Args : None Returns : Array reference =cut sub eachHMMSeq { my ($self ) = @_; my @seqs; my $seqRefs = $self->seqs; foreach my $n (keys %{ $seqRefs }){ push(@seqs, $seqRefs->{$n}); } return(\@seqs); } #------------------------------------------------------------------------------- =head2 addHMMUnit Title : addHMMUnit Usage : $hmmRes Function : Adds HMM units (the actual region hit) to the HMMSequence in the object : and for convenience to the the results sets. All we store are duplicates : of the references. Args : A Bio::Pfam::HMM:HMMUnit Returns : Nothing =cut sub addHMMUnit { my ($self, $hmmUnit) = @_; unless($hmmUnit->isa('Bio::Pfam::HMM::HMMUnit')){ die "Trying to add an non-Bio::Pfam::HMM::HMMUnit\n"; } if($self->seqs){ if($self->seqs->{$hmmUnit->name}){ $self->seqs->{$hmmUnit->name}->addHMMUnit($hmmUnit); }else{ warn "Could not add hmmUnit as the sequence has not been added\n"; } } #More conveinence we store the point to the hmmunit in an array push(@{$self->units},$hmmUnit); } #------------------------------------------------------------------------------- =head2 domainBitsCutoffFromEvalue Title : domainBitsCutoffFromEvalue Usage : $hmmRes->domainBitsCutoffFromEvalue(0.01) Function : From the supplied evalue, it scans through all of the evalues in the results : and calulates the bits score. Args : An evalue. Returns : A bits score. If no evalue is specified, returns nothing =cut sub domainBitsCutoffFromEvalue { my ($self, $eval) = @_; my ($dom,$prev,@doms,$cutoff,$sep,$seen); unless(defined ($eval) ){ warn "No evalue specified\n"; return; } $seen = 0; foreach $_ ( sort { $b->bits <=> $a->bits } @{$self->units}, @{$self->eachHMMSeq} ) { if( $_->evalue > $eval ) { $seen = 1; $dom = $_; last; } $prev = $_; } if( ! defined $prev || $seen == 0) { carp("Evalue is either above or below the list..."); return undef; } $sep = $prev->bits - $dom->bits ; if( $sep < 1 ) { return $prev->bits(); } if( $dom->bits < 25 && $prev->bits > 25 ) { return 25; } return $dom->bits + sprintf("%.1f",$sep/2); } #------------------------------------------------------------------------------- =head2 lowestTrue Title : Usage : Function : Args : Returns : =cut sub lowestTrue { my $self = shift; unless($self->domTC && $self->seqTC) { unless($self->domThr and $self->seqThr){ die "Could not define TC as I am missing a threshold\n"; } #Set it wildly high! my ($lowSeq, $lowDom); $lowSeq = $lowDom = 999999.99; foreach my $seqId (keys %{$self->seqs} ){ if($self->seqs->{$seqId}->bits >= $self->seqThr){ #Is this the lowest sequence thresh if($self->seqs->{$seqId}->bits < $lowSeq){ $lowSeq = $self->seqs->{$seqId}->bits; } #For each of the regions found on the sequence, look to see if the match is great #than the domain threshold. If it is, is it lower than we we have seen previously foreach my $unit (@{ $self->seqs->{$seqId}->hmmUnits } ){ if( $unit->bits() >= $self->domThr && $unit->bits() < $lowDom ) { $lowDom = $unit->bits; } } } } $self->domTC($lowDom); $self->seqTC($lowSeq); } return($self->seqTC, $self->domTC); } #------------------------------------------------------------------------------- =head2 highestNoise Title : Usage : Function : Args : Returns : =cut sub highestNoise { my $self = shift; #See if it is already set unless($self->domNC && $self->seqNC) { unless($self->domThr and $self->seqThr){ die "Could not define TC as I am missing a threshold\n"; } #Set it wildly low my ($highSeq, $highDom); $highSeq = $highDom = -999999.99; foreach my $seqId (keys %{$self->seqs} ){ if($self->seqs->{$seqId}->bits < $self->seqThr){ #Is this the highest sequence thres below the cut-off if($self->seqs->{$seqId}->bits > $highSeq){ $highSeq = $self->seqs->{$seqId}->bits; } } #For each of the regions found on the sequence, look to see if the match is great #than the domain threshold. If it is, is it lower than we we have seen previously foreach my $unit (@{ $self->seqs->{$seqId}->hmmUnits } ){ if( $unit->bits < $self->domThr && $unit->bits > $highDom ) { $highDom = $unit->bits; } } } $self->domNC($highDom); $self->seqNC($highSeq); } return($self->seqNC, $self->domNC); } sub applyEdits { my ($self, $edits, $removeBadEd) = @_; my @validEd; #If removeBadEd flag is on, collect all the valid ED lines in this array and return at end of sub foreach my $e (@$edits){ #{ seq => $1, oldFrom => $2, oldTo => $3, newFrom => $5, newTo => $6 } if($self->seqs->{$e->{seq}}){ my $matched = 0; foreach my $u (@{ $self->seqs->{ $e->{seq} }->hmmUnits }){ if($u->envFrom == $e->{oldFrom} and $u->envTo == $e->{oldTo}) { $matched = 1; #HMM unit found if(defined $e->{newFrom} and $e->{newTo}){ #Check co-ordinates of new start and end positions are in range if( $e->{newFrom} < $u->{envFrom} or $e->{newTo} > $u->{envTo} or $e->{newFrom} > $e->{newTo}) { if($removeBadEd) { print "Removing ED line due to out of range co-ordinates: " . $e->{seq}."/".$e->{newFrom}."-".$e->{newTo}. "\n"; } else { warn $e->{seq}."/".$e->{newFrom}."-".$e->{newTo}." contains out of range co-ordinates - bad ED line\n"; } last; } #Modify the start end positions $u->envFrom($e->{newFrom}); $u->envTo($e->{newTo}); #Check that the ali-positions are still okay if($u->seqFrom < $e->{newFrom}){ $u->seqFrom($e->{newFrom}); } if($u->seqTo > $e->{newTo}){ $u->seqTo($e->{newTo}); } }else{ #Set the score so low it will never get in the align $u->bits(-999999.99); } push(@validEd, $e) if($removeBadEd); last; } } unless($matched){ #HMM unit not found - bad ED if($removeBadEd) { print "Removing ED line for invalid hmm unit: " . $e->{seq}."/".$e->{oldFrom}."-".$e->{oldTo}. "\n"; } else { warn $e->{seq}."/".$e->{oldFrom}."-".$e->{oldTo}." does not appear in the list of hmm units - bad ED line\n"; } } }else{ #Sequence not found - bad ED if($removeBadEd) { print "Removing ED line for invalid hmm unit: " . $e->{seq}."/".$e->{oldFrom}."-".$e->{oldTo}. "\n"; } else { warn $e->{seq}." does not appear in the list of hmm units - bad ED line\n"; } } } return(\@validEd) if($removeBadEd); } sub remove_overlaps_by_clan { my ($self, $clanmap, $nested) = @_; my $new = Bio::Pfam::HMM::HMMResults->new; $new->seqName($self->seqName); foreach my $unit ( sort { $a->evalue <=> $b->evalue } @{ $self->units } ) { #check if it overlaps before adding my $o; foreach my $u ( @{ $new->units } ) { if( exists($clanmap->{$unit->name}) and exists($clanmap->{$u->name}) and ($clanmap->{$unit->name} eq $clanmap->{$u->name}) ) { if( overlap( $unit, $u ) ) { if(exists($$nested{$unit->name}{$u->name})) { next; } else { $o=1; last; } } } } unless($o) { if(! $new->seqs->{$unit->name}) { $new->addHMMSeq( Bio::Pfam::HMM::HMMSequence->new( { name => $self->seqs->{$unit->name}->name, desc => $self->seqs->{$unit->name}->desc, bits => $self->seqs->{$unit->name}->bits, evalue => $self->seqs->{$unit->name}->evalue, numberHits => $self->seqs->{$unit->name}->numberHits}) ); } $new->addHMMUnit($unit); } } return $new; } sub overlap { # does unit1 overlap with unit2? my $unit1 = shift; my $unit2 = shift; my( $u1, $u2 ) = sort { $a->seqFrom <=> $b->seqFrom } ( $unit1, $unit2 ); if( $u2->seqFrom <= $u1->seqTo ) { return 1; } return 0; } sub results { my ( $self, $pfamScanData, $e_value ) = @_; my @results = (); foreach my $unit ( sort { $a->seqFrom <=> $b->seqFrom } @{ $self->units } ) { my $pfamB = $unit->name =~ /^Pfam-B/; #Filter results based on thresholds if ( $unit->name =~ /^Pfam-B/ ) { next unless ( $self->seqs->{$unit->name}->evalue <= 0.001 and $unit->evalue <= 0.001 ); $pfamB = 1; } else { if ( $e_value ) { next unless ( $self->seqs->{$unit->name}->evalue <= $e_value and $unit->evalue <= $e_value ) ; } else { next unless $unit->sig; } } push @results, { seq => { from => $unit->seqFrom, to => $unit->seqTo, name => $self->seqName }, env => { from => $unit->envFrom, to => $unit->envTo }, hmm => { from => $unit->hmmFrom, to => $unit->hmmTo }, model_length => $pfamScanData->{_model_len}->{ $unit->name }, bits => $unit->bits, evalue => $unit->evalue, acc => $pfamScanData->{_accmap}->{ $unit->name }, name => $unit->name, desc => $pfamScanData->{_desc}->{ $unit->name }, type => $pfamB ? undef : $pfamScanData->{_type}->{ $unit->name }, clan => $pfamB ? undef : $pfamScanData->{_clanmap}->{ $unit->name } || 'No_clan', act_site => $pfamB ? undef : $unit->{act_site}, sig => $pfamB ? "NA" : $unit->sig, align => [ sprintf( '#HMM %s', $unit->hmmalign->{hmm} ), sprintf( '#MATCH %s', $unit->hmmalign->{match} ), sprintf( '#PP %s', $unit->hmmalign->{pp} ), sprintf( '#SEQ %s', $unit->hmmalign->{seq} ) ] }; } return \@results; } 1;