view pfamScan/Bio/Pfam/HMM/HMMResults.pm @ 0:68a3648c7d91 draft default tip

Uploaded
author matteoc
date Thu, 22 Dec 2016 04:45:31 -0500
parents
children
line wrap: on
line source

# Bio::Pfam::HMM::HMMResults.pm
#
# Author:        finnr
# Maintainer:    $Id: HMMResults.pm,v 1.3 2009-12-15 14:38:08 jt6 Exp $
# Version:       $Revision: 1.3 $
# Created:       Nov 19, 2008
# Last Modified: $Date: 2009-12-15 14:38:08 $

=head1 NAME

Bio::Pfam::HMM::HMMResults - A object to represents the results from hmmsearch

=cut

package Bio::Pfam::HMM::HMMResults;

=head1 DESCRIPTION

A more detailed description of what this class does and how it does it.

$Id: HMMResults.pm,v 1.3 2009-12-15 14:38:08 jt6 Exp $

=head1 COPYRIGHT

File: Bio::Pfam::HMM::HMMResults.pm

Copyright (c) 2007: Genome Research Ltd.

Authors: Rob Finn (rdf@sanger.ac.uk)

 This is free software; you can redistribute it and/or
 modify it under the terms of the GNU General Public License
 as published by the Free Software Foundation; either version 2
 of the License, or (at your option) any later version.
 
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
 
=cut

use strict;
use warnings;

use Moose;
use Moose::Util::TypeConstraints;
use Bio::Pfam::HMM::HMMSequence;
use Bio::Pfam::HMM::HMMUnit;

#
#-------------------------------------------------------------------------------
# Attributes

has 'hmmerVersion' => (
  isa       => 'Str', 
  is        => 'rw', 
);

has 'hmmName' => (
  isa => 'Str',
  is  => 'rw'
);

has 'seqDB' => (
  isa => 'Str',
  is  => 'rw'
);

has hmmLength => (
  isa => 'Int',
  is  => 'rw'
);

has 'thisFile' => (
  isa => 'Str',
  is  => 'rw'
);

has seedName => (
  isa => 'Str',
  is  => 'rw'
);

has 'seqs' => (
  isa => 'HashRef',
  is  => 'rw',
  default => sub { {} },
);

has 'units' => (
  isa => 'ArrayRef',
  is  => 'rw',
  default => sub { [] },
);

has 'domThr' => (
  isa => 'Num',
  is   => 'rw',
  default => '25.0'
);

has 'seqThr' => (
  isa => 'Num',
  is  => 'rw',
  default => '25.0'
);

has 'evalueThr' => (
  isa => 'Num',
  is  => 'rw'
);

has 'domTC' => (
  isa => 'Num',
  is  => 'rw'
);

has 'seqTC' => (
  isa => 'Num',
  is  => 'rw'
);

has 'domNC' => (
  isa => 'Num',
  is  => 'rw'
);

has 'seqNC' => (
  isa => 'Num',
  is  => 'rw'
);

has 'randSeedNum' => (
  isa => 'Int',
  is  => 'rw'
);

has 'description' => (
  isa => 'Str',
  is  => 'rw'
);

has 'seqName' => (
  isa => 'Str',
  is  => 'rw'
);

has 'seqLength' => (
  isa => 'Int',
  is  => 'rw'
);


has 'eof' => (
  isa => 'Int',
  is  => 'rw',
  default => 0
);

has 'program' => (
  isa => 'Str',
  is  => 'rw'
);

=head1 METHODS

=head2 addHMMSeq 

  Title    : addHMMSeq
  Usage    : $hmmRes->addHMMSeq( $hmmSeqObj )
  Function : Adds a Bio::Pfam::HMM::HMMSequence object to the results object 
  Args     : A Bio::Pfam::HMM::HMMSequence object
  Returns  : nothing
  
=cut

sub addHMMSeq {
  my( $self, $hmmSeq ) = @_;

  unless($hmmSeq->isa('Bio::Pfam::HMM::HMMSequence')){
    die 'Trying to add a non Bio::Pfam::HMM::HMMSequence object';  
  }

  if($self->seqs){
    if($self->seqs->{$hmmSeq->name}){
      die "Trying to add the same sequence twice"; 
    }
  }
  
  $self->seqs->{$hmmSeq->name} = $hmmSeq;
}


=head2 eachHMMSeq 

  Title    : eachHMMSeq
  Usage    : my @seqs = $hmmRes->eachHMMSeq
  Function : Returns an array reference containing the references to all of the
           : Bio::Pfam::HMM::HMMSequence objects stored in the HMMResults object.
  Args     : None
  Returns  : Array reference
  
=cut

sub eachHMMSeq {
  my ($self ) = @_;
  my @seqs;
  my $seqRefs = $self->seqs;
  foreach my $n (keys %{ $seqRefs }){
    push(@seqs, $seqRefs->{$n});
  } 
  return(\@seqs);
}

#-------------------------------------------------------------------------------

=head2 addHMMUnit 

  Title    : addHMMUnit
  Usage    : $hmmRes 
  Function : Adds HMM units (the actual region hit) to the HMMSequence in the object
           : and for convenience to the the results sets. All we store are duplicates
           : of the references. 
  Args     : A Bio::Pfam::HMM:HMMUnit
  Returns  : Nothing
  
=cut

sub addHMMUnit {
  my ($self, $hmmUnit) = @_;
  
  unless($hmmUnit->isa('Bio::Pfam::HMM::HMMUnit')){
    die "Trying to add an non-Bio::Pfam::HMM::HMMUnit\n";
  }
  
  if($self->seqs){
    if($self->seqs->{$hmmUnit->name}){
      $self->seqs->{$hmmUnit->name}->addHMMUnit($hmmUnit);
    }else{
      warn "Could not add hmmUnit as the sequence has not been added\n";
    }
  }
  
  #More conveinence we store the point to the hmmunit in an array
  push(@{$self->units},$hmmUnit); 
}


#-------------------------------------------------------------------------------

=head2  domainBitsCutoffFromEvalue

  Title    : domainBitsCutoffFromEvalue
  Usage    : $hmmRes->domainBitsCutoffFromEvalue(0.01)
  Function : From the supplied evalue, it scans through all of the evalues in the results
           : and calulates the bits score.  
  Args     : An evalue. 
  Returns  : A bits score. If no evalue is specified, returns nothing
  
=cut

sub domainBitsCutoffFromEvalue {
  my ($self, $eval) = @_;
  my ($dom,$prev,@doms,$cutoff,$sep,$seen);
  
  unless(defined ($eval) ){
    warn "No evalue specified\n";
    return; 
  }


  $seen = 0;
  foreach $_ ( sort { $b->bits <=> $a->bits } @{$self->units}, @{$self->eachHMMSeq} ) {
    if( $_->evalue > $eval ) {
	    $seen = 1;
	    $dom = $_;
	    last;
	  } 
	  $prev = $_;
  }
    
  if( ! defined $prev || $seen == 0) {
	  carp("Evalue is either above or below the list...");
	  return undef;
  }

  $sep = $prev->bits - $dom->bits ;
    
  if( $sep < 1 ) {
	    return $prev->bits();
  }
  if( $dom->bits < 25 && $prev->bits > 25 ) {
	  return 25;
  }

  return $dom->bits + sprintf("%.1f",$sep/2);
}


#-------------------------------------------------------------------------------

=head2 lowestTrue

  Title    :
  Usage    :  
  Function :
  Args     :
  Returns  :
  
=cut

sub lowestTrue {
  my $self = shift;
  
  unless($self->domTC && $self->seqTC) {
    unless($self->domThr and $self->seqThr){
      die "Could not define TC as I am missing a threshold\n";
    }
    #Set it wildly high!
    my ($lowSeq, $lowDom);
    $lowSeq = $lowDom = 999999.99;  
    
    foreach my $seqId (keys %{$self->seqs} ){ 
      if($self->seqs->{$seqId}->bits >= $self->seqThr){
        #Is this the lowest sequence thresh
        if($self->seqs->{$seqId}->bits < $lowSeq){
          $lowSeq = $self->seqs->{$seqId}->bits; 
        }
        #For each of the regions found on the sequence, look to see if the match is great
        #than the domain threshold.  If it is, is it lower than we we have seen previously    
        foreach my $unit (@{ $self->seqs->{$seqId}->hmmUnits } ){
          if( $unit->bits() >= $self->domThr && $unit->bits() < $lowDom ) {
		        $lowDom  = $unit->bits;
          }
        } 
      }
      
    }
    $self->domTC($lowDom);
    $self->seqTC($lowSeq);
  }
  return($self->seqTC, $self->domTC);
}

#-------------------------------------------------------------------------------

=head2 highestNoise

  Title    :
  Usage    :  
  Function :
  Args     :
  Returns  :
  
=cut

sub highestNoise {
  my $self = shift;
  
  #See if it is already set
  unless($self->domNC && $self->seqNC) {
    unless($self->domThr and $self->seqThr){
      die "Could not define TC as I am missing a threshold\n";
    }
    
    #Set it wildly low
    my ($highSeq, $highDom);
    $highSeq = $highDom = -999999.99;  
    
    foreach my $seqId (keys %{$self->seqs} ){
      
      if($self->seqs->{$seqId}->bits < $self->seqThr){
        #Is this the highest sequence thres below the cut-off
        if($self->seqs->{$seqId}->bits > $highSeq){
          $highSeq = $self->seqs->{$seqId}->bits; 
        }
      }
      
      #For each of the regions found on the sequence, look to see if the match is great
      #than the domain threshold.  If it is, is it lower than we we have seen previously    
      foreach my $unit (@{ $self->seqs->{$seqId}->hmmUnits } ){
        if( $unit->bits < $self->domThr && $unit->bits > $highDom ) {
		      $highDom  = $unit->bits;
        }
      }
    }
    $self->domNC($highDom);
    $self->seqNC($highSeq); 
  }

  return($self->seqNC, $self->domNC);
}


sub applyEdits {
  my ($self, $edits, $removeBadEd) = @_;

  my @validEd; #If removeBadEd flag is on, collect all the valid ED lines in this array and return at end of sub 
  foreach my $e (@$edits){
    #{ seq => $1, oldFrom => $2, oldTo => $3, newFrom => $5, newTo => $6 }  
    if($self->seqs->{$e->{seq}}){
      my $matched = 0;
      foreach my $u (@{ $self->seqs->{ $e->{seq} }->hmmUnits }){
          if($u->envFrom == $e->{oldFrom} and $u->envTo == $e->{oldTo}) {
	    $matched = 1; #HMM unit found

            if(defined $e->{newFrom} and $e->{newTo}){
	     
	      #Check co-ordinates of new start and end positions are in range
	      if( $e->{newFrom} < $u->{envFrom} or $e->{newTo} > $u->{envTo} or $e->{newFrom} > $e->{newTo}) {
		if($removeBadEd) {
		  print "Removing ED line due to out of range co-ordinates: " . $e->{seq}."/".$e->{newFrom}."-".$e->{newTo}. "\n";
		}
		else {
		  warn $e->{seq}."/".$e->{newFrom}."-".$e->{newTo}." contains out of range co-ordinates - bad ED line\n";
		}
		last;
	      }

              #Modify the start end positions
              $u->envFrom($e->{newFrom});
              $u->envTo($e->{newTo});
              
              #Check that the ali-positions are still okay
              if($u->seqFrom < $e->{newFrom}){
                $u->seqFrom($e->{newFrom});  
              }
              if($u->seqTo > $e->{newTo}){
                $u->seqTo($e->{newTo});  
              }
            }else{
              #Set the score so low it will never get in the align
              $u->bits(-999999.99);
            }

	    push(@validEd, $e) if($removeBadEd);  
            last;    
          }  
      }
      unless($matched){ #HMM unit not found - bad ED
	if($removeBadEd) {
	  print "Removing ED line for invalid hmm unit: " . $e->{seq}."/".$e->{oldFrom}."-".$e->{oldTo}. "\n";
	}
	else {
	  warn $e->{seq}."/".$e->{oldFrom}."-".$e->{oldTo}." does not appear in the list of hmm units - bad ED line\n";
	}
      }
    }else{ #Sequence not found - bad ED
      if($removeBadEd) {
	print "Removing ED line for invalid hmm unit: " . $e->{seq}."/".$e->{oldFrom}."-".$e->{oldTo}. "\n";
      }
      else {
	warn $e->{seq}." does not appear in the list of hmm units - bad ED line\n";  
      }
    }
  }
  return(\@validEd) if($removeBadEd);
  
}

sub remove_overlaps_by_clan {

    my ($self, $clanmap, $nested) = @_;

    my $new = Bio::Pfam::HMM::HMMResults->new;
    $new->seqName($self->seqName);
   
    foreach my $unit ( sort { $a->evalue <=> $b->evalue }  @{ $self->units } ) {

        #check if it overlaps before adding
	my $o;
	
	foreach my $u ( @{ $new->units } ) {
	    
	    if( exists($clanmap->{$unit->name}) and exists($clanmap->{$u->name}) and ($clanmap->{$unit->name} eq $clanmap->{$u->name}) ) {
		if( overlap( $unit, $u ) ) {
		    if(exists($$nested{$unit->name}{$u->name})) {
			next;
		    }
		    else {
			$o=1;
			last;
		    }
		}
		
	    }
	}
	unless($o) {
	    if(! $new->seqs->{$unit->name}) {
		
		$new->addHMMSeq( Bio::Pfam::HMM::HMMSequence->new( {  name       => $self->seqs->{$unit->name}->name,
								      desc       => $self->seqs->{$unit->name}->desc,
								      bits       => $self->seqs->{$unit->name}->bits,
								      evalue     => $self->seqs->{$unit->name}->evalue,
								      numberHits => $self->seqs->{$unit->name}->numberHits}) );
	    
	    }
	    $new->addHMMUnit($unit);
	}

    }
    return $new;
}



sub overlap {
    # does unit1 overlap with unit2?
    my $unit1 = shift;
    my $unit2 = shift;
    my( $u1, $u2 ) = sort { $a->seqFrom <=> $b->seqFrom } ( $unit1, $unit2 );


    if( $u2->seqFrom <= $u1->seqTo ) {
        return 1;
    } 

    return 0;
}


sub results {
  my ( $self, $pfamScanData, $e_value ) = @_;

  my @results = ();
  foreach my $unit ( sort { $a->seqFrom <=> $b->seqFrom } @{ $self->units } ) {    

    my $pfamB = $unit->name =~ /^Pfam-B/;

    #Filter results based on thresholds
    if ( $unit->name =~ /^Pfam-B/ ) {
      next unless ( $self->seqs->{$unit->name}->evalue <= 0.001 and $unit->evalue <= 0.001 );
      $pfamB = 1;
    }
    else {	  
      if ( $e_value ) {
        next unless ( $self->seqs->{$unit->name}->evalue <= $e_value and $unit->evalue <= $e_value ) ;
      } 
      else {
       next unless $unit->sig;
      }
    }

    push @results, {
      seq          => { from => $unit->seqFrom,
                        to   => $unit->seqTo,
                        name => $self->seqName },
      env          => { from => $unit->envFrom,
                        to   => $unit->envTo },

      hmm          => { from => $unit->hmmFrom,
                        to   => $unit->hmmTo },

      model_length => $pfamScanData->{_model_len}->{ $unit->name },
      bits         => $unit->bits,
      evalue       => $unit->evalue,
      acc          => $pfamScanData->{_accmap}->{ $unit->name },
      name         => $unit->name,
      desc         => $pfamScanData->{_desc}->{ $unit->name },
      type         => $pfamB ? undef : $pfamScanData->{_type}->{ $unit->name },
      clan         => $pfamB ? undef : 
                         $pfamScanData->{_clanmap}->{ $unit->name } || 'No_clan',

      act_site     => $pfamB ? undef : $unit->{act_site},
      sig          => $pfamB ? "NA" : $unit->sig,
      align        => [ sprintf( '#HMM       %s', $unit->hmmalign->{hmm} ),
                        sprintf( '#MATCH     %s', $unit->hmmalign->{match} ),
                        sprintf( '#PP        %s', $unit->hmmalign->{pp} ),
                        sprintf( '#SEQ       %s', $unit->hmmalign->{seq} ) ]
    };
  }

  return \@results;
}

1;