diff pfamScan/Bio/Pfam/HMM/HMMResults.pm @ 0:68a3648c7d91 draft default tip

Uploaded
author matteoc
date Thu, 22 Dec 2016 04:45:31 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pfamScan/Bio/Pfam/HMM/HMMResults.pm	Thu Dec 22 04:45:31 2016 -0500
@@ -0,0 +1,582 @@
+# Bio::Pfam::HMM::HMMResults.pm
+#
+# Author:        finnr
+# Maintainer:    $Id: HMMResults.pm,v 1.3 2009-12-15 14:38:08 jt6 Exp $
+# Version:       $Revision: 1.3 $
+# Created:       Nov 19, 2008
+# Last Modified: $Date: 2009-12-15 14:38:08 $
+
+=head1 NAME
+
+Bio::Pfam::HMM::HMMResults - A object to represents the results from hmmsearch
+
+=cut
+
+package Bio::Pfam::HMM::HMMResults;
+
+=head1 DESCRIPTION
+
+A more detailed description of what this class does and how it does it.
+
+$Id: HMMResults.pm,v 1.3 2009-12-15 14:38:08 jt6 Exp $
+
+=head1 COPYRIGHT
+
+File: Bio::Pfam::HMM::HMMResults.pm
+
+Copyright (c) 2007: Genome Research Ltd.
+
+Authors: Rob Finn (rdf@sanger.ac.uk)
+
+ This is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License
+ as published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
+ 
+=cut
+
+use strict;
+use warnings;
+
+use Moose;
+use Moose::Util::TypeConstraints;
+use Bio::Pfam::HMM::HMMSequence;
+use Bio::Pfam::HMM::HMMUnit;
+
+#
+#-------------------------------------------------------------------------------
+# Attributes
+
+has 'hmmerVersion' => (
+  isa       => 'Str', 
+  is        => 'rw', 
+);
+
+has 'hmmName' => (
+  isa => 'Str',
+  is  => 'rw'
+);
+
+has 'seqDB' => (
+  isa => 'Str',
+  is  => 'rw'
+);
+
+has hmmLength => (
+  isa => 'Int',
+  is  => 'rw'
+);
+
+has 'thisFile' => (
+  isa => 'Str',
+  is  => 'rw'
+);
+
+has seedName => (
+  isa => 'Str',
+  is  => 'rw'
+);
+
+has 'seqs' => (
+  isa => 'HashRef',
+  is  => 'rw',
+  default => sub { {} },
+);
+
+has 'units' => (
+  isa => 'ArrayRef',
+  is  => 'rw',
+  default => sub { [] },
+);
+
+has 'domThr' => (
+  isa => 'Num',
+  is   => 'rw',
+  default => '25.0'
+);
+
+has 'seqThr' => (
+  isa => 'Num',
+  is  => 'rw',
+  default => '25.0'
+);
+
+has 'evalueThr' => (
+  isa => 'Num',
+  is  => 'rw'
+);
+
+has 'domTC' => (
+  isa => 'Num',
+  is  => 'rw'
+);
+
+has 'seqTC' => (
+  isa => 'Num',
+  is  => 'rw'
+);
+
+has 'domNC' => (
+  isa => 'Num',
+  is  => 'rw'
+);
+
+has 'seqNC' => (
+  isa => 'Num',
+  is  => 'rw'
+);
+
+has 'randSeedNum' => (
+  isa => 'Int',
+  is  => 'rw'
+);
+
+has 'description' => (
+  isa => 'Str',
+  is  => 'rw'
+);
+
+has 'seqName' => (
+  isa => 'Str',
+  is  => 'rw'
+);
+
+has 'seqLength' => (
+  isa => 'Int',
+  is  => 'rw'
+);
+
+
+has 'eof' => (
+  isa => 'Int',
+  is  => 'rw',
+  default => 0
+);
+
+has 'program' => (
+  isa => 'Str',
+  is  => 'rw'
+);
+
+=head1 METHODS
+
+=head2 addHMMSeq 
+
+  Title    : addHMMSeq
+  Usage    : $hmmRes->addHMMSeq( $hmmSeqObj )
+  Function : Adds a Bio::Pfam::HMM::HMMSequence object to the results object 
+  Args     : A Bio::Pfam::HMM::HMMSequence object
+  Returns  : nothing
+  
+=cut
+
+sub addHMMSeq {
+  my( $self, $hmmSeq ) = @_;
+
+  unless($hmmSeq->isa('Bio::Pfam::HMM::HMMSequence')){
+    die 'Trying to add a non Bio::Pfam::HMM::HMMSequence object';  
+  }
+
+  if($self->seqs){
+    if($self->seqs->{$hmmSeq->name}){
+      die "Trying to add the same sequence twice"; 
+    }
+  }
+  
+  $self->seqs->{$hmmSeq->name} = $hmmSeq;
+}
+
+
+=head2 eachHMMSeq 
+
+  Title    : eachHMMSeq
+  Usage    : my @seqs = $hmmRes->eachHMMSeq
+  Function : Returns an array reference containing the references to all of the
+           : Bio::Pfam::HMM::HMMSequence objects stored in the HMMResults object.
+  Args     : None
+  Returns  : Array reference
+  
+=cut
+
+sub eachHMMSeq {
+  my ($self ) = @_;
+  my @seqs;
+  my $seqRefs = $self->seqs;
+  foreach my $n (keys %{ $seqRefs }){
+    push(@seqs, $seqRefs->{$n});
+  } 
+  return(\@seqs);
+}
+
+#-------------------------------------------------------------------------------
+
+=head2 addHMMUnit 
+
+  Title    : addHMMUnit
+  Usage    : $hmmRes 
+  Function : Adds HMM units (the actual region hit) to the HMMSequence in the object
+           : and for convenience to the the results sets. All we store are duplicates
+           : of the references. 
+  Args     : A Bio::Pfam::HMM:HMMUnit
+  Returns  : Nothing
+  
+=cut
+
+sub addHMMUnit {
+  my ($self, $hmmUnit) = @_;
+  
+  unless($hmmUnit->isa('Bio::Pfam::HMM::HMMUnit')){
+    die "Trying to add an non-Bio::Pfam::HMM::HMMUnit\n";
+  }
+  
+  if($self->seqs){
+    if($self->seqs->{$hmmUnit->name}){
+      $self->seqs->{$hmmUnit->name}->addHMMUnit($hmmUnit);
+    }else{
+      warn "Could not add hmmUnit as the sequence has not been added\n";
+    }
+  }
+  
+  #More conveinence we store the point to the hmmunit in an array
+  push(@{$self->units},$hmmUnit); 
+}
+
+
+#-------------------------------------------------------------------------------
+
+=head2  domainBitsCutoffFromEvalue
+
+  Title    : domainBitsCutoffFromEvalue
+  Usage    : $hmmRes->domainBitsCutoffFromEvalue(0.01)
+  Function : From the supplied evalue, it scans through all of the evalues in the results
+           : and calulates the bits score.  
+  Args     : An evalue. 
+  Returns  : A bits score. If no evalue is specified, returns nothing
+  
+=cut
+
+sub domainBitsCutoffFromEvalue {
+  my ($self, $eval) = @_;
+  my ($dom,$prev,@doms,$cutoff,$sep,$seen);
+  
+  unless(defined ($eval) ){
+    warn "No evalue specified\n";
+    return; 
+  }
+
+
+  $seen = 0;
+  foreach $_ ( sort { $b->bits <=> $a->bits } @{$self->units}, @{$self->eachHMMSeq} ) {
+    if( $_->evalue > $eval ) {
+	    $seen = 1;
+	    $dom = $_;
+	    last;
+	  } 
+	  $prev = $_;
+  }
+    
+  if( ! defined $prev || $seen == 0) {
+	  carp("Evalue is either above or below the list...");
+	  return undef;
+  }
+
+  $sep = $prev->bits - $dom->bits ;
+    
+  if( $sep < 1 ) {
+	    return $prev->bits();
+  }
+  if( $dom->bits < 25 && $prev->bits > 25 ) {
+	  return 25;
+  }
+
+  return $dom->bits + sprintf("%.1f",$sep/2);
+}
+
+
+#-------------------------------------------------------------------------------
+
+=head2 lowestTrue
+
+  Title    :
+  Usage    :  
+  Function :
+  Args     :
+  Returns  :
+  
+=cut
+
+sub lowestTrue {
+  my $self = shift;
+  
+  unless($self->domTC && $self->seqTC) {
+    unless($self->domThr and $self->seqThr){
+      die "Could not define TC as I am missing a threshold\n";
+    }
+    #Set it wildly high!
+    my ($lowSeq, $lowDom);
+    $lowSeq = $lowDom = 999999.99;  
+    
+    foreach my $seqId (keys %{$self->seqs} ){ 
+      if($self->seqs->{$seqId}->bits >= $self->seqThr){
+        #Is this the lowest sequence thresh
+        if($self->seqs->{$seqId}->bits < $lowSeq){
+          $lowSeq = $self->seqs->{$seqId}->bits; 
+        }
+        #For each of the regions found on the sequence, look to see if the match is great
+        #than the domain threshold.  If it is, is it lower than we we have seen previously    
+        foreach my $unit (@{ $self->seqs->{$seqId}->hmmUnits } ){
+          if( $unit->bits() >= $self->domThr && $unit->bits() < $lowDom ) {
+		        $lowDom  = $unit->bits;
+          }
+        } 
+      }
+      
+    }
+    $self->domTC($lowDom);
+    $self->seqTC($lowSeq);
+  }
+  return($self->seqTC, $self->domTC);
+}
+
+#-------------------------------------------------------------------------------
+
+=head2 highestNoise
+
+  Title    :
+  Usage    :  
+  Function :
+  Args     :
+  Returns  :
+  
+=cut
+
+sub highestNoise {
+  my $self = shift;
+  
+  #See if it is already set
+  unless($self->domNC && $self->seqNC) {
+    unless($self->domThr and $self->seqThr){
+      die "Could not define TC as I am missing a threshold\n";
+    }
+    
+    #Set it wildly low
+    my ($highSeq, $highDom);
+    $highSeq = $highDom = -999999.99;  
+    
+    foreach my $seqId (keys %{$self->seqs} ){
+      
+      if($self->seqs->{$seqId}->bits < $self->seqThr){
+        #Is this the highest sequence thres below the cut-off
+        if($self->seqs->{$seqId}->bits > $highSeq){
+          $highSeq = $self->seqs->{$seqId}->bits; 
+        }
+      }
+      
+      #For each of the regions found on the sequence, look to see if the match is great
+      #than the domain threshold.  If it is, is it lower than we we have seen previously    
+      foreach my $unit (@{ $self->seqs->{$seqId}->hmmUnits } ){
+        if( $unit->bits < $self->domThr && $unit->bits > $highDom ) {
+		      $highDom  = $unit->bits;
+        }
+      }
+    }
+    $self->domNC($highDom);
+    $self->seqNC($highSeq); 
+  }
+
+  return($self->seqNC, $self->domNC);
+}
+
+
+sub applyEdits {
+  my ($self, $edits, $removeBadEd) = @_;
+
+  my @validEd; #If removeBadEd flag is on, collect all the valid ED lines in this array and return at end of sub 
+  foreach my $e (@$edits){
+    #{ seq => $1, oldFrom => $2, oldTo => $3, newFrom => $5, newTo => $6 }  
+    if($self->seqs->{$e->{seq}}){
+      my $matched = 0;
+      foreach my $u (@{ $self->seqs->{ $e->{seq} }->hmmUnits }){
+          if($u->envFrom == $e->{oldFrom} and $u->envTo == $e->{oldTo}) {
+	    $matched = 1; #HMM unit found
+
+            if(defined $e->{newFrom} and $e->{newTo}){
+	     
+	      #Check co-ordinates of new start and end positions are in range
+	      if( $e->{newFrom} < $u->{envFrom} or $e->{newTo} > $u->{envTo} or $e->{newFrom} > $e->{newTo}) {
+		if($removeBadEd) {
+		  print "Removing ED line due to out of range co-ordinates: " . $e->{seq}."/".$e->{newFrom}."-".$e->{newTo}. "\n";
+		}
+		else {
+		  warn $e->{seq}."/".$e->{newFrom}."-".$e->{newTo}." contains out of range co-ordinates - bad ED line\n";
+		}
+		last;
+	      }
+
+              #Modify the start end positions
+              $u->envFrom($e->{newFrom});
+              $u->envTo($e->{newTo});
+              
+              #Check that the ali-positions are still okay
+              if($u->seqFrom < $e->{newFrom}){
+                $u->seqFrom($e->{newFrom});  
+              }
+              if($u->seqTo > $e->{newTo}){
+                $u->seqTo($e->{newTo});  
+              }
+            }else{
+              #Set the score so low it will never get in the align
+              $u->bits(-999999.99);
+            }
+
+	    push(@validEd, $e) if($removeBadEd);  
+            last;    
+          }  
+      }
+      unless($matched){ #HMM unit not found - bad ED
+	if($removeBadEd) {
+	  print "Removing ED line for invalid hmm unit: " . $e->{seq}."/".$e->{oldFrom}."-".$e->{oldTo}. "\n";
+	}
+	else {
+	  warn $e->{seq}."/".$e->{oldFrom}."-".$e->{oldTo}." does not appear in the list of hmm units - bad ED line\n";
+	}
+      }
+    }else{ #Sequence not found - bad ED
+      if($removeBadEd) {
+	print "Removing ED line for invalid hmm unit: " . $e->{seq}."/".$e->{oldFrom}."-".$e->{oldTo}. "\n";
+      }
+      else {
+	warn $e->{seq}." does not appear in the list of hmm units - bad ED line\n";  
+      }
+    }
+  }
+  return(\@validEd) if($removeBadEd);
+  
+}
+
+sub remove_overlaps_by_clan {
+
+    my ($self, $clanmap, $nested) = @_;
+
+    my $new = Bio::Pfam::HMM::HMMResults->new;
+    $new->seqName($self->seqName);
+   
+    foreach my $unit ( sort { $a->evalue <=> $b->evalue }  @{ $self->units } ) {
+
+        #check if it overlaps before adding
+	my $o;
+	
+	foreach my $u ( @{ $new->units } ) {
+	    
+	    if( exists($clanmap->{$unit->name}) and exists($clanmap->{$u->name}) and ($clanmap->{$unit->name} eq $clanmap->{$u->name}) ) {
+		if( overlap( $unit, $u ) ) {
+		    if(exists($$nested{$unit->name}{$u->name})) {
+			next;
+		    }
+		    else {
+			$o=1;
+			last;
+		    }
+		}
+		
+	    }
+	}
+	unless($o) {
+	    if(! $new->seqs->{$unit->name}) {
+		
+		$new->addHMMSeq( Bio::Pfam::HMM::HMMSequence->new( {  name       => $self->seqs->{$unit->name}->name,
+								      desc       => $self->seqs->{$unit->name}->desc,
+								      bits       => $self->seqs->{$unit->name}->bits,
+								      evalue     => $self->seqs->{$unit->name}->evalue,
+								      numberHits => $self->seqs->{$unit->name}->numberHits}) );
+	    
+	    }
+	    $new->addHMMUnit($unit);
+	}
+
+    }
+    return $new;
+}
+
+
+
+sub overlap {
+    # does unit1 overlap with unit2?
+    my $unit1 = shift;
+    my $unit2 = shift;
+    my( $u1, $u2 ) = sort { $a->seqFrom <=> $b->seqFrom } ( $unit1, $unit2 );
+
+
+    if( $u2->seqFrom <= $u1->seqTo ) {
+        return 1;
+    } 
+
+    return 0;
+}
+
+
+sub results {
+  my ( $self, $pfamScanData, $e_value ) = @_;
+
+  my @results = ();
+  foreach my $unit ( sort { $a->seqFrom <=> $b->seqFrom } @{ $self->units } ) {    
+
+    my $pfamB = $unit->name =~ /^Pfam-B/;
+
+    #Filter results based on thresholds
+    if ( $unit->name =~ /^Pfam-B/ ) {
+      next unless ( $self->seqs->{$unit->name}->evalue <= 0.001 and $unit->evalue <= 0.001 );
+      $pfamB = 1;
+    }
+    else {	  
+      if ( $e_value ) {
+        next unless ( $self->seqs->{$unit->name}->evalue <= $e_value and $unit->evalue <= $e_value ) ;
+      } 
+      else {
+       next unless $unit->sig;
+      }
+    }
+
+    push @results, {
+      seq          => { from => $unit->seqFrom,
+                        to   => $unit->seqTo,
+                        name => $self->seqName },
+      env          => { from => $unit->envFrom,
+                        to   => $unit->envTo },
+
+      hmm          => { from => $unit->hmmFrom,
+                        to   => $unit->hmmTo },
+
+      model_length => $pfamScanData->{_model_len}->{ $unit->name },
+      bits         => $unit->bits,
+      evalue       => $unit->evalue,
+      acc          => $pfamScanData->{_accmap}->{ $unit->name },
+      name         => $unit->name,
+      desc         => $pfamScanData->{_desc}->{ $unit->name },
+      type         => $pfamB ? undef : $pfamScanData->{_type}->{ $unit->name },
+      clan         => $pfamB ? undef : 
+                         $pfamScanData->{_clanmap}->{ $unit->name } || 'No_clan',
+
+      act_site     => $pfamB ? undef : $unit->{act_site},
+      sig          => $pfamB ? "NA" : $unit->sig,
+      align        => [ sprintf( '#HMM       %s', $unit->hmmalign->{hmm} ),
+                        sprintf( '#MATCH     %s', $unit->hmmalign->{match} ),
+                        sprintf( '#PP        %s', $unit->hmmalign->{pp} ),
+                        sprintf( '#SEQ       %s', $unit->hmmalign->{seq} ) ]
+    };
+  }
+
+  return \@results;
+}
+
+1;