Mercurial > repos > matteoc > agame_custom_tools

diff pfamScan/Bio/Pfam/Scan/PfamScan.pm @ 0:68a3648c7d91 draft default tip
Uploaded
author: matteoc
date: Thu, 22 Dec 2016 04:45:31 -0500
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pfamScan/Bio/Pfam/Scan/PfamScan.pm	Thu Dec 22 04:45:31 2016 -0500
@@ -0,0 +1,957 @@
+
+=head1 NAME
+
+Bio::Pfam::Scan::PfamScan
+
+=cut
+
+package Bio::Pfam::Scan::PfamScan;
+
+=head1 SYNOPSIS
+
+  my $ps = Bio::Pfam::Scan::PfamScan->new(
+             -cut_off => $hmmscan_cut_off,
+             -dir => $dir,
+             -clan_overlap => $clan_overlap,
+             -fasta => $fasta,
+             -align => $align,
+             -as => $as
+           );
+
+  $ps->search;
+  $ps->write_results;
+
+=head1 DESCRIPTION
+
+$Id: PfamScan.pm,v 1.4 2010-01-12 09:41:42 jm14 Exp $
+
+=cut
+
+use strict;
+use warnings;
+
+use Bio::Pfam::HMM::HMMResultsIO;
+use Bio::Pfam::Active_site::as_search;
+use Bio::SimpleAlign;
+use Bio::Pfam::Scan::Seq;
+
+use Carp;
+use IPC::Run qw( start finish );
+
+#-------------------------------------------------------------------------------
+#- constructor -----------------------------------------------------------------
+#-------------------------------------------------------------------------------
+
+=head1 METHODS
+
+=head2 new
+
+The only constructor for the object. Accepts a set of arguments that specify
+the parameters for the search:
+
+=over
+
+=item -cut_off
+
+=item -dir
+
+=item -clan_overlap
+
+=item -fasta
+
+=item -sequence
+
+=item -align
+
+=item -hmm
+
+=item -as
+
+=back
+
+=cut
+
+sub new {
+  my ( $class, @args ) = @_;
+
+  my $self = {};
+  bless $self, $class;
+
+  # To avoid hard coding the location for the binary, we assume it will be on the path.....
+  $self->{_HMMSCAN} = 'hmmscan';
+
+  # handle arguments, if we were given any here
+  $self->_process_args(@args) if @args;
+
+  return $self;
+}
+
+#-------------------------------------------------------------------------------
+#- public methods --------------------------------------------------------------
+#-------------------------------------------------------------------------------
+
+=head2 search
+
+The main method on the object. Performs a C<hmmscan> search using the supplied
+sequence and the specified HMM library.
+
+=cut
+
+sub search {
+  my ( $self, @args ) = @_;
+
+  # handle the arguments, if we were handed any here
+  $self->_process_args(@args) if @args;
+
+  # set up the output header
+  $self->_build_header;
+
+  croak qq(FATAL: no sequence given; set the search parameters before calling "search")
+    unless defined $self->{_sequence};
+
+  my ( %AllResults, $pfamB, $firstResult );
+
+  foreach my $hmmlib ( @{ $self->{_hmmlib} } ) {
+
+    my ( @hmmscan_cut_off, $seq_evalue, $dom_evalue );
+    if ( $hmmlib !~ /Pfam\-B/ ) {
+      @hmmscan_cut_off = @{ $self->{_hmmscan_cutoff} };
+    }
+    else {
+      $pfamB      = 1;
+      $seq_evalue = 0.001;
+      $dom_evalue = 0.001;
+
+      # It's a pfamB search so use some default cut off values
+      push @hmmscan_cut_off, '-E', $seq_evalue, '--domE', $dom_evalue;
+    }
+
+    push @{ $self->{_header} },
+      "#     cpu number specified: " . $self->{_cpu} . "\n"
+      if ( $hmmlib !~ /Pfam\-B/ and $self->{_cpu} );
+
+    push @{ $self->{_header} },
+      "#        searching against: "
+      . $self->{_dir}
+      . "/$hmmlib, with cut off "
+      . join( " ", @hmmscan_cut_off ) . "\n";
+    my @params;
+    if ( $self->{_cpu} ) {
+      @params = (
+        'hmmscan', '--notextw', '--cpu', $self->{_cpu}, @hmmscan_cut_off,
+        $self->{_dir} . '/' . $hmmlib,
+        $self->{_fasta}
+      );
+    }
+    else {
+      @params = (
+        'hmmscan', '--notextw', @hmmscan_cut_off, $self->{_dir} . '/' . $hmmlib,
+        $self->{_fasta}
+      );
+
+    }
+
+    print STDERR "PfamScan::search: hmmscan command: |@params|\n"
+      if $ENV{DEBUG};
+    print STDERR 'PfamScan::search: sequence: |' . $self->{_sequence} . "|\n"
+      if $ENV{DEBUG};
+
+    my $run = start \@params, '<pipe', \*IN, '>pipe', \*OUT, '2>pipe', \*ERR
+      or croak qq(FATAL: error running hmmscan; IPC::Run returned '$?');
+
+    # print IN $self->{_sequence}; ;
+    close IN;
+
+    $self->{_hmmresultIO} = Bio::Pfam::HMM::HMMResultsIO->new;
+    $self->{_all_results} = $self->{_hmmresultIO}->parseMultiHMMER3( \*OUT );
+    close OUT;
+
+    my $err;
+    while (<ERR>) {
+      $err .= $_;
+    }
+    close ERR;
+
+    finish $run
+      or croak qq|FATAL: error running hmmscan ($err); ipc returned '$?'|;
+
+    unless ( $hmmlib =~ /Pfam\-B/ ) {
+
+      if ( $self->{_clan_overlap} ) {
+        push( @{ $self->{_header} }, "#    resolve clan overlaps: off\n" );
+      }
+      else {
+        push( @{ $self->{_header} }, "#    resolve clan overlaps: on\n" );
+        $self->_resolve_clan_overlap;
+      }
+
+      if ( $self->{_as} ) {
+        push( @{ $self->{_header} }, "#     predict active sites: on\n" );
+        $self->_pred_act_sites;
+      }
+      else {
+        push( @{ $self->{_header} }, "#     predict active sites: off\n" );
+      }
+
+      if ( $self->{_translate} ) {
+        push @{ $self->{_header} },  "#   translate DNA sequence: " . $self->{_translate} . "\n";
+      }
+    }
+
+    # Determine which hits are significant
+    foreach my $result ( @{ $self->{_all_results} } ) {
+      foreach
+        my $unit ( sort { $a->seqFrom <=> $b->seqFrom } @{ $result->units } )
+      {
+
+        unless ($pfamB) {
+
+          $unit->sig(0);
+          if ( $result->seqs->{ $unit->name }->bits >=
+            $self->{_seqGA}->{ $unit->name } )
+          {
+            if ( $unit->bits >= $self->{_domGA}->{ $unit->name } ) {
+              $unit->sig(1);
+            }
+          }
+        }
+      }
+    }
+
+    if ($firstResult) {
+      $AllResults{ $self->{_all_results} } = $self->{_all_results};
+    }
+    else {
+      $firstResult = $self->{_all_results};
+    }
+
+  }    # end of "foreach $hmmlib"
+
+  # If more than one search, merge results into one object
+  if ( keys %AllResults ) {
+
+    foreach my $AllResult ( keys %AllResults ) {
+
+      foreach my $seq_id ( keys %{ $self->{_seq_hash} } ) {
+
+        my $flag;
+
+        #If seq exists in both, add all units from $AllResult to $firstResult
+        foreach my $result ( @{$firstResult} ) {
+
+          if ( $result->seqName eq $seq_id ) {
+            $flag = 1;
+
+            foreach my $result2 ( @{ $AllResults{$AllResult} } ) {
+
+              if ( $result2->seqName eq $seq_id ) {
+                foreach my $hmmname ( keys %{ $result2->seqs } ) {
+                  $result->addHMMSeq( $result2->seqs->{$hmmname} );
+                }
+                foreach my $unit ( @{ $result2->units } ) {
+                  $result->addHMMUnit($unit);
+                }
+              }
+            }
+          }
+        }
+
+        #If seq doesn't exist in $firstResult, need to add both sequence and units to $firstResult
+        unless ($flag) {
+          foreach my $result2 ( @{ $AllResults{$AllResult} } ) {
+            if ( $result2->seqName eq $seq_id ) {
+              push @{$firstResult}, $result2;
+            }
+          }
+        }
+      }
+    }
+    $self->{_all_results} = $firstResult;
+
+  }    # end of "if keys %AllResults"
+
+  push @{ $self->{_header} }, "# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =\n#\n";
+
+  if ( $self->{_as} ) {
+    push @{ $self->{_header} }, "# <seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan> <predicted_active_site_residues>";
+  }
+  else {
+    push @{ $self->{_header} }, "# <seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan>";
+  }
+
+  if ( $self->{_translate} ) {
+    push @{ $self->{_header} }, " <strand> <nt start> <nt end>";
+  }
+  push @{ $self->{_header} }, "\n";
+}
+
+#-------------------------------------------------------------------------------
+
+=head2 write_results
+
+Writes the results of the C<hmmscan> search. Takes a single argument, which can
+be an open filehandle or a filename. A fatal error is generated if a file of the
+given name already exists.
+
+=cut
+
+sub write_results {
+  my ( $self, $out, $e_seq, $e_dom, $b_seq, $b_dom ) = @_;
+
+  my $fh;
+
+  if ( ref $out eq 'GLOB' ) {
+
+    # we were handed a filehandle
+    $fh = $out;
+  }
+  elsif ( $out and not ref $out ) {
+
+    # we were handed a filename
+    croak qq(FATAL: output file "$out" already exists) if -f $out;
+
+    open( FH, ">$out" )
+      or croak qq(FATAL: Can\'t write to your output file "$out": $!);
+    $fh = \*FH;
+  }
+  else {
+
+    # neither filehandle nor filename, default to STDOUT
+    $fh = \*STDOUT;
+  }
+
+  if ( $self->{_header} ) {
+    my $header = join '', @{ $self->{_header} };
+    print $fh "$header\n";
+  }
+
+  foreach my $result ( @{ $self->{_all_results} } ) {
+    $self->{_hmmresultIO}
+      ->write_ascii_out( $result, $fh, $self, $e_seq, $e_dom, $b_seq, $b_dom );
+  }
+  close $fh;
+}
+
+#-------------------------------------------------------------------------------
+
+=head2 results
+
+Returns the search results.
+
+=cut
+
+sub results {
+  my ( $self, $e_value ) = @_;
+
+  unless ( defined $self->{_all_results} ) {
+    carp "WARNING: call search() before trying to retrieve results";
+    return;
+  }
+
+  my @search_results = ();
+
+  foreach my $hmm_result ( @{ $self->{_all_results} } ) {
+    push @search_results, @{ $hmm_result->results( $self, $e_value ) };
+  }
+
+  return \@search_results;
+}
+
+#-------------------------------------------------------------------------------
+#- private methods -------------------------------------------------------------
+#-------------------------------------------------------------------------------
+
+=head1 PRIVATE METHODS
+
+=head2 _process_args
+
+Handles the input arguments.
+
+=cut
+
+sub _process_args {
+  my ( $self, @args ) = @_;
+
+  # accept both a hash and a hash ref
+  my $args = ( ref $args[0] eq 'HASH' ) ? shift @args : {@args};
+
+  # make sure we get a sequence
+  if ( $args->{-fasta} and $args->{-sequence} ) {
+    croak qq(FATAL: "-fasta" and "-sequence" are mutually exclusive);
+  }
+  elsif ( $args->{-fasta} ) {
+    croak qq(FATAL: fasta file "$args->{-fasta}" doesn\'t exist)
+      unless -s $args->{-fasta};
+  }
+  elsif ( $args->{-sequence} ) {
+    croak qq(FATAL: no sequence given)
+      unless length( $args->{-sequence} );
+  }
+  else {
+    croak qq(FATAL: must specify either "-fasta" or "-sequence");
+  }
+
+  # check the cut off
+  if ( ( $args->{-e_seq} and ( $args->{-b_seq} || $args->{-b_dom} ) )
+    or ( $args->{-b_seq} and ( $args->{-e_seq} || $args->{-e_dom} ) )
+    or ( $args->{-b_dom} and $args->{-e_dom} ) )
+  {
+    croak qq(FATAL: can\'t use e value and bit score threshold together);
+  }
+
+  $self->{_hmmscan_cutoff} = ();
+  if ( $args->{-e_seq} ) {
+    croak qq(FATAL: the E-value sequence cut-off "$args->{-e_seq}" must be a positive non-zero number)
+      unless $args->{-e_seq} > 0;
+
+    push @{ $self->{_hmmscan_cutoff} }, '-E', $args->{-e_seq};
+  }
+
+  if ( $args->{-e_dom} ) {
+    croak q(FATAL: if you supply "-e_dom" you must also supply "-e_seq")
+      unless $args->{-e_seq};
+
+    croak qq(FATAL: the E-value domain cut-off "$args->{-e_dom}" must be positive non-zero number)
+      unless $args->{-e_dom} > 0;
+
+    push @{ $self->{_hmmscan_cutoff} }, '--domE', $args->{-e_dom};
+  }
+
+  if ( $args->{-b_seq} ) {
+    push @{ $self->{_hmmscan_cutoff} }, '-T', $args->{-b_seq};
+  }
+
+  if ( $args->{-b_dom} ) {
+    croak q(FATAL: if you supply "-b_dom" you must also supply "-b_seq")
+      unless $args->{-b_seq};
+
+    push @{ $self->{_hmmscan_cutoff} }, '--domT', $args->{-b_dom};
+  }
+
+  unless ( $self->{_hmmscan_cutoff} ) {
+    push @{ $self->{_hmmscan_cutoff} }, '--cut_ga';
+  }
+
+  # make sure we have a valid directory for the HMM data files
+  croak qq(FATAL: directory "$args->{-dir}" does not exist)
+    unless -d $args->{-dir};
+
+  # populate the object
+  $self->{_cut_off}      = $args->{-cut_off};
+  $self->{_dir}          = $args->{-dir};
+  $self->{_clan_overlap} = $args->{-clan_overlap};
+  $self->{_fasta}        = $args->{-fasta};
+  $self->{_align}        = $args->{-align};
+  $self->{_as}           = $args->{-as};
+  $self->{_sequence}     = $args->{-sequence};
+  $self->{_cpu}          = $args->{-cpu};
+  $self->{_translate}    = $args->{-translate};
+
+  $self->{_hmmlib} = [];
+  if ( $args->{-hmmlib} ) {
+    if ( ref $args->{-hmmlib} eq 'ARRAY' ) {
+      push @{ $self->{_hmmlib} }, @{ $args->{-hmmlib} };
+    }
+    else {
+      push @{ $self->{_hmmlib} }, $args->{-hmmlib};
+    }
+  }
+  else {
+    push @{ $self->{_hmmlib} }, "Pfam-A.hmm";
+  }
+
+  # Now check that the library exists in the data dir!
+  foreach my $hmmlib ( @{ $self->{_hmmlib} } ) {
+
+    croak qq(FATAL: can't find $hmmlib and/or $hmmlib binaries in "$args->{-dir}")
+      unless (
+      -s $self->{_dir},
+      "/$hmmlib"
+      and -s $self->{_dir} . "/$hmmlib.h3f"
+      and -s $self->{_dir} . "/$hmmlib.h3i"
+      and -s $self->{_dir} . "/$hmmlib.h3m"
+      and -s $self->{_dir} . "/$hmmlib.h3p"
+      and -s $self->{_dir} . "/$hmmlib.dat"
+      );
+
+    # read the necessary data, if it's not been read already
+    $self->_read_pfam_data;
+  }
+
+  $self->{_max_seqname} = 0;
+
+  # if there's nothing in "_sequence" try to load a fasta file
+  $self->_read_fasta
+    unless $self->{_sequence};
+
+  # check again for a sequence. If we don't have one at this point, bail with
+  # an error
+  croak qq(FATAL: no sequence given)
+    unless $self->{_sequence};
+
+  # read fasta file, store maximum sequence name and store sequences for active
+  # sites prediction
+  $self->_parse_sequence
+    unless $self->{_max_seqname};
+
+  if ( $self->{_as} ) {
+    $self->_parse_act_site_data
+      unless $self->{_read_read_act_site_data};
+  }
+
+  if ( $self->{_translate} ) {
+    $self->_translate_fasta;
+  }
+
+  # see if a version number was specified
+  $self->{_version} = $args->{version};
+
+}
+
+#-------------------------------------------------------------------------------
+
+=head2 _build_header
+
+Adds version to the header object
+
+=cut
+
+sub _build_header {
+  my ( $self, $version ) = @_;
+
+  unshift @{ $self->{_header} },
+    '#      query sequence file: ' . $self->{_fasta} . "\n";
+
+  unshift @{ $self->{_header} }, <<EOF_license;
+# Copyright (c) 2009 Genome Research Ltd\n# Freely distributed under the GNU 
+# General Public License
+#
+# Authors: Jaina Mistry (jaina\@ebi.ac.uk), 
+#          Rob Finn (rdf\@ebi.ac.uk)
+#
+# This is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; either version 2 of the License, or (at your option) any later version.
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program. If not, see <http://www.gnu.org/licenses/>. 
+# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+EOF_license
+
+  my $v =
+    ( defined $self->{_version} )
+    ? "version $version, "
+    : '';
+
+  unshift @{ $self->{_header} },
+    "# pfam_scan.pl, $v run at " . scalar(localtime) . "\n#\n";
+}
+
+#-------------------------------------------------------------------------------
+
+=head2 _read_fasta
+
+Reads a sequence from the fasta-format file that was specified in the 
+parameters.
+
+=cut
+
+sub _read_fasta {
+  my $self = shift;
+
+  open( FASTA, $self->{_fasta} )
+    or croak qq(FATAL: Couldn't open fasta file "$self->{_fasta}" $!\n);
+  my @rows = <FASTA>;
+  close FASTA;
+
+  $self->{_sequence_rows} = \@rows;
+
+  $self->{_sequence} = join '', @rows;
+}
+
+#-------------------------------------------------------------------------------
+
+=head2 _resolve_clan_overlap
+
+Resolves overlaps between clans.
+
+=cut
+
+sub _resolve_clan_overlap {
+  my $self = shift;
+
+  my @no_clan_overlap = ();
+  foreach my $result ( @{ $self->{_all_results} } ) {
+    my $new =
+      $result->remove_overlaps_by_clan( $self->{_clanmap}, $self->{_nested} );
+
+    push @no_clan_overlap, $new;
+  }
+
+  $self->{_all_results} = \@no_clan_overlap;
+}
+
+#-------------------------------------------------------------------------------
+
+=head2 _pred_act_sites
+
+Predicts active sites. Takes no arguments. Populates the "act_site" field on
+each results object.
+
+=cut
+
+sub _pred_act_sites {
+  my $self = shift;
+
+  # print STDERR "predicting active sites...\n";
+
+  my $hmm_file = $self->{_dir} . '/Pfam-A.hmm';
+
+RESULT: foreach my $result ( @{ $self->{_all_results} } ) {
+
+    # print STDERR "result: |" . $result->seqName . "|\n";
+
+  UNIT: foreach my $unit ( @{ $result->units } ) {
+
+      # print STDERR "family: |" . $unit->name . "|\n";
+
+      next UNIT
+        unless ( $self->{_act_site_data}->{ $unit->name }->{'alignment'} );
+
+      my $seq_region = substr(
+        $self->{_seq_hash}->{ $result->seqName },
+        $unit->seqFrom - 1,
+        $unit->seqTo - $unit->seqFrom + 1
+      );
+
+      my $seq_se = $unit->seqFrom . '-' . $unit->seqTo;
+
+      # print STDERR "seq_id:     |" . $result->seqName . "|\n";
+      # print STDERR "seq_se:     |" . $seq_se . "|\n";
+      # print STDERR "seq_region: |" . $seq_region . "|\n";
+      # print STDERR "family:     |" . $unit->name . "|\n";
+      # print STDERR "hmm_file:   |" . $hmm_file . "|\n";
+      # print STDERR "dir:        |" . $self->{_dir} . "|\n";
+
+      $unit->{act_site} = Bio::Pfam::Active_site::as_search::find_as(
+        $self->{_act_site_data}->{ $unit->name }->{'alignment'},
+        $self->{_act_site_data}->{ $unit->name }->{'residues'},
+        $result->seqName,
+        $seq_se,
+        $seq_region,
+        $unit->name,
+        $hmm_file
+      );
+    }
+  }
+}
+
+#-------------------------------------------------------------------------------
+
+=head2 _read_pfam_data
+
+Reads the Pfam data file ("Pfam-A.scan.dat") and populates the C<accmap>, 
+C<nested> and C<clanmap> hashes on the object.
+
+=cut
+
+sub _read_pfam_data {
+  my $self = shift;
+
+  #print STDERR "reading " . $self->{_hmmlib} . ".dat\n" if($ENV{DEBUG});
+  $self->{_accmap}    = {};
+  $self->{_nested}    = {};
+  $self->{_clanmap}   = {};
+  $self->{_desc}      = {};
+  $self->{_seqGA}     = {};
+  $self->{_domGA}     = {};
+  $self->{_type}      = {};
+  $self->{_model_len} = {};
+
+  foreach my $hmmlib ( @{ $self->{_hmmlib} } ) {
+    my $scandat = $self->{_dir} . '/' . $hmmlib . '.dat';
+    open( SCANDAT, $scandat )
+      or croak qq(FATAL: Couldn't open "$scandat" data file: $!);
+    my $id;
+    while (<SCANDAT>) {
+      if (m/^\#=GF ID\s+(\S+)/) {
+        $id = $1;
+      }
+      elsif (m/^\#=GF\s+AC\s+(\S+)/) {
+        $self->{_accmap}->{$id} = $1;
+      }
+      elsif (m/^\#=GF\s+DE\s+(.+)/) {
+        $self->{_desc}->{$id} = $1;
+      }
+      elsif (m/^\#=GF\s+GA\s+(\S+)\;\s+(\S+)\;/) {
+        $self->{_seqGA}->{$id} = $1;
+        $self->{_domGA}->{$id} = $2;
+      }
+      elsif (m/^\#=GF\s+TP\s+(\S+)/) {
+        $self->{_type}->{$id} = $1;
+      }
+      elsif (m/^\#=GF\s+ML\s+(\d+)/) {
+        $self->{_model_len}->{$id} = $1;
+      }
+      elsif (/^\#=GF\s+NE\s+(\S+)/) {
+        $self->{_nested}->{$id}->{$1} = 1;
+        $self->{_nested}->{$1}->{$id} = 1;
+      }
+      elsif (/^\#=GF\s+CL\s+(\S+)/) {
+        $self->{_clanmap}->{$id} = $1;
+      }
+    }
+
+    close SCANDAT;
+
+    # set a flag to show that we've read the data files already
+    $self->{ '_read_' . $hmmlib } = 1;
+  }
+
+}
+
+#-------------------------------------------------------------------------------
+
+=head2 _read_act_site_data
+
+Reads the Pfam active site data file ("active_site.dat") and populates
+the C<act_site_data> hashes on the object.
+
+=cut
+
+sub _parse_act_site_data {
+  my $self   = shift;
+  my $as_dat = $self->{_dir} . '/active_site.dat';
+
+  $self->{_act_site_data} = {};
+
+  open( AS, $as_dat )
+    or croak qq(FATAL: Couldn\'t open "$as_dat" data file: $!);
+
+  my ( $fam_id, $aln );
+
+  while (<AS>) {
+    if (/^ID\s+(\S+)/) {
+      $fam_id = $1;
+      $aln    = new Bio::SimpleAlign;
+    }
+    elsif (/^AL\s+(\S+)\/(\d+)\-(\d+)\s+(\S+)/) {
+      my ( $seq_id, $st, $en, $seq ) = ( $1, $2, $3, $4 );
+
+      $aln->add_seq(
+        Bio::Pfam::Scan::Seq->new(
+          '-seq'   => $seq,
+          '-id'    => $seq_id,
+          '-start' => $st,
+          '-end'   => $en,
+          '-type'  => 'aligned'
+        )
+      );
+    }
+    elsif (/^RE\s+(\S+)\s+(\d+)/) {
+      my ( $seq_id, $res ) = ( $1, $2 );
+      push(
+        @{ $self->{_act_site_data}->{$fam_id}->{'residues'}->{$seq_id} },
+        $res
+      );
+
+    }
+    elsif (/^\/\//) {
+
+      $self->{_act_site_data}->{$fam_id}->{'alignment'} = $aln;
+
+      $fam_id = "";
+      $aln    = "";
+
+    }
+    else {
+      warn "Ignoring line:\n[$_]";
+    }
+  }
+  close AS;
+  $self->{_read_read_act_site_data} = 1;
+}
+
+#-------------------------------------------------------------------------------
+
+=head2 _parse_sequence
+
+This method is used to parse the sequence and hash it on sequence
+identifier. It also stores the length of the longest sequence id
+
+=cut
+
+sub _parse_sequence {
+  my $self = shift;
+
+  my $seq_hash = {};
+  my $seq_id;
+  foreach ( @{ $self->{_sequence_rows} } ) {
+
+    next if m/^\s*$/;    #Ignore blank lines
+
+    if (m/^>(\S+)/) {
+      $seq_id = $1;
+
+      if ( exists( $seq_hash->{$seq_id} ) ) {
+        croak "FATAL: Sequence identifiers must be unique. Your fasta file contains two sequences with the same id ($seq_id)";
+      }
+
+      #Store the max length of seq name, use this later when printing in ascii
+      $self->{_max_seqname} = length($seq_id)
+        if ( !$self->{_max_seqname}
+        or length($seq_id) > $self->{_max_seqname} );
+    }
+    else {
+      croak "FATAL: Unrecognised format of fasta file. Each sequence must have a header line in the format '>identifier  <optional description>'"
+        unless defined $seq_id;
+      chomp;
+      $seq_hash->{$seq_id} .= $_;
+    }
+  }
+
+  $self->{_seq_hash} = $seq_hash;
+}
+
+#-------------------------------------------------------------------------------
+
+=head2 _translate_fasta
+
+Uses the HMMER v2.3.2 progam "translate" to perform a six-frame translation of
+the input sequence. Checks the parameter "-translate". 
+
+Accepted arguments are "all" and "orf", where "all" means (from the "translate"
+help text) "translate in full, with stops; no individual ORFs" and "orf" means
+"report only ORFs greater than minlen" where minlen is set to the default of
+20.
+
+=cut
+
+sub _translate_fasta {
+  my ($self) = @_;
+  my $translatedFasta = $self->{_fasta} . ".translated";
+
+  my @params = ( 'translate', '-q', );
+  if ( $self->{_translate} eq 'all' ) {
+    push( @params, '-a' );
+  }
+  elsif ( $self->{_translate} eq 'orf' ) {
+    push( @params, '-l', '20' );
+  }
+  else {
+    croak qq(Unexpected parameter '$self->{_translate}');
+  }
+  push( @params, '-o', $translatedFasta, $self->{_fasta} );
+
+  print STDERR "PfamScan::translate_fasta: translate command: |@params|\n"
+    if $ENV{DEBUG};
+
+  my $run = start \@params, '<pipe', \*IN, '>pipe', \*OUT, '2>pipe', \*ERR
+    or croak qq(FATAL: error running translate; IPC::Run returned '$?');
+
+  close IN;
+  close OUT;
+
+  my $err;
+  while (<ERR>) {
+    $err .= $_;
+  }
+  close ERR;
+
+  finish $run
+    or croak qq|FATAL: error running translate ($err); ipc returned '$?'|;
+  open( F, "<", $translatedFasta )
+    or croak qw(Could not open $translatedFasta '$!');
+  if ( $self->{_translate} eq 'orf' ) {
+    while (<F>) {
+      if (/^>\s?(\S+).*nt (\d+)\.+(\d+)/) {
+        $self->{_orf}->{$1}->{start}  = $2;
+        $self->{_orf}->{$1}->{end}    = $3;
+        $self->{_orf}->{$1}->{strand} = ( $2 < $3 ) ? '+' : '-';
+      }
+    }
+  }
+  else {
+    my $currentSeq;
+    my $currentFrame;
+    my $currentLen = 0;
+    my $maxEnd = 0;
+    while (<F>) {
+      chomp;
+      if (/^>\s?(\S+\:)(\d+)/) {
+        if ( $currentLen > 0 ) {
+          my $seqName = $currentSeq . $currentFrame;
+          if ( $currentFrame < 3 ) {
+            my $start = 1 + $currentFrame;
+            my $end   = $start + $currentLen - 1;
+            $self->{_orf}->{$seqName}->{strand} = '+';
+            $self->{_orf}->{$seqName}->{start}  = $start;
+            $self->{_orf}->{$seqName}->{end}    = $end;
+            $maxEnd = $end if ( $end > $maxEnd );
+          }
+          else {
+            my $start = $maxEnd - ( $currentFrame - 3 );
+            my $end = $start - $currentLen + 1;
+            $self->{_orf}->{$seqName}->{strand} = '-';
+            $self->{_orf}->{$seqName}->{start}  = $start;
+            $self->{_orf}->{$seqName}->{end}    = $end;
+          }
+        }
+        $currentLen   = 0;
+        $currentSeq   = $1;
+        $currentFrame = $2;
+      }
+      else {
+        $currentLen += length($_) * 3;
+      }
+    }
+    my $seqName = $currentSeq . $currentFrame;
+    if ( $currentFrame < 3 ) {
+      my $start = 1 + $currentFrame;
+      my $end   = $start + $currentLen - 1;
+      $self->{_orf}->{$seqName}->{strand} = '+';
+      $self->{_orf}->{$seqName}->{start}  = $start;
+      $self->{_orf}->{$seqName}->{end}    = $end;
+      $maxEnd = $end if ( $end > $maxEnd );
+    }
+    else {
+      my $start = $maxEnd - ( $currentFrame - 3 );
+      my $end = $start - $currentLen + 1;
+      $self->{_orf}->{$seqName}->{strand} = '-';
+      $self->{_orf}->{$seqName}->{start}  = $start;
+      $self->{_orf}->{$seqName}->{end}    = $end;
+    }
+  }
+  $self->{_fasta} = $translatedFasta;
+}
+#-------------------------------------------------------------------------------
+
+=head1 COPYRIGHT
+
+Copyright (c) 2009: Genome Research Ltd.
+
+Authors: Jaina Mistry (jm14@sanger.ac.uk), John Tate (jt6@sanger.ac.uk), Rob Finn (finnr@janelia.hhmi.org)
+
+This is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
+ 
+=cut
+
+  1;
+
author	matteoc
date	Thu, 22 Dec 2016 04:45:31 -0500
parents
children