Mercurial > repos > matteoc > agame_custom_tools


=head1 NAME

Bio::Pfam::Scan::PfamScan

=cut

package Bio::Pfam::Scan::PfamScan;

=head1 SYNOPSIS

  my $ps = Bio::Pfam::Scan::PfamScan->new(
             -cut_off => $hmmscan_cut_off,
             -dir => $dir,
             -clan_overlap => $clan_overlap,
             -fasta => $fasta,
             -align => $align,
             -as => $as
           );

  $ps->search;
  $ps->write_results;

=head1 DESCRIPTION

$Id: PfamScan.pm,v 1.4 2010-01-12 09:41:42 jm14 Exp $

=cut

use strict;
use warnings;

use Bio::Pfam::HMM::HMMResultsIO;
use Bio::Pfam::Active_site::as_search;
use Bio::SimpleAlign;
use Bio::Pfam::Scan::Seq;

use Carp;
use IPC::Run qw( start finish );

#-------------------------------------------------------------------------------
#- constructor -----------------------------------------------------------------
#-------------------------------------------------------------------------------

=head1 METHODS

=head2 new

The only constructor for the object. Accepts a set of arguments that specify
the parameters for the search:

=over

=item -cut_off

=item -dir

=item -clan_overlap

=item -fasta

=item -sequence

=item -align

=item -hmm

=item -as

=back

=cut

sub new {
  my ( $class, @args ) = @_;

  my $self = {};
  bless $self, $class;

  # To avoid hard coding the location for the binary, we assume it will be on the path.....
  $self->{_HMMSCAN} = 'hmmscan';

  # handle arguments, if we were given any here
  $self->_process_args(@args) if @args;

  return $self;
}

#-------------------------------------------------------------------------------
#- public methods --------------------------------------------------------------
#-------------------------------------------------------------------------------

=head2 search

The main method on the object. Performs a C<hmmscan> search using the supplied
sequence and the specified HMM library.

=cut

sub search {
  my ( $self, @args ) = @_;

  # handle the arguments, if we were handed any here
  $self->_process_args(@args) if @args;

  # set up the output header
  $self->_build_header;

  croak qq(FATAL: no sequence given; set the search parameters before calling "search")
    unless defined $self->{_sequence};

  my ( %AllResults, $pfamB, $firstResult );

  foreach my $hmmlib ( @{ $self->{_hmmlib} } ) {

    my ( @hmmscan_cut_off, $seq_evalue, $dom_evalue );
    if ( $hmmlib !~ /Pfam\-B/ ) {
      @hmmscan_cut_off = @{ $self->{_hmmscan_cutoff} };
    }
    else {
      $pfamB      = 1;
      $seq_evalue = 0.001;
      $dom_evalue = 0.001;

      # It's a pfamB search so use some default cut off values
      push @hmmscan_cut_off, '-E', $seq_evalue, '--domE', $dom_evalue;
    }

    push @{ $self->{_header} },
      "#     cpu number specified: " . $self->{_cpu} . "\n"
      if ( $hmmlib !~ /Pfam\-B/ and $self->{_cpu} );

    push @{ $self->{_header} },
      "#        searching against: "
      . $self->{_dir}
      . "/$hmmlib, with cut off "
      . join( " ", @hmmscan_cut_off ) . "\n";
    my @params;
    if ( $self->{_cpu} ) {
      @params = (
        'hmmscan', '--notextw', '--cpu', $self->{_cpu}, @hmmscan_cut_off,
        $self->{_dir} . '/' . $hmmlib,
        $self->{_fasta}
      );
    }
    else {
      @params = (
        'hmmscan', '--notextw', @hmmscan_cut_off, $self->{_dir} . '/' . $hmmlib,
        $self->{_fasta}
      );

    }

    print STDERR "PfamScan::search: hmmscan command: |@params|\n"
      if $ENV{DEBUG};
    print STDERR 'PfamScan::search: sequence: |' . $self->{_sequence} . "|\n"
      if $ENV{DEBUG};

    my $run = start \@params, '<pipe', \*IN, '>pipe', \*OUT, '2>pipe', \*ERR
      or croak qq(FATAL: error running hmmscan; IPC::Run returned '$?');

    # print IN $self->{_sequence}; ;
    close IN;

    $self->{_hmmresultIO} = Bio::Pfam::HMM::HMMResultsIO->new;
    $self->{_all_results} = $self->{_hmmresultIO}->parseMultiHMMER3( \*OUT );
    close OUT;

    my $err;
    while (<ERR>) {
      $err .= $_;
    }
    close ERR;

    finish $run
      or croak qq|FATAL: error running hmmscan ($err); ipc returned '$?'|;

    unless ( $hmmlib =~ /Pfam\-B/ ) {

      if ( $self->{_clan_overlap} ) {
        push( @{ $self->{_header} }, "#    resolve clan overlaps: off\n" );
      }
      else {
        push( @{ $self->{_header} }, "#    resolve clan overlaps: on\n" );
        $self->_resolve_clan_overlap;
      }

      if ( $self->{_as} ) {
        push( @{ $self->{_header} }, "#     predict active sites: on\n" );
        $self->_pred_act_sites;
      }
      else {
        push( @{ $self->{_header} }, "#     predict active sites: off\n" );
      }

      if ( $self->{_translate} ) {
        push @{ $self->{_header} },  "#   translate DNA sequence: " . $self->{_translate} . "\n";
      }
    }

    # Determine which hits are significant
    foreach my $result ( @{ $self->{_all_results} } ) {
      foreach
        my $unit ( sort { $a->seqFrom <=> $b->seqFrom } @{ $result->units } )
      {

        unless ($pfamB) {

          $unit->sig(0);
          if ( $result->seqs->{ $unit->name }->bits >=
            $self->{_seqGA}->{ $unit->name } )
          {
            if ( $unit->bits >= $self->{_domGA}->{ $unit->name } ) {
              $unit->sig(1);
            }
          }
        }
      }
    }

    if ($firstResult) {
      $AllResults{ $self->{_all_results} } = $self->{_all_results};
    }
    else {
      $firstResult = $self->{_all_results};
    }

  }    # end of "foreach $hmmlib"

  # If more than one search, merge results into one object
  if ( keys %AllResults ) {

    foreach my $AllResult ( keys %AllResults ) {

      foreach my $seq_id ( keys %{ $self->{_seq_hash} } ) {

        my $flag;

        #If seq exists in both, add all units from $AllResult to $firstResult
        foreach my $result ( @{$firstResult} ) {

          if ( $result->seqName eq $seq_id ) {
            $flag = 1;

            foreach my $result2 ( @{ $AllResults{$AllResult} } ) {

              if ( $result2->seqName eq $seq_id ) {
                foreach my $hmmname ( keys %{ $result2->seqs } ) {
                  $result->addHMMSeq( $result2->seqs->{$hmmname} );
                }
                foreach my $unit ( @{ $result2->units } ) {
                  $result->addHMMUnit($unit);
                }
              }
            }
          }
        }

        #If seq doesn't exist in $firstResult, need to add both sequence and units to $firstResult
        unless ($flag) {
          foreach my $result2 ( @{ $AllResults{$AllResult} } ) {
            if ( $result2->seqName eq $seq_id ) {
              push @{$firstResult}, $result2;
            }
          }
        }
      }
    }
    $self->{_all_results} = $firstResult;

  }    # end of "if keys %AllResults"

  push @{ $self->{_header} }, "# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =\n#\n";

  if ( $self->{_as} ) {
    push @{ $self->{_header} }, "# <seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan> <predicted_active_site_residues>";
  }
  else {
    push @{ $self->{_header} }, "# <seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan>";
  }

  if ( $self->{_translate} ) {
    push @{ $self->{_header} }, " <strand> <nt start> <nt end>";
  }
  push @{ $self->{_header} }, "\n";
}

#-------------------------------------------------------------------------------

=head2 write_results

Writes the results of the C<hmmscan> search. Takes a single argument, which can
be an open filehandle or a filename. A fatal error is generated if a file of the
given name already exists.

=cut

sub write_results {
  my ( $self, $out, $e_seq, $e_dom, $b_seq, $b_dom ) = @_;

  my $fh;

  if ( ref $out eq 'GLOB' ) {

    # we were handed a filehandle
    $fh = $out;
  }
  elsif ( $out and not ref $out ) {

    # we were handed a filename
    croak qq(FATAL: output file "$out" already exists) if -f $out;

    open( FH, ">$out" )
      or croak qq(FATAL: Can\'t write to your output file "$out": $!);
    $fh = \*FH;
  }
  else {

    # neither filehandle nor filename, default to STDOUT
    $fh = \*STDOUT;
  }

  if ( $self->{_header} ) {
    my $header = join '', @{ $self->{_header} };
    print $fh "$header\n";
  }

  foreach my $result ( @{ $self->{_all_results} } ) {
    $self->{_hmmresultIO}
      ->write_ascii_out( $result, $fh, $self, $e_seq, $e_dom, $b_seq, $b_dom );
  }
  close $fh;
}

#-------------------------------------------------------------------------------

=head2 results

Returns the search results.

=cut

sub results {
  my ( $self, $e_value ) = @_;

  unless ( defined $self->{_all_results} ) {
    carp "WARNING: call search() before trying to retrieve results";
    return;
  }

  my @search_results = ();

  foreach my $hmm_result ( @{ $self->{_all_results} } ) {
    push @search_results, @{ $hmm_result->results( $self, $e_value ) };
  }

  return \@search_results;
}

#-------------------------------------------------------------------------------
#- private methods -------------------------------------------------------------
#-------------------------------------------------------------------------------

=head1 PRIVATE METHODS

=head2 _process_args

Handles the input arguments.

=cut

sub _process_args {
  my ( $self, @args ) = @_;

  # accept both a hash and a hash ref
  my $args = ( ref $args[0] eq 'HASH' ) ? shift @args : {@args};

  # make sure we get a sequence
  if ( $args->{-fasta} and $args->{-sequence} ) {
    croak qq(FATAL: "-fasta" and "-sequence" are mutually exclusive);
  }
  elsif ( $args->{-fasta} ) {
    croak qq(FATAL: fasta file "$args->{-fasta}" doesn\'t exist)
      unless -s $args->{-fasta};
  }
  elsif ( $args->{-sequence} ) {
    croak qq(FATAL: no sequence given)
      unless length( $args->{-sequence} );
  }
  else {
    croak qq(FATAL: must specify either "-fasta" or "-sequence");
  }

  # check the cut off
  if ( ( $args->{-e_seq} and ( $args->{-b_seq} || $args->{-b_dom} ) )
    or ( $args->{-b_seq} and ( $args->{-e_seq} || $args->{-e_dom} ) )
    or ( $args->{-b_dom} and $args->{-e_dom} ) )
  {
    croak qq(FATAL: can\'t use e value and bit score threshold together);
  }

  $self->{_hmmscan_cutoff} = ();
  if ( $args->{-e_seq} ) {
    croak qq(FATAL: the E-value sequence cut-off "$args->{-e_seq}" must be a positive non-zero number)
      unless $args->{-e_seq} > 0;

    push @{ $self->{_hmmscan_cutoff} }, '-E', $args->{-e_seq};
  }

  if ( $args->{-e_dom} ) {
    croak q(FATAL: if you supply "-e_dom" you must also supply "-e_seq")
      unless $args->{-e_seq};

    croak qq(FATAL: the E-value domain cut-off "$args->{-e_dom}" must be positive non-zero number)
      unless $args->{-e_dom} > 0;

    push @{ $self->{_hmmscan_cutoff} }, '--domE', $args->{-e_dom};
  }

  if ( $args->{-b_seq} ) {
    push @{ $self->{_hmmscan_cutoff} }, '-T', $args->{-b_seq};
  }

  if ( $args->{-b_dom} ) {
    croak q(FATAL: if you supply "-b_dom" you must also supply "-b_seq")
      unless $args->{-b_seq};

    push @{ $self->{_hmmscan_cutoff} }, '--domT', $args->{-b_dom};
  }

  unless ( $self->{_hmmscan_cutoff} ) {
    push @{ $self->{_hmmscan_cutoff} }, '--cut_ga';
  }

  # make sure we have a valid directory for the HMM data files
  croak qq(FATAL: directory "$args->{-dir}" does not exist)
    unless -d $args->{-dir};

  # populate the object
  $self->{_cut_off}      = $args->{-cut_off};
  $self->{_dir}          = $args->{-dir};
  $self->{_clan_overlap} = $args->{-clan_overlap};
  $self->{_fasta}        = $args->{-fasta};
  $self->{_align}        = $args->{-align};
  $self->{_as}           = $args->{-as};
  $self->{_sequence}     = $args->{-sequence};
  $self->{_cpu}          = $args->{-cpu};
  $self->{_translate}    = $args->{-translate};

  $self->{_hmmlib} = [];
  if ( $args->{-hmmlib} ) {
    if ( ref $args->{-hmmlib} eq 'ARRAY' ) {
      push @{ $self->{_hmmlib} }, @{ $args->{-hmmlib} };
    }
    else {
      push @{ $self->{_hmmlib} }, $args->{-hmmlib};
    }
  }
  else {
    push @{ $self->{_hmmlib} }, "Pfam-A.hmm";
  }

  # Now check that the library exists in the data dir!
  foreach my $hmmlib ( @{ $self->{_hmmlib} } ) {

    croak qq(FATAL: can't find $hmmlib and/or $hmmlib binaries in "$args->{-dir}")
      unless (
      -s $self->{_dir},
      "/$hmmlib"
      and -s $self->{_dir} . "/$hmmlib.h3f"
      and -s $self->{_dir} . "/$hmmlib.h3i"
      and -s $self->{_dir} . "/$hmmlib.h3m"
      and -s $self->{_dir} . "/$hmmlib.h3p"
      and -s $self->{_dir} . "/$hmmlib.dat"
      );

    # read the necessary data, if it's not been read already
    $self->_read_pfam_data;
  }

  $self->{_max_seqname} = 0;

  # if there's nothing in "_sequence" try to load a fasta file
  $self->_read_fasta
    unless $self->{_sequence};

  # check again for a sequence. If we don't have one at this point, bail with
  # an error
  croak qq(FATAL: no sequence given)
    unless $self->{_sequence};

  # read fasta file, store maximum sequence name and store sequences for active
  # sites prediction
  $self->_parse_sequence
    unless $self->{_max_seqname};

  if ( $self->{_as} ) {
    $self->_parse_act_site_data
      unless $self->{_read_read_act_site_data};
  }

  if ( $self->{_translate} ) {
    $self->_translate_fasta;
  }

  # see if a version number was specified
  $self->{_version} = $args->{version};

}

#-------------------------------------------------------------------------------

=head2 _build_header

Adds version to the header object

=cut

sub _build_header {
  my ( $self, $version ) = @_;

  unshift @{ $self->{_header} },
    '#      query sequence file: ' . $self->{_fasta} . "\n";

  unshift @{ $self->{_header} }, <<EOF_license;
# Copyright (c) 2009 Genome Research Ltd\n# Freely distributed under the GNU
# General Public License
#
# Authors: Jaina Mistry (jaina\@ebi.ac.uk),
#          Rob Finn (rdf\@ebi.ac.uk)
#
# This is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation; either version 2 of the License, or (at your option) any later version.
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
EOF_license

  my $v =
    ( defined $self->{_version} )
    ? "version $version, "
    : '';

  unshift @{ $self->{_header} },
    "# pfam_scan.pl, $v run at " . scalar(localtime) . "\n#\n";
}

#-------------------------------------------------------------------------------

=head2 _read_fasta

Reads a sequence from the fasta-format file that was specified in the
parameters.

=cut

sub _read_fasta {
  my $self = shift;

  open( FASTA, $self->{_fasta} )
    or croak qq(FATAL: Couldn't open fasta file "$self->{_fasta}" $!\n);
  my @rows = <FASTA>;
  close FASTA;

  $self->{_sequence_rows} = \@rows;

  $self->{_sequence} = join '', @rows;
}

#-------------------------------------------------------------------------------

=head2 _resolve_clan_overlap

Resolves overlaps between clans.

=cut

sub _resolve_clan_overlap {
  my $self = shift;

  my @no_clan_overlap = ();
  foreach my $result ( @{ $self->{_all_results} } ) {
    my $new =
      $result->remove_overlaps_by_clan( $self->{_clanmap}, $self->{_nested} );

    push @no_clan_overlap, $new;
  }

  $self->{_all_results} = \@no_clan_overlap;
}

#-------------------------------------------------------------------------------

=head2 _pred_act_sites

Predicts active sites. Takes no arguments. Populates the "act_site" field on
each results object.

=cut

sub _pred_act_sites {
  my $self = shift;

  # print STDERR "predicting active sites...\n";

  my $hmm_file = $self->{_dir} . '/Pfam-A.hmm';

RESULT: foreach my $result ( @{ $self->{_all_results} } ) {

    # print STDERR "result: |" . $result->seqName . "|\n";

  UNIT: foreach my $unit ( @{ $result->units } ) {

      # print STDERR "family: |" . $unit->name . "|\n";

      next UNIT
        unless ( $self->{_act_site_data}->{ $unit->name }->{'alignment'} );

      my $seq_region = substr(
        $self->{_seq_hash}->{ $result->seqName },
        $unit->seqFrom - 1,
        $unit->seqTo - $unit->seqFrom + 1
      );

      my $seq_se = $unit->seqFrom . '-' . $unit->seqTo;

      # print STDERR "seq_id:     |" . $result->seqName . "|\n";
      # print STDERR "seq_se:     |" . $seq_se . "|\n";
      # print STDERR "seq_region: |" . $seq_region . "|\n";
      # print STDERR "family:     |" . $unit->name . "|\n";
      # print STDERR "hmm_file:   |" . $hmm_file . "|\n";
      # print STDERR "dir:        |" . $self->{_dir} . "|\n";

      $unit->{act_site} = Bio::Pfam::Active_site::as_search::find_as(
        $self->{_act_site_data}->{ $unit->name }->{'alignment'},
        $self->{_act_site_data}->{ $unit->name }->{'residues'},
        $result->seqName,
        $seq_se,
        $seq_region,
        $unit->name,
        $hmm_file
      );
    }
  }
}

#-------------------------------------------------------------------------------

=head2 _read_pfam_data

Reads the Pfam data file ("Pfam-A.scan.dat") and populates the C<accmap>,
C<nested> and C<clanmap> hashes on the object.

=cut

sub _read_pfam_data {
  my $self = shift;

  #print STDERR "reading " . $self->{_hmmlib} . ".dat\n" if($ENV{DEBUG});
  $self->{_accmap}    = {};
  $self->{_nested}    = {};
  $self->{_clanmap}   = {};
  $self->{_desc}      = {};
  $self->{_seqGA}     = {};
  $self->{_domGA}     = {};
  $self->{_type}      = {};
  $self->{_model_len} = {};

  foreach my $hmmlib ( @{ $self->{_hmmlib} } ) {
    my $scandat = $self->{_dir} . '/' . $hmmlib . '.dat';
    open( SCANDAT, $scandat )
      or croak qq(FATAL: Couldn't open "$scandat" data file: $!);
    my $id;
    while (<SCANDAT>) {
      if (m/^\#=GF ID\s+(\S+)/) {
        $id = $1;
      }
      elsif (m/^\#=GF\s+AC\s+(\S+)/) {
        $self->{_accmap}->{$id} = $1;
      }
      elsif (m/^\#=GF\s+DE\s+(.+)/) {
        $self->{_desc}->{$id} = $1;
      }
      elsif (m/^\#=GF\s+GA\s+(\S+)\;\s+(\S+)\;/) {
        $self->{_seqGA}->{$id} = $1;
        $self->{_domGA}->{$id} = $2;
      }
      elsif (m/^\#=GF\s+TP\s+(\S+)/) {
        $self->{_type}->{$id} = $1;
      }
      elsif (m/^\#=GF\s+ML\s+(\d+)/) {
        $self->{_model_len}->{$id} = $1;
      }
      elsif (/^\#=GF\s+NE\s+(\S+)/) {
        $self->{_nested}->{$id}->{$1} = 1;
        $self->{_nested}->{$1}->{$id} = 1;
      }
      elsif (/^\#=GF\s+CL\s+(\S+)/) {
        $self->{_clanmap}->{$id} = $1;
      }
    }

    close SCANDAT;

    # set a flag to show that we've read the data files already
    $self->{ '_read_' . $hmmlib } = 1;
  }

}

#-------------------------------------------------------------------------------

=head2 _read_act_site_data

Reads the Pfam active site data file ("active_site.dat") and populates
the C<act_site_data> hashes on the object.

=cut

sub _parse_act_site_data {
  my $self   = shift;
  my $as_dat = $self->{_dir} . '/active_site.dat';

  $self->{_act_site_data} = {};

  open( AS, $as_dat )
    or croak qq(FATAL: Couldn\'t open "$as_dat" data file: $!);

  my ( $fam_id, $aln );

  while (<AS>) {
    if (/^ID\s+(\S+)/) {
      $fam_id = $1;
      $aln    = new Bio::SimpleAlign;
    }
    elsif (/^AL\s+(\S+)\/(\d+)\-(\d+)\s+(\S+)/) {
      my ( $seq_id, $st, $en, $seq ) = ( $1, $2, $3, $4 );

      $aln->add_seq(
        Bio::Pfam::Scan::Seq->new(
          '-seq'   => $seq,
          '-id'    => $seq_id,
          '-start' => $st,
          '-end'   => $en,
          '-type'  => 'aligned'
        )
      );
    }
    elsif (/^RE\s+(\S+)\s+(\d+)/) {
      my ( $seq_id, $res ) = ( $1, $2 );
      push(
        @{ $self->{_act_site_data}->{$fam_id}->{'residues'}->{$seq_id} },
        $res
      );

    }
    elsif (/^\/\//) {

      $self->{_act_site_data}->{$fam_id}->{'alignment'} = $aln;

      $fam_id = "";
      $aln    = "";

    }
    else {
      warn "Ignoring line:\n[$_]";
    }
  }
  close AS;
  $self->{_read_read_act_site_data} = 1;
}

#-------------------------------------------------------------------------------

=head2 _parse_sequence

This method is used to parse the sequence and hash it on sequence
identifier. It also stores the length of the longest sequence id

=cut

sub _parse_sequence {
  my $self = shift;

  my $seq_hash = {};
  my $seq_id;
  foreach ( @{ $self->{_sequence_rows} } ) {

    next if m/^\s*$/;    #Ignore blank lines

    if (m/^>(\S+)/) {
      $seq_id = $1;

      if ( exists( $seq_hash->{$seq_id} ) ) {
        croak "FATAL: Sequence identifiers must be unique. Your fasta file contains two sequences with the same id ($seq_id)";
      }

      #Store the max length of seq name, use this later when printing in ascii
      $self->{_max_seqname} = length($seq_id)
        if ( !$self->{_max_seqname}
        or length($seq_id) > $self->{_max_seqname} );
    }
    else {
      croak "FATAL: Unrecognised format of fasta file. Each sequence must have a header line in the format '>identifier  <optional description>'"
        unless defined $seq_id;
      chomp;
      $seq_hash->{$seq_id} .= $_;
    }
  }

  $self->{_seq_hash} = $seq_hash;
}

#-------------------------------------------------------------------------------

=head2 _translate_fasta

Uses the HMMER v2.3.2 progam "translate" to perform a six-frame translation of
the input sequence. Checks the parameter "-translate".

Accepted arguments are "all" and "orf", where "all" means (from the "translate"
help text) "translate in full, with stops; no individual ORFs" and "orf" means
"report only ORFs greater than minlen" where minlen is set to the default of
20.

=cut

sub _translate_fasta {
  my ($self) = @_;
  my $translatedFasta = $self->{_fasta} . ".translated";

  my @params = ( 'translate', '-q', );
  if ( $self->{_translate} eq 'all' ) {
    push( @params, '-a' );
  }
  elsif ( $self->{_translate} eq 'orf' ) {
    push( @params, '-l', '20' );
  }
  else {
    croak qq(Unexpected parameter '$self->{_translate}');
  }
  push( @params, '-o', $translatedFasta, $self->{_fasta} );

  print STDERR "PfamScan::translate_fasta: translate command: |@params|\n"
    if $ENV{DEBUG};

  my $run = start \@params, '<pipe', \*IN, '>pipe', \*OUT, '2>pipe', \*ERR
    or croak qq(FATAL: error running translate; IPC::Run returned '$?');

  close IN;
  close OUT;

  my $err;
  while (<ERR>) {
    $err .= $_;
  }
  close ERR;

  finish $run
    or croak qq|FATAL: error running translate ($err); ipc returned '$?'|;
  open( F, "<", $translatedFasta )
    or croak qw(Could not open $translatedFasta '$!');
  if ( $self->{_translate} eq 'orf' ) {
    while (<F>) {
      if (/^>\s?(\S+).*nt (\d+)\.+(\d+)/) {
        $self->{_orf}->{$1}->{start}  = $2;
        $self->{_orf}->{$1}->{end}    = $3;
        $self->{_orf}->{$1}->{strand} = ( $2 < $3 ) ? '+' : '-';
      }
    }
  }
  else {
    my $currentSeq;
    my $currentFrame;
    my $currentLen = 0;
    my $maxEnd = 0;
    while (<F>) {
      chomp;
      if (/^>\s?(\S+\:)(\d+)/) {
        if ( $currentLen > 0 ) {
          my $seqName = $currentSeq . $currentFrame;
          if ( $currentFrame < 3 ) {
            my $start = 1 + $currentFrame;
            my $end   = $start + $currentLen - 1;
            $self->{_orf}->{$seqName}->{strand} = '+';
            $self->{_orf}->{$seqName}->{start}  = $start;
            $self->{_orf}->{$seqName}->{end}    = $end;
            $maxEnd = $end if ( $end > $maxEnd );
          }
          else {
            my $start = $maxEnd - ( $currentFrame - 3 );
            my $end = $start - $currentLen + 1;
            $self->{_orf}->{$seqName}->{strand} = '-';
            $self->{_orf}->{$seqName}->{start}  = $start;
            $self->{_orf}->{$seqName}->{end}    = $end;
          }
        }
        $currentLen   = 0;
        $currentSeq   = $1;
        $currentFrame = $2;
      }
      else {
        $currentLen += length($_) * 3;
      }
    }
    my $seqName = $currentSeq . $currentFrame;
    if ( $currentFrame < 3 ) {
      my $start = 1 + $currentFrame;
      my $end   = $start + $currentLen - 1;
      $self->{_orf}->{$seqName}->{strand} = '+';
      $self->{_orf}->{$seqName}->{start}  = $start;
      $self->{_orf}->{$seqName}->{end}    = $end;
      $maxEnd = $end if ( $end > $maxEnd );
    }
    else {
      my $start = $maxEnd - ( $currentFrame - 3 );
      my $end = $start - $currentLen + 1;
      $self->{_orf}->{$seqName}->{strand} = '-';
      $self->{_orf}->{$seqName}->{start}  = $start;
      $self->{_orf}->{$seqName}->{end}    = $end;
    }
  }
  $self->{_fasta} = $translatedFasta;
}
#-------------------------------------------------------------------------------

=head1 COPYRIGHT

Copyright (c) 2009: Genome Research Ltd.

Authors: Jaina Mistry (jm14@sanger.ac.uk), John Tate (jt6@sanger.ac.uk), Rob Finn (finnr@janelia.hhmi.org)

This is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
or see the on-line version at http://www.gnu.org/copyleft/gpl.txt

=cut

  1;
author	matteoc
date	Thu, 22 Dec 2016 04:45:31 -0500
parents
children