agame_custom_tools: pfamScan/Bio/Pfam/Scan/PfamScan.pm comparison

comparison pfamScan/Bio/Pfam/Scan/PfamScan.pm @ 0:68a3648c7d91 draft default tip

Uploaded

author	matteoc
date	Thu, 22 Dec 2016 04:45:31 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:68a3648c7d91
+=head1 NAME
+Bio::Pfam::Scan::PfamScan
+=cut
+package Bio::Pfam::Scan::PfamScan;
+=head1 SYNOPSIS
+my $ps = Bio::Pfam::Scan::PfamScan->new(
+-cut_off => $hmmscan_cut_off,
+-dir => $dir,
+-clan_overlap => $clan_overlap,
+-fasta => $fasta,
+-align => $align,
+-as => $as
+);
+$ps->search;
+$ps->write_results;
+=head1 DESCRIPTION
+$Id: PfamScan.pm,v 1.4 2010-01-12 09:41:42 jm14 Exp $
+=cut
+use strict;
+use warnings;
+use Bio::Pfam::HMM::HMMResultsIO;
+use Bio::Pfam::Active_site::as_search;
+use Bio::SimpleAlign;
+use Bio::Pfam::Scan::Seq;
+use Carp;
+use IPC::Run qw( start finish );
+#-------------------------------------------------------------------------------
+#- constructor -----------------------------------------------------------------
+#-------------------------------------------------------------------------------
+=head1 METHODS
+=head2 new
+The only constructor for the object. Accepts a set of arguments that specify
+the parameters for the search:
+=over
+=item -cut_off
+=item -dir
+=item -clan_overlap
+=item -fasta
+=item -sequence
+=item -align
+=item -hmm
+=item -as
+=back
+=cut
+sub new {
+my ( $class, @args ) = @_;
+my $self = {};
+bless $self, $class;
+# To avoid hard coding the location for the binary, we assume it will be on the path.....
+$self->{_HMMSCAN} = 'hmmscan';
+# handle arguments, if we were given any here
+$self->_process_args(@args) if @args;
+return $self;
+}
+#-------------------------------------------------------------------------------
+#- public methods --------------------------------------------------------------
+#-------------------------------------------------------------------------------
+=head2 search
+The main method on the object. Performs a C<hmmscan> search using the supplied
+sequence and the specified HMM library.
+=cut
+sub search {
+my ( $self, @args ) = @_;
+# handle the arguments, if we were handed any here
+$self->_process_args(@args) if @args;
+# set up the output header
+$self->_build_header;
+croak qq(FATAL: no sequence given; set the search parameters before calling "search")
+unless defined $self->{_sequence};
+my ( %AllResults, $pfamB, $firstResult );
+foreach my $hmmlib ( @{ $self->{_hmmlib} } ) {
+my ( @hmmscan_cut_off, $seq_evalue, $dom_evalue );
+if ( $hmmlib !~ /Pfam\-B/ ) {
+@hmmscan_cut_off = @{ $self->{_hmmscan_cutoff} };
+}
+else {
+$pfamB      = 1;
+$seq_evalue = 0.001;
+$dom_evalue = 0.001;
+# It's a pfamB search so use some default cut off values
+push @hmmscan_cut_off, '-E', $seq_evalue, '--domE', $dom_evalue;
+}
+push @{ $self->{_header} },
+"#     cpu number specified: " . $self->{_cpu} . "\n"
+if ( $hmmlib !~ /Pfam\-B/ and $self->{_cpu} );
+push @{ $self->{_header} },
+"#        searching against: "
+. $self->{_dir}
+. "/$hmmlib, with cut off "
+. join( " ", @hmmscan_cut_off ) . "\n";
+my @params;
+if ( $self->{_cpu} ) {
+@params = (
+'hmmscan', '--notextw', '--cpu', $self->{_cpu}, @hmmscan_cut_off,
+$self->{_dir} . '/' . $hmmlib,
+$self->{_fasta}
+);
+}
+else {
+@params = (
+'hmmscan', '--notextw', @hmmscan_cut_off, $self->{_dir} . '/' . $hmmlib,
+$self->{_fasta}
+);
+}
+print STDERR "PfamScan::search: hmmscan command: |@params|\n"
+if $ENV{DEBUG};
+print STDERR 'PfamScan::search: sequence: |' . $self->{_sequence} . "|\n"
+if $ENV{DEBUG};
+my $run = start \@params, '<pipe', \*IN, '>pipe', \*OUT, '2>pipe', \*ERR
+or croak qq(FATAL: error running hmmscan; IPC::Run returned '$?');
+# print IN $self->{_sequence}; ;
+close IN;
+$self->{_hmmresultIO} = Bio::Pfam::HMM::HMMResultsIO->new;
+$self->{_all_results} = $self->{_hmmresultIO}->parseMultiHMMER3( \*OUT );
+close OUT;
+my $err;
+while (<ERR>) {
+$err .= $_;
+}
+close ERR;
+finish $run
+or croak qq|FATAL: error running hmmscan ($err); ipc returned '$?'|;
+unless ( $hmmlib =~ /Pfam\-B/ ) {
+if ( $self->{_clan_overlap} ) {
+push( @{ $self->{_header} }, "#    resolve clan overlaps: off\n" );
+}
+else {
+push( @{ $self->{_header} }, "#    resolve clan overlaps: on\n" );
+$self->_resolve_clan_overlap;
+}
+if ( $self->{_as} ) {
+push( @{ $self->{_header} }, "#     predict active sites: on\n" );
+$self->_pred_act_sites;
+}
+else {
+push( @{ $self->{_header} }, "#     predict active sites: off\n" );
+}
+if ( $self->{_translate} ) {
+push @{ $self->{_header} },  "#   translate DNA sequence: " . $self->{_translate} . "\n";
+}
+}
+# Determine which hits are significant
+foreach my $result ( @{ $self->{_all_results} } ) {
+foreach
+my $unit ( sort { $a->seqFrom <=> $b->seqFrom } @{ $result->units } )
+{
+unless ($pfamB) {
+$unit->sig(0);
+if ( $result->seqs->{ $unit->name }->bits >=
+$self->{_seqGA}->{ $unit->name } )
+{
+if ( $unit->bits >= $self->{_domGA}->{ $unit->name } ) {
+$unit->sig(1);
+}
+}
+}
+}
+}
+if ($firstResult) {
+$AllResults{ $self->{_all_results} } = $self->{_all_results};
+}
+else {
+$firstResult = $self->{_all_results};
+}
+}    # end of "foreach $hmmlib"
+# If more than one search, merge results into one object
+if ( keys %AllResults ) {
+foreach my $AllResult ( keys %AllResults ) {
+foreach my $seq_id ( keys %{ $self->{_seq_hash} } ) {
+my $flag;
+#If seq exists in both, add all units from $AllResult to $firstResult
+foreach my $result ( @{$firstResult} ) {
+if ( $result->seqName eq $seq_id ) {
+$flag = 1;
+foreach my $result2 ( @{ $AllResults{$AllResult} } ) {
+if ( $result2->seqName eq $seq_id ) {
+foreach my $hmmname ( keys %{ $result2->seqs } ) {
+$result->addHMMSeq( $result2->seqs->{$hmmname} );
+}
+foreach my $unit ( @{ $result2->units } ) {
+$result->addHMMUnit($unit);
+}
+}
+}
+}
+}
+#If seq doesn't exist in $firstResult, need to add both sequence and units to $firstResult
+unless ($flag) {
+foreach my $result2 ( @{ $AllResults{$AllResult} } ) {
+if ( $result2->seqName eq $seq_id ) {
+push @{$firstResult}, $result2;
+}
+}
+}
+}
+}
+$self->{_all_results} = $firstResult;
+}    # end of "if keys %AllResults"
+push @{ $self->{_header} }, "# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =\n#\n";
+if ( $self->{_as} ) {
+push @{ $self->{_header} }, "# <seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan> <predicted_active_site_residues>";
+}
+else {
+push @{ $self->{_header} }, "# <seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan>";
+}
+if ( $self->{_translate} ) {
+push @{ $self->{_header} }, " <strand> <nt start> <nt end>";
+}
+push @{ $self->{_header} }, "\n";
+}
+#-------------------------------------------------------------------------------
+=head2 write_results
+Writes the results of the C<hmmscan> search. Takes a single argument, which can
+be an open filehandle or a filename. A fatal error is generated if a file of the
+given name already exists.
+=cut
+sub write_results {
+my ( $self, $out, $e_seq, $e_dom, $b_seq, $b_dom ) = @_;
+my $fh;
+if ( ref $out eq 'GLOB' ) {
+# we were handed a filehandle
+$fh = $out;
+}
+elsif ( $out and not ref $out ) {
+# we were handed a filename
+croak qq(FATAL: output file "$out" already exists) if -f $out;
+open( FH, ">$out" )
+or croak qq(FATAL: Can\'t write to your output file "$out": $!);
+$fh = \*FH;
+}
+else {
+# neither filehandle nor filename, default to STDOUT
+$fh = \*STDOUT;
+}
+if ( $self->{_header} ) {
+my $header = join '', @{ $self->{_header} };
+print $fh "$header\n";
+}
+foreach my $result ( @{ $self->{_all_results} } ) {
+$self->{_hmmresultIO}
+->write_ascii_out( $result, $fh, $self, $e_seq, $e_dom, $b_seq, $b_dom );
+}
+close $fh;
+}
+#-------------------------------------------------------------------------------
+=head2 results
+Returns the search results.
+=cut
+sub results {
+my ( $self, $e_value ) = @_;
+unless ( defined $self->{_all_results} ) {
+carp "WARNING: call search() before trying to retrieve results";
+return;
+}
+my @search_results = ();
+foreach my $hmm_result ( @{ $self->{_all_results} } ) {
+push @search_results, @{ $hmm_result->results( $self, $e_value ) };
+}
+return \@search_results;
+}
+#-------------------------------------------------------------------------------
+#- private methods -------------------------------------------------------------
+#-------------------------------------------------------------------------------
+=head1 PRIVATE METHODS
+=head2 _process_args
+Handles the input arguments.
+=cut
+sub _process_args {
+my ( $self, @args ) = @_;
+# accept both a hash and a hash ref
+my $args = ( ref $args[0] eq 'HASH' ) ? shift @args : {@args};
+# make sure we get a sequence
+if ( $args->{-fasta} and $args->{-sequence} ) {
+croak qq(FATAL: "-fasta" and "-sequence" are mutually exclusive);
+}
+elsif ( $args->{-fasta} ) {
+croak qq(FATAL: fasta file "$args->{-fasta}" doesn\'t exist)
+unless -s $args->{-fasta};
+}
+elsif ( $args->{-sequence} ) {
+croak qq(FATAL: no sequence given)
+unless length( $args->{-sequence} );
+}
+else {
+croak qq(FATAL: must specify either "-fasta" or "-sequence");
+}
+# check the cut off
+if ( ( $args->{-e_seq} and ( $args->{-b_seq} || $args->{-b_dom} ) )
+or ( $args->{-b_seq} and ( $args->{-e_seq} || $args->{-e_dom} ) )
+or ( $args->{-b_dom} and $args->{-e_dom} ) )
+{
+croak qq(FATAL: can\'t use e value and bit score threshold together);
+}
+$self->{_hmmscan_cutoff} = ();
+if ( $args->{-e_seq} ) {
+croak qq(FATAL: the E-value sequence cut-off "$args->{-e_seq}" must be a positive non-zero number)
+unless $args->{-e_seq} > 0;
+push @{ $self->{_hmmscan_cutoff} }, '-E', $args->{-e_seq};
+}
+if ( $args->{-e_dom} ) {
+croak q(FATAL: if you supply "-e_dom" you must also supply "-e_seq")
+unless $args->{-e_seq};
+croak qq(FATAL: the E-value domain cut-off "$args->{-e_dom}" must be positive non-zero number)
+unless $args->{-e_dom} > 0;
+push @{ $self->{_hmmscan_cutoff} }, '--domE', $args->{-e_dom};
+}
+if ( $args->{-b_seq} ) {
+push @{ $self->{_hmmscan_cutoff} }, '-T', $args->{-b_seq};
+}
+if ( $args->{-b_dom} ) {
+croak q(FATAL: if you supply "-b_dom" you must also supply "-b_seq")
+unless $args->{-b_seq};
+push @{ $self->{_hmmscan_cutoff} }, '--domT', $args->{-b_dom};
+}
+unless ( $self->{_hmmscan_cutoff} ) {
+push @{ $self->{_hmmscan_cutoff} }, '--cut_ga';
+}
+# make sure we have a valid directory for the HMM data files
+croak qq(FATAL: directory "$args->{-dir}" does not exist)
+unless -d $args->{-dir};
+# populate the object
+$self->{_cut_off}      = $args->{-cut_off};
+$self->{_dir}          = $args->{-dir};
+$self->{_clan_overlap} = $args->{-clan_overlap};
+$self->{_fasta}        = $args->{-fasta};
+$self->{_align}        = $args->{-align};
+$self->{_as}           = $args->{-as};
+$self->{_sequence}     = $args->{-sequence};
+$self->{_cpu}          = $args->{-cpu};
+$self->{_translate}    = $args->{-translate};
+$self->{_hmmlib} = [];
+if ( $args->{-hmmlib} ) {
+if ( ref $args->{-hmmlib} eq 'ARRAY' ) {
+push @{ $self->{_hmmlib} }, @{ $args->{-hmmlib} };
+}
+else {
+push @{ $self->{_hmmlib} }, $args->{-hmmlib};
+}
+}
+else {
+push @{ $self->{_hmmlib} }, "Pfam-A.hmm";
+}
+# Now check that the library exists in the data dir!
+foreach my $hmmlib ( @{ $self->{_hmmlib} } ) {
+croak qq(FATAL: can't find $hmmlib and/or $hmmlib binaries in "$args->{-dir}")
+unless (
+-s $self->{_dir},
+"/$hmmlib"
+and -s $self->{_dir} . "/$hmmlib.h3f"
+and -s $self->{_dir} . "/$hmmlib.h3i"
+and -s $self->{_dir} . "/$hmmlib.h3m"
+and -s $self->{_dir} . "/$hmmlib.h3p"
+and -s $self->{_dir} . "/$hmmlib.dat"
+);
+# read the necessary data, if it's not been read already
+$self->_read_pfam_data;
+}
+$self->{_max_seqname} = 0;
+# if there's nothing in "_sequence" try to load a fasta file
+$self->_read_fasta
+unless $self->{_sequence};
+# check again for a sequence. If we don't have one at this point, bail with
+# an error
+croak qq(FATAL: no sequence given)
+unless $self->{_sequence};
+# read fasta file, store maximum sequence name and store sequences for active
+# sites prediction
+$self->_parse_sequence
+unless $self->{_max_seqname};
+if ( $self->{_as} ) {
+$self->_parse_act_site_data
+unless $self->{_read_read_act_site_data};
+}
+if ( $self->{_translate} ) {
+$self->_translate_fasta;
+}
+# see if a version number was specified
+$self->{_version} = $args->{version};
+}
+#-------------------------------------------------------------------------------
+=head2 _build_header
+Adds version to the header object
+=cut
+sub _build_header {
+my ( $self, $version ) = @_;
+unshift @{ $self->{_header} },
+'#      query sequence file: ' . $self->{_fasta} . "\n";
+unshift @{ $self->{_header} }, <<EOF_license;
+# Copyright (c) 2009 Genome Research Ltd\n# Freely distributed under the GNU
+# General Public License
+#
+# Authors: Jaina Mistry (jaina\@ebi.ac.uk),
+#          Rob Finn (rdf\@ebi.ac.uk)
+#
+# This is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; either version 2 of the License, or (at your option) any later version.
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program. If not, see <http://www.gnu.org/licenses/>.
+# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+EOF_license
+my $v =
+( defined $self->{_version} )
+? "version $version, "
+: '';
+unshift @{ $self->{_header} },
+"# pfam_scan.pl, $v run at " . scalar(localtime) . "\n#\n";
+}
+#-------------------------------------------------------------------------------
+=head2 _read_fasta
+Reads a sequence from the fasta-format file that was specified in the
+parameters.
+=cut
+sub _read_fasta {
+my $self = shift;
+open( FASTA, $self->{_fasta} )
+or croak qq(FATAL: Couldn't open fasta file "$self->{_fasta}" $!\n);
+my @rows = <FASTA>;
+close FASTA;
+$self->{_sequence_rows} = \@rows;
+$self->{_sequence} = join '', @rows;
+}
+#-------------------------------------------------------------------------------
+=head2 _resolve_clan_overlap
+Resolves overlaps between clans.
+=cut
+sub _resolve_clan_overlap {
+my $self = shift;
+my @no_clan_overlap = ();
+foreach my $result ( @{ $self->{_all_results} } ) {
+my $new =
+$result->remove_overlaps_by_clan( $self->{_clanmap}, $self->{_nested} );
+push @no_clan_overlap, $new;
+}
+$self->{_all_results} = \@no_clan_overlap;
+}
+#-------------------------------------------------------------------------------
+=head2 _pred_act_sites
+Predicts active sites. Takes no arguments. Populates the "act_site" field on
+each results object.
+=cut
+sub _pred_act_sites {
+my $self = shift;
+# print STDERR "predicting active sites...\n";
+my $hmm_file = $self->{_dir} . '/Pfam-A.hmm';
+RESULT: foreach my $result ( @{ $self->{_all_results} } ) {
+# print STDERR "result: |" . $result->seqName . "|\n";
+UNIT: foreach my $unit ( @{ $result->units } ) {
+# print STDERR "family: |" . $unit->name . "|\n";
+next UNIT
+unless ( $self->{_act_site_data}->{ $unit->name }->{'alignment'} );
+my $seq_region = substr(
+$self->{_seq_hash}->{ $result->seqName },
+$unit->seqFrom - 1,
+$unit->seqTo - $unit->seqFrom + 1
+);
+my $seq_se = $unit->seqFrom . '-' . $unit->seqTo;
+# print STDERR "seq_id:     |" . $result->seqName . "|\n";
+# print STDERR "seq_se:     |" . $seq_se . "|\n";
+# print STDERR "seq_region: |" . $seq_region . "|\n";
+# print STDERR "family:     |" . $unit->name . "|\n";
+# print STDERR "hmm_file:   |" . $hmm_file . "|\n";
+# print STDERR "dir:        |" . $self->{_dir} . "|\n";
+$unit->{act_site} = Bio::Pfam::Active_site::as_search::find_as(
+$self->{_act_site_data}->{ $unit->name }->{'alignment'},
+$self->{_act_site_data}->{ $unit->name }->{'residues'},
+$result->seqName,
+$seq_se,
+$seq_region,
+$unit->name,
+$hmm_file
+);
+}
+}
+}
+#-------------------------------------------------------------------------------
+=head2 _read_pfam_data
+Reads the Pfam data file ("Pfam-A.scan.dat") and populates the C<accmap>,
+C<nested> and C<clanmap> hashes on the object.
+=cut
+sub _read_pfam_data {
+my $self = shift;
+#print STDERR "reading " . $self->{_hmmlib} . ".dat\n" if($ENV{DEBUG});
+$self->{_accmap}    = {};
+$self->{_nested}    = {};
+$self->{_clanmap}   = {};
+$self->{_desc}      = {};
+$self->{_seqGA}     = {};
+$self->{_domGA}     = {};
+$self->{_type}      = {};
+$self->{_model_len} = {};
+foreach my $hmmlib ( @{ $self->{_hmmlib} } ) {
+my $scandat = $self->{_dir} . '/' . $hmmlib . '.dat';
+open( SCANDAT, $scandat )
+or croak qq(FATAL: Couldn't open "$scandat" data file: $!);
+my $id;
+while (<SCANDAT>) {
+if (m/^\#=GF ID\s+(\S+)/) {
+$id = $1;
+}
+elsif (m/^\#=GF\s+AC\s+(\S+)/) {
+$self->{_accmap}->{$id} = $1;
+}
+elsif (m/^\#=GF\s+DE\s+(.+)/) {
+$self->{_desc}->{$id} = $1;
+}
+elsif (m/^\#=GF\s+GA\s+(\S+)\;\s+(\S+)\;/) {
+$self->{_seqGA}->{$id} = $1;
+$self->{_domGA}->{$id} = $2;
+}
+elsif (m/^\#=GF\s+TP\s+(\S+)/) {
+$self->{_type}->{$id} = $1;
+}
+elsif (m/^\#=GF\s+ML\s+(\d+)/) {
+$self->{_model_len}->{$id} = $1;
+}
+elsif (/^\#=GF\s+NE\s+(\S+)/) {
+$self->{_nested}->{$id}->{$1} = 1;
+$self->{_nested}->{$1}->{$id} = 1;
+}
+elsif (/^\#=GF\s+CL\s+(\S+)/) {
+$self->{_clanmap}->{$id} = $1;
+}
+}
+close SCANDAT;
+# set a flag to show that we've read the data files already
+$self->{ '_read_' . $hmmlib } = 1;
+}
+}
+#-------------------------------------------------------------------------------
+=head2 _read_act_site_data
+Reads the Pfam active site data file ("active_site.dat") and populates
+the C<act_site_data> hashes on the object.
+=cut
+sub _parse_act_site_data {
+my $self   = shift;
+my $as_dat = $self->{_dir} . '/active_site.dat';
+$self->{_act_site_data} = {};
+open( AS, $as_dat )
+or croak qq(FATAL: Couldn\'t open "$as_dat" data file: $!);
+my ( $fam_id, $aln );
+while (<AS>) {
+if (/^ID\s+(\S+)/) {
+$fam_id = $1;
+$aln    = new Bio::SimpleAlign;
+}
+elsif (/^AL\s+(\S+)\/(\d+)\-(\d+)\s+(\S+)/) {
+my ( $seq_id, $st, $en, $seq ) = ( $1, $2, $3, $4 );
+$aln->add_seq(
+Bio::Pfam::Scan::Seq->new(
+'-seq'   => $seq,
+'-id'    => $seq_id,
+'-start' => $st,
+'-end'   => $en,
+'-type'  => 'aligned'
+)
+);
+}
+elsif (/^RE\s+(\S+)\s+(\d+)/) {
+my ( $seq_id, $res ) = ( $1, $2 );
+push(
+@{ $self->{_act_site_data}->{$fam_id}->{'residues'}->{$seq_id} },
+$res
+);
+}
+elsif (/^\/\//) {
+$self->{_act_site_data}->{$fam_id}->{'alignment'} = $aln;
+$fam_id = "";
+$aln    = "";
+}
+else {
+warn "Ignoring line:\n[$_]";
+}
+}
+close AS;
+$self->{_read_read_act_site_data} = 1;
+}
+#-------------------------------------------------------------------------------
+=head2 _parse_sequence
+This method is used to parse the sequence and hash it on sequence
+identifier. It also stores the length of the longest sequence id
+=cut
+sub _parse_sequence {
+my $self = shift;
+my $seq_hash = {};
+my $seq_id;
+foreach ( @{ $self->{_sequence_rows} } ) {
+next if m/^\s*$/;    #Ignore blank lines
+if (m/^>(\S+)/) {
+$seq_id = $1;
+if ( exists( $seq_hash->{$seq_id} ) ) {
+croak "FATAL: Sequence identifiers must be unique. Your fasta file contains two sequences with the same id ($seq_id)";
+}
+#Store the max length of seq name, use this later when printing in ascii
+$self->{_max_seqname} = length($seq_id)
+if ( !$self->{_max_seqname}
+or length($seq_id) > $self->{_max_seqname} );
+}
+else {
+croak "FATAL: Unrecognised format of fasta file. Each sequence must have a header line in the format '>identifier  <optional description>'"
+unless defined $seq_id;
+chomp;
+$seq_hash->{$seq_id} .= $_;
+}
+}
+$self->{_seq_hash} = $seq_hash;
+}
+#-------------------------------------------------------------------------------
+=head2 _translate_fasta
+Uses the HMMER v2.3.2 progam "translate" to perform a six-frame translation of
+the input sequence. Checks the parameter "-translate".
+Accepted arguments are "all" and "orf", where "all" means (from the "translate"
+help text) "translate in full, with stops; no individual ORFs" and "orf" means
+"report only ORFs greater than minlen" where minlen is set to the default of
+20.
+=cut
+sub _translate_fasta {
+my ($self) = @_;
+my $translatedFasta = $self->{_fasta} . ".translated";
+my @params = ( 'translate', '-q', );
+if ( $self->{_translate} eq 'all' ) {
+push( @params, '-a' );
+}
+elsif ( $self->{_translate} eq 'orf' ) {
+push( @params, '-l', '20' );
+}
+else {
+croak qq(Unexpected parameter '$self->{_translate}');
+}
+push( @params, '-o', $translatedFasta, $self->{_fasta} );
+print STDERR "PfamScan::translate_fasta: translate command: |@params|\n"
+if $ENV{DEBUG};
+my $run = start \@params, '<pipe', \*IN, '>pipe', \*OUT, '2>pipe', \*ERR
+or croak qq(FATAL: error running translate; IPC::Run returned '$?');
+close IN;
+close OUT;
+my $err;
+while (<ERR>) {
+$err .= $_;
+}
+close ERR;
+finish $run
+or croak qq|FATAL: error running translate ($err); ipc returned '$?'|;
+open( F, "<", $translatedFasta )
+or croak qw(Could not open $translatedFasta '$!');
+if ( $self->{_translate} eq 'orf' ) {
+while (<F>) {
+if (/^>\s?(\S+).*nt (\d+)\.+(\d+)/) {
+$self->{_orf}->{$1}->{start}  = $2;
+$self->{_orf}->{$1}->{end}    = $3;
+$self->{_orf}->{$1}->{strand} = ( $2 < $3 ) ? '+' : '-';
+}
+}
+}
+else {
+my $currentSeq;
+my $currentFrame;
+my $currentLen = 0;
+my $maxEnd = 0;
+while (<F>) {
+chomp;
+if (/^>\s?(\S+\:)(\d+)/) {
+if ( $currentLen > 0 ) {
+my $seqName = $currentSeq . $currentFrame;
+if ( $currentFrame < 3 ) {
+my $start = 1 + $currentFrame;
+my $end   = $start + $currentLen - 1;
+$self->{_orf}->{$seqName}->{strand} = '+';
+$self->{_orf}->{$seqName}->{start}  = $start;
+$self->{_orf}->{$seqName}->{end}    = $end;
+$maxEnd = $end if ( $end > $maxEnd );
+}
+else {
+my $start = $maxEnd - ( $currentFrame - 3 );
+my $end = $start - $currentLen + 1;
+$self->{_orf}->{$seqName}->{strand} = '-';
+$self->{_orf}->{$seqName}->{start}  = $start;
+$self->{_orf}->{$seqName}->{end}    = $end;
+}
+}
+$currentLen   = 0;
+$currentSeq   = $1;
+$currentFrame = $2;
+}
+else {
+$currentLen += length($_) * 3;
+}
+}
+my $seqName = $currentSeq . $currentFrame;
+if ( $currentFrame < 3 ) {
+my $start = 1 + $currentFrame;
+my $end   = $start + $currentLen - 1;
+$self->{_orf}->{$seqName}->{strand} = '+';
+$self->{_orf}->{$seqName}->{start}  = $start;
+$self->{_orf}->{$seqName}->{end}    = $end;
+$maxEnd = $end if ( $end > $maxEnd );
+}
+else {
+my $start = $maxEnd - ( $currentFrame - 3 );
+my $end = $start - $currentLen + 1;
+$self->{_orf}->{$seqName}->{strand} = '-';
+$self->{_orf}->{$seqName}->{start}  = $start;
+$self->{_orf}->{$seqName}->{end}    = $end;
+}
+}
+$self->{_fasta} = $translatedFasta;
+}
+#-------------------------------------------------------------------------------
+=head1 COPYRIGHT
+Copyright (c) 2009: Genome Research Ltd.
+Authors: Jaina Mistry (jm14@sanger.ac.uk), John Tate (jt6@sanger.ac.uk), Rob Finn (finnr@janelia.hhmi.org)
+This is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
+=cut
+1;

Mercurial > repos > matteoc > agame_custom_tools

comparison pfamScan/Bio/Pfam/Scan/PfamScan.pm @ 0:68a3648c7d91 draft default tip