view pfamScan/pfam_scan.pl @ 0:68a3648c7d91 draft default tip

Uploaded
author matteoc
date Thu, 22 Dec 2016 04:45:31 -0500
parents
children
line wrap: on
line source

#!/usr/bin/env perl

# $Id: pfam_scan.pl 9045 2015-05-26 09:09:52Z rdf $

use strict;
use warnings;

BEGIN {push @INC,"/home/inmare/galaxy/tools/pfamScan"}
use Bio::Pfam::Scan::PfamScan;
use Getopt::Long;

my $VERSION = "1.5"; 

#-------------------------------------------------------------------------------

# get the user options
my ( $outfile, $e_seq, $e_dom, $b_seq, $b_dom, $dir, 
     $clan_overlap, $fasta, $align, $help, $as, $pfamB, 
     $json, $only_pfamB, $cpu, $translate );
GetOptions( 'help'         => \$help,
            'outfile=s'    => \$outfile,
            'e_seq=f'      => \$e_seq,
            'e_dom=f'      => \$e_dom,
            'b_seq=f'      => \$b_seq,
            'b_dom=f'      => \$b_dom,
            'dir=s'        => \$dir,
            'clan_overlap' => \$clan_overlap,
            'fasta=s'      => \$fasta,
            'align'        => \$align,
            'h'            => \$help,
            'as'           => \$as,
            'pfamB'        => \$pfamB,
            'only_pfamB'   => \$only_pfamB,
            'json:s'       => \$json,
            'cpu=i'        => \$cpu,
            'translate:s'  => \$translate
);

help() if $help;
help() unless ( $dir and $fasta ); # required options

my $pfamA;
if ( $only_pfamB or $pfamB ) {
  die qq(FATAL: As of release 28.0, Pfam no longer produces Pfam-B. The -pfamB and -only_pfamB options are now obsolete.\n);
  $pfamB=1;
}
else {
  $pfamA=1;
}

my @hmmlib;
push @hmmlib, 'Pfam-A.hmm' if $pfamA;
push @hmmlib, 'Pfam-B.hmm' if $pfamB;

#-------------------------------------------------------------------------------

# check the input parameters

die qq(FATAL: must specify both "-dir" and "-fasta")
  unless ( defined $dir and defined $fasta );

die qq(FATAL: can't find directory "$dir")
  unless -d $dir;

die qq(FATAL: can't find file "$fasta")
  unless -s $fasta;

foreach my $hmmlib ( @hmmlib ) {
  die qq(FATAL: can't find "$hmmlib" and/or "$hmmlib" binaries and/or "$hmmlib.dat" file in "$dir")
    unless ( -s "$dir/$hmmlib"     and 
             -s "$dir/$hmmlib.h3f" and
             -s "$dir/$hmmlib.h3i" and
             -s "$dir/$hmmlib.h3m" and
             -s "$dir/$hmmlib.h3p" and
             -s "$dir/$hmmlib.dat" );
}

die qq(FATAL: can't use E-value or bit score threshold with Pfam-B searches; Pfam-B searches use a default cut_off of 0.001)
  if ( ( $e_seq or $e_dom or $b_seq or $b_dom ) and not $pfamA ); 

die qq(FATAL: can't use E-value and bit score threshold together)
  if ( ( $e_seq and ( $b_seq or $b_dom ) ) or 
       ( $b_seq and ( $e_seq or $e_dom ) ) or 
       ( $b_dom and $e_dom ) );

die qq(FATAL: output file "$outfile" already exists)
  if ( $outfile and -s $outfile );

if ( $as ) {
  die qq(FATAL: "-as" option only works on Pfam-A families)
    unless $pfamA;

  die qq(FATAL: can't find "active_site.dat" in "$dir")
    unless -s "$dir/active_site.dat";
}

if ( defined $translate ) {
  if ( $translate eq "" ) {
    # no argument to "-translate" was given, so make "orf" the default
    $translate = 'orf';
  }
  else {
    # there was an argument to "-translate", so make sure it's valid
    unless ( $translate eq "all" or $translate eq "orf" ) {
      die qq(FATAL: "-translate" option accepts only "all" and "orf");
    }
  }
}

#-------------------------------------------------------------------------------

# build the object
my $ps = Bio::Pfam::Scan::PfamScan->new(
  -e_seq        => $e_seq,
  -e_dom        => $e_dom,
  -b_seq        => $b_seq,
  -b_dom        => $b_dom,
  -dir          => $dir,
  -clan_overlap => $clan_overlap,
  -fasta        => $fasta,
  -align        => $align,
  -as           => $as,
  -hmmlib       => \@hmmlib,
  -version      => $VERSION,
  -cpu          => $cpu,
  -translate    => $translate
);

# run the search
$ps->search;

# print the results
if ( defined $json ) {

  my $json_object;
  eval {
    require JSON;
    $json_object = new JSON;
  };
  if ( $@ ) {
    die qq(FATAL: can't load JSON module; can't write JSON-format output);
  }

  if ( $json eq 'pretty' ) {
    $json_object->pretty( 1 ) ;
  }
  print $json_object->encode( $ps->results );

}
else {
  $ps->write_results( $outfile, $e_seq, $e_dom, $b_seq, $b_dom );
}

exit;

#-------------------------------------------------------------------------------

sub help {
  print STDERR <<EOF;

pfam_scan.pl: search a FASTA file against a library of Pfam HMMs

Usage: pfam_scan.pl -fasta <fasta_file> -dir <directory location of Pfam files>

Additonal options:

  -h                : show this help
  -outfile <file>   : output file, otherwise send to STDOUT
  -clan_overlap     : show overlapping hits within clan member families (applies to Pfam-A families only)
  -align            : show the HMM-sequence alignment for each match
  -e_seq <n>        : specify hmmscan evalue sequence cutoff for Pfam-A searches (default Pfam defined)
  -e_dom <n>        : specify hmmscan evalue domain cutoff for Pfam-A searches (default Pfam defined)
  -b_seq <n>        : specify hmmscan bit score sequence cutoff for Pfam-A searches (default Pfam defined)
  -b_dom <n>        : specify hmmscan bit score domain cutoff for Pfam-A searches (default Pfam defined)
  -as               : predict active site residues for Pfam-A matches
  -json [pretty]    : write results in JSON format. If the optional value "pretty" is given,
                      the JSON output will be formatted using the "pretty" option in the JSON
                      module
  -cpu <n>          : number of parallel CPU workers to use for multithreads (default all)
  -translate [mode] : treat sequence as DNA and perform six-frame translation before searching. If the
                      optional value "mode" is given it must be either "all", to translate everything 
                      and produce no individual ORFs, or "orf", to report only ORFs with length greater 
                      than 20. If "-translate" is used without a "mode" value, the default is to 
                      report ORFs (default no translation)

  For more help, check the perldoc:

      shell\% perldoc pfam_scan.pl

EOF
  exit;

}

#-------------------------------------------------------------------------------

=head1 NAME

pfam_scan.pl -- Search protein sequences against the Pfam HMM library

=head1 SYNOPSIS

pfam_scan.pl [options] -fasta <fasta_file> -dir <Pfam_data_file_dir>

=head1 OPTIONS

=over

=item B<-dir> I<Pfam_data_file_dir>

Directory containing Pfam data files [required]

=item B<-fasta> I<fasta_file>

Filename of input file containing sequence(s) [required]

=item B<-outfile> I<output_file>

Write output to C<output_file> [default: STDOUT]

=item B<-e_seq>

Sequence E-value cut-off [default: use Pfam GA cutoff]

=item B<-e_dom> 

Domain E-value cut-off [default: use Pfam GA cutoff]

=item B<-b_seq>

Sequence bits score cut-off [default: use Pfam GA cutoff]

=item B<-b_dom>

Domain bits score cut-off [default: use Pfam GA cutoff]

=item B<-clan_overlap>

Allow sequences in different clans to overlap [default: false]

=item B<-align>

Show alignment snippets in results [default: false]

=item B<-as>

Search for active sites on Pfam-A matches [default: false]

=item B<-json> [I<pretty>]

Write the results in JSON format [default: false]

=item B<-cpu>

Number of parallel CPU workers to use for multithreads [default: all]

=item B<-translate> [I<mode>]

Treat the input sequence as DNA and perform a six-frame translation before
searching, using the "translate" program from the HMMER v2.3.2 package. If the
optional value I<mode> is given, it must be either "all" or "orf": "all" means
translate in full, with stops, and produce no individual ORFs; "orf" means
translate and report only ORFs of length greater than 20. If B<translate> is
used but I<mode> is omitted, the default is to translate using the "orf"
method [default: off (no translation)]

=item B<-h>

Display help message

=back

The input must be a FASTA-format file. The C<-fasta> and C<-dir> options are 
mandatory. You cannot specify both an E-value and bits score threshold.  

=head1 OVERVIEW

C<pfam_scan.pl> is a script for searching one or more protein sequences against the
library of HMMs from Pfam. It requires a local copy of the Pfam data files, which 
can be obtained from the Pfam FTP area:

  ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/

You must also have the HMMER3 binaries installed and their locations given by your
C<PATH> environment variable. You can download the HMMER3 package at:

  ftp://selab.janelia.org/pub/software/hmmer3/

=head1 OUTPUT

The output format is:
<seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan> <predicted_active_site_residues>
Example output (-as option):

  O65039.1     38     93     38     93 PF08246   Inhibitor_I29     Domain     1    58    58     45.9   2.8e-12   1 No_clan
  O65039.1    126    342    126    342 PF00112   Peptidase_C1      Domain     1   216   216    296.0   1.1e-88   1 CL0125   predicted_active_site[150,285,307]

Most of these values are derived from the output of I<hmmscan> (see HMMER3
documentation for details). The significance value is 1 if the bit score for a
hit is greater than or equal to the curated gathering threshold for the
matching family, 0 otherwise. 

=head1 REFERENCES

Active site residues are predicted using the method described in the publication: 

Mistry J., Bateman A., Finn R.D. "Predicting active site residue annotations in 
the Pfam database." BMC Bioinformatics. 2007;8:298. PMID:17688688.

=head1 AUTHORS

Jaina Mistry (jaina@ebi.ac.uk), Rob Finn (rdf@ebi.ac.uk)

=cut

=head1 COPYRIGHT

Copyright (c) 2009: Genome Research Ltd.

Authors: Jaina Mistry (jaina@ebi.ac.uk), rdf (rdf@ebi.ac.uk)

This is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
 
=cut