Mercurial > repos > matteoc > agame_custom_tools
view pfamScan/pfam_scan.pl @ 0:68a3648c7d91 draft default tip
Uploaded
author | matteoc |
---|---|
date | Thu, 22 Dec 2016 04:45:31 -0500 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env perl # $Id: pfam_scan.pl 9045 2015-05-26 09:09:52Z rdf $ use strict; use warnings; BEGIN {push @INC,"/home/inmare/galaxy/tools/pfamScan"} use Bio::Pfam::Scan::PfamScan; use Getopt::Long; my $VERSION = "1.5"; #------------------------------------------------------------------------------- # get the user options my ( $outfile, $e_seq, $e_dom, $b_seq, $b_dom, $dir, $clan_overlap, $fasta, $align, $help, $as, $pfamB, $json, $only_pfamB, $cpu, $translate ); GetOptions( 'help' => \$help, 'outfile=s' => \$outfile, 'e_seq=f' => \$e_seq, 'e_dom=f' => \$e_dom, 'b_seq=f' => \$b_seq, 'b_dom=f' => \$b_dom, 'dir=s' => \$dir, 'clan_overlap' => \$clan_overlap, 'fasta=s' => \$fasta, 'align' => \$align, 'h' => \$help, 'as' => \$as, 'pfamB' => \$pfamB, 'only_pfamB' => \$only_pfamB, 'json:s' => \$json, 'cpu=i' => \$cpu, 'translate:s' => \$translate ); help() if $help; help() unless ( $dir and $fasta ); # required options my $pfamA; if ( $only_pfamB or $pfamB ) { die qq(FATAL: As of release 28.0, Pfam no longer produces Pfam-B. The -pfamB and -only_pfamB options are now obsolete.\n); $pfamB=1; } else { $pfamA=1; } my @hmmlib; push @hmmlib, 'Pfam-A.hmm' if $pfamA; push @hmmlib, 'Pfam-B.hmm' if $pfamB; #------------------------------------------------------------------------------- # check the input parameters die qq(FATAL: must specify both "-dir" and "-fasta") unless ( defined $dir and defined $fasta ); die qq(FATAL: can't find directory "$dir") unless -d $dir; die qq(FATAL: can't find file "$fasta") unless -s $fasta; foreach my $hmmlib ( @hmmlib ) { die qq(FATAL: can't find "$hmmlib" and/or "$hmmlib" binaries and/or "$hmmlib.dat" file in "$dir") unless ( -s "$dir/$hmmlib" and -s "$dir/$hmmlib.h3f" and -s "$dir/$hmmlib.h3i" and -s "$dir/$hmmlib.h3m" and -s "$dir/$hmmlib.h3p" and -s "$dir/$hmmlib.dat" ); } die qq(FATAL: can't use E-value or bit score threshold with Pfam-B searches; Pfam-B searches use a default cut_off of 0.001) if ( ( $e_seq or $e_dom or $b_seq or $b_dom ) and not $pfamA ); die qq(FATAL: can't use E-value and bit score threshold together) if ( ( $e_seq and ( $b_seq or $b_dom ) ) or ( $b_seq and ( $e_seq or $e_dom ) ) or ( $b_dom and $e_dom ) ); die qq(FATAL: output file "$outfile" already exists) if ( $outfile and -s $outfile ); if ( $as ) { die qq(FATAL: "-as" option only works on Pfam-A families) unless $pfamA; die qq(FATAL: can't find "active_site.dat" in "$dir") unless -s "$dir/active_site.dat"; } if ( defined $translate ) { if ( $translate eq "" ) { # no argument to "-translate" was given, so make "orf" the default $translate = 'orf'; } else { # there was an argument to "-translate", so make sure it's valid unless ( $translate eq "all" or $translate eq "orf" ) { die qq(FATAL: "-translate" option accepts only "all" and "orf"); } } } #------------------------------------------------------------------------------- # build the object my $ps = Bio::Pfam::Scan::PfamScan->new( -e_seq => $e_seq, -e_dom => $e_dom, -b_seq => $b_seq, -b_dom => $b_dom, -dir => $dir, -clan_overlap => $clan_overlap, -fasta => $fasta, -align => $align, -as => $as, -hmmlib => \@hmmlib, -version => $VERSION, -cpu => $cpu, -translate => $translate ); # run the search $ps->search; # print the results if ( defined $json ) { my $json_object; eval { require JSON; $json_object = new JSON; }; if ( $@ ) { die qq(FATAL: can't load JSON module; can't write JSON-format output); } if ( $json eq 'pretty' ) { $json_object->pretty( 1 ) ; } print $json_object->encode( $ps->results ); } else { $ps->write_results( $outfile, $e_seq, $e_dom, $b_seq, $b_dom ); } exit; #------------------------------------------------------------------------------- sub help { print STDERR <<EOF; pfam_scan.pl: search a FASTA file against a library of Pfam HMMs Usage: pfam_scan.pl -fasta <fasta_file> -dir <directory location of Pfam files> Additonal options: -h : show this help -outfile <file> : output file, otherwise send to STDOUT -clan_overlap : show overlapping hits within clan member families (applies to Pfam-A families only) -align : show the HMM-sequence alignment for each match -e_seq <n> : specify hmmscan evalue sequence cutoff for Pfam-A searches (default Pfam defined) -e_dom <n> : specify hmmscan evalue domain cutoff for Pfam-A searches (default Pfam defined) -b_seq <n> : specify hmmscan bit score sequence cutoff for Pfam-A searches (default Pfam defined) -b_dom <n> : specify hmmscan bit score domain cutoff for Pfam-A searches (default Pfam defined) -as : predict active site residues for Pfam-A matches -json [pretty] : write results in JSON format. If the optional value "pretty" is given, the JSON output will be formatted using the "pretty" option in the JSON module -cpu <n> : number of parallel CPU workers to use for multithreads (default all) -translate [mode] : treat sequence as DNA and perform six-frame translation before searching. If the optional value "mode" is given it must be either "all", to translate everything and produce no individual ORFs, or "orf", to report only ORFs with length greater than 20. If "-translate" is used without a "mode" value, the default is to report ORFs (default no translation) For more help, check the perldoc: shell\% perldoc pfam_scan.pl EOF exit; } #------------------------------------------------------------------------------- =head1 NAME pfam_scan.pl -- Search protein sequences against the Pfam HMM library =head1 SYNOPSIS pfam_scan.pl [options] -fasta <fasta_file> -dir <Pfam_data_file_dir> =head1 OPTIONS =over =item B<-dir> I<Pfam_data_file_dir> Directory containing Pfam data files [required] =item B<-fasta> I<fasta_file> Filename of input file containing sequence(s) [required] =item B<-outfile> I<output_file> Write output to C<output_file> [default: STDOUT] =item B<-e_seq> Sequence E-value cut-off [default: use Pfam GA cutoff] =item B<-e_dom> Domain E-value cut-off [default: use Pfam GA cutoff] =item B<-b_seq> Sequence bits score cut-off [default: use Pfam GA cutoff] =item B<-b_dom> Domain bits score cut-off [default: use Pfam GA cutoff] =item B<-clan_overlap> Allow sequences in different clans to overlap [default: false] =item B<-align> Show alignment snippets in results [default: false] =item B<-as> Search for active sites on Pfam-A matches [default: false] =item B<-json> [I<pretty>] Write the results in JSON format [default: false] =item B<-cpu> Number of parallel CPU workers to use for multithreads [default: all] =item B<-translate> [I<mode>] Treat the input sequence as DNA and perform a six-frame translation before searching, using the "translate" program from the HMMER v2.3.2 package. If the optional value I<mode> is given, it must be either "all" or "orf": "all" means translate in full, with stops, and produce no individual ORFs; "orf" means translate and report only ORFs of length greater than 20. If B<translate> is used but I<mode> is omitted, the default is to translate using the "orf" method [default: off (no translation)] =item B<-h> Display help message =back The input must be a FASTA-format file. The C<-fasta> and C<-dir> options are mandatory. You cannot specify both an E-value and bits score threshold. =head1 OVERVIEW C<pfam_scan.pl> is a script for searching one or more protein sequences against the library of HMMs from Pfam. It requires a local copy of the Pfam data files, which can be obtained from the Pfam FTP area: ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/ You must also have the HMMER3 binaries installed and their locations given by your C<PATH> environment variable. You can download the HMMER3 package at: ftp://selab.janelia.org/pub/software/hmmer3/ =head1 OUTPUT The output format is: <seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan> <predicted_active_site_residues> Example output (-as option): O65039.1 38 93 38 93 PF08246 Inhibitor_I29 Domain 1 58 58 45.9 2.8e-12 1 No_clan O65039.1 126 342 126 342 PF00112 Peptidase_C1 Domain 1 216 216 296.0 1.1e-88 1 CL0125 predicted_active_site[150,285,307] Most of these values are derived from the output of I<hmmscan> (see HMMER3 documentation for details). The significance value is 1 if the bit score for a hit is greater than or equal to the curated gathering threshold for the matching family, 0 otherwise. =head1 REFERENCES Active site residues are predicted using the method described in the publication: Mistry J., Bateman A., Finn R.D. "Predicting active site residue annotations in the Pfam database." BMC Bioinformatics. 2007;8:298. PMID:17688688. =head1 AUTHORS Jaina Mistry (jaina@ebi.ac.uk), Rob Finn (rdf@ebi.ac.uk) =cut =head1 COPYRIGHT Copyright (c) 2009: Genome Research Ltd. Authors: Jaina Mistry (jaina@ebi.ac.uk), rdf (rdf@ebi.ac.uk) This is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. or see the on-line version at http://www.gnu.org/copyleft/gpl.txt =cut