Mercurial > repos > matteoc > agame_custom_tools
diff pfamScan/pfam_scan.pl @ 0:68a3648c7d91 draft default tip
Uploaded
author | matteoc |
---|---|
date | Thu, 22 Dec 2016 04:45:31 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pfamScan/pfam_scan.pl Thu Dec 22 04:45:31 2016 -0500 @@ -0,0 +1,338 @@ +#!/usr/bin/env perl + +# $Id: pfam_scan.pl 9045 2015-05-26 09:09:52Z rdf $ + +use strict; +use warnings; + +BEGIN {push @INC,"/home/inmare/galaxy/tools/pfamScan"} +use Bio::Pfam::Scan::PfamScan; +use Getopt::Long; + +my $VERSION = "1.5"; + +#------------------------------------------------------------------------------- + +# get the user options +my ( $outfile, $e_seq, $e_dom, $b_seq, $b_dom, $dir, + $clan_overlap, $fasta, $align, $help, $as, $pfamB, + $json, $only_pfamB, $cpu, $translate ); +GetOptions( 'help' => \$help, + 'outfile=s' => \$outfile, + 'e_seq=f' => \$e_seq, + 'e_dom=f' => \$e_dom, + 'b_seq=f' => \$b_seq, + 'b_dom=f' => \$b_dom, + 'dir=s' => \$dir, + 'clan_overlap' => \$clan_overlap, + 'fasta=s' => \$fasta, + 'align' => \$align, + 'h' => \$help, + 'as' => \$as, + 'pfamB' => \$pfamB, + 'only_pfamB' => \$only_pfamB, + 'json:s' => \$json, + 'cpu=i' => \$cpu, + 'translate:s' => \$translate +); + +help() if $help; +help() unless ( $dir and $fasta ); # required options + +my $pfamA; +if ( $only_pfamB or $pfamB ) { + die qq(FATAL: As of release 28.0, Pfam no longer produces Pfam-B. The -pfamB and -only_pfamB options are now obsolete.\n); + $pfamB=1; +} +else { + $pfamA=1; +} + +my @hmmlib; +push @hmmlib, 'Pfam-A.hmm' if $pfamA; +push @hmmlib, 'Pfam-B.hmm' if $pfamB; + +#------------------------------------------------------------------------------- + +# check the input parameters + +die qq(FATAL: must specify both "-dir" and "-fasta") + unless ( defined $dir and defined $fasta ); + +die qq(FATAL: can't find directory "$dir") + unless -d $dir; + +die qq(FATAL: can't find file "$fasta") + unless -s $fasta; + +foreach my $hmmlib ( @hmmlib ) { + die qq(FATAL: can't find "$hmmlib" and/or "$hmmlib" binaries and/or "$hmmlib.dat" file in "$dir") + unless ( -s "$dir/$hmmlib" and + -s "$dir/$hmmlib.h3f" and + -s "$dir/$hmmlib.h3i" and + -s "$dir/$hmmlib.h3m" and + -s "$dir/$hmmlib.h3p" and + -s "$dir/$hmmlib.dat" ); +} + +die qq(FATAL: can't use E-value or bit score threshold with Pfam-B searches; Pfam-B searches use a default cut_off of 0.001) + if ( ( $e_seq or $e_dom or $b_seq or $b_dom ) and not $pfamA ); + +die qq(FATAL: can't use E-value and bit score threshold together) + if ( ( $e_seq and ( $b_seq or $b_dom ) ) or + ( $b_seq and ( $e_seq or $e_dom ) ) or + ( $b_dom and $e_dom ) ); + +die qq(FATAL: output file "$outfile" already exists) + if ( $outfile and -s $outfile ); + +if ( $as ) { + die qq(FATAL: "-as" option only works on Pfam-A families) + unless $pfamA; + + die qq(FATAL: can't find "active_site.dat" in "$dir") + unless -s "$dir/active_site.dat"; +} + +if ( defined $translate ) { + if ( $translate eq "" ) { + # no argument to "-translate" was given, so make "orf" the default + $translate = 'orf'; + } + else { + # there was an argument to "-translate", so make sure it's valid + unless ( $translate eq "all" or $translate eq "orf" ) { + die qq(FATAL: "-translate" option accepts only "all" and "orf"); + } + } +} + +#------------------------------------------------------------------------------- + +# build the object +my $ps = Bio::Pfam::Scan::PfamScan->new( + -e_seq => $e_seq, + -e_dom => $e_dom, + -b_seq => $b_seq, + -b_dom => $b_dom, + -dir => $dir, + -clan_overlap => $clan_overlap, + -fasta => $fasta, + -align => $align, + -as => $as, + -hmmlib => \@hmmlib, + -version => $VERSION, + -cpu => $cpu, + -translate => $translate +); + +# run the search +$ps->search; + +# print the results +if ( defined $json ) { + + my $json_object; + eval { + require JSON; + $json_object = new JSON; + }; + if ( $@ ) { + die qq(FATAL: can't load JSON module; can't write JSON-format output); + } + + if ( $json eq 'pretty' ) { + $json_object->pretty( 1 ) ; + } + print $json_object->encode( $ps->results ); + +} +else { + $ps->write_results( $outfile, $e_seq, $e_dom, $b_seq, $b_dom ); +} + +exit; + +#------------------------------------------------------------------------------- + +sub help { + print STDERR <<EOF; + +pfam_scan.pl: search a FASTA file against a library of Pfam HMMs + +Usage: pfam_scan.pl -fasta <fasta_file> -dir <directory location of Pfam files> + +Additonal options: + + -h : show this help + -outfile <file> : output file, otherwise send to STDOUT + -clan_overlap : show overlapping hits within clan member families (applies to Pfam-A families only) + -align : show the HMM-sequence alignment for each match + -e_seq <n> : specify hmmscan evalue sequence cutoff for Pfam-A searches (default Pfam defined) + -e_dom <n> : specify hmmscan evalue domain cutoff for Pfam-A searches (default Pfam defined) + -b_seq <n> : specify hmmscan bit score sequence cutoff for Pfam-A searches (default Pfam defined) + -b_dom <n> : specify hmmscan bit score domain cutoff for Pfam-A searches (default Pfam defined) + -as : predict active site residues for Pfam-A matches + -json [pretty] : write results in JSON format. If the optional value "pretty" is given, + the JSON output will be formatted using the "pretty" option in the JSON + module + -cpu <n> : number of parallel CPU workers to use for multithreads (default all) + -translate [mode] : treat sequence as DNA and perform six-frame translation before searching. If the + optional value "mode" is given it must be either "all", to translate everything + and produce no individual ORFs, or "orf", to report only ORFs with length greater + than 20. If "-translate" is used without a "mode" value, the default is to + report ORFs (default no translation) + + For more help, check the perldoc: + + shell\% perldoc pfam_scan.pl + +EOF + exit; + +} + +#------------------------------------------------------------------------------- + +=head1 NAME + +pfam_scan.pl -- Search protein sequences against the Pfam HMM library + +=head1 SYNOPSIS + +pfam_scan.pl [options] -fasta <fasta_file> -dir <Pfam_data_file_dir> + +=head1 OPTIONS + +=over + +=item B<-dir> I<Pfam_data_file_dir> + +Directory containing Pfam data files [required] + +=item B<-fasta> I<fasta_file> + +Filename of input file containing sequence(s) [required] + +=item B<-outfile> I<output_file> + +Write output to C<output_file> [default: STDOUT] + +=item B<-e_seq> + +Sequence E-value cut-off [default: use Pfam GA cutoff] + +=item B<-e_dom> + +Domain E-value cut-off [default: use Pfam GA cutoff] + +=item B<-b_seq> + +Sequence bits score cut-off [default: use Pfam GA cutoff] + +=item B<-b_dom> + +Domain bits score cut-off [default: use Pfam GA cutoff] + +=item B<-clan_overlap> + +Allow sequences in different clans to overlap [default: false] + +=item B<-align> + +Show alignment snippets in results [default: false] + +=item B<-as> + +Search for active sites on Pfam-A matches [default: false] + +=item B<-json> [I<pretty>] + +Write the results in JSON format [default: false] + +=item B<-cpu> + +Number of parallel CPU workers to use for multithreads [default: all] + +=item B<-translate> [I<mode>] + +Treat the input sequence as DNA and perform a six-frame translation before +searching, using the "translate" program from the HMMER v2.3.2 package. If the +optional value I<mode> is given, it must be either "all" or "orf": "all" means +translate in full, with stops, and produce no individual ORFs; "orf" means +translate and report only ORFs of length greater than 20. If B<translate> is +used but I<mode> is omitted, the default is to translate using the "orf" +method [default: off (no translation)] + +=item B<-h> + +Display help message + +=back + +The input must be a FASTA-format file. The C<-fasta> and C<-dir> options are +mandatory. You cannot specify both an E-value and bits score threshold. + +=head1 OVERVIEW + +C<pfam_scan.pl> is a script for searching one or more protein sequences against the +library of HMMs from Pfam. It requires a local copy of the Pfam data files, which +can be obtained from the Pfam FTP area: + + ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/ + +You must also have the HMMER3 binaries installed and their locations given by your +C<PATH> environment variable. You can download the HMMER3 package at: + + ftp://selab.janelia.org/pub/software/hmmer3/ + +=head1 OUTPUT + +The output format is: +<seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan> <predicted_active_site_residues> +Example output (-as option): + + O65039.1 38 93 38 93 PF08246 Inhibitor_I29 Domain 1 58 58 45.9 2.8e-12 1 No_clan + O65039.1 126 342 126 342 PF00112 Peptidase_C1 Domain 1 216 216 296.0 1.1e-88 1 CL0125 predicted_active_site[150,285,307] + +Most of these values are derived from the output of I<hmmscan> (see HMMER3 +documentation for details). The significance value is 1 if the bit score for a +hit is greater than or equal to the curated gathering threshold for the +matching family, 0 otherwise. + +=head1 REFERENCES + +Active site residues are predicted using the method described in the publication: + +Mistry J., Bateman A., Finn R.D. "Predicting active site residue annotations in +the Pfam database." BMC Bioinformatics. 2007;8:298. PMID:17688688. + +=head1 AUTHORS + +Jaina Mistry (jaina@ebi.ac.uk), Rob Finn (rdf@ebi.ac.uk) + +=cut + +=head1 COPYRIGHT + +Copyright (c) 2009: Genome Research Ltd. + +Authors: Jaina Mistry (jaina@ebi.ac.uk), rdf (rdf@ebi.ac.uk) + +This is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +or see the on-line version at http://www.gnu.org/copyleft/gpl.txt + +=cut +