comparison pfamScan/pfam_scan.pl @ 0:68a3648c7d91 draft default tip

Uploaded
author matteoc
date Thu, 22 Dec 2016 04:45:31 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:68a3648c7d91
1 #!/usr/bin/env perl
2
3 # $Id: pfam_scan.pl 9045 2015-05-26 09:09:52Z rdf $
4
5 use strict;
6 use warnings;
7
8 BEGIN {push @INC,"/home/inmare/galaxy/tools/pfamScan"}
9 use Bio::Pfam::Scan::PfamScan;
10 use Getopt::Long;
11
12 my $VERSION = "1.5";
13
14 #-------------------------------------------------------------------------------
15
16 # get the user options
17 my ( $outfile, $e_seq, $e_dom, $b_seq, $b_dom, $dir,
18 $clan_overlap, $fasta, $align, $help, $as, $pfamB,
19 $json, $only_pfamB, $cpu, $translate );
20 GetOptions( 'help' => \$help,
21 'outfile=s' => \$outfile,
22 'e_seq=f' => \$e_seq,
23 'e_dom=f' => \$e_dom,
24 'b_seq=f' => \$b_seq,
25 'b_dom=f' => \$b_dom,
26 'dir=s' => \$dir,
27 'clan_overlap' => \$clan_overlap,
28 'fasta=s' => \$fasta,
29 'align' => \$align,
30 'h' => \$help,
31 'as' => \$as,
32 'pfamB' => \$pfamB,
33 'only_pfamB' => \$only_pfamB,
34 'json:s' => \$json,
35 'cpu=i' => \$cpu,
36 'translate:s' => \$translate
37 );
38
39 help() if $help;
40 help() unless ( $dir and $fasta ); # required options
41
42 my $pfamA;
43 if ( $only_pfamB or $pfamB ) {
44 die qq(FATAL: As of release 28.0, Pfam no longer produces Pfam-B. The -pfamB and -only_pfamB options are now obsolete.\n);
45 $pfamB=1;
46 }
47 else {
48 $pfamA=1;
49 }
50
51 my @hmmlib;
52 push @hmmlib, 'Pfam-A.hmm' if $pfamA;
53 push @hmmlib, 'Pfam-B.hmm' if $pfamB;
54
55 #-------------------------------------------------------------------------------
56
57 # check the input parameters
58
59 die qq(FATAL: must specify both "-dir" and "-fasta")
60 unless ( defined $dir and defined $fasta );
61
62 die qq(FATAL: can't find directory "$dir")
63 unless -d $dir;
64
65 die qq(FATAL: can't find file "$fasta")
66 unless -s $fasta;
67
68 foreach my $hmmlib ( @hmmlib ) {
69 die qq(FATAL: can't find "$hmmlib" and/or "$hmmlib" binaries and/or "$hmmlib.dat" file in "$dir")
70 unless ( -s "$dir/$hmmlib" and
71 -s "$dir/$hmmlib.h3f" and
72 -s "$dir/$hmmlib.h3i" and
73 -s "$dir/$hmmlib.h3m" and
74 -s "$dir/$hmmlib.h3p" and
75 -s "$dir/$hmmlib.dat" );
76 }
77
78 die qq(FATAL: can't use E-value or bit score threshold with Pfam-B searches; Pfam-B searches use a default cut_off of 0.001)
79 if ( ( $e_seq or $e_dom or $b_seq or $b_dom ) and not $pfamA );
80
81 die qq(FATAL: can't use E-value and bit score threshold together)
82 if ( ( $e_seq and ( $b_seq or $b_dom ) ) or
83 ( $b_seq and ( $e_seq or $e_dom ) ) or
84 ( $b_dom and $e_dom ) );
85
86 die qq(FATAL: output file "$outfile" already exists)
87 if ( $outfile and -s $outfile );
88
89 if ( $as ) {
90 die qq(FATAL: "-as" option only works on Pfam-A families)
91 unless $pfamA;
92
93 die qq(FATAL: can't find "active_site.dat" in "$dir")
94 unless -s "$dir/active_site.dat";
95 }
96
97 if ( defined $translate ) {
98 if ( $translate eq "" ) {
99 # no argument to "-translate" was given, so make "orf" the default
100 $translate = 'orf';
101 }
102 else {
103 # there was an argument to "-translate", so make sure it's valid
104 unless ( $translate eq "all" or $translate eq "orf" ) {
105 die qq(FATAL: "-translate" option accepts only "all" and "orf");
106 }
107 }
108 }
109
110 #-------------------------------------------------------------------------------
111
112 # build the object
113 my $ps = Bio::Pfam::Scan::PfamScan->new(
114 -e_seq => $e_seq,
115 -e_dom => $e_dom,
116 -b_seq => $b_seq,
117 -b_dom => $b_dom,
118 -dir => $dir,
119 -clan_overlap => $clan_overlap,
120 -fasta => $fasta,
121 -align => $align,
122 -as => $as,
123 -hmmlib => \@hmmlib,
124 -version => $VERSION,
125 -cpu => $cpu,
126 -translate => $translate
127 );
128
129 # run the search
130 $ps->search;
131
132 # print the results
133 if ( defined $json ) {
134
135 my $json_object;
136 eval {
137 require JSON;
138 $json_object = new JSON;
139 };
140 if ( $@ ) {
141 die qq(FATAL: can't load JSON module; can't write JSON-format output);
142 }
143
144 if ( $json eq 'pretty' ) {
145 $json_object->pretty( 1 ) ;
146 }
147 print $json_object->encode( $ps->results );
148
149 }
150 else {
151 $ps->write_results( $outfile, $e_seq, $e_dom, $b_seq, $b_dom );
152 }
153
154 exit;
155
156 #-------------------------------------------------------------------------------
157
158 sub help {
159 print STDERR <<EOF;
160
161 pfam_scan.pl: search a FASTA file against a library of Pfam HMMs
162
163 Usage: pfam_scan.pl -fasta <fasta_file> -dir <directory location of Pfam files>
164
165 Additonal options:
166
167 -h : show this help
168 -outfile <file> : output file, otherwise send to STDOUT
169 -clan_overlap : show overlapping hits within clan member families (applies to Pfam-A families only)
170 -align : show the HMM-sequence alignment for each match
171 -e_seq <n> : specify hmmscan evalue sequence cutoff for Pfam-A searches (default Pfam defined)
172 -e_dom <n> : specify hmmscan evalue domain cutoff for Pfam-A searches (default Pfam defined)
173 -b_seq <n> : specify hmmscan bit score sequence cutoff for Pfam-A searches (default Pfam defined)
174 -b_dom <n> : specify hmmscan bit score domain cutoff for Pfam-A searches (default Pfam defined)
175 -as : predict active site residues for Pfam-A matches
176 -json [pretty] : write results in JSON format. If the optional value "pretty" is given,
177 the JSON output will be formatted using the "pretty" option in the JSON
178 module
179 -cpu <n> : number of parallel CPU workers to use for multithreads (default all)
180 -translate [mode] : treat sequence as DNA and perform six-frame translation before searching. If the
181 optional value "mode" is given it must be either "all", to translate everything
182 and produce no individual ORFs, or "orf", to report only ORFs with length greater
183 than 20. If "-translate" is used without a "mode" value, the default is to
184 report ORFs (default no translation)
185
186 For more help, check the perldoc:
187
188 shell\% perldoc pfam_scan.pl
189
190 EOF
191 exit;
192
193 }
194
195 #-------------------------------------------------------------------------------
196
197 =head1 NAME
198
199 pfam_scan.pl -- Search protein sequences against the Pfam HMM library
200
201 =head1 SYNOPSIS
202
203 pfam_scan.pl [options] -fasta <fasta_file> -dir <Pfam_data_file_dir>
204
205 =head1 OPTIONS
206
207 =over
208
209 =item B<-dir> I<Pfam_data_file_dir>
210
211 Directory containing Pfam data files [required]
212
213 =item B<-fasta> I<fasta_file>
214
215 Filename of input file containing sequence(s) [required]
216
217 =item B<-outfile> I<output_file>
218
219 Write output to C<output_file> [default: STDOUT]
220
221 =item B<-e_seq>
222
223 Sequence E-value cut-off [default: use Pfam GA cutoff]
224
225 =item B<-e_dom>
226
227 Domain E-value cut-off [default: use Pfam GA cutoff]
228
229 =item B<-b_seq>
230
231 Sequence bits score cut-off [default: use Pfam GA cutoff]
232
233 =item B<-b_dom>
234
235 Domain bits score cut-off [default: use Pfam GA cutoff]
236
237 =item B<-clan_overlap>
238
239 Allow sequences in different clans to overlap [default: false]
240
241 =item B<-align>
242
243 Show alignment snippets in results [default: false]
244
245 =item B<-as>
246
247 Search for active sites on Pfam-A matches [default: false]
248
249 =item B<-json> [I<pretty>]
250
251 Write the results in JSON format [default: false]
252
253 =item B<-cpu>
254
255 Number of parallel CPU workers to use for multithreads [default: all]
256
257 =item B<-translate> [I<mode>]
258
259 Treat the input sequence as DNA and perform a six-frame translation before
260 searching, using the "translate" program from the HMMER v2.3.2 package. If the
261 optional value I<mode> is given, it must be either "all" or "orf": "all" means
262 translate in full, with stops, and produce no individual ORFs; "orf" means
263 translate and report only ORFs of length greater than 20. If B<translate> is
264 used but I<mode> is omitted, the default is to translate using the "orf"
265 method [default: off (no translation)]
266
267 =item B<-h>
268
269 Display help message
270
271 =back
272
273 The input must be a FASTA-format file. The C<-fasta> and C<-dir> options are
274 mandatory. You cannot specify both an E-value and bits score threshold.
275
276 =head1 OVERVIEW
277
278 C<pfam_scan.pl> is a script for searching one or more protein sequences against the
279 library of HMMs from Pfam. It requires a local copy of the Pfam data files, which
280 can be obtained from the Pfam FTP area:
281
282 ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/
283
284 You must also have the HMMER3 binaries installed and their locations given by your
285 C<PATH> environment variable. You can download the HMMER3 package at:
286
287 ftp://selab.janelia.org/pub/software/hmmer3/
288
289 =head1 OUTPUT
290
291 The output format is:
292 <seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan> <predicted_active_site_residues>
293 Example output (-as option):
294
295 O65039.1 38 93 38 93 PF08246 Inhibitor_I29 Domain 1 58 58 45.9 2.8e-12 1 No_clan
296 O65039.1 126 342 126 342 PF00112 Peptidase_C1 Domain 1 216 216 296.0 1.1e-88 1 CL0125 predicted_active_site[150,285,307]
297
298 Most of these values are derived from the output of I<hmmscan> (see HMMER3
299 documentation for details). The significance value is 1 if the bit score for a
300 hit is greater than or equal to the curated gathering threshold for the
301 matching family, 0 otherwise.
302
303 =head1 REFERENCES
304
305 Active site residues are predicted using the method described in the publication:
306
307 Mistry J., Bateman A., Finn R.D. "Predicting active site residue annotations in
308 the Pfam database." BMC Bioinformatics. 2007;8:298. PMID:17688688.
309
310 =head1 AUTHORS
311
312 Jaina Mistry (jaina@ebi.ac.uk), Rob Finn (rdf@ebi.ac.uk)
313
314 =cut
315
316 =head1 COPYRIGHT
317
318 Copyright (c) 2009: Genome Research Ltd.
319
320 Authors: Jaina Mistry (jaina@ebi.ac.uk), rdf (rdf@ebi.ac.uk)
321
322 This is free software; you can redistribute it and/or
323 modify it under the terms of the GNU General Public License
324 as published by the Free Software Foundation; either version 2
325 of the License, or (at your option) any later version.
326
327 This program is distributed in the hope that it will be useful,
328 but WITHOUT ANY WARRANTY; without even the implied warranty of
329 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
330 GNU General Public License for more details.
331
332 You should have received a copy of the GNU General Public License
333 along with this program; if not, write to the Free Software
334 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
335 or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
336
337 =cut
338