Mercurial > repos > matteoc > agame_custom_tools
comparison pfamScan/Bio/Pfam/Scan/PfamScan.pm @ 0:68a3648c7d91 draft default tip
Uploaded
author | matteoc |
---|---|
date | Thu, 22 Dec 2016 04:45:31 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:68a3648c7d91 |
---|---|
1 | |
2 =head1 NAME | |
3 | |
4 Bio::Pfam::Scan::PfamScan | |
5 | |
6 =cut | |
7 | |
8 package Bio::Pfam::Scan::PfamScan; | |
9 | |
10 =head1 SYNOPSIS | |
11 | |
12 my $ps = Bio::Pfam::Scan::PfamScan->new( | |
13 -cut_off => $hmmscan_cut_off, | |
14 -dir => $dir, | |
15 -clan_overlap => $clan_overlap, | |
16 -fasta => $fasta, | |
17 -align => $align, | |
18 -as => $as | |
19 ); | |
20 | |
21 $ps->search; | |
22 $ps->write_results; | |
23 | |
24 =head1 DESCRIPTION | |
25 | |
26 $Id: PfamScan.pm,v 1.4 2010-01-12 09:41:42 jm14 Exp $ | |
27 | |
28 =cut | |
29 | |
30 use strict; | |
31 use warnings; | |
32 | |
33 use Bio::Pfam::HMM::HMMResultsIO; | |
34 use Bio::Pfam::Active_site::as_search; | |
35 use Bio::SimpleAlign; | |
36 use Bio::Pfam::Scan::Seq; | |
37 | |
38 use Carp; | |
39 use IPC::Run qw( start finish ); | |
40 | |
41 #------------------------------------------------------------------------------- | |
42 #- constructor ----------------------------------------------------------------- | |
43 #------------------------------------------------------------------------------- | |
44 | |
45 =head1 METHODS | |
46 | |
47 =head2 new | |
48 | |
49 The only constructor for the object. Accepts a set of arguments that specify | |
50 the parameters for the search: | |
51 | |
52 =over | |
53 | |
54 =item -cut_off | |
55 | |
56 =item -dir | |
57 | |
58 =item -clan_overlap | |
59 | |
60 =item -fasta | |
61 | |
62 =item -sequence | |
63 | |
64 =item -align | |
65 | |
66 =item -hmm | |
67 | |
68 =item -as | |
69 | |
70 =back | |
71 | |
72 =cut | |
73 | |
74 sub new { | |
75 my ( $class, @args ) = @_; | |
76 | |
77 my $self = {}; | |
78 bless $self, $class; | |
79 | |
80 # To avoid hard coding the location for the binary, we assume it will be on the path..... | |
81 $self->{_HMMSCAN} = 'hmmscan'; | |
82 | |
83 # handle arguments, if we were given any here | |
84 $self->_process_args(@args) if @args; | |
85 | |
86 return $self; | |
87 } | |
88 | |
89 #------------------------------------------------------------------------------- | |
90 #- public methods -------------------------------------------------------------- | |
91 #------------------------------------------------------------------------------- | |
92 | |
93 =head2 search | |
94 | |
95 The main method on the object. Performs a C<hmmscan> search using the supplied | |
96 sequence and the specified HMM library. | |
97 | |
98 =cut | |
99 | |
100 sub search { | |
101 my ( $self, @args ) = @_; | |
102 | |
103 # handle the arguments, if we were handed any here | |
104 $self->_process_args(@args) if @args; | |
105 | |
106 # set up the output header | |
107 $self->_build_header; | |
108 | |
109 croak qq(FATAL: no sequence given; set the search parameters before calling "search") | |
110 unless defined $self->{_sequence}; | |
111 | |
112 my ( %AllResults, $pfamB, $firstResult ); | |
113 | |
114 foreach my $hmmlib ( @{ $self->{_hmmlib} } ) { | |
115 | |
116 my ( @hmmscan_cut_off, $seq_evalue, $dom_evalue ); | |
117 if ( $hmmlib !~ /Pfam\-B/ ) { | |
118 @hmmscan_cut_off = @{ $self->{_hmmscan_cutoff} }; | |
119 } | |
120 else { | |
121 $pfamB = 1; | |
122 $seq_evalue = 0.001; | |
123 $dom_evalue = 0.001; | |
124 | |
125 # It's a pfamB search so use some default cut off values | |
126 push @hmmscan_cut_off, '-E', $seq_evalue, '--domE', $dom_evalue; | |
127 } | |
128 | |
129 push @{ $self->{_header} }, | |
130 "# cpu number specified: " . $self->{_cpu} . "\n" | |
131 if ( $hmmlib !~ /Pfam\-B/ and $self->{_cpu} ); | |
132 | |
133 push @{ $self->{_header} }, | |
134 "# searching against: " | |
135 . $self->{_dir} | |
136 . "/$hmmlib, with cut off " | |
137 . join( " ", @hmmscan_cut_off ) . "\n"; | |
138 my @params; | |
139 if ( $self->{_cpu} ) { | |
140 @params = ( | |
141 'hmmscan', '--notextw', '--cpu', $self->{_cpu}, @hmmscan_cut_off, | |
142 $self->{_dir} . '/' . $hmmlib, | |
143 $self->{_fasta} | |
144 ); | |
145 } | |
146 else { | |
147 @params = ( | |
148 'hmmscan', '--notextw', @hmmscan_cut_off, $self->{_dir} . '/' . $hmmlib, | |
149 $self->{_fasta} | |
150 ); | |
151 | |
152 } | |
153 | |
154 print STDERR "PfamScan::search: hmmscan command: |@params|\n" | |
155 if $ENV{DEBUG}; | |
156 print STDERR 'PfamScan::search: sequence: |' . $self->{_sequence} . "|\n" | |
157 if $ENV{DEBUG}; | |
158 | |
159 my $run = start \@params, '<pipe', \*IN, '>pipe', \*OUT, '2>pipe', \*ERR | |
160 or croak qq(FATAL: error running hmmscan; IPC::Run returned '$?'); | |
161 | |
162 # print IN $self->{_sequence}; ; | |
163 close IN; | |
164 | |
165 $self->{_hmmresultIO} = Bio::Pfam::HMM::HMMResultsIO->new; | |
166 $self->{_all_results} = $self->{_hmmresultIO}->parseMultiHMMER3( \*OUT ); | |
167 close OUT; | |
168 | |
169 my $err; | |
170 while (<ERR>) { | |
171 $err .= $_; | |
172 } | |
173 close ERR; | |
174 | |
175 finish $run | |
176 or croak qq|FATAL: error running hmmscan ($err); ipc returned '$?'|; | |
177 | |
178 unless ( $hmmlib =~ /Pfam\-B/ ) { | |
179 | |
180 if ( $self->{_clan_overlap} ) { | |
181 push( @{ $self->{_header} }, "# resolve clan overlaps: off\n" ); | |
182 } | |
183 else { | |
184 push( @{ $self->{_header} }, "# resolve clan overlaps: on\n" ); | |
185 $self->_resolve_clan_overlap; | |
186 } | |
187 | |
188 if ( $self->{_as} ) { | |
189 push( @{ $self->{_header} }, "# predict active sites: on\n" ); | |
190 $self->_pred_act_sites; | |
191 } | |
192 else { | |
193 push( @{ $self->{_header} }, "# predict active sites: off\n" ); | |
194 } | |
195 | |
196 if ( $self->{_translate} ) { | |
197 push @{ $self->{_header} }, "# translate DNA sequence: " . $self->{_translate} . "\n"; | |
198 } | |
199 } | |
200 | |
201 # Determine which hits are significant | |
202 foreach my $result ( @{ $self->{_all_results} } ) { | |
203 foreach | |
204 my $unit ( sort { $a->seqFrom <=> $b->seqFrom } @{ $result->units } ) | |
205 { | |
206 | |
207 unless ($pfamB) { | |
208 | |
209 $unit->sig(0); | |
210 if ( $result->seqs->{ $unit->name }->bits >= | |
211 $self->{_seqGA}->{ $unit->name } ) | |
212 { | |
213 if ( $unit->bits >= $self->{_domGA}->{ $unit->name } ) { | |
214 $unit->sig(1); | |
215 } | |
216 } | |
217 } | |
218 } | |
219 } | |
220 | |
221 if ($firstResult) { | |
222 $AllResults{ $self->{_all_results} } = $self->{_all_results}; | |
223 } | |
224 else { | |
225 $firstResult = $self->{_all_results}; | |
226 } | |
227 | |
228 } # end of "foreach $hmmlib" | |
229 | |
230 # If more than one search, merge results into one object | |
231 if ( keys %AllResults ) { | |
232 | |
233 foreach my $AllResult ( keys %AllResults ) { | |
234 | |
235 foreach my $seq_id ( keys %{ $self->{_seq_hash} } ) { | |
236 | |
237 my $flag; | |
238 | |
239 #If seq exists in both, add all units from $AllResult to $firstResult | |
240 foreach my $result ( @{$firstResult} ) { | |
241 | |
242 if ( $result->seqName eq $seq_id ) { | |
243 $flag = 1; | |
244 | |
245 foreach my $result2 ( @{ $AllResults{$AllResult} } ) { | |
246 | |
247 if ( $result2->seqName eq $seq_id ) { | |
248 foreach my $hmmname ( keys %{ $result2->seqs } ) { | |
249 $result->addHMMSeq( $result2->seqs->{$hmmname} ); | |
250 } | |
251 foreach my $unit ( @{ $result2->units } ) { | |
252 $result->addHMMUnit($unit); | |
253 } | |
254 } | |
255 } | |
256 } | |
257 } | |
258 | |
259 #If seq doesn't exist in $firstResult, need to add both sequence and units to $firstResult | |
260 unless ($flag) { | |
261 foreach my $result2 ( @{ $AllResults{$AllResult} } ) { | |
262 if ( $result2->seqName eq $seq_id ) { | |
263 push @{$firstResult}, $result2; | |
264 } | |
265 } | |
266 } | |
267 } | |
268 } | |
269 $self->{_all_results} = $firstResult; | |
270 | |
271 } # end of "if keys %AllResults" | |
272 | |
273 push @{ $self->{_header} }, "# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =\n#\n"; | |
274 | |
275 if ( $self->{_as} ) { | |
276 push @{ $self->{_header} }, "# <seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan> <predicted_active_site_residues>"; | |
277 } | |
278 else { | |
279 push @{ $self->{_header} }, "# <seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan>"; | |
280 } | |
281 | |
282 if ( $self->{_translate} ) { | |
283 push @{ $self->{_header} }, " <strand> <nt start> <nt end>"; | |
284 } | |
285 push @{ $self->{_header} }, "\n"; | |
286 } | |
287 | |
288 #------------------------------------------------------------------------------- | |
289 | |
290 =head2 write_results | |
291 | |
292 Writes the results of the C<hmmscan> search. Takes a single argument, which can | |
293 be an open filehandle or a filename. A fatal error is generated if a file of the | |
294 given name already exists. | |
295 | |
296 =cut | |
297 | |
298 sub write_results { | |
299 my ( $self, $out, $e_seq, $e_dom, $b_seq, $b_dom ) = @_; | |
300 | |
301 my $fh; | |
302 | |
303 if ( ref $out eq 'GLOB' ) { | |
304 | |
305 # we were handed a filehandle | |
306 $fh = $out; | |
307 } | |
308 elsif ( $out and not ref $out ) { | |
309 | |
310 # we were handed a filename | |
311 croak qq(FATAL: output file "$out" already exists) if -f $out; | |
312 | |
313 open( FH, ">$out" ) | |
314 or croak qq(FATAL: Can\'t write to your output file "$out": $!); | |
315 $fh = \*FH; | |
316 } | |
317 else { | |
318 | |
319 # neither filehandle nor filename, default to STDOUT | |
320 $fh = \*STDOUT; | |
321 } | |
322 | |
323 if ( $self->{_header} ) { | |
324 my $header = join '', @{ $self->{_header} }; | |
325 print $fh "$header\n"; | |
326 } | |
327 | |
328 foreach my $result ( @{ $self->{_all_results} } ) { | |
329 $self->{_hmmresultIO} | |
330 ->write_ascii_out( $result, $fh, $self, $e_seq, $e_dom, $b_seq, $b_dom ); | |
331 } | |
332 close $fh; | |
333 } | |
334 | |
335 #------------------------------------------------------------------------------- | |
336 | |
337 =head2 results | |
338 | |
339 Returns the search results. | |
340 | |
341 =cut | |
342 | |
343 sub results { | |
344 my ( $self, $e_value ) = @_; | |
345 | |
346 unless ( defined $self->{_all_results} ) { | |
347 carp "WARNING: call search() before trying to retrieve results"; | |
348 return; | |
349 } | |
350 | |
351 my @search_results = (); | |
352 | |
353 foreach my $hmm_result ( @{ $self->{_all_results} } ) { | |
354 push @search_results, @{ $hmm_result->results( $self, $e_value ) }; | |
355 } | |
356 | |
357 return \@search_results; | |
358 } | |
359 | |
360 #------------------------------------------------------------------------------- | |
361 #- private methods ------------------------------------------------------------- | |
362 #------------------------------------------------------------------------------- | |
363 | |
364 =head1 PRIVATE METHODS | |
365 | |
366 =head2 _process_args | |
367 | |
368 Handles the input arguments. | |
369 | |
370 =cut | |
371 | |
372 sub _process_args { | |
373 my ( $self, @args ) = @_; | |
374 | |
375 # accept both a hash and a hash ref | |
376 my $args = ( ref $args[0] eq 'HASH' ) ? shift @args : {@args}; | |
377 | |
378 # make sure we get a sequence | |
379 if ( $args->{-fasta} and $args->{-sequence} ) { | |
380 croak qq(FATAL: "-fasta" and "-sequence" are mutually exclusive); | |
381 } | |
382 elsif ( $args->{-fasta} ) { | |
383 croak qq(FATAL: fasta file "$args->{-fasta}" doesn\'t exist) | |
384 unless -s $args->{-fasta}; | |
385 } | |
386 elsif ( $args->{-sequence} ) { | |
387 croak qq(FATAL: no sequence given) | |
388 unless length( $args->{-sequence} ); | |
389 } | |
390 else { | |
391 croak qq(FATAL: must specify either "-fasta" or "-sequence"); | |
392 } | |
393 | |
394 # check the cut off | |
395 if ( ( $args->{-e_seq} and ( $args->{-b_seq} || $args->{-b_dom} ) ) | |
396 or ( $args->{-b_seq} and ( $args->{-e_seq} || $args->{-e_dom} ) ) | |
397 or ( $args->{-b_dom} and $args->{-e_dom} ) ) | |
398 { | |
399 croak qq(FATAL: can\'t use e value and bit score threshold together); | |
400 } | |
401 | |
402 $self->{_hmmscan_cutoff} = (); | |
403 if ( $args->{-e_seq} ) { | |
404 croak qq(FATAL: the E-value sequence cut-off "$args->{-e_seq}" must be a positive non-zero number) | |
405 unless $args->{-e_seq} > 0; | |
406 | |
407 push @{ $self->{_hmmscan_cutoff} }, '-E', $args->{-e_seq}; | |
408 } | |
409 | |
410 if ( $args->{-e_dom} ) { | |
411 croak q(FATAL: if you supply "-e_dom" you must also supply "-e_seq") | |
412 unless $args->{-e_seq}; | |
413 | |
414 croak qq(FATAL: the E-value domain cut-off "$args->{-e_dom}" must be positive non-zero number) | |
415 unless $args->{-e_dom} > 0; | |
416 | |
417 push @{ $self->{_hmmscan_cutoff} }, '--domE', $args->{-e_dom}; | |
418 } | |
419 | |
420 if ( $args->{-b_seq} ) { | |
421 push @{ $self->{_hmmscan_cutoff} }, '-T', $args->{-b_seq}; | |
422 } | |
423 | |
424 if ( $args->{-b_dom} ) { | |
425 croak q(FATAL: if you supply "-b_dom" you must also supply "-b_seq") | |
426 unless $args->{-b_seq}; | |
427 | |
428 push @{ $self->{_hmmscan_cutoff} }, '--domT', $args->{-b_dom}; | |
429 } | |
430 | |
431 unless ( $self->{_hmmscan_cutoff} ) { | |
432 push @{ $self->{_hmmscan_cutoff} }, '--cut_ga'; | |
433 } | |
434 | |
435 # make sure we have a valid directory for the HMM data files | |
436 croak qq(FATAL: directory "$args->{-dir}" does not exist) | |
437 unless -d $args->{-dir}; | |
438 | |
439 # populate the object | |
440 $self->{_cut_off} = $args->{-cut_off}; | |
441 $self->{_dir} = $args->{-dir}; | |
442 $self->{_clan_overlap} = $args->{-clan_overlap}; | |
443 $self->{_fasta} = $args->{-fasta}; | |
444 $self->{_align} = $args->{-align}; | |
445 $self->{_as} = $args->{-as}; | |
446 $self->{_sequence} = $args->{-sequence}; | |
447 $self->{_cpu} = $args->{-cpu}; | |
448 $self->{_translate} = $args->{-translate}; | |
449 | |
450 $self->{_hmmlib} = []; | |
451 if ( $args->{-hmmlib} ) { | |
452 if ( ref $args->{-hmmlib} eq 'ARRAY' ) { | |
453 push @{ $self->{_hmmlib} }, @{ $args->{-hmmlib} }; | |
454 } | |
455 else { | |
456 push @{ $self->{_hmmlib} }, $args->{-hmmlib}; | |
457 } | |
458 } | |
459 else { | |
460 push @{ $self->{_hmmlib} }, "Pfam-A.hmm"; | |
461 } | |
462 | |
463 # Now check that the library exists in the data dir! | |
464 foreach my $hmmlib ( @{ $self->{_hmmlib} } ) { | |
465 | |
466 croak qq(FATAL: can't find $hmmlib and/or $hmmlib binaries in "$args->{-dir}") | |
467 unless ( | |
468 -s $self->{_dir}, | |
469 "/$hmmlib" | |
470 and -s $self->{_dir} . "/$hmmlib.h3f" | |
471 and -s $self->{_dir} . "/$hmmlib.h3i" | |
472 and -s $self->{_dir} . "/$hmmlib.h3m" | |
473 and -s $self->{_dir} . "/$hmmlib.h3p" | |
474 and -s $self->{_dir} . "/$hmmlib.dat" | |
475 ); | |
476 | |
477 # read the necessary data, if it's not been read already | |
478 $self->_read_pfam_data; | |
479 } | |
480 | |
481 $self->{_max_seqname} = 0; | |
482 | |
483 # if there's nothing in "_sequence" try to load a fasta file | |
484 $self->_read_fasta | |
485 unless $self->{_sequence}; | |
486 | |
487 # check again for a sequence. If we don't have one at this point, bail with | |
488 # an error | |
489 croak qq(FATAL: no sequence given) | |
490 unless $self->{_sequence}; | |
491 | |
492 # read fasta file, store maximum sequence name and store sequences for active | |
493 # sites prediction | |
494 $self->_parse_sequence | |
495 unless $self->{_max_seqname}; | |
496 | |
497 if ( $self->{_as} ) { | |
498 $self->_parse_act_site_data | |
499 unless $self->{_read_read_act_site_data}; | |
500 } | |
501 | |
502 if ( $self->{_translate} ) { | |
503 $self->_translate_fasta; | |
504 } | |
505 | |
506 # see if a version number was specified | |
507 $self->{_version} = $args->{version}; | |
508 | |
509 } | |
510 | |
511 #------------------------------------------------------------------------------- | |
512 | |
513 =head2 _build_header | |
514 | |
515 Adds version to the header object | |
516 | |
517 =cut | |
518 | |
519 sub _build_header { | |
520 my ( $self, $version ) = @_; | |
521 | |
522 unshift @{ $self->{_header} }, | |
523 '# query sequence file: ' . $self->{_fasta} . "\n"; | |
524 | |
525 unshift @{ $self->{_header} }, <<EOF_license; | |
526 # Copyright (c) 2009 Genome Research Ltd\n# Freely distributed under the GNU | |
527 # General Public License | |
528 # | |
529 # Authors: Jaina Mistry (jaina\@ebi.ac.uk), | |
530 # Rob Finn (rdf\@ebi.ac.uk) | |
531 # | |
532 # This is free software; you can redistribute it and/or modify it under | |
533 # the terms of the GNU General Public License as published by the Free Software | |
534 # Foundation; either version 2 of the License, or (at your option) any later version. | |
535 # This program is distributed in the hope that it will be useful, but WITHOUT | |
536 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
537 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more | |
538 # details. | |
539 # | |
540 # You should have received a copy of the GNU General Public License along with | |
541 # this program. If not, see <http://www.gnu.org/licenses/>. | |
542 # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | |
543 EOF_license | |
544 | |
545 my $v = | |
546 ( defined $self->{_version} ) | |
547 ? "version $version, " | |
548 : ''; | |
549 | |
550 unshift @{ $self->{_header} }, | |
551 "# pfam_scan.pl, $v run at " . scalar(localtime) . "\n#\n"; | |
552 } | |
553 | |
554 #------------------------------------------------------------------------------- | |
555 | |
556 =head2 _read_fasta | |
557 | |
558 Reads a sequence from the fasta-format file that was specified in the | |
559 parameters. | |
560 | |
561 =cut | |
562 | |
563 sub _read_fasta { | |
564 my $self = shift; | |
565 | |
566 open( FASTA, $self->{_fasta} ) | |
567 or croak qq(FATAL: Couldn't open fasta file "$self->{_fasta}" $!\n); | |
568 my @rows = <FASTA>; | |
569 close FASTA; | |
570 | |
571 $self->{_sequence_rows} = \@rows; | |
572 | |
573 $self->{_sequence} = join '', @rows; | |
574 } | |
575 | |
576 #------------------------------------------------------------------------------- | |
577 | |
578 =head2 _resolve_clan_overlap | |
579 | |
580 Resolves overlaps between clans. | |
581 | |
582 =cut | |
583 | |
584 sub _resolve_clan_overlap { | |
585 my $self = shift; | |
586 | |
587 my @no_clan_overlap = (); | |
588 foreach my $result ( @{ $self->{_all_results} } ) { | |
589 my $new = | |
590 $result->remove_overlaps_by_clan( $self->{_clanmap}, $self->{_nested} ); | |
591 | |
592 push @no_clan_overlap, $new; | |
593 } | |
594 | |
595 $self->{_all_results} = \@no_clan_overlap; | |
596 } | |
597 | |
598 #------------------------------------------------------------------------------- | |
599 | |
600 =head2 _pred_act_sites | |
601 | |
602 Predicts active sites. Takes no arguments. Populates the "act_site" field on | |
603 each results object. | |
604 | |
605 =cut | |
606 | |
607 sub _pred_act_sites { | |
608 my $self = shift; | |
609 | |
610 # print STDERR "predicting active sites...\n"; | |
611 | |
612 my $hmm_file = $self->{_dir} . '/Pfam-A.hmm'; | |
613 | |
614 RESULT: foreach my $result ( @{ $self->{_all_results} } ) { | |
615 | |
616 # print STDERR "result: |" . $result->seqName . "|\n"; | |
617 | |
618 UNIT: foreach my $unit ( @{ $result->units } ) { | |
619 | |
620 # print STDERR "family: |" . $unit->name . "|\n"; | |
621 | |
622 next UNIT | |
623 unless ( $self->{_act_site_data}->{ $unit->name }->{'alignment'} ); | |
624 | |
625 my $seq_region = substr( | |
626 $self->{_seq_hash}->{ $result->seqName }, | |
627 $unit->seqFrom - 1, | |
628 $unit->seqTo - $unit->seqFrom + 1 | |
629 ); | |
630 | |
631 my $seq_se = $unit->seqFrom . '-' . $unit->seqTo; | |
632 | |
633 # print STDERR "seq_id: |" . $result->seqName . "|\n"; | |
634 # print STDERR "seq_se: |" . $seq_se . "|\n"; | |
635 # print STDERR "seq_region: |" . $seq_region . "|\n"; | |
636 # print STDERR "family: |" . $unit->name . "|\n"; | |
637 # print STDERR "hmm_file: |" . $hmm_file . "|\n"; | |
638 # print STDERR "dir: |" . $self->{_dir} . "|\n"; | |
639 | |
640 $unit->{act_site} = Bio::Pfam::Active_site::as_search::find_as( | |
641 $self->{_act_site_data}->{ $unit->name }->{'alignment'}, | |
642 $self->{_act_site_data}->{ $unit->name }->{'residues'}, | |
643 $result->seqName, | |
644 $seq_se, | |
645 $seq_region, | |
646 $unit->name, | |
647 $hmm_file | |
648 ); | |
649 } | |
650 } | |
651 } | |
652 | |
653 #------------------------------------------------------------------------------- | |
654 | |
655 =head2 _read_pfam_data | |
656 | |
657 Reads the Pfam data file ("Pfam-A.scan.dat") and populates the C<accmap>, | |
658 C<nested> and C<clanmap> hashes on the object. | |
659 | |
660 =cut | |
661 | |
662 sub _read_pfam_data { | |
663 my $self = shift; | |
664 | |
665 #print STDERR "reading " . $self->{_hmmlib} . ".dat\n" if($ENV{DEBUG}); | |
666 $self->{_accmap} = {}; | |
667 $self->{_nested} = {}; | |
668 $self->{_clanmap} = {}; | |
669 $self->{_desc} = {}; | |
670 $self->{_seqGA} = {}; | |
671 $self->{_domGA} = {}; | |
672 $self->{_type} = {}; | |
673 $self->{_model_len} = {}; | |
674 | |
675 foreach my $hmmlib ( @{ $self->{_hmmlib} } ) { | |
676 my $scandat = $self->{_dir} . '/' . $hmmlib . '.dat'; | |
677 open( SCANDAT, $scandat ) | |
678 or croak qq(FATAL: Couldn't open "$scandat" data file: $!); | |
679 my $id; | |
680 while (<SCANDAT>) { | |
681 if (m/^\#=GF ID\s+(\S+)/) { | |
682 $id = $1; | |
683 } | |
684 elsif (m/^\#=GF\s+AC\s+(\S+)/) { | |
685 $self->{_accmap}->{$id} = $1; | |
686 } | |
687 elsif (m/^\#=GF\s+DE\s+(.+)/) { | |
688 $self->{_desc}->{$id} = $1; | |
689 } | |
690 elsif (m/^\#=GF\s+GA\s+(\S+)\;\s+(\S+)\;/) { | |
691 $self->{_seqGA}->{$id} = $1; | |
692 $self->{_domGA}->{$id} = $2; | |
693 } | |
694 elsif (m/^\#=GF\s+TP\s+(\S+)/) { | |
695 $self->{_type}->{$id} = $1; | |
696 } | |
697 elsif (m/^\#=GF\s+ML\s+(\d+)/) { | |
698 $self->{_model_len}->{$id} = $1; | |
699 } | |
700 elsif (/^\#=GF\s+NE\s+(\S+)/) { | |
701 $self->{_nested}->{$id}->{$1} = 1; | |
702 $self->{_nested}->{$1}->{$id} = 1; | |
703 } | |
704 elsif (/^\#=GF\s+CL\s+(\S+)/) { | |
705 $self->{_clanmap}->{$id} = $1; | |
706 } | |
707 } | |
708 | |
709 close SCANDAT; | |
710 | |
711 # set a flag to show that we've read the data files already | |
712 $self->{ '_read_' . $hmmlib } = 1; | |
713 } | |
714 | |
715 } | |
716 | |
717 #------------------------------------------------------------------------------- | |
718 | |
719 =head2 _read_act_site_data | |
720 | |
721 Reads the Pfam active site data file ("active_site.dat") and populates | |
722 the C<act_site_data> hashes on the object. | |
723 | |
724 =cut | |
725 | |
726 sub _parse_act_site_data { | |
727 my $self = shift; | |
728 my $as_dat = $self->{_dir} . '/active_site.dat'; | |
729 | |
730 $self->{_act_site_data} = {}; | |
731 | |
732 open( AS, $as_dat ) | |
733 or croak qq(FATAL: Couldn\'t open "$as_dat" data file: $!); | |
734 | |
735 my ( $fam_id, $aln ); | |
736 | |
737 while (<AS>) { | |
738 if (/^ID\s+(\S+)/) { | |
739 $fam_id = $1; | |
740 $aln = new Bio::SimpleAlign; | |
741 } | |
742 elsif (/^AL\s+(\S+)\/(\d+)\-(\d+)\s+(\S+)/) { | |
743 my ( $seq_id, $st, $en, $seq ) = ( $1, $2, $3, $4 ); | |
744 | |
745 $aln->add_seq( | |
746 Bio::Pfam::Scan::Seq->new( | |
747 '-seq' => $seq, | |
748 '-id' => $seq_id, | |
749 '-start' => $st, | |
750 '-end' => $en, | |
751 '-type' => 'aligned' | |
752 ) | |
753 ); | |
754 } | |
755 elsif (/^RE\s+(\S+)\s+(\d+)/) { | |
756 my ( $seq_id, $res ) = ( $1, $2 ); | |
757 push( | |
758 @{ $self->{_act_site_data}->{$fam_id}->{'residues'}->{$seq_id} }, | |
759 $res | |
760 ); | |
761 | |
762 } | |
763 elsif (/^\/\//) { | |
764 | |
765 $self->{_act_site_data}->{$fam_id}->{'alignment'} = $aln; | |
766 | |
767 $fam_id = ""; | |
768 $aln = ""; | |
769 | |
770 } | |
771 else { | |
772 warn "Ignoring line:\n[$_]"; | |
773 } | |
774 } | |
775 close AS; | |
776 $self->{_read_read_act_site_data} = 1; | |
777 } | |
778 | |
779 #------------------------------------------------------------------------------- | |
780 | |
781 =head2 _parse_sequence | |
782 | |
783 This method is used to parse the sequence and hash it on sequence | |
784 identifier. It also stores the length of the longest sequence id | |
785 | |
786 =cut | |
787 | |
788 sub _parse_sequence { | |
789 my $self = shift; | |
790 | |
791 my $seq_hash = {}; | |
792 my $seq_id; | |
793 foreach ( @{ $self->{_sequence_rows} } ) { | |
794 | |
795 next if m/^\s*$/; #Ignore blank lines | |
796 | |
797 if (m/^>(\S+)/) { | |
798 $seq_id = $1; | |
799 | |
800 if ( exists( $seq_hash->{$seq_id} ) ) { | |
801 croak "FATAL: Sequence identifiers must be unique. Your fasta file contains two sequences with the same id ($seq_id)"; | |
802 } | |
803 | |
804 #Store the max length of seq name, use this later when printing in ascii | |
805 $self->{_max_seqname} = length($seq_id) | |
806 if ( !$self->{_max_seqname} | |
807 or length($seq_id) > $self->{_max_seqname} ); | |
808 } | |
809 else { | |
810 croak "FATAL: Unrecognised format of fasta file. Each sequence must have a header line in the format '>identifier <optional description>'" | |
811 unless defined $seq_id; | |
812 chomp; | |
813 $seq_hash->{$seq_id} .= $_; | |
814 } | |
815 } | |
816 | |
817 $self->{_seq_hash} = $seq_hash; | |
818 } | |
819 | |
820 #------------------------------------------------------------------------------- | |
821 | |
822 =head2 _translate_fasta | |
823 | |
824 Uses the HMMER v2.3.2 progam "translate" to perform a six-frame translation of | |
825 the input sequence. Checks the parameter "-translate". | |
826 | |
827 Accepted arguments are "all" and "orf", where "all" means (from the "translate" | |
828 help text) "translate in full, with stops; no individual ORFs" and "orf" means | |
829 "report only ORFs greater than minlen" where minlen is set to the default of | |
830 20. | |
831 | |
832 =cut | |
833 | |
834 sub _translate_fasta { | |
835 my ($self) = @_; | |
836 my $translatedFasta = $self->{_fasta} . ".translated"; | |
837 | |
838 my @params = ( 'translate', '-q', ); | |
839 if ( $self->{_translate} eq 'all' ) { | |
840 push( @params, '-a' ); | |
841 } | |
842 elsif ( $self->{_translate} eq 'orf' ) { | |
843 push( @params, '-l', '20' ); | |
844 } | |
845 else { | |
846 croak qq(Unexpected parameter '$self->{_translate}'); | |
847 } | |
848 push( @params, '-o', $translatedFasta, $self->{_fasta} ); | |
849 | |
850 print STDERR "PfamScan::translate_fasta: translate command: |@params|\n" | |
851 if $ENV{DEBUG}; | |
852 | |
853 my $run = start \@params, '<pipe', \*IN, '>pipe', \*OUT, '2>pipe', \*ERR | |
854 or croak qq(FATAL: error running translate; IPC::Run returned '$?'); | |
855 | |
856 close IN; | |
857 close OUT; | |
858 | |
859 my $err; | |
860 while (<ERR>) { | |
861 $err .= $_; | |
862 } | |
863 close ERR; | |
864 | |
865 finish $run | |
866 or croak qq|FATAL: error running translate ($err); ipc returned '$?'|; | |
867 open( F, "<", $translatedFasta ) | |
868 or croak qw(Could not open $translatedFasta '$!'); | |
869 if ( $self->{_translate} eq 'orf' ) { | |
870 while (<F>) { | |
871 if (/^>\s?(\S+).*nt (\d+)\.+(\d+)/) { | |
872 $self->{_orf}->{$1}->{start} = $2; | |
873 $self->{_orf}->{$1}->{end} = $3; | |
874 $self->{_orf}->{$1}->{strand} = ( $2 < $3 ) ? '+' : '-'; | |
875 } | |
876 } | |
877 } | |
878 else { | |
879 my $currentSeq; | |
880 my $currentFrame; | |
881 my $currentLen = 0; | |
882 my $maxEnd = 0; | |
883 while (<F>) { | |
884 chomp; | |
885 if (/^>\s?(\S+\:)(\d+)/) { | |
886 if ( $currentLen > 0 ) { | |
887 my $seqName = $currentSeq . $currentFrame; | |
888 if ( $currentFrame < 3 ) { | |
889 my $start = 1 + $currentFrame; | |
890 my $end = $start + $currentLen - 1; | |
891 $self->{_orf}->{$seqName}->{strand} = '+'; | |
892 $self->{_orf}->{$seqName}->{start} = $start; | |
893 $self->{_orf}->{$seqName}->{end} = $end; | |
894 $maxEnd = $end if ( $end > $maxEnd ); | |
895 } | |
896 else { | |
897 my $start = $maxEnd - ( $currentFrame - 3 ); | |
898 my $end = $start - $currentLen + 1; | |
899 $self->{_orf}->{$seqName}->{strand} = '-'; | |
900 $self->{_orf}->{$seqName}->{start} = $start; | |
901 $self->{_orf}->{$seqName}->{end} = $end; | |
902 } | |
903 } | |
904 $currentLen = 0; | |
905 $currentSeq = $1; | |
906 $currentFrame = $2; | |
907 } | |
908 else { | |
909 $currentLen += length($_) * 3; | |
910 } | |
911 } | |
912 my $seqName = $currentSeq . $currentFrame; | |
913 if ( $currentFrame < 3 ) { | |
914 my $start = 1 + $currentFrame; | |
915 my $end = $start + $currentLen - 1; | |
916 $self->{_orf}->{$seqName}->{strand} = '+'; | |
917 $self->{_orf}->{$seqName}->{start} = $start; | |
918 $self->{_orf}->{$seqName}->{end} = $end; | |
919 $maxEnd = $end if ( $end > $maxEnd ); | |
920 } | |
921 else { | |
922 my $start = $maxEnd - ( $currentFrame - 3 ); | |
923 my $end = $start - $currentLen + 1; | |
924 $self->{_orf}->{$seqName}->{strand} = '-'; | |
925 $self->{_orf}->{$seqName}->{start} = $start; | |
926 $self->{_orf}->{$seqName}->{end} = $end; | |
927 } | |
928 } | |
929 $self->{_fasta} = $translatedFasta; | |
930 } | |
931 #------------------------------------------------------------------------------- | |
932 | |
933 =head1 COPYRIGHT | |
934 | |
935 Copyright (c) 2009: Genome Research Ltd. | |
936 | |
937 Authors: Jaina Mistry (jm14@sanger.ac.uk), John Tate (jt6@sanger.ac.uk), Rob Finn (finnr@janelia.hhmi.org) | |
938 | |
939 This is free software; you can redistribute it and/or | |
940 modify it under the terms of the GNU General Public License | |
941 as published by the Free Software Foundation; either version 2 | |
942 of the License, or (at your option) any later version. | |
943 | |
944 This program is distributed in the hope that it will be useful, | |
945 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
946 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
947 GNU General Public License for more details. | |
948 | |
949 You should have received a copy of the GNU General Public License | |
950 along with this program; if not, write to the Free Software | |
951 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | |
952 or see the on-line version at http://www.gnu.org/copyleft/gpl.txt | |
953 | |
954 =cut | |
955 | |
956 1; | |
957 |