comparison pfamScan/Bio/Pfam/Scan/PfamScan.pm @ 0:68a3648c7d91 draft default tip

Uploaded
author matteoc
date Thu, 22 Dec 2016 04:45:31 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:68a3648c7d91
1
2 =head1 NAME
3
4 Bio::Pfam::Scan::PfamScan
5
6 =cut
7
8 package Bio::Pfam::Scan::PfamScan;
9
10 =head1 SYNOPSIS
11
12 my $ps = Bio::Pfam::Scan::PfamScan->new(
13 -cut_off => $hmmscan_cut_off,
14 -dir => $dir,
15 -clan_overlap => $clan_overlap,
16 -fasta => $fasta,
17 -align => $align,
18 -as => $as
19 );
20
21 $ps->search;
22 $ps->write_results;
23
24 =head1 DESCRIPTION
25
26 $Id: PfamScan.pm,v 1.4 2010-01-12 09:41:42 jm14 Exp $
27
28 =cut
29
30 use strict;
31 use warnings;
32
33 use Bio::Pfam::HMM::HMMResultsIO;
34 use Bio::Pfam::Active_site::as_search;
35 use Bio::SimpleAlign;
36 use Bio::Pfam::Scan::Seq;
37
38 use Carp;
39 use IPC::Run qw( start finish );
40
41 #-------------------------------------------------------------------------------
42 #- constructor -----------------------------------------------------------------
43 #-------------------------------------------------------------------------------
44
45 =head1 METHODS
46
47 =head2 new
48
49 The only constructor for the object. Accepts a set of arguments that specify
50 the parameters for the search:
51
52 =over
53
54 =item -cut_off
55
56 =item -dir
57
58 =item -clan_overlap
59
60 =item -fasta
61
62 =item -sequence
63
64 =item -align
65
66 =item -hmm
67
68 =item -as
69
70 =back
71
72 =cut
73
74 sub new {
75 my ( $class, @args ) = @_;
76
77 my $self = {};
78 bless $self, $class;
79
80 # To avoid hard coding the location for the binary, we assume it will be on the path.....
81 $self->{_HMMSCAN} = 'hmmscan';
82
83 # handle arguments, if we were given any here
84 $self->_process_args(@args) if @args;
85
86 return $self;
87 }
88
89 #-------------------------------------------------------------------------------
90 #- public methods --------------------------------------------------------------
91 #-------------------------------------------------------------------------------
92
93 =head2 search
94
95 The main method on the object. Performs a C<hmmscan> search using the supplied
96 sequence and the specified HMM library.
97
98 =cut
99
100 sub search {
101 my ( $self, @args ) = @_;
102
103 # handle the arguments, if we were handed any here
104 $self->_process_args(@args) if @args;
105
106 # set up the output header
107 $self->_build_header;
108
109 croak qq(FATAL: no sequence given; set the search parameters before calling "search")
110 unless defined $self->{_sequence};
111
112 my ( %AllResults, $pfamB, $firstResult );
113
114 foreach my $hmmlib ( @{ $self->{_hmmlib} } ) {
115
116 my ( @hmmscan_cut_off, $seq_evalue, $dom_evalue );
117 if ( $hmmlib !~ /Pfam\-B/ ) {
118 @hmmscan_cut_off = @{ $self->{_hmmscan_cutoff} };
119 }
120 else {
121 $pfamB = 1;
122 $seq_evalue = 0.001;
123 $dom_evalue = 0.001;
124
125 # It's a pfamB search so use some default cut off values
126 push @hmmscan_cut_off, '-E', $seq_evalue, '--domE', $dom_evalue;
127 }
128
129 push @{ $self->{_header} },
130 "# cpu number specified: " . $self->{_cpu} . "\n"
131 if ( $hmmlib !~ /Pfam\-B/ and $self->{_cpu} );
132
133 push @{ $self->{_header} },
134 "# searching against: "
135 . $self->{_dir}
136 . "/$hmmlib, with cut off "
137 . join( " ", @hmmscan_cut_off ) . "\n";
138 my @params;
139 if ( $self->{_cpu} ) {
140 @params = (
141 'hmmscan', '--notextw', '--cpu', $self->{_cpu}, @hmmscan_cut_off,
142 $self->{_dir} . '/' . $hmmlib,
143 $self->{_fasta}
144 );
145 }
146 else {
147 @params = (
148 'hmmscan', '--notextw', @hmmscan_cut_off, $self->{_dir} . '/' . $hmmlib,
149 $self->{_fasta}
150 );
151
152 }
153
154 print STDERR "PfamScan::search: hmmscan command: |@params|\n"
155 if $ENV{DEBUG};
156 print STDERR 'PfamScan::search: sequence: |' . $self->{_sequence} . "|\n"
157 if $ENV{DEBUG};
158
159 my $run = start \@params, '<pipe', \*IN, '>pipe', \*OUT, '2>pipe', \*ERR
160 or croak qq(FATAL: error running hmmscan; IPC::Run returned '$?');
161
162 # print IN $self->{_sequence}; ;
163 close IN;
164
165 $self->{_hmmresultIO} = Bio::Pfam::HMM::HMMResultsIO->new;
166 $self->{_all_results} = $self->{_hmmresultIO}->parseMultiHMMER3( \*OUT );
167 close OUT;
168
169 my $err;
170 while (<ERR>) {
171 $err .= $_;
172 }
173 close ERR;
174
175 finish $run
176 or croak qq|FATAL: error running hmmscan ($err); ipc returned '$?'|;
177
178 unless ( $hmmlib =~ /Pfam\-B/ ) {
179
180 if ( $self->{_clan_overlap} ) {
181 push( @{ $self->{_header} }, "# resolve clan overlaps: off\n" );
182 }
183 else {
184 push( @{ $self->{_header} }, "# resolve clan overlaps: on\n" );
185 $self->_resolve_clan_overlap;
186 }
187
188 if ( $self->{_as} ) {
189 push( @{ $self->{_header} }, "# predict active sites: on\n" );
190 $self->_pred_act_sites;
191 }
192 else {
193 push( @{ $self->{_header} }, "# predict active sites: off\n" );
194 }
195
196 if ( $self->{_translate} ) {
197 push @{ $self->{_header} }, "# translate DNA sequence: " . $self->{_translate} . "\n";
198 }
199 }
200
201 # Determine which hits are significant
202 foreach my $result ( @{ $self->{_all_results} } ) {
203 foreach
204 my $unit ( sort { $a->seqFrom <=> $b->seqFrom } @{ $result->units } )
205 {
206
207 unless ($pfamB) {
208
209 $unit->sig(0);
210 if ( $result->seqs->{ $unit->name }->bits >=
211 $self->{_seqGA}->{ $unit->name } )
212 {
213 if ( $unit->bits >= $self->{_domGA}->{ $unit->name } ) {
214 $unit->sig(1);
215 }
216 }
217 }
218 }
219 }
220
221 if ($firstResult) {
222 $AllResults{ $self->{_all_results} } = $self->{_all_results};
223 }
224 else {
225 $firstResult = $self->{_all_results};
226 }
227
228 } # end of "foreach $hmmlib"
229
230 # If more than one search, merge results into one object
231 if ( keys %AllResults ) {
232
233 foreach my $AllResult ( keys %AllResults ) {
234
235 foreach my $seq_id ( keys %{ $self->{_seq_hash} } ) {
236
237 my $flag;
238
239 #If seq exists in both, add all units from $AllResult to $firstResult
240 foreach my $result ( @{$firstResult} ) {
241
242 if ( $result->seqName eq $seq_id ) {
243 $flag = 1;
244
245 foreach my $result2 ( @{ $AllResults{$AllResult} } ) {
246
247 if ( $result2->seqName eq $seq_id ) {
248 foreach my $hmmname ( keys %{ $result2->seqs } ) {
249 $result->addHMMSeq( $result2->seqs->{$hmmname} );
250 }
251 foreach my $unit ( @{ $result2->units } ) {
252 $result->addHMMUnit($unit);
253 }
254 }
255 }
256 }
257 }
258
259 #If seq doesn't exist in $firstResult, need to add both sequence and units to $firstResult
260 unless ($flag) {
261 foreach my $result2 ( @{ $AllResults{$AllResult} } ) {
262 if ( $result2->seqName eq $seq_id ) {
263 push @{$firstResult}, $result2;
264 }
265 }
266 }
267 }
268 }
269 $self->{_all_results} = $firstResult;
270
271 } # end of "if keys %AllResults"
272
273 push @{ $self->{_header} }, "# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =\n#\n";
274
275 if ( $self->{_as} ) {
276 push @{ $self->{_header} }, "# <seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan> <predicted_active_site_residues>";
277 }
278 else {
279 push @{ $self->{_header} }, "# <seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan>";
280 }
281
282 if ( $self->{_translate} ) {
283 push @{ $self->{_header} }, " <strand> <nt start> <nt end>";
284 }
285 push @{ $self->{_header} }, "\n";
286 }
287
288 #-------------------------------------------------------------------------------
289
290 =head2 write_results
291
292 Writes the results of the C<hmmscan> search. Takes a single argument, which can
293 be an open filehandle or a filename. A fatal error is generated if a file of the
294 given name already exists.
295
296 =cut
297
298 sub write_results {
299 my ( $self, $out, $e_seq, $e_dom, $b_seq, $b_dom ) = @_;
300
301 my $fh;
302
303 if ( ref $out eq 'GLOB' ) {
304
305 # we were handed a filehandle
306 $fh = $out;
307 }
308 elsif ( $out and not ref $out ) {
309
310 # we were handed a filename
311 croak qq(FATAL: output file "$out" already exists) if -f $out;
312
313 open( FH, ">$out" )
314 or croak qq(FATAL: Can\'t write to your output file "$out": $!);
315 $fh = \*FH;
316 }
317 else {
318
319 # neither filehandle nor filename, default to STDOUT
320 $fh = \*STDOUT;
321 }
322
323 if ( $self->{_header} ) {
324 my $header = join '', @{ $self->{_header} };
325 print $fh "$header\n";
326 }
327
328 foreach my $result ( @{ $self->{_all_results} } ) {
329 $self->{_hmmresultIO}
330 ->write_ascii_out( $result, $fh, $self, $e_seq, $e_dom, $b_seq, $b_dom );
331 }
332 close $fh;
333 }
334
335 #-------------------------------------------------------------------------------
336
337 =head2 results
338
339 Returns the search results.
340
341 =cut
342
343 sub results {
344 my ( $self, $e_value ) = @_;
345
346 unless ( defined $self->{_all_results} ) {
347 carp "WARNING: call search() before trying to retrieve results";
348 return;
349 }
350
351 my @search_results = ();
352
353 foreach my $hmm_result ( @{ $self->{_all_results} } ) {
354 push @search_results, @{ $hmm_result->results( $self, $e_value ) };
355 }
356
357 return \@search_results;
358 }
359
360 #-------------------------------------------------------------------------------
361 #- private methods -------------------------------------------------------------
362 #-------------------------------------------------------------------------------
363
364 =head1 PRIVATE METHODS
365
366 =head2 _process_args
367
368 Handles the input arguments.
369
370 =cut
371
372 sub _process_args {
373 my ( $self, @args ) = @_;
374
375 # accept both a hash and a hash ref
376 my $args = ( ref $args[0] eq 'HASH' ) ? shift @args : {@args};
377
378 # make sure we get a sequence
379 if ( $args->{-fasta} and $args->{-sequence} ) {
380 croak qq(FATAL: "-fasta" and "-sequence" are mutually exclusive);
381 }
382 elsif ( $args->{-fasta} ) {
383 croak qq(FATAL: fasta file "$args->{-fasta}" doesn\'t exist)
384 unless -s $args->{-fasta};
385 }
386 elsif ( $args->{-sequence} ) {
387 croak qq(FATAL: no sequence given)
388 unless length( $args->{-sequence} );
389 }
390 else {
391 croak qq(FATAL: must specify either "-fasta" or "-sequence");
392 }
393
394 # check the cut off
395 if ( ( $args->{-e_seq} and ( $args->{-b_seq} || $args->{-b_dom} ) )
396 or ( $args->{-b_seq} and ( $args->{-e_seq} || $args->{-e_dom} ) )
397 or ( $args->{-b_dom} and $args->{-e_dom} ) )
398 {
399 croak qq(FATAL: can\'t use e value and bit score threshold together);
400 }
401
402 $self->{_hmmscan_cutoff} = ();
403 if ( $args->{-e_seq} ) {
404 croak qq(FATAL: the E-value sequence cut-off "$args->{-e_seq}" must be a positive non-zero number)
405 unless $args->{-e_seq} > 0;
406
407 push @{ $self->{_hmmscan_cutoff} }, '-E', $args->{-e_seq};
408 }
409
410 if ( $args->{-e_dom} ) {
411 croak q(FATAL: if you supply "-e_dom" you must also supply "-e_seq")
412 unless $args->{-e_seq};
413
414 croak qq(FATAL: the E-value domain cut-off "$args->{-e_dom}" must be positive non-zero number)
415 unless $args->{-e_dom} > 0;
416
417 push @{ $self->{_hmmscan_cutoff} }, '--domE', $args->{-e_dom};
418 }
419
420 if ( $args->{-b_seq} ) {
421 push @{ $self->{_hmmscan_cutoff} }, '-T', $args->{-b_seq};
422 }
423
424 if ( $args->{-b_dom} ) {
425 croak q(FATAL: if you supply "-b_dom" you must also supply "-b_seq")
426 unless $args->{-b_seq};
427
428 push @{ $self->{_hmmscan_cutoff} }, '--domT', $args->{-b_dom};
429 }
430
431 unless ( $self->{_hmmscan_cutoff} ) {
432 push @{ $self->{_hmmscan_cutoff} }, '--cut_ga';
433 }
434
435 # make sure we have a valid directory for the HMM data files
436 croak qq(FATAL: directory "$args->{-dir}" does not exist)
437 unless -d $args->{-dir};
438
439 # populate the object
440 $self->{_cut_off} = $args->{-cut_off};
441 $self->{_dir} = $args->{-dir};
442 $self->{_clan_overlap} = $args->{-clan_overlap};
443 $self->{_fasta} = $args->{-fasta};
444 $self->{_align} = $args->{-align};
445 $self->{_as} = $args->{-as};
446 $self->{_sequence} = $args->{-sequence};
447 $self->{_cpu} = $args->{-cpu};
448 $self->{_translate} = $args->{-translate};
449
450 $self->{_hmmlib} = [];
451 if ( $args->{-hmmlib} ) {
452 if ( ref $args->{-hmmlib} eq 'ARRAY' ) {
453 push @{ $self->{_hmmlib} }, @{ $args->{-hmmlib} };
454 }
455 else {
456 push @{ $self->{_hmmlib} }, $args->{-hmmlib};
457 }
458 }
459 else {
460 push @{ $self->{_hmmlib} }, "Pfam-A.hmm";
461 }
462
463 # Now check that the library exists in the data dir!
464 foreach my $hmmlib ( @{ $self->{_hmmlib} } ) {
465
466 croak qq(FATAL: can't find $hmmlib and/or $hmmlib binaries in "$args->{-dir}")
467 unless (
468 -s $self->{_dir},
469 "/$hmmlib"
470 and -s $self->{_dir} . "/$hmmlib.h3f"
471 and -s $self->{_dir} . "/$hmmlib.h3i"
472 and -s $self->{_dir} . "/$hmmlib.h3m"
473 and -s $self->{_dir} . "/$hmmlib.h3p"
474 and -s $self->{_dir} . "/$hmmlib.dat"
475 );
476
477 # read the necessary data, if it's not been read already
478 $self->_read_pfam_data;
479 }
480
481 $self->{_max_seqname} = 0;
482
483 # if there's nothing in "_sequence" try to load a fasta file
484 $self->_read_fasta
485 unless $self->{_sequence};
486
487 # check again for a sequence. If we don't have one at this point, bail with
488 # an error
489 croak qq(FATAL: no sequence given)
490 unless $self->{_sequence};
491
492 # read fasta file, store maximum sequence name and store sequences for active
493 # sites prediction
494 $self->_parse_sequence
495 unless $self->{_max_seqname};
496
497 if ( $self->{_as} ) {
498 $self->_parse_act_site_data
499 unless $self->{_read_read_act_site_data};
500 }
501
502 if ( $self->{_translate} ) {
503 $self->_translate_fasta;
504 }
505
506 # see if a version number was specified
507 $self->{_version} = $args->{version};
508
509 }
510
511 #-------------------------------------------------------------------------------
512
513 =head2 _build_header
514
515 Adds version to the header object
516
517 =cut
518
519 sub _build_header {
520 my ( $self, $version ) = @_;
521
522 unshift @{ $self->{_header} },
523 '# query sequence file: ' . $self->{_fasta} . "\n";
524
525 unshift @{ $self->{_header} }, <<EOF_license;
526 # Copyright (c) 2009 Genome Research Ltd\n# Freely distributed under the GNU
527 # General Public License
528 #
529 # Authors: Jaina Mistry (jaina\@ebi.ac.uk),
530 # Rob Finn (rdf\@ebi.ac.uk)
531 #
532 # This is free software; you can redistribute it and/or modify it under
533 # the terms of the GNU General Public License as published by the Free Software
534 # Foundation; either version 2 of the License, or (at your option) any later version.
535 # This program is distributed in the hope that it will be useful, but WITHOUT
536 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
537 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
538 # details.
539 #
540 # You should have received a copy of the GNU General Public License along with
541 # this program. If not, see <http://www.gnu.org/licenses/>.
542 # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
543 EOF_license
544
545 my $v =
546 ( defined $self->{_version} )
547 ? "version $version, "
548 : '';
549
550 unshift @{ $self->{_header} },
551 "# pfam_scan.pl, $v run at " . scalar(localtime) . "\n#\n";
552 }
553
554 #-------------------------------------------------------------------------------
555
556 =head2 _read_fasta
557
558 Reads a sequence from the fasta-format file that was specified in the
559 parameters.
560
561 =cut
562
563 sub _read_fasta {
564 my $self = shift;
565
566 open( FASTA, $self->{_fasta} )
567 or croak qq(FATAL: Couldn't open fasta file "$self->{_fasta}" $!\n);
568 my @rows = <FASTA>;
569 close FASTA;
570
571 $self->{_sequence_rows} = \@rows;
572
573 $self->{_sequence} = join '', @rows;
574 }
575
576 #-------------------------------------------------------------------------------
577
578 =head2 _resolve_clan_overlap
579
580 Resolves overlaps between clans.
581
582 =cut
583
584 sub _resolve_clan_overlap {
585 my $self = shift;
586
587 my @no_clan_overlap = ();
588 foreach my $result ( @{ $self->{_all_results} } ) {
589 my $new =
590 $result->remove_overlaps_by_clan( $self->{_clanmap}, $self->{_nested} );
591
592 push @no_clan_overlap, $new;
593 }
594
595 $self->{_all_results} = \@no_clan_overlap;
596 }
597
598 #-------------------------------------------------------------------------------
599
600 =head2 _pred_act_sites
601
602 Predicts active sites. Takes no arguments. Populates the "act_site" field on
603 each results object.
604
605 =cut
606
607 sub _pred_act_sites {
608 my $self = shift;
609
610 # print STDERR "predicting active sites...\n";
611
612 my $hmm_file = $self->{_dir} . '/Pfam-A.hmm';
613
614 RESULT: foreach my $result ( @{ $self->{_all_results} } ) {
615
616 # print STDERR "result: |" . $result->seqName . "|\n";
617
618 UNIT: foreach my $unit ( @{ $result->units } ) {
619
620 # print STDERR "family: |" . $unit->name . "|\n";
621
622 next UNIT
623 unless ( $self->{_act_site_data}->{ $unit->name }->{'alignment'} );
624
625 my $seq_region = substr(
626 $self->{_seq_hash}->{ $result->seqName },
627 $unit->seqFrom - 1,
628 $unit->seqTo - $unit->seqFrom + 1
629 );
630
631 my $seq_se = $unit->seqFrom . '-' . $unit->seqTo;
632
633 # print STDERR "seq_id: |" . $result->seqName . "|\n";
634 # print STDERR "seq_se: |" . $seq_se . "|\n";
635 # print STDERR "seq_region: |" . $seq_region . "|\n";
636 # print STDERR "family: |" . $unit->name . "|\n";
637 # print STDERR "hmm_file: |" . $hmm_file . "|\n";
638 # print STDERR "dir: |" . $self->{_dir} . "|\n";
639
640 $unit->{act_site} = Bio::Pfam::Active_site::as_search::find_as(
641 $self->{_act_site_data}->{ $unit->name }->{'alignment'},
642 $self->{_act_site_data}->{ $unit->name }->{'residues'},
643 $result->seqName,
644 $seq_se,
645 $seq_region,
646 $unit->name,
647 $hmm_file
648 );
649 }
650 }
651 }
652
653 #-------------------------------------------------------------------------------
654
655 =head2 _read_pfam_data
656
657 Reads the Pfam data file ("Pfam-A.scan.dat") and populates the C<accmap>,
658 C<nested> and C<clanmap> hashes on the object.
659
660 =cut
661
662 sub _read_pfam_data {
663 my $self = shift;
664
665 #print STDERR "reading " . $self->{_hmmlib} . ".dat\n" if($ENV{DEBUG});
666 $self->{_accmap} = {};
667 $self->{_nested} = {};
668 $self->{_clanmap} = {};
669 $self->{_desc} = {};
670 $self->{_seqGA} = {};
671 $self->{_domGA} = {};
672 $self->{_type} = {};
673 $self->{_model_len} = {};
674
675 foreach my $hmmlib ( @{ $self->{_hmmlib} } ) {
676 my $scandat = $self->{_dir} . '/' . $hmmlib . '.dat';
677 open( SCANDAT, $scandat )
678 or croak qq(FATAL: Couldn't open "$scandat" data file: $!);
679 my $id;
680 while (<SCANDAT>) {
681 if (m/^\#=GF ID\s+(\S+)/) {
682 $id = $1;
683 }
684 elsif (m/^\#=GF\s+AC\s+(\S+)/) {
685 $self->{_accmap}->{$id} = $1;
686 }
687 elsif (m/^\#=GF\s+DE\s+(.+)/) {
688 $self->{_desc}->{$id} = $1;
689 }
690 elsif (m/^\#=GF\s+GA\s+(\S+)\;\s+(\S+)\;/) {
691 $self->{_seqGA}->{$id} = $1;
692 $self->{_domGA}->{$id} = $2;
693 }
694 elsif (m/^\#=GF\s+TP\s+(\S+)/) {
695 $self->{_type}->{$id} = $1;
696 }
697 elsif (m/^\#=GF\s+ML\s+(\d+)/) {
698 $self->{_model_len}->{$id} = $1;
699 }
700 elsif (/^\#=GF\s+NE\s+(\S+)/) {
701 $self->{_nested}->{$id}->{$1} = 1;
702 $self->{_nested}->{$1}->{$id} = 1;
703 }
704 elsif (/^\#=GF\s+CL\s+(\S+)/) {
705 $self->{_clanmap}->{$id} = $1;
706 }
707 }
708
709 close SCANDAT;
710
711 # set a flag to show that we've read the data files already
712 $self->{ '_read_' . $hmmlib } = 1;
713 }
714
715 }
716
717 #-------------------------------------------------------------------------------
718
719 =head2 _read_act_site_data
720
721 Reads the Pfam active site data file ("active_site.dat") and populates
722 the C<act_site_data> hashes on the object.
723
724 =cut
725
726 sub _parse_act_site_data {
727 my $self = shift;
728 my $as_dat = $self->{_dir} . '/active_site.dat';
729
730 $self->{_act_site_data} = {};
731
732 open( AS, $as_dat )
733 or croak qq(FATAL: Couldn\'t open "$as_dat" data file: $!);
734
735 my ( $fam_id, $aln );
736
737 while (<AS>) {
738 if (/^ID\s+(\S+)/) {
739 $fam_id = $1;
740 $aln = new Bio::SimpleAlign;
741 }
742 elsif (/^AL\s+(\S+)\/(\d+)\-(\d+)\s+(\S+)/) {
743 my ( $seq_id, $st, $en, $seq ) = ( $1, $2, $3, $4 );
744
745 $aln->add_seq(
746 Bio::Pfam::Scan::Seq->new(
747 '-seq' => $seq,
748 '-id' => $seq_id,
749 '-start' => $st,
750 '-end' => $en,
751 '-type' => 'aligned'
752 )
753 );
754 }
755 elsif (/^RE\s+(\S+)\s+(\d+)/) {
756 my ( $seq_id, $res ) = ( $1, $2 );
757 push(
758 @{ $self->{_act_site_data}->{$fam_id}->{'residues'}->{$seq_id} },
759 $res
760 );
761
762 }
763 elsif (/^\/\//) {
764
765 $self->{_act_site_data}->{$fam_id}->{'alignment'} = $aln;
766
767 $fam_id = "";
768 $aln = "";
769
770 }
771 else {
772 warn "Ignoring line:\n[$_]";
773 }
774 }
775 close AS;
776 $self->{_read_read_act_site_data} = 1;
777 }
778
779 #-------------------------------------------------------------------------------
780
781 =head2 _parse_sequence
782
783 This method is used to parse the sequence and hash it on sequence
784 identifier. It also stores the length of the longest sequence id
785
786 =cut
787
788 sub _parse_sequence {
789 my $self = shift;
790
791 my $seq_hash = {};
792 my $seq_id;
793 foreach ( @{ $self->{_sequence_rows} } ) {
794
795 next if m/^\s*$/; #Ignore blank lines
796
797 if (m/^>(\S+)/) {
798 $seq_id = $1;
799
800 if ( exists( $seq_hash->{$seq_id} ) ) {
801 croak "FATAL: Sequence identifiers must be unique. Your fasta file contains two sequences with the same id ($seq_id)";
802 }
803
804 #Store the max length of seq name, use this later when printing in ascii
805 $self->{_max_seqname} = length($seq_id)
806 if ( !$self->{_max_seqname}
807 or length($seq_id) > $self->{_max_seqname} );
808 }
809 else {
810 croak "FATAL: Unrecognised format of fasta file. Each sequence must have a header line in the format '>identifier <optional description>'"
811 unless defined $seq_id;
812 chomp;
813 $seq_hash->{$seq_id} .= $_;
814 }
815 }
816
817 $self->{_seq_hash} = $seq_hash;
818 }
819
820 #-------------------------------------------------------------------------------
821
822 =head2 _translate_fasta
823
824 Uses the HMMER v2.3.2 progam "translate" to perform a six-frame translation of
825 the input sequence. Checks the parameter "-translate".
826
827 Accepted arguments are "all" and "orf", where "all" means (from the "translate"
828 help text) "translate in full, with stops; no individual ORFs" and "orf" means
829 "report only ORFs greater than minlen" where minlen is set to the default of
830 20.
831
832 =cut
833
834 sub _translate_fasta {
835 my ($self) = @_;
836 my $translatedFasta = $self->{_fasta} . ".translated";
837
838 my @params = ( 'translate', '-q', );
839 if ( $self->{_translate} eq 'all' ) {
840 push( @params, '-a' );
841 }
842 elsif ( $self->{_translate} eq 'orf' ) {
843 push( @params, '-l', '20' );
844 }
845 else {
846 croak qq(Unexpected parameter '$self->{_translate}');
847 }
848 push( @params, '-o', $translatedFasta, $self->{_fasta} );
849
850 print STDERR "PfamScan::translate_fasta: translate command: |@params|\n"
851 if $ENV{DEBUG};
852
853 my $run = start \@params, '<pipe', \*IN, '>pipe', \*OUT, '2>pipe', \*ERR
854 or croak qq(FATAL: error running translate; IPC::Run returned '$?');
855
856 close IN;
857 close OUT;
858
859 my $err;
860 while (<ERR>) {
861 $err .= $_;
862 }
863 close ERR;
864
865 finish $run
866 or croak qq|FATAL: error running translate ($err); ipc returned '$?'|;
867 open( F, "<", $translatedFasta )
868 or croak qw(Could not open $translatedFasta '$!');
869 if ( $self->{_translate} eq 'orf' ) {
870 while (<F>) {
871 if (/^>\s?(\S+).*nt (\d+)\.+(\d+)/) {
872 $self->{_orf}->{$1}->{start} = $2;
873 $self->{_orf}->{$1}->{end} = $3;
874 $self->{_orf}->{$1}->{strand} = ( $2 < $3 ) ? '+' : '-';
875 }
876 }
877 }
878 else {
879 my $currentSeq;
880 my $currentFrame;
881 my $currentLen = 0;
882 my $maxEnd = 0;
883 while (<F>) {
884 chomp;
885 if (/^>\s?(\S+\:)(\d+)/) {
886 if ( $currentLen > 0 ) {
887 my $seqName = $currentSeq . $currentFrame;
888 if ( $currentFrame < 3 ) {
889 my $start = 1 + $currentFrame;
890 my $end = $start + $currentLen - 1;
891 $self->{_orf}->{$seqName}->{strand} = '+';
892 $self->{_orf}->{$seqName}->{start} = $start;
893 $self->{_orf}->{$seqName}->{end} = $end;
894 $maxEnd = $end if ( $end > $maxEnd );
895 }
896 else {
897 my $start = $maxEnd - ( $currentFrame - 3 );
898 my $end = $start - $currentLen + 1;
899 $self->{_orf}->{$seqName}->{strand} = '-';
900 $self->{_orf}->{$seqName}->{start} = $start;
901 $self->{_orf}->{$seqName}->{end} = $end;
902 }
903 }
904 $currentLen = 0;
905 $currentSeq = $1;
906 $currentFrame = $2;
907 }
908 else {
909 $currentLen += length($_) * 3;
910 }
911 }
912 my $seqName = $currentSeq . $currentFrame;
913 if ( $currentFrame < 3 ) {
914 my $start = 1 + $currentFrame;
915 my $end = $start + $currentLen - 1;
916 $self->{_orf}->{$seqName}->{strand} = '+';
917 $self->{_orf}->{$seqName}->{start} = $start;
918 $self->{_orf}->{$seqName}->{end} = $end;
919 $maxEnd = $end if ( $end > $maxEnd );
920 }
921 else {
922 my $start = $maxEnd - ( $currentFrame - 3 );
923 my $end = $start - $currentLen + 1;
924 $self->{_orf}->{$seqName}->{strand} = '-';
925 $self->{_orf}->{$seqName}->{start} = $start;
926 $self->{_orf}->{$seqName}->{end} = $end;
927 }
928 }
929 $self->{_fasta} = $translatedFasta;
930 }
931 #-------------------------------------------------------------------------------
932
933 =head1 COPYRIGHT
934
935 Copyright (c) 2009: Genome Research Ltd.
936
937 Authors: Jaina Mistry (jm14@sanger.ac.uk), John Tate (jt6@sanger.ac.uk), Rob Finn (finnr@janelia.hhmi.org)
938
939 This is free software; you can redistribute it and/or
940 modify it under the terms of the GNU General Public License
941 as published by the Free Software Foundation; either version 2
942 of the License, or (at your option) any later version.
943
944 This program is distributed in the hope that it will be useful,
945 but WITHOUT ANY WARRANTY; without even the implied warranty of
946 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
947 GNU General Public License for more details.
948
949 You should have received a copy of the GNU General Public License
950 along with this program; if not, write to the Free Software
951 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
952 or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
953
954 =cut
955
956 1;
957