comparison pfamScan/Bio/Pfam/HMM/HMMResultsIO.pm @ 0:68a3648c7d91 draft default tip

Uploaded
author matteoc
date Thu, 22 Dec 2016 04:45:31 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:68a3648c7d91
1 # HMMResultsIO.pm
2 #
3 # Author: rdf
4 # Maintainer: $Id: HMMResultsIO.pm,v 1.2 2009-12-01 15:42:20 jt6 Exp $
5 # Version: $Revision: 1.2 $
6 # Created: Nov 16, 2008
7 # Last Modified: $Date: 2009-12-01 15:42:20 $
8
9 =head1 NAME
10
11 Template - a short description of the class
12
13 =cut
14
15 package Bio::Pfam::HMM::HMMResultsIO;
16
17 =head1 DESCRIPTION
18
19 A more detailed description of what this class does and how it does it.
20
21 $Id: HMMResultsIO.pm,v 1.2 2009-12-01 15:42:20 jt6 Exp $
22
23 =head1 COPYRIGHT
24
25 File: HMMResultsIO.pm
26
27 Copyright (c) 2007: Genome Research Ltd.
28
29 Authors: Rob Finn (rdf@sanger.ac.uk), John Tate (jt6@sanger.ac.uk)
30
31 This is free software; you can redistribute it and/or
32 modify it under the terms of the GNU General Public License
33 as published by the Free Software Foundation; either version 2
34 of the License, or (at your option) any later version.
35
36 This program is distributed in the hope that it will be useful,
37 but WITHOUT ANY WARRANTY; without even the implied warranty of
38 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
39 GNU General Public License for more details.
40
41 You should have received a copy of the GNU General Public License
42 along with this program; if not, write to the Free Software
43 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
44 or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
45
46 =cut
47
48 use strict;
49 use warnings;
50 use Moose;
51 use Carp;
52
53 #All the things we need to objectfy the search results
54 use Bio::Pfam::HMM::HMMResults;
55 use Bio::Pfam::HMM::HMMSequence;
56 use Bio::Pfam::HMM::HMMUnit;
57
58 #-------------------------------------------------------------------------------
59
60 =head1 ATTRIBUTES
61
62
63
64 =cut
65
66 has 'align' => (
67 isa => 'Int',
68 is => 'rw',
69 default => 0
70 );
71
72 has 'outfile' => (
73 isa => 'Str',
74 is => 'rw',
75 default => 'OUTPUT'
76 );
77
78 has 'pfamout' => (
79 isa => 'Str',
80 is => 'rw',
81 default => 'PFAMOUT'
82 );
83
84 has 'scores' => (
85 isa => 'Str',
86 is => 'rw',
87 default => 'scores'
88 );
89
90 #-------------------------------------------------------------------------------
91
92 =head1 METHODS
93
94 =head2 parseHMMER3
95
96 Title : parseHMMER
97 Usage : $hmmResIO->parseHMMSearch( filename )
98 Function : Parse the output from a HMMER3 search results
99 Args : Filename containing the search
100 Returns : A Bio::Pfam::HMM::HMMResults object
101
102 =cut
103
104 sub parseHMMER3 {
105 my ( $self, $filename ) = @_;
106 my $fh;
107
108 if(ref($filename) eq 'GLOB'){
109 $fh = $filename;
110 }else{
111 open( $fh, $filename ) or confess "Could not open $filename:[$!]\n";
112 }
113
114 # open( $fh, $filename ) or confess "Could not open $filename:[$!]\n";
115 my $hmmRes = Bio::Pfam::HMM::HMMResults->new;
116 $self->_readHeader( $fh, $hmmRes );
117 $self->_readSeqHits( $fh, $hmmRes );
118 $self->_readUnitHits( $fh, $hmmRes );
119 $self->_readFooter($fh, $hmmRes);
120 return ($hmmRes);
121 }
122
123
124
125 sub parseMultiHMMER3 {
126 my ( $self, $filename ) = @_;
127 my $fh;
128
129 if(ref($filename) eq 'GLOB'){
130 $fh = $filename;
131 }elsif( ref($filename) and $filename->isa('IO::File') ) {
132 $fh = $filename;
133 }else{
134 open( $fh, $filename ) or confess "Could not open $filename:[$!]\n";
135 }
136
137 my @hmmResAll;
138 my $program;
139 while(!eof($fh)){
140 my $hmmRes = Bio::Pfam::HMM::HMMResults->new;
141 my $eof = $self->_readHeader( $fh, $hmmRes );
142 last if($eof);
143 push(@hmmResAll, $hmmRes);
144 if($hmmRes->program) {
145 $program = $hmmRes->program;
146 }
147 else {
148 $hmmRes->program($program);
149 }
150 $self->_readSeqHits( $fh, $hmmRes );
151 $self->_readUnitHits( $fh, $hmmRes );
152 $self->_readFooter($fh, $hmmRes);
153 }
154 return (\@hmmResAll);
155 }
156
157 sub parseSplitHMMER3 {
158 my($self, $files ) = @_;
159
160 my $hmmRes = Bio::Pfam::HMM::HMMResults->new;
161
162 foreach my $filename (@{$files}){
163 my ($fh);
164 open( $fh, $filename ) or confess "Could not open $filename:[$!]\n";
165 $self->_readHeader( $fh, $hmmRes );
166 $self->_readSeqHits( $fh, $hmmRes );
167 $self->_readUnitHits( $fh, $hmmRes );
168 $self->_readFooter($fh, $hmmRes);
169 }
170
171 return ( $hmmRes );
172
173 }
174
175
176 #-------------------------------------------------------------------------------
177
178 =head2 convertHMMSearch
179
180 Title : convertHMMSearch
181 Usage : $hmmResIO->convertHMMSearch('SEARCHFILE')
182 Function : This wraps up a couple of methods to convert the more complex hmmsearch
183 : results in to nice clean format that we Pfam-ers are used to.
184 Args : The filename of the hmmsearch output file
185 Returns : Nothing
186
187 =cut
188
189 sub convertHMMSearch {
190 my ( $self, $filename ) = @_;
191
192 unless ($filename) {
193 confess "No filename passed in to convertHMMSearch\n";
194 }
195 unless ( -s $filename ) {
196 confess "$filename does not exists\n";
197 }
198
199 #Now parse in the raw HMM output and write out the results as a PFAMOUT
200 my $hmmRes = $self->parseHMMER3($filename);
201 $self->writePFAMOUT($hmmRes);
202 return $hmmRes;
203 }
204
205 #-------------------------------------------------------------------------------
206
207 =head2 writePFAMOUT
208
209 Title : writePFAMOUT
210 Usage : $hmmResIO->writePFAMOUT( $hmmRes )
211 Function : Writes a Bio::Pfam::HMM:HMMResults object in to a PFAMOUT file.
212 Args : A Bio::Pfam::HMM:HMMResults
213 Returns : Nothing
214
215 =cut
216
217 sub writePFAMOUT {
218 my ( $self, $hmmRes ) = @_;
219
220 unless ($hmmRes) {
221 confess "A Bio::Pfam::HMM::HMMResults object was not parsed in\n";
222 }
223 unless ( $hmmRes->isa("Bio::Pfam::HMM::HMMResults") ) {
224 confess("Variable passed in is not a Bio::Pfam::HMM::Results object");
225 }
226
227 my $fh;
228 open( $fh, ">" . $self->pfamout )
229 or confess "Could not open " . $self->pfamout . ":[$!]\n";
230
231 print $fh <<HEAD;
232 # ===========
233 # Pfam output
234 # ===========
235 #
236 # Sequence scores
237 # ---------------
238 #
239 # name description bits evalue n exp bias
240
241 HEAD
242
243 foreach
244 my $seq ( sort { $b->bits <=> $a->bits } ( @{ $hmmRes->eachHMMSeq } ) )
245 {
246 $_ = $seq->desc;
247 my ($desc) = /^(.{1,42})/;
248 $desc = uc($desc);
249 printf $fh (
250 "%-15s %-42s %8.1f %9s %3d %5.1f %5.1f\n",
251 $seq->name,
252 $desc,
253 $seq->bits,
254 $seq->evalue,
255 scalar( @{ $seq->hmmUnits } ),
256 defined( $seq->exp ) ? $seq->exp : "-",
257 defined( $seq->bias ) ? $seq->bias : "-"
258 );
259 }
260
261 print $fh <<HEAD;
262 #
263 # Domain scores
264 # -------------
265 #
266 # name env-st env-en ali-st ali-en hmm-st hmm-en bits evalue hit bias
267 #
268
269 HEAD
270
271 foreach my $dom ( sort { $b->bits <=> $a->bits } @{ $hmmRes->units } ) {
272
273
274 printf $fh (
275 "%-15s %6d %6d %6d %6d %6s %6s %6.1f %9s %6d %6.1f\n",
276 $dom->name,
277 $dom->envFrom,
278 $dom->envTo,
279 $dom->seqFrom,
280 $dom->seqTo,
281 $dom->hmmFrom,
282 $dom->hmmTo,
283 $dom->bits,
284 $dom->evalue,
285 $dom->domain,
286 defined( $dom->bias ) ? $dom->bias : "-",
287
288 );
289 }
290 }
291
292 #-------------------------------------------------------------------------------
293
294 =head2 parsePFAMOUT
295
296 Title : parsePFAMOUT
297 Usage : $self->parsePFAMOUT($filename)
298 Function : Reads in a PFAMOUT file. This file contains the minimal amount of information
299 : require to constrcut a pfam ALIGN file.
300 Args : A filename. Normally this is filename
301 Returns : A Bio::Pfam::HMM::HMMResults object
302
303 =cut
304
305 sub parsePFAMOUT {
306 my $self = shift;
307 my $filename = shift;
308
309 unless ($filename) {
310 confess('No filename or filehandle passed to parsePFAMOUT');
311 }
312
313 my $fh;
314 if ( ref($filename) eq 'GLOB' ) {
315 $fh = $filename;
316 }
317 else {
318 open( $fh, $filename ) or confess "Could not open $filename:[$!]\n";
319 }
320 my $hmmRes = Bio::Pfam::HMM::HMMResults->new;
321
322 while (<$fh>) {
323 /^# Domain scores/ && last;
324
325 #if (/^(\S+)\s+(.*?)\s+(\S+)\s+(\S+)\s+(\d+)\s*$/) {
326 if (/^(\S+)\s+(.*?)\s+(\S+)\s+(\S+)\s+(\d+)\s+\S+\s+(\S+)\s*$/) {
327
328 $hmmRes->addHMMSeq(
329 Bio::Pfam::HMM::HMMSequence->new(
330 {
331 name => $1,
332 desc => $2,
333 bits => $3,
334 evalue => $4,
335 numberHits => $5,
336 bias => $6
337 }
338 )
339 );
340 }
341 elsif (/^#|^\s+$/) {
342 next;
343 }
344 else {
345 warn "Did not parse|$_|\n";
346 }
347 }
348 while (<$fh>) {
349
350 #if (/^(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s*$/) {
351 if (
352 /^(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+\S+\s+(\S+)/)
353 {
354 $hmmRes->addHMMUnit(
355 Bio::Pfam::HMM::HMMUnit->new(
356 {
357 name => $1,
358 envFrom => $2,
359 envTo => $3,
360 seqFrom => $4,
361 seqTo => $5,
362 hmmFrom => $6,
363 hmmTo => $7,
364 bits => $8,
365 evalue => $9,
366 bias => $10
367 }
368 )
369 );
370 }
371 elsif (/^#|^\s+$/) {
372 next;
373 }
374 elsif (/^$/) {
375 next;
376 }
377 else {
378 warn "Did not parse: |$_|";
379 }
380 }
381 close($fh);
382 return ($hmmRes);
383 }
384
385 #-------------------------------------------------------------------------------
386
387 =head2 _readHeader
388
389 Title : _readHeader
390 Usage : Private method. $self->_readHeader(\*FH, $hmmResults)
391 Function : Reads the header section from a HMMER3 hmmsearch
392 Args : The file handle to hmmsearch output, a Bio::Pfam::HMM::HMMResults object
393 Returns : Nothing
394
395 =cut
396
397 #Parse the header part of the output first;
398 sub _readHeader {
399 my ( $self, $hs, $hmmRes ) = @_;
400
401 #Check the $hs is defined and a GLOB
402
403 while (<$hs>) {
404 if (/^Scores for complete/) {
405 last;
406 }
407 elsif (/^# query HMM file:\s+(\S+)/) {
408 $hmmRes->hmmName($1);
409 }
410 elsif (/^# target sequence database:\s+(\S+)/) {
411 $hmmRes->seqDB($1);
412 }
413 elsif (/^output directed to file:\s+(\S+)/) {
414 $hmmRes->thisFile($1);
415 }
416 elsif (/^Query:\s+(\S+)\s+\[M\=(\d+)\]/) {
417 $hmmRes->seedName($1);
418 $hmmRes->hmmLength($2);
419 }elsif(/^Query:\s+(\S+)\s+\[L\=(\d+)\]/) {
420 $hmmRes->seqName($1);
421 $hmmRes->seqLength($2);
422 }elsif (/^sequence E-value threshold: <= (\d+)/) {
423 $hmmRes->evalueThr($1);
424 }
425 elsif (/^# Random generator seed: (\d+)/) {
426 $hmmRes->randSeedNum($1);
427 }elsif(/^Description:\s+(.*)/){
428 $hmmRes->description($1);
429 }elsif(/^# (phmmer|hmmsearch|hmmscan|jackhmmer)/){
430 $hmmRes->program($1);
431 }elsif (/(^#)|(^$)/) {
432 next;
433 }elsif(/^Accession/){
434 next;
435 } elsif(/^\[ok\]/) {
436 return(1);
437 } else {
438 die "Failed to parse hmmsearch results |$_| in header section\n";
439 }
440 }
441 }
442
443 #-------------------------------------------------------------------------------
444
445 =head2 _readSeqHits
446
447 Title : _readSeqHits
448 Usage : Private method. $self->_readSeqHits(\*FH, $hmmResults)
449 Function : Reads the sequence hits from a HMMER3 hmmsearch
450 Args : The file handle to hmmsearch output, a Bio::Pfam::HMM::HMMResults object
451 Returns : Nothing
452
453 =cut
454
455 sub _readSeqHits {
456 my ( $self, $hs, $hmmRes ) = @_;
457 while (<$hs>) {
458
459 #Match a line like this
460 # E-value score bias E-value score bias exp N Sequence Description
461 # ------- ------ ----- ------- ------ ----- ---- -- -------- -----------
462 # 4e-83 285.8 10.0 5.3e-83 285.5 7.0 1.1 1 Q14SN3.1 Q14SN3_9HEPC Polyprotein (Fragment).
463 if (/^Domain annotation for each [sequence|model]/) { # This is the format for HMMER3b3
464 last;
465 }
466 elsif (/^Domain and alignment annotation for each [sequence|model]/) { #This is the format for HMMER3b2 - can be removed later
467 last;
468 }
469 elsif (/^\s+(E-value|---)/) {
470 next;
471 }
472 elsif (/^$/) {
473 next;
474 }
475 else {
476 next if(/No hits detected that satisfy reporting thresholds/);
477
478 #Assume that we have a sequence match
479 my @sMatch = split( /\s+/, $_ );
480 unless ( scalar(@sMatch) >= 10 ) {
481 die "Expected at least 10 pieces of data: $_;\n";
482 }
483 my $desc;
484 if ( scalar(@sMatch) >= 11 ) {
485 $desc = join( " ", @sMatch[ 10 .. $#sMatch ] );
486 }
487
488 $hmmRes->addHMMSeq(
489 Bio::Pfam::HMM::HMMSequence->new(
490 {
491 evalue => $sMatch[1],
492 bits => $sMatch[2],
493 bias => $sMatch[3],
494 exp => $sMatch[7],
495 numberHits => $sMatch[8],
496 name => $sMatch[9],
497 desc => defined($desc) ? $desc : "-",
498 }
499 )
500 );
501
502 next;
503 }
504 die "Failed to parse $_ in sequence section\n";
505 }
506
507 }
508
509 #------------------------------------------------------------------------------
510
511 =head2 _readUnitHits
512
513 Title : _readUnitHits
514 Usage : Private method. $self->_readUnitHits(\*FH, $hmmResults)
515 Function : Reads the unit (domain) hits from a HMMER3 hmmsearch
516 Args : The file handle to hmmsearch output, a Bio::Pfam::HMM::HMMResults object
517 Returns : Nothing
518
519 =cut
520
521 no warnings 'recursion';
522
523 sub _readUnitHits {
524 my ( $self, $hs, $hmmRes ) = @_;
525
526 if($hmmRes->eof){
527 return;
528 }
529
530 #Parse the domain hits section
531 #>> P37935.1 MAAY4_SCHCO Mating-type protein A-alpha Y4.
532 # # bit score bias E-value ind Evalue hmm from hmm to ali from ali to env from env to ali-acc
533 # --- --------- ------- ---------- ---------- -------- -------- -------- -------- -------- -------- -------
534 # 1 244.0 0.5 9.5e-76 1.7e-70 1 146 [. 1 145 [. 1 146 [. 0.99
535 #
536 # Alignments for each domain:
537 # == domain 1 score: 244.0 bits; conditional E-value: 9.5e-76
538 # SEED 1 medrlallkaisasakdlvalaasrGaksipspvkttavkfdplptPdldalrtrlkeaklPakaiksalsayekaCarWrsdleeafdktaksvsPanlhllealrirlyteqvekWlvqvlevaerWkaemekqrahiaatmgp 146
539 # m+++la+l++isa+akd++ala+srGa+++ +p++tt+++fd+l++P+ld++rtrl+ea+lP+kaik++lsaye+aCarW++dleeafd+ta+s+sP+n+++l++lr+rly+eqv+kWl++vl+v+erWkaemekqrahi+atmgp
540 # P37935.1 1 MAELLACLQSISAHAKDMMALARSRGATGS-RPTPTTLPHFDELLPPNLDFVRTRLQEARLPPKAIKGTLSAYESACARWKHDLEEAFDRTAHSISPHNFQRLAQLRTRLYVEQVQKWLYEVLQVPERWKAEMEKQRAHINATMGP 145
541 # 899***************************.******************************************************************************************************************8 PP
542
543 while (<$hs>) {
544 if (/^Internal/) {
545 last;
546 }
547 elsif (/\>\>\s+(\S+)/) {
548 my $seqId = $1;
549 $self->_readUnitData( $seqId, $hs, $hmmRes );
550 if($hmmRes->eof){
551 return;
552 }
553 }
554 }
555 }
556
557 sub _readUnitData {
558 my ( $self, $id, $hs, $hmmRes ) = @_;
559
560 if($hmmRes->eof){
561 return;
562 }
563 my $hmmName = $hmmRes->seedName();
564
565 my $seqName = $hmmRes->seqName;
566
567 # bit score bias E-value ind Evalue hmm from hmm to ali from ali to env from env to ali-acc
568 # --- --------- ------- ---------- ---------- -------- -------- -------- -------- -------- -------- -------
569 # 1 244.0 0.5 9.5e-76 1.7e-70 1 146 [. 1 145 [. 1 146 [. 0.99
570 #
571 # Alignments for each domain:
572
573 my @units;
574 my $align = 1;
575 my $recurse = 0;
576 my $eof = 0;
577 my ($nextSeqId);
578 while (<$hs>) {
579 if (/^[(\/\/|Internal)]/ ) {
580 $align = 0;
581 $recurse = 0;
582 $eof = 1;
583 last;
584 }
585 elsif (/^\>\>\s+(\S+)/) {
586 $nextSeqId = $1;
587 $align = 0;
588 $recurse = 1;
589 last;
590 }
591 elsif (/^\s+Alignments for each domain:/) {
592 $align = 1;
593 $recurse = 0;
594 last;
595 }
596 elsif (/^\s+(#\s+score|---)/){
597
598 #Two human readable lines
599 next;
600 }
601 elsif (/^$/) {
602
603 #blank line
604 next;
605 }
606 elsif (/^\s+\d+\s+/) {
607 my @dMatch = split( /\s+/, $_ );
608 unless ( scalar(@dMatch) == 17 ) {
609 die "Expected 16 elements of data: $_\n";
610 }
611
612 push(
613 @units,
614 Bio::Pfam::HMM::HMMUnit->new(
615 {
616 name => $id,
617 domain => $dMatch[1],
618 bits => $dMatch[3],
619 bias => $dMatch[4],
620 domEvalue => $dMatch[5],
621 evalue => $dMatch[6],
622 hmmFrom => $dMatch[7],
623 hmmTo => $dMatch[8],
624 seqFrom => $dMatch[10],
625 seqTo => $dMatch[11],
626 envFrom => $dMatch[13],
627 envTo => $dMatch[14],
628 aliAcc => $dMatch[16]
629 }
630 )
631 );
632
633 next;
634 }
635 elsif(/^\s+\[No individual domains/) {
636 $align=0;
637 next;
638 }
639 else {
640 confess("Did not parse line: $_");
641 }
642 }
643
644 # == domain 1 score: 244.0 bits; conditional E-value: 9.5e-76
645 # SEED 1 medrlallkaisasakdlvalaasrGaksipspvkttavkfdplptPdldalrtrlkeaklPakaiksalsayekaCarWrsdleeafdktaksvsPanlhllealrirlyteqvekWlvqvlevaerWkaemekqrahiaatmgp 146
646 # m+++la+l++isa+akd++ala+srGa+++ +p++tt+++fd+l++P+ld++rtrl+ea+lP+kaik++lsaye+aCarW++dleeafd+ta+s+sP+n+++l++lr+rly+eqv+kWl++vl+v+erWkaemekqrahi+atmgp
647 # P37935.1 1 MAELLACLQSISAHAKDMMALARSRGATGS-RPTPTTLPHFDELLPPNLDFVRTRLQEARLPPKAIKGTLSAYESACARWKHDLEEAFDRTAHSISPHNFQRLAQLRTRLYVEQVQKWLYEVLQVPERWKAEMEKQRAHINATMGP 145
648 # 899***************************.******************************************************************************************************************8 PP
649 #
650 # OR....
651 #
652 # == domain 1 score: 27.6 bits; conditional E-value: 7.4e-10
653 # PF00018 17 LsfkkGdvitvleksee.eWwkaelkdg.keGlvPsnYvep 55
654 # L++++Gd+++++++++e++Ww++++++++++G++P+n+v+p
655 # P15498.4 617 LRLNPGDIVELTKAEAEqNWWEGRNTSTnEIGWFPCNRVKP 657
656 # 7899**********9999*******************9987 PP
657
658
659 if ($align) {
660 my ($pattern1, $pattern2);
661
662 if($hmmName and $hmmRes->program eq 'hmmsearch'){
663 $pattern1 = qr/^\s+$hmmName\s+\d+\s+(\S+)\s+\d+/;
664 $id =~ s/(\W)/\\$1/g; # escape any non-word character
665 # $id =~ s/\|/\\|/g; #Escape '|', '[' and ']' characters
666 # $id =~ s/\[/\\[/g;
667 # $id =~ s/\]/\\]/g;
668 $pattern2 = qr/^\s+$id\s+\d+\s+(\S+)\s+\d+/;
669 }elsif($seqName and $hmmRes->program eq 'hmmscan'){
670 my $tmpSeqName = $seqName;
671 $tmpSeqName =~ s/(\W)/\\$1/g; # escape any non-word character
672 # $tmpSeqName =~ s/\|/\\|/g; #Escape '|', '[' and ']' characters
673 # $tmpSeqName =~ s/\[/\\[/g;
674 # $tmpSeqName =~ s/\]/\\]/g;
675 $pattern1 = qr/^\s+$id\s+\d+\s+(\S+)\s+\d+/;
676 $pattern2 = qr/^\s+$tmpSeqName\s+\d+\s+(\S+)\s+\d+/;
677 }elsif($seqName and ($hmmRes->program eq 'phmmer' or $hmmRes->program eq 'jackhmmer') ){
678 $seqName =~ s/(\W)/\\$1/g; # escape any non-word character
679 # $seqName =~ s/\|/\|/g; #Escape '|', '[' and ']' characters
680 # $seqName =~ s/\[/\\[/g;
681 # $seqName =~ s/\]/\\]/g;
682 $pattern1 = qr/^\s+$seqName\s+\d+\s+(\S+)\s+\d+/;
683 $pattern2 = qr/^\s+$id\s+\d+\s+(\S+)\s+\d+/;
684 }
685
686
687 $recurse = 0;
688 my $matchNo;
689 my $hmmlen = 0;
690 while (<$hs>) {
691 if (/$pattern1/) {
692 $units[ $matchNo - 1 ]->hmmalign->{hmm} .= $1;
693 $hmmlen = length($1);
694 }
695 elsif (/$pattern2/) {
696 $units[ $matchNo - 1 ]->hmmalign->{seq} .= $1;
697 }
698 elsif (/^\s+([x\.]+)\s+RF$/) {
699 my $rf = $1;
700 $units[ $matchNo - 1 ]->hmmalign->{rf} .= $rf;
701 }
702 elsif (/^\s+([0-9\*\.]+)\s+PP$/) {
703 my $pp = $1;
704 $units[ $matchNo - 1 ]->hmmalign->{pp} .= $pp;
705 }elsif (/^\s+(\S+)\s+CS$/) {
706 my $cs = $1;
707 $units[ $matchNo - 1 ]->hmmalign->{cs} .= $cs;
708 }elsif (/^\s+==\s+domain\s+(\d+)/) {
709 $matchNo = $1;
710 }
711 elsif (/^\s+(.*)\s+$/) {
712 # $1 is *not* the match - this fails if there are prepended
713 # or appended spaces
714 # $units[ $matchNo - 1 ]->hmmalign->{match} .= $1;
715 # Let's get a right substring based on the HMM length
716 chomp;
717 my $m1 = substr($_,-$hmmlen);
718 $units[ $matchNo - 1 ]->hmmalign->{match} .= $m1;
719 }elsif (/^$/) {
720 next;
721 }
722 elsif (/^[(\/\/|Internal)]/) {
723 $align = 0;
724 $recurse = 0;
725 $eof = 1;
726 last;
727 }
728 elsif (/^\>\>\s+(\S+)/) {
729 $nextSeqId = $1;
730 $recurse = 1;
731 last;
732 }
733
734 else {
735 confess("Did not parse |$_| in units");
736 }
737 }
738 }
739
740 foreach my $u (@units) {
741 $hmmRes->addHMMUnit($u);
742 }
743
744 $hmmRes->eof($eof);
745
746 if ($recurse and $nextSeqId) {
747 $self->_readUnitData( $nextSeqId, $hs, $hmmRes );
748 }
749 return;
750 }
751 use warnings 'recursion';
752
753 #-------------------------------------------------------------------------------
754
755 =head2 parseHMMER2
756
757 Title : parseHMMER2
758 Usage : $self->parseHMMER2(\*FH )
759 Function : This is a minimal parser for reading in the output of HMMER2 hmmsearch
760 Args : The file handle to hmmsearch output
761 Returns : A Bio::Pfam::HMM::HMMResults object
762
763 =cut
764
765 sub parseHMMER2 {
766 my $self = shift;
767 my $file = shift;
768
769 my $hmmRes = Bio::Pfam::HMM::HMMResults->new;
770
771 my %seqh;
772 my $count = 0;
773
774 while (<$file>) {
775 /^Scores for complete sequences/ && last;
776 }
777
778 while (<$file>) {
779 /^Parsed for domains/ && last;
780 if ( my ( $id, $de, $sc, $ev, $hits ) =
781 /^(\S+)\s+(.*?)\s+(\S+)\s+(\S+)\s+(\d+)\s*$/ )
782 {
783 $hmmRes->addHMMSeq(
784 Bio::Pfam::HMM::HMMSequence->new(
785 {
786 bits => $sc,
787 evalue => $ev,
788 name => $id,
789 desc => $de,
790 numberHits => $hits
791 }
792 )
793 );
794
795 $seqh{$id} = $sc;
796 }
797 }
798
799 while (<$file>) {
800 /^Histogram of all scores/ && last;
801 if ( my ( $id, $sqfrom, $sqto, $hmmf, $hmmt, $sc, $ev ) =
802 /^(\S+)\s+\S+\s+(\d+)\s+(\d+).+?(\d+)\s+(\d+)\s+\S+\s+(\S+)\s+(\S+)\s*$/ )
803 {
804 $hmmRes->addHMMUnit(
805 Bio::Pfam::HMM::HMMUnit->new(
806 {
807 name => $id,
808 seqFrom => $sqfrom,
809 seqTo => $sqto,
810 hmmFrom => $hmmf,
811 hmmTo => $hmmt,
812 bits => $sc,
813 evalue => $ev
814 }
815 )
816 );
817
818 }
819 }
820
821 return $hmmRes;
822 }
823
824 #-------------------------------------------------------------------------------
825
826 =head2 parseHMMER1
827
828 Title : parseHMMER1
829 Usage : $self->parseHMMER1(\*FH )
830 Function : This is a minimal parser for reading in the output of HMMER1 hmmsearch.
831 : There are a few hacks to get round some of them requirements
832 Args : The file handle to hmmsearch output
833 Returns : A Bio::Pfam::HMM::HMMResults object
834
835 =cut
836
837 sub parseHMMER1 {
838 my $self = shift;
839 my $file = shift;
840
841 my $hmmRes = Bio::Pfam::HMM::HMMResults->new;
842
843 my %seqh;
844 my $count = 0;
845
846 while (<$file>) {
847 if ( my ( $bits, $s, $e, $id, $de ) =
848 /^(-?\d+\.?\d*)\s+\(bits\)\s+f:\s+(\d+)\s+t:\s+(\d+)\s+Target:\s+(\S+)\s+(.*)/
849 )
850 {
851 if ( $id =~ /(\S+)\/(\d+)-(\d+)/ ) {
852 $id = $1;
853 $s = $2 + $s - 1;
854 $e = $2 + $e - 1;
855 }
856
857 if ( !$hmmRes->seqs->{$id} ) {
858 $hmmRes->addHMMSeq(
859 Bio::Pfam::HMM::HMMSequence->new(
860 {
861 bits => $bits,
862 evalue => 1,
863 name => $id,
864 desc => $de,
865 numberHits => 1
866 }
867 )
868 );
869 }
870 $hmmRes->addHMMUnit(
871 Bio::Pfam::HMM::HMMUnit->new(
872 {
873 name => $id,
874 seqFrom => $s,
875 seqTo => $e,
876 hmmFrom => "1",
877 hmmTo => "1",
878 bits => $bits,
879 evalue => "1"
880 }
881 )
882 );
883 if ( $bits > $hmmRes->seqs->{$id}->bits ) {
884 $hmmRes->seqs->{$id}->bits($bits);
885 }
886 }
887 }
888 return $hmmRes;
889 }
890
891 #-------------------------------------------------------------------------------
892
893 =head2 writeScoresFile
894
895 Title : writeScoresFile
896 Usage : $hmmResIO->writeScoresFile( $hmmRes)
897 Function : Writes a scores file for a Bio::Pfam::HMM::HMMResults object.
898 Args : Bio::Pfam::HMM::HMMResults
899 Returns : Nothing
900
901 =cut
902
903 sub writeScoresFile {
904 my ( $self, $hmmRes ) = @_;
905
906 unless ($hmmRes) {
907 confess "A Bio::Pfam::HMM::HMMResults object was not parsed in\n";
908 }
909 unless ( $hmmRes->isa("Bio::Pfam::HMM::HMMResults") ) {
910 confess("Variable passed in is not a Bio::Pfam::HMM::Results object");
911 }
912
913 my $fh;
914 open( $fh, ">" . $self->scores )
915 or confess "Could not open " . $self->scores . ":[$!]\n";
916
917 my ( $lowSeq, $lowDom, $highSeq, $highDom );
918 $lowSeq = $lowDom = 999999.99;
919 $highSeq = $highDom = -999999.99;
920 unless ( defined $hmmRes->domThr and defined $hmmRes->seqThr ) {
921 warn "No threshold set, setting to 25.0 bits\n";
922 $hmmRes->domThr("25.0");
923 $hmmRes->seqThr("25.0");
924 }
925
926 my @sigUnits;
927
928 foreach my $seqId ( keys %{ $hmmRes->seqs } ) {
929
930 #Does this sequence score above or equal to the sequence threshold?
931 if ( $hmmRes->seqs->{$seqId}->bits >= $hmmRes->seqThr ) {
932
933 #Is this the lowest sequence thresh
934 if ( $hmmRes->seqs->{$seqId}->bits < $lowSeq ) {
935 $lowSeq = $hmmRes->seqs->{$seqId}->bits;
936 }
937
938 #For each of the regions found on the sequence, look to see if the match is great
939 #than the domain threshold. If it is, is it lower than we we have seen previously
940 foreach my $unit ( @{ $hmmRes->seqs->{$seqId}->hmmUnits } ) {
941 if ( $unit->bits >= $hmmRes->domThr ) {
942 push( @sigUnits, $unit );
943 if ( $unit->bits < $lowDom ) {
944 $lowDom = $unit->bits();
945 }
946 }
947 elsif ( $unit->bits > $highDom ) {
948 $highDom = $unit->bits;
949 }
950 }
951 }
952 else {
953
954 #Is this the highest sequence thres below the cut-off
955 if ( $hmmRes->seqs->{$seqId}->bits > $highSeq ) {
956 $highSeq = $hmmRes->seqs->{$seqId}->bits;
957 }
958
959 #For each of the regions found on the sequence, look to see if the match is great
960 #than the domain threshold. If it is, is it lower than we we have seen previously
961 foreach my $unit ( @{ $hmmRes->seqs->{$seqId}->hmmUnits } ) {
962 if ( $unit->bits < $hmmRes->domThr && $unit->bits > $highDom ) {
963 $highDom = $unit->bits;
964 }
965 }
966 }
967 }
968
969 $hmmRes->domTC($lowDom);
970 $hmmRes->seqTC($lowSeq);
971 $hmmRes->domNC($highDom);
972 $hmmRes->seqNC($highSeq);
973
974 #Print the domains to the scores file
975 foreach my $u ( sort { $b->bits <=> $a->bits } @sigUnits ) {
976 print $fh
977 sprintf( "%.1f %s/%s-%s %s-%s %s\n", $u->bits, $u->name, $u->envFrom, $u->envTo, $u->seqFrom, $u->seqTo, $u->evalue );
978 }
979 close($fh);
980
981 }
982
983 #-------------------------------------------------------------------------------
984
985 #TODO - write _readAlign
986
987 =head2 _readAlign
988
989 Title :
990 Usage :
991 Function :
992 Args :
993 Returns :
994
995 =cut
996
997 sub _readAlign {
998 my ( $self, $fh, $hmmRes ) = @_;
999
1000 }
1001
1002 #Parse the alignment section
1003 #if($pp){
1004
1005 #}else{
1006 # while(<HS>){
1007 # last if(/^\/\//)
1008 # }
1009 #}
1010
1011
1012
1013 sub _readFooter {
1014 my($self, $fh, $hmmRes ) = @_;
1015
1016 # We are going to parse something like this!
1017
1018 # Internal pipeline statistics summary:
1019 #-------------------------------------
1020 #Query sequence(s): 1 (360 residues)
1021 #Target model(s): 7 (836 nodes)
1022 #Passed MSV filter: 2 (0.285714); expected 0.1 (0.02)
1023 #Passed Vit filter: 1 (0.142857); expected 0.0 (0.001)
1024 #Passed Fwd filter: 1 (0.142857); expected 0.0 (1e-05)
1025 #Initial search space (Z): 7 [actual number of targets]
1026 #Domain search space (domZ): 1 [number of targets reported over threshold]
1027 ## CPU time: 0.00u 0.00s 00:00:00.00 Elapsed: 00:00:00
1028 ## Mc/sec: inf
1029 #//
1030
1031 while(<$fh>){
1032 if(/\/\//){
1033 last;
1034 }
1035 }
1036 }
1037
1038
1039 #Parse the internal summary section
1040 #Internal statistics summary:
1041 #----------------------------
1042 #Query HMM(s): 1 (0 nodes)
1043 #Target sequences: 5323441 (0 residues)
1044 #Passed MSV filter: 116519 (-37389918065567040729448769671768824784852036328367855636063687997915136.000; expected 19991592792512146725679052970637918208.000)
1045 #Passed Vit filter: 7579 (-0.0000; expected -35982214160587876085407389642471051723332987952235753317595472501307733302049608744822636544.0000)
1046 #Passed Fwd filter: 1687 (8.3e+165; expected -7.5e-266)
1047 #Mc/sec: 828.85
1048 # CPU time: 115.36u 4.45s 00:01:59.81 Elapsed: 00:03:01
1049
1050 #sub writeHMMSearch {
1051 # my ( $self, $hmmRes ) = @_;
1052 # my $fh;
1053 # open($fh, ">".$self->outfile."\n");
1054 #
1055 # $self->_writeHeader($fh, $hmmRes);
1056 # $self->_writeSeqHits( $fh, $hmmRes);
1057 # $self->_writeDomHits( $fh, $hmmRes);
1058 # $self->_writeAlign( $fh, $hmmRes) if($self->align);
1059 # $self->_writeInternalSummary( $fh, $hmmRes);
1060 #}
1061 #sub mergeHMMSearch {
1062 # my ( $self, $filenames ) = @_;
1063 #}
1064
1065
1066 sub write_ascii_out {
1067
1068 my ($self, $HMMResults, $fh, $scanData, $e_seq, $e_dom, $b_seq, $b_dom) = @_;
1069
1070
1071 $scanData->{_max_seqname} = 20 unless($scanData->{_max_seqname} or $scanData->{_max_seqname} < 1);
1072
1073 my $ga;
1074
1075 if($e_seq or $e_dom) {
1076 $e_seq = $e_dom unless($e_seq);
1077 $e_dom = "10" unless($e_dom);
1078 }
1079 elsif($b_seq or $b_dom) {
1080 $b_seq = $b_dom unless($b_seq);
1081 $b_dom = "0" unless($b_dom);
1082 }
1083 else {
1084 $ga = 1;
1085 }
1086
1087
1088 foreach my $unit ( sort { $a->seqFrom <=> $b->seqFrom } @{ $HMMResults->units } ) {
1089
1090 if($unit->name =~ /Pfam\-B/) {
1091
1092 next unless($HMMResults->seqs->{$unit->name}->evalue <= "0.001" and $unit->evalue <= "0.001");
1093
1094
1095 printf $fh "%-".$scanData->{_max_seqname}."s %6d %6d %6d %6d %-11s %-16s %7s %5d %5d %5d %8s %9s %3s %-8s\n",
1096 $HMMResults->seqName,
1097 $unit->seqFrom,
1098 $unit->seqTo,
1099 $unit->envFrom,
1100 $unit->envTo,
1101 $scanData->{_accmap}->{ $unit->name },
1102 $unit->name,
1103 "Pfam-B",
1104 $unit->hmmFrom,
1105 $unit->hmmTo,
1106 $scanData->{_model_len}->{ $unit->name },
1107 $unit->bits,
1108 $unit->evalue,
1109 "NA",
1110 "NA";
1111
1112
1113 }
1114 else {
1115
1116 #Filter results based on thresholds
1117 if($ga) {
1118 next unless($unit->sig);
1119 }
1120 if($e_seq) {
1121 next unless($HMMResults->seqs->{$unit->name}->evalue <= $e_seq and $unit->evalue <= $e_dom);
1122 }
1123 if($b_seq) {
1124
1125 next unless($HMMResults->seqs->{$unit->name}->bits >= $b_seq and $unit->bits >= $b_dom);
1126 }
1127
1128 my $clan = $scanData->{_clanmap}->{ $unit->name } || "No_clan";
1129
1130
1131 printf $fh "%-".$scanData->{_max_seqname}."s %6d %6d %6d %6d %-11s %-16s %7s %5d %5d %5d %8s %9s %3d %-8s ",
1132 $HMMResults->seqName,
1133 $unit->seqFrom,
1134 $unit->seqTo,
1135 $unit->envFrom,
1136 $unit->envTo,
1137 $scanData->{_accmap}->{ $unit->name },
1138 $unit->name,
1139 $scanData->{_type}->{ $unit->name },
1140 $unit->hmmFrom,
1141 $unit->hmmTo,
1142 $scanData->{_model_len}->{ $unit->name },
1143 $unit->bits,
1144 $unit->evalue,
1145 $unit->sig,
1146 $clan;
1147
1148
1149 if($unit->{'act_site'}) {
1150 local $" = ",";
1151 print $fh "predicted_active_site[@{$unit->{'act_site'}}]";
1152 }
1153
1154 if($scanData->{_translate}){
1155 my $strand = '?';
1156 my $start = '-';
1157 my $end = '-';
1158 if(exists($scanData->{_orf}->{$HMMResults->seqName})){
1159 $strand = $scanData->{_orf}->{$HMMResults->seqName}->{strand};
1160 if($strand eq '+'){
1161 $start = $scanData->{_orf}->{$HMMResults->seqName}->{start} + ($unit->envFrom * 3) - 3;
1162 $end = $scanData->{_orf}->{$HMMResults->seqName}->{start} + ($unit->envTo * 3) - 3;
1163 }elsif($strand eq '-'){
1164 $start = $scanData->{_orf}->{$HMMResults->seqName}->{start} - ($unit->envFrom * 3) + 3;
1165 $end = $scanData->{_orf}->{$HMMResults->seqName}->{start} - ($unit->envTo * 3) + 3;
1166 }
1167 }
1168 print $fh "$strand $start $end";
1169 }
1170
1171 print $fh "\n";
1172 }
1173
1174 if($scanData->{_align}){
1175 print $fh sprintf( "%-10s %s\n", "#HMM", $unit->hmmalign->{hmm} );
1176 print $fh sprintf( "%-10s %s\n", "#MATCH", $unit->hmmalign->{match} );
1177 print $fh sprintf( "%-10s %s\n", "#PP", $unit->hmmalign->{pp});
1178 print $fh sprintf( "%-10s %s\n", "#SEQ", $unit->hmmalign->{seq});
1179 print $fh sprintf( "%-10s %s\n", "#CS", $unit->hmmalign->{cs}) if($unit->hmmalign->{cs});
1180 }
1181
1182 }
1183
1184 }
1185
1186 1;