Mercurial > repos > matteoc > agame_custom_tools
comparison pfamScan/Bio/Pfam/HMM/HMMResultsIO.pm @ 0:68a3648c7d91 draft default tip
Uploaded
author | matteoc |
---|---|
date | Thu, 22 Dec 2016 04:45:31 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:68a3648c7d91 |
---|---|
1 # HMMResultsIO.pm | |
2 # | |
3 # Author: rdf | |
4 # Maintainer: $Id: HMMResultsIO.pm,v 1.2 2009-12-01 15:42:20 jt6 Exp $ | |
5 # Version: $Revision: 1.2 $ | |
6 # Created: Nov 16, 2008 | |
7 # Last Modified: $Date: 2009-12-01 15:42:20 $ | |
8 | |
9 =head1 NAME | |
10 | |
11 Template - a short description of the class | |
12 | |
13 =cut | |
14 | |
15 package Bio::Pfam::HMM::HMMResultsIO; | |
16 | |
17 =head1 DESCRIPTION | |
18 | |
19 A more detailed description of what this class does and how it does it. | |
20 | |
21 $Id: HMMResultsIO.pm,v 1.2 2009-12-01 15:42:20 jt6 Exp $ | |
22 | |
23 =head1 COPYRIGHT | |
24 | |
25 File: HMMResultsIO.pm | |
26 | |
27 Copyright (c) 2007: Genome Research Ltd. | |
28 | |
29 Authors: Rob Finn (rdf@sanger.ac.uk), John Tate (jt6@sanger.ac.uk) | |
30 | |
31 This is free software; you can redistribute it and/or | |
32 modify it under the terms of the GNU General Public License | |
33 as published by the Free Software Foundation; either version 2 | |
34 of the License, or (at your option) any later version. | |
35 | |
36 This program is distributed in the hope that it will be useful, | |
37 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
38 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
39 GNU General Public License for more details. | |
40 | |
41 You should have received a copy of the GNU General Public License | |
42 along with this program; if not, write to the Free Software | |
43 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | |
44 or see the on-line version at http://www.gnu.org/copyleft/gpl.txt | |
45 | |
46 =cut | |
47 | |
48 use strict; | |
49 use warnings; | |
50 use Moose; | |
51 use Carp; | |
52 | |
53 #All the things we need to objectfy the search results | |
54 use Bio::Pfam::HMM::HMMResults; | |
55 use Bio::Pfam::HMM::HMMSequence; | |
56 use Bio::Pfam::HMM::HMMUnit; | |
57 | |
58 #------------------------------------------------------------------------------- | |
59 | |
60 =head1 ATTRIBUTES | |
61 | |
62 | |
63 | |
64 =cut | |
65 | |
66 has 'align' => ( | |
67 isa => 'Int', | |
68 is => 'rw', | |
69 default => 0 | |
70 ); | |
71 | |
72 has 'outfile' => ( | |
73 isa => 'Str', | |
74 is => 'rw', | |
75 default => 'OUTPUT' | |
76 ); | |
77 | |
78 has 'pfamout' => ( | |
79 isa => 'Str', | |
80 is => 'rw', | |
81 default => 'PFAMOUT' | |
82 ); | |
83 | |
84 has 'scores' => ( | |
85 isa => 'Str', | |
86 is => 'rw', | |
87 default => 'scores' | |
88 ); | |
89 | |
90 #------------------------------------------------------------------------------- | |
91 | |
92 =head1 METHODS | |
93 | |
94 =head2 parseHMMER3 | |
95 | |
96 Title : parseHMMER | |
97 Usage : $hmmResIO->parseHMMSearch( filename ) | |
98 Function : Parse the output from a HMMER3 search results | |
99 Args : Filename containing the search | |
100 Returns : A Bio::Pfam::HMM::HMMResults object | |
101 | |
102 =cut | |
103 | |
104 sub parseHMMER3 { | |
105 my ( $self, $filename ) = @_; | |
106 my $fh; | |
107 | |
108 if(ref($filename) eq 'GLOB'){ | |
109 $fh = $filename; | |
110 }else{ | |
111 open( $fh, $filename ) or confess "Could not open $filename:[$!]\n"; | |
112 } | |
113 | |
114 # open( $fh, $filename ) or confess "Could not open $filename:[$!]\n"; | |
115 my $hmmRes = Bio::Pfam::HMM::HMMResults->new; | |
116 $self->_readHeader( $fh, $hmmRes ); | |
117 $self->_readSeqHits( $fh, $hmmRes ); | |
118 $self->_readUnitHits( $fh, $hmmRes ); | |
119 $self->_readFooter($fh, $hmmRes); | |
120 return ($hmmRes); | |
121 } | |
122 | |
123 | |
124 | |
125 sub parseMultiHMMER3 { | |
126 my ( $self, $filename ) = @_; | |
127 my $fh; | |
128 | |
129 if(ref($filename) eq 'GLOB'){ | |
130 $fh = $filename; | |
131 }elsif( ref($filename) and $filename->isa('IO::File') ) { | |
132 $fh = $filename; | |
133 }else{ | |
134 open( $fh, $filename ) or confess "Could not open $filename:[$!]\n"; | |
135 } | |
136 | |
137 my @hmmResAll; | |
138 my $program; | |
139 while(!eof($fh)){ | |
140 my $hmmRes = Bio::Pfam::HMM::HMMResults->new; | |
141 my $eof = $self->_readHeader( $fh, $hmmRes ); | |
142 last if($eof); | |
143 push(@hmmResAll, $hmmRes); | |
144 if($hmmRes->program) { | |
145 $program = $hmmRes->program; | |
146 } | |
147 else { | |
148 $hmmRes->program($program); | |
149 } | |
150 $self->_readSeqHits( $fh, $hmmRes ); | |
151 $self->_readUnitHits( $fh, $hmmRes ); | |
152 $self->_readFooter($fh, $hmmRes); | |
153 } | |
154 return (\@hmmResAll); | |
155 } | |
156 | |
157 sub parseSplitHMMER3 { | |
158 my($self, $files ) = @_; | |
159 | |
160 my $hmmRes = Bio::Pfam::HMM::HMMResults->new; | |
161 | |
162 foreach my $filename (@{$files}){ | |
163 my ($fh); | |
164 open( $fh, $filename ) or confess "Could not open $filename:[$!]\n"; | |
165 $self->_readHeader( $fh, $hmmRes ); | |
166 $self->_readSeqHits( $fh, $hmmRes ); | |
167 $self->_readUnitHits( $fh, $hmmRes ); | |
168 $self->_readFooter($fh, $hmmRes); | |
169 } | |
170 | |
171 return ( $hmmRes ); | |
172 | |
173 } | |
174 | |
175 | |
176 #------------------------------------------------------------------------------- | |
177 | |
178 =head2 convertHMMSearch | |
179 | |
180 Title : convertHMMSearch | |
181 Usage : $hmmResIO->convertHMMSearch('SEARCHFILE') | |
182 Function : This wraps up a couple of methods to convert the more complex hmmsearch | |
183 : results in to nice clean format that we Pfam-ers are used to. | |
184 Args : The filename of the hmmsearch output file | |
185 Returns : Nothing | |
186 | |
187 =cut | |
188 | |
189 sub convertHMMSearch { | |
190 my ( $self, $filename ) = @_; | |
191 | |
192 unless ($filename) { | |
193 confess "No filename passed in to convertHMMSearch\n"; | |
194 } | |
195 unless ( -s $filename ) { | |
196 confess "$filename does not exists\n"; | |
197 } | |
198 | |
199 #Now parse in the raw HMM output and write out the results as a PFAMOUT | |
200 my $hmmRes = $self->parseHMMER3($filename); | |
201 $self->writePFAMOUT($hmmRes); | |
202 return $hmmRes; | |
203 } | |
204 | |
205 #------------------------------------------------------------------------------- | |
206 | |
207 =head2 writePFAMOUT | |
208 | |
209 Title : writePFAMOUT | |
210 Usage : $hmmResIO->writePFAMOUT( $hmmRes ) | |
211 Function : Writes a Bio::Pfam::HMM:HMMResults object in to a PFAMOUT file. | |
212 Args : A Bio::Pfam::HMM:HMMResults | |
213 Returns : Nothing | |
214 | |
215 =cut | |
216 | |
217 sub writePFAMOUT { | |
218 my ( $self, $hmmRes ) = @_; | |
219 | |
220 unless ($hmmRes) { | |
221 confess "A Bio::Pfam::HMM::HMMResults object was not parsed in\n"; | |
222 } | |
223 unless ( $hmmRes->isa("Bio::Pfam::HMM::HMMResults") ) { | |
224 confess("Variable passed in is not a Bio::Pfam::HMM::Results object"); | |
225 } | |
226 | |
227 my $fh; | |
228 open( $fh, ">" . $self->pfamout ) | |
229 or confess "Could not open " . $self->pfamout . ":[$!]\n"; | |
230 | |
231 print $fh <<HEAD; | |
232 # =========== | |
233 # Pfam output | |
234 # =========== | |
235 # | |
236 # Sequence scores | |
237 # --------------- | |
238 # | |
239 # name description bits evalue n exp bias | |
240 | |
241 HEAD | |
242 | |
243 foreach | |
244 my $seq ( sort { $b->bits <=> $a->bits } ( @{ $hmmRes->eachHMMSeq } ) ) | |
245 { | |
246 $_ = $seq->desc; | |
247 my ($desc) = /^(.{1,42})/; | |
248 $desc = uc($desc); | |
249 printf $fh ( | |
250 "%-15s %-42s %8.1f %9s %3d %5.1f %5.1f\n", | |
251 $seq->name, | |
252 $desc, | |
253 $seq->bits, | |
254 $seq->evalue, | |
255 scalar( @{ $seq->hmmUnits } ), | |
256 defined( $seq->exp ) ? $seq->exp : "-", | |
257 defined( $seq->bias ) ? $seq->bias : "-" | |
258 ); | |
259 } | |
260 | |
261 print $fh <<HEAD; | |
262 # | |
263 # Domain scores | |
264 # ------------- | |
265 # | |
266 # name env-st env-en ali-st ali-en hmm-st hmm-en bits evalue hit bias | |
267 # | |
268 | |
269 HEAD | |
270 | |
271 foreach my $dom ( sort { $b->bits <=> $a->bits } @{ $hmmRes->units } ) { | |
272 | |
273 | |
274 printf $fh ( | |
275 "%-15s %6d %6d %6d %6d %6s %6s %6.1f %9s %6d %6.1f\n", | |
276 $dom->name, | |
277 $dom->envFrom, | |
278 $dom->envTo, | |
279 $dom->seqFrom, | |
280 $dom->seqTo, | |
281 $dom->hmmFrom, | |
282 $dom->hmmTo, | |
283 $dom->bits, | |
284 $dom->evalue, | |
285 $dom->domain, | |
286 defined( $dom->bias ) ? $dom->bias : "-", | |
287 | |
288 ); | |
289 } | |
290 } | |
291 | |
292 #------------------------------------------------------------------------------- | |
293 | |
294 =head2 parsePFAMOUT | |
295 | |
296 Title : parsePFAMOUT | |
297 Usage : $self->parsePFAMOUT($filename) | |
298 Function : Reads in a PFAMOUT file. This file contains the minimal amount of information | |
299 : require to constrcut a pfam ALIGN file. | |
300 Args : A filename. Normally this is filename | |
301 Returns : A Bio::Pfam::HMM::HMMResults object | |
302 | |
303 =cut | |
304 | |
305 sub parsePFAMOUT { | |
306 my $self = shift; | |
307 my $filename = shift; | |
308 | |
309 unless ($filename) { | |
310 confess('No filename or filehandle passed to parsePFAMOUT'); | |
311 } | |
312 | |
313 my $fh; | |
314 if ( ref($filename) eq 'GLOB' ) { | |
315 $fh = $filename; | |
316 } | |
317 else { | |
318 open( $fh, $filename ) or confess "Could not open $filename:[$!]\n"; | |
319 } | |
320 my $hmmRes = Bio::Pfam::HMM::HMMResults->new; | |
321 | |
322 while (<$fh>) { | |
323 /^# Domain scores/ && last; | |
324 | |
325 #if (/^(\S+)\s+(.*?)\s+(\S+)\s+(\S+)\s+(\d+)\s*$/) { | |
326 if (/^(\S+)\s+(.*?)\s+(\S+)\s+(\S+)\s+(\d+)\s+\S+\s+(\S+)\s*$/) { | |
327 | |
328 $hmmRes->addHMMSeq( | |
329 Bio::Pfam::HMM::HMMSequence->new( | |
330 { | |
331 name => $1, | |
332 desc => $2, | |
333 bits => $3, | |
334 evalue => $4, | |
335 numberHits => $5, | |
336 bias => $6 | |
337 } | |
338 ) | |
339 ); | |
340 } | |
341 elsif (/^#|^\s+$/) { | |
342 next; | |
343 } | |
344 else { | |
345 warn "Did not parse|$_|\n"; | |
346 } | |
347 } | |
348 while (<$fh>) { | |
349 | |
350 #if (/^(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s*$/) { | |
351 if ( | |
352 /^(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+\S+\s+(\S+)/) | |
353 { | |
354 $hmmRes->addHMMUnit( | |
355 Bio::Pfam::HMM::HMMUnit->new( | |
356 { | |
357 name => $1, | |
358 envFrom => $2, | |
359 envTo => $3, | |
360 seqFrom => $4, | |
361 seqTo => $5, | |
362 hmmFrom => $6, | |
363 hmmTo => $7, | |
364 bits => $8, | |
365 evalue => $9, | |
366 bias => $10 | |
367 } | |
368 ) | |
369 ); | |
370 } | |
371 elsif (/^#|^\s+$/) { | |
372 next; | |
373 } | |
374 elsif (/^$/) { | |
375 next; | |
376 } | |
377 else { | |
378 warn "Did not parse: |$_|"; | |
379 } | |
380 } | |
381 close($fh); | |
382 return ($hmmRes); | |
383 } | |
384 | |
385 #------------------------------------------------------------------------------- | |
386 | |
387 =head2 _readHeader | |
388 | |
389 Title : _readHeader | |
390 Usage : Private method. $self->_readHeader(\*FH, $hmmResults) | |
391 Function : Reads the header section from a HMMER3 hmmsearch | |
392 Args : The file handle to hmmsearch output, a Bio::Pfam::HMM::HMMResults object | |
393 Returns : Nothing | |
394 | |
395 =cut | |
396 | |
397 #Parse the header part of the output first; | |
398 sub _readHeader { | |
399 my ( $self, $hs, $hmmRes ) = @_; | |
400 | |
401 #Check the $hs is defined and a GLOB | |
402 | |
403 while (<$hs>) { | |
404 if (/^Scores for complete/) { | |
405 last; | |
406 } | |
407 elsif (/^# query HMM file:\s+(\S+)/) { | |
408 $hmmRes->hmmName($1); | |
409 } | |
410 elsif (/^# target sequence database:\s+(\S+)/) { | |
411 $hmmRes->seqDB($1); | |
412 } | |
413 elsif (/^output directed to file:\s+(\S+)/) { | |
414 $hmmRes->thisFile($1); | |
415 } | |
416 elsif (/^Query:\s+(\S+)\s+\[M\=(\d+)\]/) { | |
417 $hmmRes->seedName($1); | |
418 $hmmRes->hmmLength($2); | |
419 }elsif(/^Query:\s+(\S+)\s+\[L\=(\d+)\]/) { | |
420 $hmmRes->seqName($1); | |
421 $hmmRes->seqLength($2); | |
422 }elsif (/^sequence E-value threshold: <= (\d+)/) { | |
423 $hmmRes->evalueThr($1); | |
424 } | |
425 elsif (/^# Random generator seed: (\d+)/) { | |
426 $hmmRes->randSeedNum($1); | |
427 }elsif(/^Description:\s+(.*)/){ | |
428 $hmmRes->description($1); | |
429 }elsif(/^# (phmmer|hmmsearch|hmmscan|jackhmmer)/){ | |
430 $hmmRes->program($1); | |
431 }elsif (/(^#)|(^$)/) { | |
432 next; | |
433 }elsif(/^Accession/){ | |
434 next; | |
435 } elsif(/^\[ok\]/) { | |
436 return(1); | |
437 } else { | |
438 die "Failed to parse hmmsearch results |$_| in header section\n"; | |
439 } | |
440 } | |
441 } | |
442 | |
443 #------------------------------------------------------------------------------- | |
444 | |
445 =head2 _readSeqHits | |
446 | |
447 Title : _readSeqHits | |
448 Usage : Private method. $self->_readSeqHits(\*FH, $hmmResults) | |
449 Function : Reads the sequence hits from a HMMER3 hmmsearch | |
450 Args : The file handle to hmmsearch output, a Bio::Pfam::HMM::HMMResults object | |
451 Returns : Nothing | |
452 | |
453 =cut | |
454 | |
455 sub _readSeqHits { | |
456 my ( $self, $hs, $hmmRes ) = @_; | |
457 while (<$hs>) { | |
458 | |
459 #Match a line like this | |
460 # E-value score bias E-value score bias exp N Sequence Description | |
461 # ------- ------ ----- ------- ------ ----- ---- -- -------- ----------- | |
462 # 4e-83 285.8 10.0 5.3e-83 285.5 7.0 1.1 1 Q14SN3.1 Q14SN3_9HEPC Polyprotein (Fragment). | |
463 if (/^Domain annotation for each [sequence|model]/) { # This is the format for HMMER3b3 | |
464 last; | |
465 } | |
466 elsif (/^Domain and alignment annotation for each [sequence|model]/) { #This is the format for HMMER3b2 - can be removed later | |
467 last; | |
468 } | |
469 elsif (/^\s+(E-value|---)/) { | |
470 next; | |
471 } | |
472 elsif (/^$/) { | |
473 next; | |
474 } | |
475 else { | |
476 next if(/No hits detected that satisfy reporting thresholds/); | |
477 | |
478 #Assume that we have a sequence match | |
479 my @sMatch = split( /\s+/, $_ ); | |
480 unless ( scalar(@sMatch) >= 10 ) { | |
481 die "Expected at least 10 pieces of data: $_;\n"; | |
482 } | |
483 my $desc; | |
484 if ( scalar(@sMatch) >= 11 ) { | |
485 $desc = join( " ", @sMatch[ 10 .. $#sMatch ] ); | |
486 } | |
487 | |
488 $hmmRes->addHMMSeq( | |
489 Bio::Pfam::HMM::HMMSequence->new( | |
490 { | |
491 evalue => $sMatch[1], | |
492 bits => $sMatch[2], | |
493 bias => $sMatch[3], | |
494 exp => $sMatch[7], | |
495 numberHits => $sMatch[8], | |
496 name => $sMatch[9], | |
497 desc => defined($desc) ? $desc : "-", | |
498 } | |
499 ) | |
500 ); | |
501 | |
502 next; | |
503 } | |
504 die "Failed to parse $_ in sequence section\n"; | |
505 } | |
506 | |
507 } | |
508 | |
509 #------------------------------------------------------------------------------ | |
510 | |
511 =head2 _readUnitHits | |
512 | |
513 Title : _readUnitHits | |
514 Usage : Private method. $self->_readUnitHits(\*FH, $hmmResults) | |
515 Function : Reads the unit (domain) hits from a HMMER3 hmmsearch | |
516 Args : The file handle to hmmsearch output, a Bio::Pfam::HMM::HMMResults object | |
517 Returns : Nothing | |
518 | |
519 =cut | |
520 | |
521 no warnings 'recursion'; | |
522 | |
523 sub _readUnitHits { | |
524 my ( $self, $hs, $hmmRes ) = @_; | |
525 | |
526 if($hmmRes->eof){ | |
527 return; | |
528 } | |
529 | |
530 #Parse the domain hits section | |
531 #>> P37935.1 MAAY4_SCHCO Mating-type protein A-alpha Y4. | |
532 # # bit score bias E-value ind Evalue hmm from hmm to ali from ali to env from env to ali-acc | |
533 # --- --------- ------- ---------- ---------- -------- -------- -------- -------- -------- -------- ------- | |
534 # 1 244.0 0.5 9.5e-76 1.7e-70 1 146 [. 1 145 [. 1 146 [. 0.99 | |
535 # | |
536 # Alignments for each domain: | |
537 # == domain 1 score: 244.0 bits; conditional E-value: 9.5e-76 | |
538 # SEED 1 medrlallkaisasakdlvalaasrGaksipspvkttavkfdplptPdldalrtrlkeaklPakaiksalsayekaCarWrsdleeafdktaksvsPanlhllealrirlyteqvekWlvqvlevaerWkaemekqrahiaatmgp 146 | |
539 # m+++la+l++isa+akd++ala+srGa+++ +p++tt+++fd+l++P+ld++rtrl+ea+lP+kaik++lsaye+aCarW++dleeafd+ta+s+sP+n+++l++lr+rly+eqv+kWl++vl+v+erWkaemekqrahi+atmgp | |
540 # P37935.1 1 MAELLACLQSISAHAKDMMALARSRGATGS-RPTPTTLPHFDELLPPNLDFVRTRLQEARLPPKAIKGTLSAYESACARWKHDLEEAFDRTAHSISPHNFQRLAQLRTRLYVEQVQKWLYEVLQVPERWKAEMEKQRAHINATMGP 145 | |
541 # 899***************************.******************************************************************************************************************8 PP | |
542 | |
543 while (<$hs>) { | |
544 if (/^Internal/) { | |
545 last; | |
546 } | |
547 elsif (/\>\>\s+(\S+)/) { | |
548 my $seqId = $1; | |
549 $self->_readUnitData( $seqId, $hs, $hmmRes ); | |
550 if($hmmRes->eof){ | |
551 return; | |
552 } | |
553 } | |
554 } | |
555 } | |
556 | |
557 sub _readUnitData { | |
558 my ( $self, $id, $hs, $hmmRes ) = @_; | |
559 | |
560 if($hmmRes->eof){ | |
561 return; | |
562 } | |
563 my $hmmName = $hmmRes->seedName(); | |
564 | |
565 my $seqName = $hmmRes->seqName; | |
566 | |
567 # bit score bias E-value ind Evalue hmm from hmm to ali from ali to env from env to ali-acc | |
568 # --- --------- ------- ---------- ---------- -------- -------- -------- -------- -------- -------- ------- | |
569 # 1 244.0 0.5 9.5e-76 1.7e-70 1 146 [. 1 145 [. 1 146 [. 0.99 | |
570 # | |
571 # Alignments for each domain: | |
572 | |
573 my @units; | |
574 my $align = 1; | |
575 my $recurse = 0; | |
576 my $eof = 0; | |
577 my ($nextSeqId); | |
578 while (<$hs>) { | |
579 if (/^[(\/\/|Internal)]/ ) { | |
580 $align = 0; | |
581 $recurse = 0; | |
582 $eof = 1; | |
583 last; | |
584 } | |
585 elsif (/^\>\>\s+(\S+)/) { | |
586 $nextSeqId = $1; | |
587 $align = 0; | |
588 $recurse = 1; | |
589 last; | |
590 } | |
591 elsif (/^\s+Alignments for each domain:/) { | |
592 $align = 1; | |
593 $recurse = 0; | |
594 last; | |
595 } | |
596 elsif (/^\s+(#\s+score|---)/){ | |
597 | |
598 #Two human readable lines | |
599 next; | |
600 } | |
601 elsif (/^$/) { | |
602 | |
603 #blank line | |
604 next; | |
605 } | |
606 elsif (/^\s+\d+\s+/) { | |
607 my @dMatch = split( /\s+/, $_ ); | |
608 unless ( scalar(@dMatch) == 17 ) { | |
609 die "Expected 16 elements of data: $_\n"; | |
610 } | |
611 | |
612 push( | |
613 @units, | |
614 Bio::Pfam::HMM::HMMUnit->new( | |
615 { | |
616 name => $id, | |
617 domain => $dMatch[1], | |
618 bits => $dMatch[3], | |
619 bias => $dMatch[4], | |
620 domEvalue => $dMatch[5], | |
621 evalue => $dMatch[6], | |
622 hmmFrom => $dMatch[7], | |
623 hmmTo => $dMatch[8], | |
624 seqFrom => $dMatch[10], | |
625 seqTo => $dMatch[11], | |
626 envFrom => $dMatch[13], | |
627 envTo => $dMatch[14], | |
628 aliAcc => $dMatch[16] | |
629 } | |
630 ) | |
631 ); | |
632 | |
633 next; | |
634 } | |
635 elsif(/^\s+\[No individual domains/) { | |
636 $align=0; | |
637 next; | |
638 } | |
639 else { | |
640 confess("Did not parse line: $_"); | |
641 } | |
642 } | |
643 | |
644 # == domain 1 score: 244.0 bits; conditional E-value: 9.5e-76 | |
645 # SEED 1 medrlallkaisasakdlvalaasrGaksipspvkttavkfdplptPdldalrtrlkeaklPakaiksalsayekaCarWrsdleeafdktaksvsPanlhllealrirlyteqvekWlvqvlevaerWkaemekqrahiaatmgp 146 | |
646 # m+++la+l++isa+akd++ala+srGa+++ +p++tt+++fd+l++P+ld++rtrl+ea+lP+kaik++lsaye+aCarW++dleeafd+ta+s+sP+n+++l++lr+rly+eqv+kWl++vl+v+erWkaemekqrahi+atmgp | |
647 # P37935.1 1 MAELLACLQSISAHAKDMMALARSRGATGS-RPTPTTLPHFDELLPPNLDFVRTRLQEARLPPKAIKGTLSAYESACARWKHDLEEAFDRTAHSISPHNFQRLAQLRTRLYVEQVQKWLYEVLQVPERWKAEMEKQRAHINATMGP 145 | |
648 # 899***************************.******************************************************************************************************************8 PP | |
649 # | |
650 # OR.... | |
651 # | |
652 # == domain 1 score: 27.6 bits; conditional E-value: 7.4e-10 | |
653 # PF00018 17 LsfkkGdvitvleksee.eWwkaelkdg.keGlvPsnYvep 55 | |
654 # L++++Gd+++++++++e++Ww++++++++++G++P+n+v+p | |
655 # P15498.4 617 LRLNPGDIVELTKAEAEqNWWEGRNTSTnEIGWFPCNRVKP 657 | |
656 # 7899**********9999*******************9987 PP | |
657 | |
658 | |
659 if ($align) { | |
660 my ($pattern1, $pattern2); | |
661 | |
662 if($hmmName and $hmmRes->program eq 'hmmsearch'){ | |
663 $pattern1 = qr/^\s+$hmmName\s+\d+\s+(\S+)\s+\d+/; | |
664 $id =~ s/(\W)/\\$1/g; # escape any non-word character | |
665 # $id =~ s/\|/\\|/g; #Escape '|', '[' and ']' characters | |
666 # $id =~ s/\[/\\[/g; | |
667 # $id =~ s/\]/\\]/g; | |
668 $pattern2 = qr/^\s+$id\s+\d+\s+(\S+)\s+\d+/; | |
669 }elsif($seqName and $hmmRes->program eq 'hmmscan'){ | |
670 my $tmpSeqName = $seqName; | |
671 $tmpSeqName =~ s/(\W)/\\$1/g; # escape any non-word character | |
672 # $tmpSeqName =~ s/\|/\\|/g; #Escape '|', '[' and ']' characters | |
673 # $tmpSeqName =~ s/\[/\\[/g; | |
674 # $tmpSeqName =~ s/\]/\\]/g; | |
675 $pattern1 = qr/^\s+$id\s+\d+\s+(\S+)\s+\d+/; | |
676 $pattern2 = qr/^\s+$tmpSeqName\s+\d+\s+(\S+)\s+\d+/; | |
677 }elsif($seqName and ($hmmRes->program eq 'phmmer' or $hmmRes->program eq 'jackhmmer') ){ | |
678 $seqName =~ s/(\W)/\\$1/g; # escape any non-word character | |
679 # $seqName =~ s/\|/\|/g; #Escape '|', '[' and ']' characters | |
680 # $seqName =~ s/\[/\\[/g; | |
681 # $seqName =~ s/\]/\\]/g; | |
682 $pattern1 = qr/^\s+$seqName\s+\d+\s+(\S+)\s+\d+/; | |
683 $pattern2 = qr/^\s+$id\s+\d+\s+(\S+)\s+\d+/; | |
684 } | |
685 | |
686 | |
687 $recurse = 0; | |
688 my $matchNo; | |
689 my $hmmlen = 0; | |
690 while (<$hs>) { | |
691 if (/$pattern1/) { | |
692 $units[ $matchNo - 1 ]->hmmalign->{hmm} .= $1; | |
693 $hmmlen = length($1); | |
694 } | |
695 elsif (/$pattern2/) { | |
696 $units[ $matchNo - 1 ]->hmmalign->{seq} .= $1; | |
697 } | |
698 elsif (/^\s+([x\.]+)\s+RF$/) { | |
699 my $rf = $1; | |
700 $units[ $matchNo - 1 ]->hmmalign->{rf} .= $rf; | |
701 } | |
702 elsif (/^\s+([0-9\*\.]+)\s+PP$/) { | |
703 my $pp = $1; | |
704 $units[ $matchNo - 1 ]->hmmalign->{pp} .= $pp; | |
705 }elsif (/^\s+(\S+)\s+CS$/) { | |
706 my $cs = $1; | |
707 $units[ $matchNo - 1 ]->hmmalign->{cs} .= $cs; | |
708 }elsif (/^\s+==\s+domain\s+(\d+)/) { | |
709 $matchNo = $1; | |
710 } | |
711 elsif (/^\s+(.*)\s+$/) { | |
712 # $1 is *not* the match - this fails if there are prepended | |
713 # or appended spaces | |
714 # $units[ $matchNo - 1 ]->hmmalign->{match} .= $1; | |
715 # Let's get a right substring based on the HMM length | |
716 chomp; | |
717 my $m1 = substr($_,-$hmmlen); | |
718 $units[ $matchNo - 1 ]->hmmalign->{match} .= $m1; | |
719 }elsif (/^$/) { | |
720 next; | |
721 } | |
722 elsif (/^[(\/\/|Internal)]/) { | |
723 $align = 0; | |
724 $recurse = 0; | |
725 $eof = 1; | |
726 last; | |
727 } | |
728 elsif (/^\>\>\s+(\S+)/) { | |
729 $nextSeqId = $1; | |
730 $recurse = 1; | |
731 last; | |
732 } | |
733 | |
734 else { | |
735 confess("Did not parse |$_| in units"); | |
736 } | |
737 } | |
738 } | |
739 | |
740 foreach my $u (@units) { | |
741 $hmmRes->addHMMUnit($u); | |
742 } | |
743 | |
744 $hmmRes->eof($eof); | |
745 | |
746 if ($recurse and $nextSeqId) { | |
747 $self->_readUnitData( $nextSeqId, $hs, $hmmRes ); | |
748 } | |
749 return; | |
750 } | |
751 use warnings 'recursion'; | |
752 | |
753 #------------------------------------------------------------------------------- | |
754 | |
755 =head2 parseHMMER2 | |
756 | |
757 Title : parseHMMER2 | |
758 Usage : $self->parseHMMER2(\*FH ) | |
759 Function : This is a minimal parser for reading in the output of HMMER2 hmmsearch | |
760 Args : The file handle to hmmsearch output | |
761 Returns : A Bio::Pfam::HMM::HMMResults object | |
762 | |
763 =cut | |
764 | |
765 sub parseHMMER2 { | |
766 my $self = shift; | |
767 my $file = shift; | |
768 | |
769 my $hmmRes = Bio::Pfam::HMM::HMMResults->new; | |
770 | |
771 my %seqh; | |
772 my $count = 0; | |
773 | |
774 while (<$file>) { | |
775 /^Scores for complete sequences/ && last; | |
776 } | |
777 | |
778 while (<$file>) { | |
779 /^Parsed for domains/ && last; | |
780 if ( my ( $id, $de, $sc, $ev, $hits ) = | |
781 /^(\S+)\s+(.*?)\s+(\S+)\s+(\S+)\s+(\d+)\s*$/ ) | |
782 { | |
783 $hmmRes->addHMMSeq( | |
784 Bio::Pfam::HMM::HMMSequence->new( | |
785 { | |
786 bits => $sc, | |
787 evalue => $ev, | |
788 name => $id, | |
789 desc => $de, | |
790 numberHits => $hits | |
791 } | |
792 ) | |
793 ); | |
794 | |
795 $seqh{$id} = $sc; | |
796 } | |
797 } | |
798 | |
799 while (<$file>) { | |
800 /^Histogram of all scores/ && last; | |
801 if ( my ( $id, $sqfrom, $sqto, $hmmf, $hmmt, $sc, $ev ) = | |
802 /^(\S+)\s+\S+\s+(\d+)\s+(\d+).+?(\d+)\s+(\d+)\s+\S+\s+(\S+)\s+(\S+)\s*$/ ) | |
803 { | |
804 $hmmRes->addHMMUnit( | |
805 Bio::Pfam::HMM::HMMUnit->new( | |
806 { | |
807 name => $id, | |
808 seqFrom => $sqfrom, | |
809 seqTo => $sqto, | |
810 hmmFrom => $hmmf, | |
811 hmmTo => $hmmt, | |
812 bits => $sc, | |
813 evalue => $ev | |
814 } | |
815 ) | |
816 ); | |
817 | |
818 } | |
819 } | |
820 | |
821 return $hmmRes; | |
822 } | |
823 | |
824 #------------------------------------------------------------------------------- | |
825 | |
826 =head2 parseHMMER1 | |
827 | |
828 Title : parseHMMER1 | |
829 Usage : $self->parseHMMER1(\*FH ) | |
830 Function : This is a minimal parser for reading in the output of HMMER1 hmmsearch. | |
831 : There are a few hacks to get round some of them requirements | |
832 Args : The file handle to hmmsearch output | |
833 Returns : A Bio::Pfam::HMM::HMMResults object | |
834 | |
835 =cut | |
836 | |
837 sub parseHMMER1 { | |
838 my $self = shift; | |
839 my $file = shift; | |
840 | |
841 my $hmmRes = Bio::Pfam::HMM::HMMResults->new; | |
842 | |
843 my %seqh; | |
844 my $count = 0; | |
845 | |
846 while (<$file>) { | |
847 if ( my ( $bits, $s, $e, $id, $de ) = | |
848 /^(-?\d+\.?\d*)\s+\(bits\)\s+f:\s+(\d+)\s+t:\s+(\d+)\s+Target:\s+(\S+)\s+(.*)/ | |
849 ) | |
850 { | |
851 if ( $id =~ /(\S+)\/(\d+)-(\d+)/ ) { | |
852 $id = $1; | |
853 $s = $2 + $s - 1; | |
854 $e = $2 + $e - 1; | |
855 } | |
856 | |
857 if ( !$hmmRes->seqs->{$id} ) { | |
858 $hmmRes->addHMMSeq( | |
859 Bio::Pfam::HMM::HMMSequence->new( | |
860 { | |
861 bits => $bits, | |
862 evalue => 1, | |
863 name => $id, | |
864 desc => $de, | |
865 numberHits => 1 | |
866 } | |
867 ) | |
868 ); | |
869 } | |
870 $hmmRes->addHMMUnit( | |
871 Bio::Pfam::HMM::HMMUnit->new( | |
872 { | |
873 name => $id, | |
874 seqFrom => $s, | |
875 seqTo => $e, | |
876 hmmFrom => "1", | |
877 hmmTo => "1", | |
878 bits => $bits, | |
879 evalue => "1" | |
880 } | |
881 ) | |
882 ); | |
883 if ( $bits > $hmmRes->seqs->{$id}->bits ) { | |
884 $hmmRes->seqs->{$id}->bits($bits); | |
885 } | |
886 } | |
887 } | |
888 return $hmmRes; | |
889 } | |
890 | |
891 #------------------------------------------------------------------------------- | |
892 | |
893 =head2 writeScoresFile | |
894 | |
895 Title : writeScoresFile | |
896 Usage : $hmmResIO->writeScoresFile( $hmmRes) | |
897 Function : Writes a scores file for a Bio::Pfam::HMM::HMMResults object. | |
898 Args : Bio::Pfam::HMM::HMMResults | |
899 Returns : Nothing | |
900 | |
901 =cut | |
902 | |
903 sub writeScoresFile { | |
904 my ( $self, $hmmRes ) = @_; | |
905 | |
906 unless ($hmmRes) { | |
907 confess "A Bio::Pfam::HMM::HMMResults object was not parsed in\n"; | |
908 } | |
909 unless ( $hmmRes->isa("Bio::Pfam::HMM::HMMResults") ) { | |
910 confess("Variable passed in is not a Bio::Pfam::HMM::Results object"); | |
911 } | |
912 | |
913 my $fh; | |
914 open( $fh, ">" . $self->scores ) | |
915 or confess "Could not open " . $self->scores . ":[$!]\n"; | |
916 | |
917 my ( $lowSeq, $lowDom, $highSeq, $highDom ); | |
918 $lowSeq = $lowDom = 999999.99; | |
919 $highSeq = $highDom = -999999.99; | |
920 unless ( defined $hmmRes->domThr and defined $hmmRes->seqThr ) { | |
921 warn "No threshold set, setting to 25.0 bits\n"; | |
922 $hmmRes->domThr("25.0"); | |
923 $hmmRes->seqThr("25.0"); | |
924 } | |
925 | |
926 my @sigUnits; | |
927 | |
928 foreach my $seqId ( keys %{ $hmmRes->seqs } ) { | |
929 | |
930 #Does this sequence score above or equal to the sequence threshold? | |
931 if ( $hmmRes->seqs->{$seqId}->bits >= $hmmRes->seqThr ) { | |
932 | |
933 #Is this the lowest sequence thresh | |
934 if ( $hmmRes->seqs->{$seqId}->bits < $lowSeq ) { | |
935 $lowSeq = $hmmRes->seqs->{$seqId}->bits; | |
936 } | |
937 | |
938 #For each of the regions found on the sequence, look to see if the match is great | |
939 #than the domain threshold. If it is, is it lower than we we have seen previously | |
940 foreach my $unit ( @{ $hmmRes->seqs->{$seqId}->hmmUnits } ) { | |
941 if ( $unit->bits >= $hmmRes->domThr ) { | |
942 push( @sigUnits, $unit ); | |
943 if ( $unit->bits < $lowDom ) { | |
944 $lowDom = $unit->bits(); | |
945 } | |
946 } | |
947 elsif ( $unit->bits > $highDom ) { | |
948 $highDom = $unit->bits; | |
949 } | |
950 } | |
951 } | |
952 else { | |
953 | |
954 #Is this the highest sequence thres below the cut-off | |
955 if ( $hmmRes->seqs->{$seqId}->bits > $highSeq ) { | |
956 $highSeq = $hmmRes->seqs->{$seqId}->bits; | |
957 } | |
958 | |
959 #For each of the regions found on the sequence, look to see if the match is great | |
960 #than the domain threshold. If it is, is it lower than we we have seen previously | |
961 foreach my $unit ( @{ $hmmRes->seqs->{$seqId}->hmmUnits } ) { | |
962 if ( $unit->bits < $hmmRes->domThr && $unit->bits > $highDom ) { | |
963 $highDom = $unit->bits; | |
964 } | |
965 } | |
966 } | |
967 } | |
968 | |
969 $hmmRes->domTC($lowDom); | |
970 $hmmRes->seqTC($lowSeq); | |
971 $hmmRes->domNC($highDom); | |
972 $hmmRes->seqNC($highSeq); | |
973 | |
974 #Print the domains to the scores file | |
975 foreach my $u ( sort { $b->bits <=> $a->bits } @sigUnits ) { | |
976 print $fh | |
977 sprintf( "%.1f %s/%s-%s %s-%s %s\n", $u->bits, $u->name, $u->envFrom, $u->envTo, $u->seqFrom, $u->seqTo, $u->evalue ); | |
978 } | |
979 close($fh); | |
980 | |
981 } | |
982 | |
983 #------------------------------------------------------------------------------- | |
984 | |
985 #TODO - write _readAlign | |
986 | |
987 =head2 _readAlign | |
988 | |
989 Title : | |
990 Usage : | |
991 Function : | |
992 Args : | |
993 Returns : | |
994 | |
995 =cut | |
996 | |
997 sub _readAlign { | |
998 my ( $self, $fh, $hmmRes ) = @_; | |
999 | |
1000 } | |
1001 | |
1002 #Parse the alignment section | |
1003 #if($pp){ | |
1004 | |
1005 #}else{ | |
1006 # while(<HS>){ | |
1007 # last if(/^\/\//) | |
1008 # } | |
1009 #} | |
1010 | |
1011 | |
1012 | |
1013 sub _readFooter { | |
1014 my($self, $fh, $hmmRes ) = @_; | |
1015 | |
1016 # We are going to parse something like this! | |
1017 | |
1018 # Internal pipeline statistics summary: | |
1019 #------------------------------------- | |
1020 #Query sequence(s): 1 (360 residues) | |
1021 #Target model(s): 7 (836 nodes) | |
1022 #Passed MSV filter: 2 (0.285714); expected 0.1 (0.02) | |
1023 #Passed Vit filter: 1 (0.142857); expected 0.0 (0.001) | |
1024 #Passed Fwd filter: 1 (0.142857); expected 0.0 (1e-05) | |
1025 #Initial search space (Z): 7 [actual number of targets] | |
1026 #Domain search space (domZ): 1 [number of targets reported over threshold] | |
1027 ## CPU time: 0.00u 0.00s 00:00:00.00 Elapsed: 00:00:00 | |
1028 ## Mc/sec: inf | |
1029 #// | |
1030 | |
1031 while(<$fh>){ | |
1032 if(/\/\//){ | |
1033 last; | |
1034 } | |
1035 } | |
1036 } | |
1037 | |
1038 | |
1039 #Parse the internal summary section | |
1040 #Internal statistics summary: | |
1041 #---------------------------- | |
1042 #Query HMM(s): 1 (0 nodes) | |
1043 #Target sequences: 5323441 (0 residues) | |
1044 #Passed MSV filter: 116519 (-37389918065567040729448769671768824784852036328367855636063687997915136.000; expected 19991592792512146725679052970637918208.000) | |
1045 #Passed Vit filter: 7579 (-0.0000; expected -35982214160587876085407389642471051723332987952235753317595472501307733302049608744822636544.0000) | |
1046 #Passed Fwd filter: 1687 (8.3e+165; expected -7.5e-266) | |
1047 #Mc/sec: 828.85 | |
1048 # CPU time: 115.36u 4.45s 00:01:59.81 Elapsed: 00:03:01 | |
1049 | |
1050 #sub writeHMMSearch { | |
1051 # my ( $self, $hmmRes ) = @_; | |
1052 # my $fh; | |
1053 # open($fh, ">".$self->outfile."\n"); | |
1054 # | |
1055 # $self->_writeHeader($fh, $hmmRes); | |
1056 # $self->_writeSeqHits( $fh, $hmmRes); | |
1057 # $self->_writeDomHits( $fh, $hmmRes); | |
1058 # $self->_writeAlign( $fh, $hmmRes) if($self->align); | |
1059 # $self->_writeInternalSummary( $fh, $hmmRes); | |
1060 #} | |
1061 #sub mergeHMMSearch { | |
1062 # my ( $self, $filenames ) = @_; | |
1063 #} | |
1064 | |
1065 | |
1066 sub write_ascii_out { | |
1067 | |
1068 my ($self, $HMMResults, $fh, $scanData, $e_seq, $e_dom, $b_seq, $b_dom) = @_; | |
1069 | |
1070 | |
1071 $scanData->{_max_seqname} = 20 unless($scanData->{_max_seqname} or $scanData->{_max_seqname} < 1); | |
1072 | |
1073 my $ga; | |
1074 | |
1075 if($e_seq or $e_dom) { | |
1076 $e_seq = $e_dom unless($e_seq); | |
1077 $e_dom = "10" unless($e_dom); | |
1078 } | |
1079 elsif($b_seq or $b_dom) { | |
1080 $b_seq = $b_dom unless($b_seq); | |
1081 $b_dom = "0" unless($b_dom); | |
1082 } | |
1083 else { | |
1084 $ga = 1; | |
1085 } | |
1086 | |
1087 | |
1088 foreach my $unit ( sort { $a->seqFrom <=> $b->seqFrom } @{ $HMMResults->units } ) { | |
1089 | |
1090 if($unit->name =~ /Pfam\-B/) { | |
1091 | |
1092 next unless($HMMResults->seqs->{$unit->name}->evalue <= "0.001" and $unit->evalue <= "0.001"); | |
1093 | |
1094 | |
1095 printf $fh "%-".$scanData->{_max_seqname}."s %6d %6d %6d %6d %-11s %-16s %7s %5d %5d %5d %8s %9s %3s %-8s\n", | |
1096 $HMMResults->seqName, | |
1097 $unit->seqFrom, | |
1098 $unit->seqTo, | |
1099 $unit->envFrom, | |
1100 $unit->envTo, | |
1101 $scanData->{_accmap}->{ $unit->name }, | |
1102 $unit->name, | |
1103 "Pfam-B", | |
1104 $unit->hmmFrom, | |
1105 $unit->hmmTo, | |
1106 $scanData->{_model_len}->{ $unit->name }, | |
1107 $unit->bits, | |
1108 $unit->evalue, | |
1109 "NA", | |
1110 "NA"; | |
1111 | |
1112 | |
1113 } | |
1114 else { | |
1115 | |
1116 #Filter results based on thresholds | |
1117 if($ga) { | |
1118 next unless($unit->sig); | |
1119 } | |
1120 if($e_seq) { | |
1121 next unless($HMMResults->seqs->{$unit->name}->evalue <= $e_seq and $unit->evalue <= $e_dom); | |
1122 } | |
1123 if($b_seq) { | |
1124 | |
1125 next unless($HMMResults->seqs->{$unit->name}->bits >= $b_seq and $unit->bits >= $b_dom); | |
1126 } | |
1127 | |
1128 my $clan = $scanData->{_clanmap}->{ $unit->name } || "No_clan"; | |
1129 | |
1130 | |
1131 printf $fh "%-".$scanData->{_max_seqname}."s %6d %6d %6d %6d %-11s %-16s %7s %5d %5d %5d %8s %9s %3d %-8s ", | |
1132 $HMMResults->seqName, | |
1133 $unit->seqFrom, | |
1134 $unit->seqTo, | |
1135 $unit->envFrom, | |
1136 $unit->envTo, | |
1137 $scanData->{_accmap}->{ $unit->name }, | |
1138 $unit->name, | |
1139 $scanData->{_type}->{ $unit->name }, | |
1140 $unit->hmmFrom, | |
1141 $unit->hmmTo, | |
1142 $scanData->{_model_len}->{ $unit->name }, | |
1143 $unit->bits, | |
1144 $unit->evalue, | |
1145 $unit->sig, | |
1146 $clan; | |
1147 | |
1148 | |
1149 if($unit->{'act_site'}) { | |
1150 local $" = ","; | |
1151 print $fh "predicted_active_site[@{$unit->{'act_site'}}]"; | |
1152 } | |
1153 | |
1154 if($scanData->{_translate}){ | |
1155 my $strand = '?'; | |
1156 my $start = '-'; | |
1157 my $end = '-'; | |
1158 if(exists($scanData->{_orf}->{$HMMResults->seqName})){ | |
1159 $strand = $scanData->{_orf}->{$HMMResults->seqName}->{strand}; | |
1160 if($strand eq '+'){ | |
1161 $start = $scanData->{_orf}->{$HMMResults->seqName}->{start} + ($unit->envFrom * 3) - 3; | |
1162 $end = $scanData->{_orf}->{$HMMResults->seqName}->{start} + ($unit->envTo * 3) - 3; | |
1163 }elsif($strand eq '-'){ | |
1164 $start = $scanData->{_orf}->{$HMMResults->seqName}->{start} - ($unit->envFrom * 3) + 3; | |
1165 $end = $scanData->{_orf}->{$HMMResults->seqName}->{start} - ($unit->envTo * 3) + 3; | |
1166 } | |
1167 } | |
1168 print $fh "$strand $start $end"; | |
1169 } | |
1170 | |
1171 print $fh "\n"; | |
1172 } | |
1173 | |
1174 if($scanData->{_align}){ | |
1175 print $fh sprintf( "%-10s %s\n", "#HMM", $unit->hmmalign->{hmm} ); | |
1176 print $fh sprintf( "%-10s %s\n", "#MATCH", $unit->hmmalign->{match} ); | |
1177 print $fh sprintf( "%-10s %s\n", "#PP", $unit->hmmalign->{pp}); | |
1178 print $fh sprintf( "%-10s %s\n", "#SEQ", $unit->hmmalign->{seq}); | |
1179 print $fh sprintf( "%-10s %s\n", "#CS", $unit->hmmalign->{cs}) if($unit->hmmalign->{cs}); | |
1180 } | |
1181 | |
1182 } | |
1183 | |
1184 } | |
1185 | |
1186 1; |