Mercurial > repos > matteoc > agame_custom_tools
comparison pfamScan/Bio/Pfam/HMM/HMMResults.pm @ 0:68a3648c7d91 draft default tip
Uploaded
author | matteoc |
---|---|
date | Thu, 22 Dec 2016 04:45:31 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:68a3648c7d91 |
---|---|
1 # Bio::Pfam::HMM::HMMResults.pm | |
2 # | |
3 # Author: finnr | |
4 # Maintainer: $Id: HMMResults.pm,v 1.3 2009-12-15 14:38:08 jt6 Exp $ | |
5 # Version: $Revision: 1.3 $ | |
6 # Created: Nov 19, 2008 | |
7 # Last Modified: $Date: 2009-12-15 14:38:08 $ | |
8 | |
9 =head1 NAME | |
10 | |
11 Bio::Pfam::HMM::HMMResults - A object to represents the results from hmmsearch | |
12 | |
13 =cut | |
14 | |
15 package Bio::Pfam::HMM::HMMResults; | |
16 | |
17 =head1 DESCRIPTION | |
18 | |
19 A more detailed description of what this class does and how it does it. | |
20 | |
21 $Id: HMMResults.pm,v 1.3 2009-12-15 14:38:08 jt6 Exp $ | |
22 | |
23 =head1 COPYRIGHT | |
24 | |
25 File: Bio::Pfam::HMM::HMMResults.pm | |
26 | |
27 Copyright (c) 2007: Genome Research Ltd. | |
28 | |
29 Authors: Rob Finn (rdf@sanger.ac.uk) | |
30 | |
31 This is free software; you can redistribute it and/or | |
32 modify it under the terms of the GNU General Public License | |
33 as published by the Free Software Foundation; either version 2 | |
34 of the License, or (at your option) any later version. | |
35 | |
36 This program is distributed in the hope that it will be useful, | |
37 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
38 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
39 GNU General Public License for more details. | |
40 | |
41 You should have received a copy of the GNU General Public License | |
42 along with this program; if not, write to the Free Software | |
43 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | |
44 or see the on-line version at http://www.gnu.org/copyleft/gpl.txt | |
45 | |
46 =cut | |
47 | |
48 use strict; | |
49 use warnings; | |
50 | |
51 use Moose; | |
52 use Moose::Util::TypeConstraints; | |
53 use Bio::Pfam::HMM::HMMSequence; | |
54 use Bio::Pfam::HMM::HMMUnit; | |
55 | |
56 # | |
57 #------------------------------------------------------------------------------- | |
58 # Attributes | |
59 | |
60 has 'hmmerVersion' => ( | |
61 isa => 'Str', | |
62 is => 'rw', | |
63 ); | |
64 | |
65 has 'hmmName' => ( | |
66 isa => 'Str', | |
67 is => 'rw' | |
68 ); | |
69 | |
70 has 'seqDB' => ( | |
71 isa => 'Str', | |
72 is => 'rw' | |
73 ); | |
74 | |
75 has hmmLength => ( | |
76 isa => 'Int', | |
77 is => 'rw' | |
78 ); | |
79 | |
80 has 'thisFile' => ( | |
81 isa => 'Str', | |
82 is => 'rw' | |
83 ); | |
84 | |
85 has seedName => ( | |
86 isa => 'Str', | |
87 is => 'rw' | |
88 ); | |
89 | |
90 has 'seqs' => ( | |
91 isa => 'HashRef', | |
92 is => 'rw', | |
93 default => sub { {} }, | |
94 ); | |
95 | |
96 has 'units' => ( | |
97 isa => 'ArrayRef', | |
98 is => 'rw', | |
99 default => sub { [] }, | |
100 ); | |
101 | |
102 has 'domThr' => ( | |
103 isa => 'Num', | |
104 is => 'rw', | |
105 default => '25.0' | |
106 ); | |
107 | |
108 has 'seqThr' => ( | |
109 isa => 'Num', | |
110 is => 'rw', | |
111 default => '25.0' | |
112 ); | |
113 | |
114 has 'evalueThr' => ( | |
115 isa => 'Num', | |
116 is => 'rw' | |
117 ); | |
118 | |
119 has 'domTC' => ( | |
120 isa => 'Num', | |
121 is => 'rw' | |
122 ); | |
123 | |
124 has 'seqTC' => ( | |
125 isa => 'Num', | |
126 is => 'rw' | |
127 ); | |
128 | |
129 has 'domNC' => ( | |
130 isa => 'Num', | |
131 is => 'rw' | |
132 ); | |
133 | |
134 has 'seqNC' => ( | |
135 isa => 'Num', | |
136 is => 'rw' | |
137 ); | |
138 | |
139 has 'randSeedNum' => ( | |
140 isa => 'Int', | |
141 is => 'rw' | |
142 ); | |
143 | |
144 has 'description' => ( | |
145 isa => 'Str', | |
146 is => 'rw' | |
147 ); | |
148 | |
149 has 'seqName' => ( | |
150 isa => 'Str', | |
151 is => 'rw' | |
152 ); | |
153 | |
154 has 'seqLength' => ( | |
155 isa => 'Int', | |
156 is => 'rw' | |
157 ); | |
158 | |
159 | |
160 has 'eof' => ( | |
161 isa => 'Int', | |
162 is => 'rw', | |
163 default => 0 | |
164 ); | |
165 | |
166 has 'program' => ( | |
167 isa => 'Str', | |
168 is => 'rw' | |
169 ); | |
170 | |
171 =head1 METHODS | |
172 | |
173 =head2 addHMMSeq | |
174 | |
175 Title : addHMMSeq | |
176 Usage : $hmmRes->addHMMSeq( $hmmSeqObj ) | |
177 Function : Adds a Bio::Pfam::HMM::HMMSequence object to the results object | |
178 Args : A Bio::Pfam::HMM::HMMSequence object | |
179 Returns : nothing | |
180 | |
181 =cut | |
182 | |
183 sub addHMMSeq { | |
184 my( $self, $hmmSeq ) = @_; | |
185 | |
186 unless($hmmSeq->isa('Bio::Pfam::HMM::HMMSequence')){ | |
187 die 'Trying to add a non Bio::Pfam::HMM::HMMSequence object'; | |
188 } | |
189 | |
190 if($self->seqs){ | |
191 if($self->seqs->{$hmmSeq->name}){ | |
192 die "Trying to add the same sequence twice"; | |
193 } | |
194 } | |
195 | |
196 $self->seqs->{$hmmSeq->name} = $hmmSeq; | |
197 } | |
198 | |
199 | |
200 =head2 eachHMMSeq | |
201 | |
202 Title : eachHMMSeq | |
203 Usage : my @seqs = $hmmRes->eachHMMSeq | |
204 Function : Returns an array reference containing the references to all of the | |
205 : Bio::Pfam::HMM::HMMSequence objects stored in the HMMResults object. | |
206 Args : None | |
207 Returns : Array reference | |
208 | |
209 =cut | |
210 | |
211 sub eachHMMSeq { | |
212 my ($self ) = @_; | |
213 my @seqs; | |
214 my $seqRefs = $self->seqs; | |
215 foreach my $n (keys %{ $seqRefs }){ | |
216 push(@seqs, $seqRefs->{$n}); | |
217 } | |
218 return(\@seqs); | |
219 } | |
220 | |
221 #------------------------------------------------------------------------------- | |
222 | |
223 =head2 addHMMUnit | |
224 | |
225 Title : addHMMUnit | |
226 Usage : $hmmRes | |
227 Function : Adds HMM units (the actual region hit) to the HMMSequence in the object | |
228 : and for convenience to the the results sets. All we store are duplicates | |
229 : of the references. | |
230 Args : A Bio::Pfam::HMM:HMMUnit | |
231 Returns : Nothing | |
232 | |
233 =cut | |
234 | |
235 sub addHMMUnit { | |
236 my ($self, $hmmUnit) = @_; | |
237 | |
238 unless($hmmUnit->isa('Bio::Pfam::HMM::HMMUnit')){ | |
239 die "Trying to add an non-Bio::Pfam::HMM::HMMUnit\n"; | |
240 } | |
241 | |
242 if($self->seqs){ | |
243 if($self->seqs->{$hmmUnit->name}){ | |
244 $self->seqs->{$hmmUnit->name}->addHMMUnit($hmmUnit); | |
245 }else{ | |
246 warn "Could not add hmmUnit as the sequence has not been added\n"; | |
247 } | |
248 } | |
249 | |
250 #More conveinence we store the point to the hmmunit in an array | |
251 push(@{$self->units},$hmmUnit); | |
252 } | |
253 | |
254 | |
255 #------------------------------------------------------------------------------- | |
256 | |
257 =head2 domainBitsCutoffFromEvalue | |
258 | |
259 Title : domainBitsCutoffFromEvalue | |
260 Usage : $hmmRes->domainBitsCutoffFromEvalue(0.01) | |
261 Function : From the supplied evalue, it scans through all of the evalues in the results | |
262 : and calulates the bits score. | |
263 Args : An evalue. | |
264 Returns : A bits score. If no evalue is specified, returns nothing | |
265 | |
266 =cut | |
267 | |
268 sub domainBitsCutoffFromEvalue { | |
269 my ($self, $eval) = @_; | |
270 my ($dom,$prev,@doms,$cutoff,$sep,$seen); | |
271 | |
272 unless(defined ($eval) ){ | |
273 warn "No evalue specified\n"; | |
274 return; | |
275 } | |
276 | |
277 | |
278 $seen = 0; | |
279 foreach $_ ( sort { $b->bits <=> $a->bits } @{$self->units}, @{$self->eachHMMSeq} ) { | |
280 if( $_->evalue > $eval ) { | |
281 $seen = 1; | |
282 $dom = $_; | |
283 last; | |
284 } | |
285 $prev = $_; | |
286 } | |
287 | |
288 if( ! defined $prev || $seen == 0) { | |
289 carp("Evalue is either above or below the list..."); | |
290 return undef; | |
291 } | |
292 | |
293 $sep = $prev->bits - $dom->bits ; | |
294 | |
295 if( $sep < 1 ) { | |
296 return $prev->bits(); | |
297 } | |
298 if( $dom->bits < 25 && $prev->bits > 25 ) { | |
299 return 25; | |
300 } | |
301 | |
302 return $dom->bits + sprintf("%.1f",$sep/2); | |
303 } | |
304 | |
305 | |
306 #------------------------------------------------------------------------------- | |
307 | |
308 =head2 lowestTrue | |
309 | |
310 Title : | |
311 Usage : | |
312 Function : | |
313 Args : | |
314 Returns : | |
315 | |
316 =cut | |
317 | |
318 sub lowestTrue { | |
319 my $self = shift; | |
320 | |
321 unless($self->domTC && $self->seqTC) { | |
322 unless($self->domThr and $self->seqThr){ | |
323 die "Could not define TC as I am missing a threshold\n"; | |
324 } | |
325 #Set it wildly high! | |
326 my ($lowSeq, $lowDom); | |
327 $lowSeq = $lowDom = 999999.99; | |
328 | |
329 foreach my $seqId (keys %{$self->seqs} ){ | |
330 if($self->seqs->{$seqId}->bits >= $self->seqThr){ | |
331 #Is this the lowest sequence thresh | |
332 if($self->seqs->{$seqId}->bits < $lowSeq){ | |
333 $lowSeq = $self->seqs->{$seqId}->bits; | |
334 } | |
335 #For each of the regions found on the sequence, look to see if the match is great | |
336 #than the domain threshold. If it is, is it lower than we we have seen previously | |
337 foreach my $unit (@{ $self->seqs->{$seqId}->hmmUnits } ){ | |
338 if( $unit->bits() >= $self->domThr && $unit->bits() < $lowDom ) { | |
339 $lowDom = $unit->bits; | |
340 } | |
341 } | |
342 } | |
343 | |
344 } | |
345 $self->domTC($lowDom); | |
346 $self->seqTC($lowSeq); | |
347 } | |
348 return($self->seqTC, $self->domTC); | |
349 } | |
350 | |
351 #------------------------------------------------------------------------------- | |
352 | |
353 =head2 highestNoise | |
354 | |
355 Title : | |
356 Usage : | |
357 Function : | |
358 Args : | |
359 Returns : | |
360 | |
361 =cut | |
362 | |
363 sub highestNoise { | |
364 my $self = shift; | |
365 | |
366 #See if it is already set | |
367 unless($self->domNC && $self->seqNC) { | |
368 unless($self->domThr and $self->seqThr){ | |
369 die "Could not define TC as I am missing a threshold\n"; | |
370 } | |
371 | |
372 #Set it wildly low | |
373 my ($highSeq, $highDom); | |
374 $highSeq = $highDom = -999999.99; | |
375 | |
376 foreach my $seqId (keys %{$self->seqs} ){ | |
377 | |
378 if($self->seqs->{$seqId}->bits < $self->seqThr){ | |
379 #Is this the highest sequence thres below the cut-off | |
380 if($self->seqs->{$seqId}->bits > $highSeq){ | |
381 $highSeq = $self->seqs->{$seqId}->bits; | |
382 } | |
383 } | |
384 | |
385 #For each of the regions found on the sequence, look to see if the match is great | |
386 #than the domain threshold. If it is, is it lower than we we have seen previously | |
387 foreach my $unit (@{ $self->seqs->{$seqId}->hmmUnits } ){ | |
388 if( $unit->bits < $self->domThr && $unit->bits > $highDom ) { | |
389 $highDom = $unit->bits; | |
390 } | |
391 } | |
392 } | |
393 $self->domNC($highDom); | |
394 $self->seqNC($highSeq); | |
395 } | |
396 | |
397 return($self->seqNC, $self->domNC); | |
398 } | |
399 | |
400 | |
401 sub applyEdits { | |
402 my ($self, $edits, $removeBadEd) = @_; | |
403 | |
404 my @validEd; #If removeBadEd flag is on, collect all the valid ED lines in this array and return at end of sub | |
405 foreach my $e (@$edits){ | |
406 #{ seq => $1, oldFrom => $2, oldTo => $3, newFrom => $5, newTo => $6 } | |
407 if($self->seqs->{$e->{seq}}){ | |
408 my $matched = 0; | |
409 foreach my $u (@{ $self->seqs->{ $e->{seq} }->hmmUnits }){ | |
410 if($u->envFrom == $e->{oldFrom} and $u->envTo == $e->{oldTo}) { | |
411 $matched = 1; #HMM unit found | |
412 | |
413 if(defined $e->{newFrom} and $e->{newTo}){ | |
414 | |
415 #Check co-ordinates of new start and end positions are in range | |
416 if( $e->{newFrom} < $u->{envFrom} or $e->{newTo} > $u->{envTo} or $e->{newFrom} > $e->{newTo}) { | |
417 if($removeBadEd) { | |
418 print "Removing ED line due to out of range co-ordinates: " . $e->{seq}."/".$e->{newFrom}."-".$e->{newTo}. "\n"; | |
419 } | |
420 else { | |
421 warn $e->{seq}."/".$e->{newFrom}."-".$e->{newTo}." contains out of range co-ordinates - bad ED line\n"; | |
422 } | |
423 last; | |
424 } | |
425 | |
426 #Modify the start end positions | |
427 $u->envFrom($e->{newFrom}); | |
428 $u->envTo($e->{newTo}); | |
429 | |
430 #Check that the ali-positions are still okay | |
431 if($u->seqFrom < $e->{newFrom}){ | |
432 $u->seqFrom($e->{newFrom}); | |
433 } | |
434 if($u->seqTo > $e->{newTo}){ | |
435 $u->seqTo($e->{newTo}); | |
436 } | |
437 }else{ | |
438 #Set the score so low it will never get in the align | |
439 $u->bits(-999999.99); | |
440 } | |
441 | |
442 push(@validEd, $e) if($removeBadEd); | |
443 last; | |
444 } | |
445 } | |
446 unless($matched){ #HMM unit not found - bad ED | |
447 if($removeBadEd) { | |
448 print "Removing ED line for invalid hmm unit: " . $e->{seq}."/".$e->{oldFrom}."-".$e->{oldTo}. "\n"; | |
449 } | |
450 else { | |
451 warn $e->{seq}."/".$e->{oldFrom}."-".$e->{oldTo}." does not appear in the list of hmm units - bad ED line\n"; | |
452 } | |
453 } | |
454 }else{ #Sequence not found - bad ED | |
455 if($removeBadEd) { | |
456 print "Removing ED line for invalid hmm unit: " . $e->{seq}."/".$e->{oldFrom}."-".$e->{oldTo}. "\n"; | |
457 } | |
458 else { | |
459 warn $e->{seq}." does not appear in the list of hmm units - bad ED line\n"; | |
460 } | |
461 } | |
462 } | |
463 return(\@validEd) if($removeBadEd); | |
464 | |
465 } | |
466 | |
467 sub remove_overlaps_by_clan { | |
468 | |
469 my ($self, $clanmap, $nested) = @_; | |
470 | |
471 my $new = Bio::Pfam::HMM::HMMResults->new; | |
472 $new->seqName($self->seqName); | |
473 | |
474 foreach my $unit ( sort { $a->evalue <=> $b->evalue } @{ $self->units } ) { | |
475 | |
476 #check if it overlaps before adding | |
477 my $o; | |
478 | |
479 foreach my $u ( @{ $new->units } ) { | |
480 | |
481 if( exists($clanmap->{$unit->name}) and exists($clanmap->{$u->name}) and ($clanmap->{$unit->name} eq $clanmap->{$u->name}) ) { | |
482 if( overlap( $unit, $u ) ) { | |
483 if(exists($$nested{$unit->name}{$u->name})) { | |
484 next; | |
485 } | |
486 else { | |
487 $o=1; | |
488 last; | |
489 } | |
490 } | |
491 | |
492 } | |
493 } | |
494 unless($o) { | |
495 if(! $new->seqs->{$unit->name}) { | |
496 | |
497 $new->addHMMSeq( Bio::Pfam::HMM::HMMSequence->new( { name => $self->seqs->{$unit->name}->name, | |
498 desc => $self->seqs->{$unit->name}->desc, | |
499 bits => $self->seqs->{$unit->name}->bits, | |
500 evalue => $self->seqs->{$unit->name}->evalue, | |
501 numberHits => $self->seqs->{$unit->name}->numberHits}) ); | |
502 | |
503 } | |
504 $new->addHMMUnit($unit); | |
505 } | |
506 | |
507 } | |
508 return $new; | |
509 } | |
510 | |
511 | |
512 | |
513 sub overlap { | |
514 # does unit1 overlap with unit2? | |
515 my $unit1 = shift; | |
516 my $unit2 = shift; | |
517 my( $u1, $u2 ) = sort { $a->seqFrom <=> $b->seqFrom } ( $unit1, $unit2 ); | |
518 | |
519 | |
520 if( $u2->seqFrom <= $u1->seqTo ) { | |
521 return 1; | |
522 } | |
523 | |
524 return 0; | |
525 } | |
526 | |
527 | |
528 sub results { | |
529 my ( $self, $pfamScanData, $e_value ) = @_; | |
530 | |
531 my @results = (); | |
532 foreach my $unit ( sort { $a->seqFrom <=> $b->seqFrom } @{ $self->units } ) { | |
533 | |
534 my $pfamB = $unit->name =~ /^Pfam-B/; | |
535 | |
536 #Filter results based on thresholds | |
537 if ( $unit->name =~ /^Pfam-B/ ) { | |
538 next unless ( $self->seqs->{$unit->name}->evalue <= 0.001 and $unit->evalue <= 0.001 ); | |
539 $pfamB = 1; | |
540 } | |
541 else { | |
542 if ( $e_value ) { | |
543 next unless ( $self->seqs->{$unit->name}->evalue <= $e_value and $unit->evalue <= $e_value ) ; | |
544 } | |
545 else { | |
546 next unless $unit->sig; | |
547 } | |
548 } | |
549 | |
550 push @results, { | |
551 seq => { from => $unit->seqFrom, | |
552 to => $unit->seqTo, | |
553 name => $self->seqName }, | |
554 env => { from => $unit->envFrom, | |
555 to => $unit->envTo }, | |
556 | |
557 hmm => { from => $unit->hmmFrom, | |
558 to => $unit->hmmTo }, | |
559 | |
560 model_length => $pfamScanData->{_model_len}->{ $unit->name }, | |
561 bits => $unit->bits, | |
562 evalue => $unit->evalue, | |
563 acc => $pfamScanData->{_accmap}->{ $unit->name }, | |
564 name => $unit->name, | |
565 desc => $pfamScanData->{_desc}->{ $unit->name }, | |
566 type => $pfamB ? undef : $pfamScanData->{_type}->{ $unit->name }, | |
567 clan => $pfamB ? undef : | |
568 $pfamScanData->{_clanmap}->{ $unit->name } || 'No_clan', | |
569 | |
570 act_site => $pfamB ? undef : $unit->{act_site}, | |
571 sig => $pfamB ? "NA" : $unit->sig, | |
572 align => [ sprintf( '#HMM %s', $unit->hmmalign->{hmm} ), | |
573 sprintf( '#MATCH %s', $unit->hmmalign->{match} ), | |
574 sprintf( '#PP %s', $unit->hmmalign->{pp} ), | |
575 sprintf( '#SEQ %s', $unit->hmmalign->{seq} ) ] | |
576 }; | |
577 } | |
578 | |
579 return \@results; | |
580 } | |
581 | |
582 1; |