| 0 | 1 | 
|  | 2 =head1 NAME | 
|  | 3 | 
|  | 4 Bio::Pfam::Scan::PfamScan | 
|  | 5 | 
|  | 6 =cut | 
|  | 7 | 
|  | 8 package Bio::Pfam::Scan::PfamScan; | 
|  | 9 | 
|  | 10 =head1 SYNOPSIS | 
|  | 11 | 
|  | 12   my $ps = Bio::Pfam::Scan::PfamScan->new( | 
|  | 13              -cut_off => $hmmscan_cut_off, | 
|  | 14              -dir => $dir, | 
|  | 15              -clan_overlap => $clan_overlap, | 
|  | 16              -fasta => $fasta, | 
|  | 17              -align => $align, | 
|  | 18              -as => $as | 
|  | 19            ); | 
|  | 20 | 
|  | 21   $ps->search; | 
|  | 22   $ps->write_results; | 
|  | 23 | 
|  | 24 =head1 DESCRIPTION | 
|  | 25 | 
|  | 26 $Id: PfamScan.pm,v 1.4 2010-01-12 09:41:42 jm14 Exp $ | 
|  | 27 | 
|  | 28 =cut | 
|  | 29 | 
|  | 30 use strict; | 
|  | 31 use warnings; | 
|  | 32 | 
|  | 33 use Bio::Pfam::HMM::HMMResultsIO; | 
|  | 34 use Bio::Pfam::Active_site::as_search; | 
|  | 35 use Bio::SimpleAlign; | 
|  | 36 use Bio::Pfam::Scan::Seq; | 
|  | 37 | 
|  | 38 use Carp; | 
|  | 39 use IPC::Run qw( start finish ); | 
|  | 40 | 
|  | 41 #------------------------------------------------------------------------------- | 
|  | 42 #- constructor ----------------------------------------------------------------- | 
|  | 43 #------------------------------------------------------------------------------- | 
|  | 44 | 
|  | 45 =head1 METHODS | 
|  | 46 | 
|  | 47 =head2 new | 
|  | 48 | 
|  | 49 The only constructor for the object. Accepts a set of arguments that specify | 
|  | 50 the parameters for the search: | 
|  | 51 | 
|  | 52 =over | 
|  | 53 | 
|  | 54 =item -cut_off | 
|  | 55 | 
|  | 56 =item -dir | 
|  | 57 | 
|  | 58 =item -clan_overlap | 
|  | 59 | 
|  | 60 =item -fasta | 
|  | 61 | 
|  | 62 =item -sequence | 
|  | 63 | 
|  | 64 =item -align | 
|  | 65 | 
|  | 66 =item -hmm | 
|  | 67 | 
|  | 68 =item -as | 
|  | 69 | 
|  | 70 =back | 
|  | 71 | 
|  | 72 =cut | 
|  | 73 | 
|  | 74 sub new { | 
|  | 75   my ( $class, @args ) = @_; | 
|  | 76 | 
|  | 77   my $self = {}; | 
|  | 78   bless $self, $class; | 
|  | 79 | 
|  | 80   # To avoid hard coding the location for the binary, we assume it will be on the path..... | 
|  | 81   $self->{_HMMSCAN} = 'hmmscan'; | 
|  | 82 | 
|  | 83   # handle arguments, if we were given any here | 
|  | 84   $self->_process_args(@args) if @args; | 
|  | 85 | 
|  | 86   return $self; | 
|  | 87 } | 
|  | 88 | 
|  | 89 #------------------------------------------------------------------------------- | 
|  | 90 #- public methods -------------------------------------------------------------- | 
|  | 91 #------------------------------------------------------------------------------- | 
|  | 92 | 
|  | 93 =head2 search | 
|  | 94 | 
|  | 95 The main method on the object. Performs a C<hmmscan> search using the supplied | 
|  | 96 sequence and the specified HMM library. | 
|  | 97 | 
|  | 98 =cut | 
|  | 99 | 
|  | 100 sub search { | 
|  | 101   my ( $self, @args ) = @_; | 
|  | 102 | 
|  | 103   # handle the arguments, if we were handed any here | 
|  | 104   $self->_process_args(@args) if @args; | 
|  | 105 | 
|  | 106   # set up the output header | 
|  | 107   $self->_build_header; | 
|  | 108 | 
|  | 109   croak qq(FATAL: no sequence given; set the search parameters before calling "search") | 
|  | 110     unless defined $self->{_sequence}; | 
|  | 111 | 
|  | 112   my ( %AllResults, $pfamB, $firstResult ); | 
|  | 113 | 
|  | 114   foreach my $hmmlib ( @{ $self->{_hmmlib} } ) { | 
|  | 115 | 
|  | 116     my ( @hmmscan_cut_off, $seq_evalue, $dom_evalue ); | 
|  | 117     if ( $hmmlib !~ /Pfam\-B/ ) { | 
|  | 118       @hmmscan_cut_off = @{ $self->{_hmmscan_cutoff} }; | 
|  | 119     } | 
|  | 120     else { | 
|  | 121       $pfamB      = 1; | 
|  | 122       $seq_evalue = 0.001; | 
|  | 123       $dom_evalue = 0.001; | 
|  | 124 | 
|  | 125       # It's a pfamB search so use some default cut off values | 
|  | 126       push @hmmscan_cut_off, '-E', $seq_evalue, '--domE', $dom_evalue; | 
|  | 127     } | 
|  | 128 | 
|  | 129     push @{ $self->{_header} }, | 
|  | 130       "#     cpu number specified: " . $self->{_cpu} . "\n" | 
|  | 131       if ( $hmmlib !~ /Pfam\-B/ and $self->{_cpu} ); | 
|  | 132 | 
|  | 133     push @{ $self->{_header} }, | 
|  | 134       "#        searching against: " | 
|  | 135       . $self->{_dir} | 
|  | 136       . "/$hmmlib, with cut off " | 
|  | 137       . join( " ", @hmmscan_cut_off ) . "\n"; | 
|  | 138     my @params; | 
|  | 139     if ( $self->{_cpu} ) { | 
|  | 140       @params = ( | 
|  | 141         'hmmscan', '--notextw', '--cpu', $self->{_cpu}, @hmmscan_cut_off, | 
|  | 142         $self->{_dir} . '/' . $hmmlib, | 
|  | 143         $self->{_fasta} | 
|  | 144       ); | 
|  | 145     } | 
|  | 146     else { | 
|  | 147       @params = ( | 
|  | 148         'hmmscan', '--notextw', @hmmscan_cut_off, $self->{_dir} . '/' . $hmmlib, | 
|  | 149         $self->{_fasta} | 
|  | 150       ); | 
|  | 151 | 
|  | 152     } | 
|  | 153 | 
|  | 154     print STDERR "PfamScan::search: hmmscan command: |@params|\n" | 
|  | 155       if $ENV{DEBUG}; | 
|  | 156     print STDERR 'PfamScan::search: sequence: |' . $self->{_sequence} . "|\n" | 
|  | 157       if $ENV{DEBUG}; | 
|  | 158 | 
|  | 159     my $run = start \@params, '<pipe', \*IN, '>pipe', \*OUT, '2>pipe', \*ERR | 
|  | 160       or croak qq(FATAL: error running hmmscan; IPC::Run returned '$?'); | 
|  | 161 | 
|  | 162     # print IN $self->{_sequence}; ; | 
|  | 163     close IN; | 
|  | 164 | 
|  | 165     $self->{_hmmresultIO} = Bio::Pfam::HMM::HMMResultsIO->new; | 
|  | 166     $self->{_all_results} = $self->{_hmmresultIO}->parseMultiHMMER3( \*OUT ); | 
|  | 167     close OUT; | 
|  | 168 | 
|  | 169     my $err; | 
|  | 170     while (<ERR>) { | 
|  | 171       $err .= $_; | 
|  | 172     } | 
|  | 173     close ERR; | 
|  | 174 | 
|  | 175     finish $run | 
|  | 176       or croak qq|FATAL: error running hmmscan ($err); ipc returned '$?'|; | 
|  | 177 | 
|  | 178     unless ( $hmmlib =~ /Pfam\-B/ ) { | 
|  | 179 | 
|  | 180       if ( $self->{_clan_overlap} ) { | 
|  | 181         push( @{ $self->{_header} }, "#    resolve clan overlaps: off\n" ); | 
|  | 182       } | 
|  | 183       else { | 
|  | 184         push( @{ $self->{_header} }, "#    resolve clan overlaps: on\n" ); | 
|  | 185         $self->_resolve_clan_overlap; | 
|  | 186       } | 
|  | 187 | 
|  | 188       if ( $self->{_as} ) { | 
|  | 189         push( @{ $self->{_header} }, "#     predict active sites: on\n" ); | 
|  | 190         $self->_pred_act_sites; | 
|  | 191       } | 
|  | 192       else { | 
|  | 193         push( @{ $self->{_header} }, "#     predict active sites: off\n" ); | 
|  | 194       } | 
|  | 195 | 
|  | 196       if ( $self->{_translate} ) { | 
|  | 197         push @{ $self->{_header} },  "#   translate DNA sequence: " . $self->{_translate} . "\n"; | 
|  | 198       } | 
|  | 199     } | 
|  | 200 | 
|  | 201     # Determine which hits are significant | 
|  | 202     foreach my $result ( @{ $self->{_all_results} } ) { | 
|  | 203       foreach | 
|  | 204         my $unit ( sort { $a->seqFrom <=> $b->seqFrom } @{ $result->units } ) | 
|  | 205       { | 
|  | 206 | 
|  | 207         unless ($pfamB) { | 
|  | 208 | 
|  | 209           $unit->sig(0); | 
|  | 210           if ( $result->seqs->{ $unit->name }->bits >= | 
|  | 211             $self->{_seqGA}->{ $unit->name } ) | 
|  | 212           { | 
|  | 213             if ( $unit->bits >= $self->{_domGA}->{ $unit->name } ) { | 
|  | 214               $unit->sig(1); | 
|  | 215             } | 
|  | 216           } | 
|  | 217         } | 
|  | 218       } | 
|  | 219     } | 
|  | 220 | 
|  | 221     if ($firstResult) { | 
|  | 222       $AllResults{ $self->{_all_results} } = $self->{_all_results}; | 
|  | 223     } | 
|  | 224     else { | 
|  | 225       $firstResult = $self->{_all_results}; | 
|  | 226     } | 
|  | 227 | 
|  | 228   }    # end of "foreach $hmmlib" | 
|  | 229 | 
|  | 230   # If more than one search, merge results into one object | 
|  | 231   if ( keys %AllResults ) { | 
|  | 232 | 
|  | 233     foreach my $AllResult ( keys %AllResults ) { | 
|  | 234 | 
|  | 235       foreach my $seq_id ( keys %{ $self->{_seq_hash} } ) { | 
|  | 236 | 
|  | 237         my $flag; | 
|  | 238 | 
|  | 239         #If seq exists in both, add all units from $AllResult to $firstResult | 
|  | 240         foreach my $result ( @{$firstResult} ) { | 
|  | 241 | 
|  | 242           if ( $result->seqName eq $seq_id ) { | 
|  | 243             $flag = 1; | 
|  | 244 | 
|  | 245             foreach my $result2 ( @{ $AllResults{$AllResult} } ) { | 
|  | 246 | 
|  | 247               if ( $result2->seqName eq $seq_id ) { | 
|  | 248                 foreach my $hmmname ( keys %{ $result2->seqs } ) { | 
|  | 249                   $result->addHMMSeq( $result2->seqs->{$hmmname} ); | 
|  | 250                 } | 
|  | 251                 foreach my $unit ( @{ $result2->units } ) { | 
|  | 252                   $result->addHMMUnit($unit); | 
|  | 253                 } | 
|  | 254               } | 
|  | 255             } | 
|  | 256           } | 
|  | 257         } | 
|  | 258 | 
|  | 259         #If seq doesn't exist in $firstResult, need to add both sequence and units to $firstResult | 
|  | 260         unless ($flag) { | 
|  | 261           foreach my $result2 ( @{ $AllResults{$AllResult} } ) { | 
|  | 262             if ( $result2->seqName eq $seq_id ) { | 
|  | 263               push @{$firstResult}, $result2; | 
|  | 264             } | 
|  | 265           } | 
|  | 266         } | 
|  | 267       } | 
|  | 268     } | 
|  | 269     $self->{_all_results} = $firstResult; | 
|  | 270 | 
|  | 271   }    # end of "if keys %AllResults" | 
|  | 272 | 
|  | 273   push @{ $self->{_header} }, "# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =\n#\n"; | 
|  | 274 | 
|  | 275   if ( $self->{_as} ) { | 
|  | 276     push @{ $self->{_header} }, "# <seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan> <predicted_active_site_residues>"; | 
|  | 277   } | 
|  | 278   else { | 
|  | 279     push @{ $self->{_header} }, "# <seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan>"; | 
|  | 280   } | 
|  | 281 | 
|  | 282   if ( $self->{_translate} ) { | 
|  | 283     push @{ $self->{_header} }, " <strand> <nt start> <nt end>"; | 
|  | 284   } | 
|  | 285   push @{ $self->{_header} }, "\n"; | 
|  | 286 } | 
|  | 287 | 
|  | 288 #------------------------------------------------------------------------------- | 
|  | 289 | 
|  | 290 =head2 write_results | 
|  | 291 | 
|  | 292 Writes the results of the C<hmmscan> search. Takes a single argument, which can | 
|  | 293 be an open filehandle or a filename. A fatal error is generated if a file of the | 
|  | 294 given name already exists. | 
|  | 295 | 
|  | 296 =cut | 
|  | 297 | 
|  | 298 sub write_results { | 
|  | 299   my ( $self, $out, $e_seq, $e_dom, $b_seq, $b_dom ) = @_; | 
|  | 300 | 
|  | 301   my $fh; | 
|  | 302 | 
|  | 303   if ( ref $out eq 'GLOB' ) { | 
|  | 304 | 
|  | 305     # we were handed a filehandle | 
|  | 306     $fh = $out; | 
|  | 307   } | 
|  | 308   elsif ( $out and not ref $out ) { | 
|  | 309 | 
|  | 310     # we were handed a filename | 
|  | 311     croak qq(FATAL: output file "$out" already exists) if -f $out; | 
|  | 312 | 
|  | 313     open( FH, ">$out" ) | 
|  | 314       or croak qq(FATAL: Can\'t write to your output file "$out": $!); | 
|  | 315     $fh = \*FH; | 
|  | 316   } | 
|  | 317   else { | 
|  | 318 | 
|  | 319     # neither filehandle nor filename, default to STDOUT | 
|  | 320     $fh = \*STDOUT; | 
|  | 321   } | 
|  | 322 | 
|  | 323   if ( $self->{_header} ) { | 
|  | 324     my $header = join '', @{ $self->{_header} }; | 
|  | 325     print $fh "$header\n"; | 
|  | 326   } | 
|  | 327 | 
|  | 328   foreach my $result ( @{ $self->{_all_results} } ) { | 
|  | 329     $self->{_hmmresultIO} | 
|  | 330       ->write_ascii_out( $result, $fh, $self, $e_seq, $e_dom, $b_seq, $b_dom ); | 
|  | 331   } | 
|  | 332   close $fh; | 
|  | 333 } | 
|  | 334 | 
|  | 335 #------------------------------------------------------------------------------- | 
|  | 336 | 
|  | 337 =head2 results | 
|  | 338 | 
|  | 339 Returns the search results. | 
|  | 340 | 
|  | 341 =cut | 
|  | 342 | 
|  | 343 sub results { | 
|  | 344   my ( $self, $e_value ) = @_; | 
|  | 345 | 
|  | 346   unless ( defined $self->{_all_results} ) { | 
|  | 347     carp "WARNING: call search() before trying to retrieve results"; | 
|  | 348     return; | 
|  | 349   } | 
|  | 350 | 
|  | 351   my @search_results = (); | 
|  | 352 | 
|  | 353   foreach my $hmm_result ( @{ $self->{_all_results} } ) { | 
|  | 354     push @search_results, @{ $hmm_result->results( $self, $e_value ) }; | 
|  | 355   } | 
|  | 356 | 
|  | 357   return \@search_results; | 
|  | 358 } | 
|  | 359 | 
|  | 360 #------------------------------------------------------------------------------- | 
|  | 361 #- private methods ------------------------------------------------------------- | 
|  | 362 #------------------------------------------------------------------------------- | 
|  | 363 | 
|  | 364 =head1 PRIVATE METHODS | 
|  | 365 | 
|  | 366 =head2 _process_args | 
|  | 367 | 
|  | 368 Handles the input arguments. | 
|  | 369 | 
|  | 370 =cut | 
|  | 371 | 
|  | 372 sub _process_args { | 
|  | 373   my ( $self, @args ) = @_; | 
|  | 374 | 
|  | 375   # accept both a hash and a hash ref | 
|  | 376   my $args = ( ref $args[0] eq 'HASH' ) ? shift @args : {@args}; | 
|  | 377 | 
|  | 378   # make sure we get a sequence | 
|  | 379   if ( $args->{-fasta} and $args->{-sequence} ) { | 
|  | 380     croak qq(FATAL: "-fasta" and "-sequence" are mutually exclusive); | 
|  | 381   } | 
|  | 382   elsif ( $args->{-fasta} ) { | 
|  | 383     croak qq(FATAL: fasta file "$args->{-fasta}" doesn\'t exist) | 
|  | 384       unless -s $args->{-fasta}; | 
|  | 385   } | 
|  | 386   elsif ( $args->{-sequence} ) { | 
|  | 387     croak qq(FATAL: no sequence given) | 
|  | 388       unless length( $args->{-sequence} ); | 
|  | 389   } | 
|  | 390   else { | 
|  | 391     croak qq(FATAL: must specify either "-fasta" or "-sequence"); | 
|  | 392   } | 
|  | 393 | 
|  | 394   # check the cut off | 
|  | 395   if ( ( $args->{-e_seq} and ( $args->{-b_seq} || $args->{-b_dom} ) ) | 
|  | 396     or ( $args->{-b_seq} and ( $args->{-e_seq} || $args->{-e_dom} ) ) | 
|  | 397     or ( $args->{-b_dom} and $args->{-e_dom} ) ) | 
|  | 398   { | 
|  | 399     croak qq(FATAL: can\'t use e value and bit score threshold together); | 
|  | 400   } | 
|  | 401 | 
|  | 402   $self->{_hmmscan_cutoff} = (); | 
|  | 403   if ( $args->{-e_seq} ) { | 
|  | 404     croak qq(FATAL: the E-value sequence cut-off "$args->{-e_seq}" must be a positive non-zero number) | 
|  | 405       unless $args->{-e_seq} > 0; | 
|  | 406 | 
|  | 407     push @{ $self->{_hmmscan_cutoff} }, '-E', $args->{-e_seq}; | 
|  | 408   } | 
|  | 409 | 
|  | 410   if ( $args->{-e_dom} ) { | 
|  | 411     croak q(FATAL: if you supply "-e_dom" you must also supply "-e_seq") | 
|  | 412       unless $args->{-e_seq}; | 
|  | 413 | 
|  | 414     croak qq(FATAL: the E-value domain cut-off "$args->{-e_dom}" must be positive non-zero number) | 
|  | 415       unless $args->{-e_dom} > 0; | 
|  | 416 | 
|  | 417     push @{ $self->{_hmmscan_cutoff} }, '--domE', $args->{-e_dom}; | 
|  | 418   } | 
|  | 419 | 
|  | 420   if ( $args->{-b_seq} ) { | 
|  | 421     push @{ $self->{_hmmscan_cutoff} }, '-T', $args->{-b_seq}; | 
|  | 422   } | 
|  | 423 | 
|  | 424   if ( $args->{-b_dom} ) { | 
|  | 425     croak q(FATAL: if you supply "-b_dom" you must also supply "-b_seq") | 
|  | 426       unless $args->{-b_seq}; | 
|  | 427 | 
|  | 428     push @{ $self->{_hmmscan_cutoff} }, '--domT', $args->{-b_dom}; | 
|  | 429   } | 
|  | 430 | 
|  | 431   unless ( $self->{_hmmscan_cutoff} ) { | 
|  | 432     push @{ $self->{_hmmscan_cutoff} }, '--cut_ga'; | 
|  | 433   } | 
|  | 434 | 
|  | 435   # make sure we have a valid directory for the HMM data files | 
|  | 436   croak qq(FATAL: directory "$args->{-dir}" does not exist) | 
|  | 437     unless -d $args->{-dir}; | 
|  | 438 | 
|  | 439   # populate the object | 
|  | 440   $self->{_cut_off}      = $args->{-cut_off}; | 
|  | 441   $self->{_dir}          = $args->{-dir}; | 
|  | 442   $self->{_clan_overlap} = $args->{-clan_overlap}; | 
|  | 443   $self->{_fasta}        = $args->{-fasta}; | 
|  | 444   $self->{_align}        = $args->{-align}; | 
|  | 445   $self->{_as}           = $args->{-as}; | 
|  | 446   $self->{_sequence}     = $args->{-sequence}; | 
|  | 447   $self->{_cpu}          = $args->{-cpu}; | 
|  | 448   $self->{_translate}    = $args->{-translate}; | 
|  | 449 | 
|  | 450   $self->{_hmmlib} = []; | 
|  | 451   if ( $args->{-hmmlib} ) { | 
|  | 452     if ( ref $args->{-hmmlib} eq 'ARRAY' ) { | 
|  | 453       push @{ $self->{_hmmlib} }, @{ $args->{-hmmlib} }; | 
|  | 454     } | 
|  | 455     else { | 
|  | 456       push @{ $self->{_hmmlib} }, $args->{-hmmlib}; | 
|  | 457     } | 
|  | 458   } | 
|  | 459   else { | 
|  | 460     push @{ $self->{_hmmlib} }, "Pfam-A.hmm"; | 
|  | 461   } | 
|  | 462 | 
|  | 463   # Now check that the library exists in the data dir! | 
|  | 464   foreach my $hmmlib ( @{ $self->{_hmmlib} } ) { | 
|  | 465 | 
|  | 466     croak qq(FATAL: can't find $hmmlib and/or $hmmlib binaries in "$args->{-dir}") | 
|  | 467       unless ( | 
|  | 468       -s $self->{_dir}, | 
|  | 469       "/$hmmlib" | 
|  | 470       and -s $self->{_dir} . "/$hmmlib.h3f" | 
|  | 471       and -s $self->{_dir} . "/$hmmlib.h3i" | 
|  | 472       and -s $self->{_dir} . "/$hmmlib.h3m" | 
|  | 473       and -s $self->{_dir} . "/$hmmlib.h3p" | 
|  | 474       and -s $self->{_dir} . "/$hmmlib.dat" | 
|  | 475       ); | 
|  | 476 | 
|  | 477     # read the necessary data, if it's not been read already | 
|  | 478     $self->_read_pfam_data; | 
|  | 479   } | 
|  | 480 | 
|  | 481   $self->{_max_seqname} = 0; | 
|  | 482 | 
|  | 483   # if there's nothing in "_sequence" try to load a fasta file | 
|  | 484   $self->_read_fasta | 
|  | 485     unless $self->{_sequence}; | 
|  | 486 | 
|  | 487   # check again for a sequence. If we don't have one at this point, bail with | 
|  | 488   # an error | 
|  | 489   croak qq(FATAL: no sequence given) | 
|  | 490     unless $self->{_sequence}; | 
|  | 491 | 
|  | 492   # read fasta file, store maximum sequence name and store sequences for active | 
|  | 493   # sites prediction | 
|  | 494   $self->_parse_sequence | 
|  | 495     unless $self->{_max_seqname}; | 
|  | 496 | 
|  | 497   if ( $self->{_as} ) { | 
|  | 498     $self->_parse_act_site_data | 
|  | 499       unless $self->{_read_read_act_site_data}; | 
|  | 500   } | 
|  | 501 | 
|  | 502   if ( $self->{_translate} ) { | 
|  | 503     $self->_translate_fasta; | 
|  | 504   } | 
|  | 505 | 
|  | 506   # see if a version number was specified | 
|  | 507   $self->{_version} = $args->{version}; | 
|  | 508 | 
|  | 509 } | 
|  | 510 | 
|  | 511 #------------------------------------------------------------------------------- | 
|  | 512 | 
|  | 513 =head2 _build_header | 
|  | 514 | 
|  | 515 Adds version to the header object | 
|  | 516 | 
|  | 517 =cut | 
|  | 518 | 
|  | 519 sub _build_header { | 
|  | 520   my ( $self, $version ) = @_; | 
|  | 521 | 
|  | 522   unshift @{ $self->{_header} }, | 
|  | 523     '#      query sequence file: ' . $self->{_fasta} . "\n"; | 
|  | 524 | 
|  | 525   unshift @{ $self->{_header} }, <<EOF_license; | 
|  | 526 # Copyright (c) 2009 Genome Research Ltd\n# Freely distributed under the GNU | 
|  | 527 # General Public License | 
|  | 528 # | 
|  | 529 # Authors: Jaina Mistry (jaina\@ebi.ac.uk), | 
|  | 530 #          Rob Finn (rdf\@ebi.ac.uk) | 
|  | 531 # | 
|  | 532 # This is free software; you can redistribute it and/or modify it under | 
|  | 533 # the terms of the GNU General Public License as published by the Free Software | 
|  | 534 # Foundation; either version 2 of the License, or (at your option) any later version. | 
|  | 535 # This program is distributed in the hope that it will be useful, but WITHOUT | 
|  | 536 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | 
|  | 537 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more | 
|  | 538 # details. | 
|  | 539 # | 
|  | 540 # You should have received a copy of the GNU General Public License along with | 
|  | 541 # this program. If not, see <http://www.gnu.org/licenses/>. | 
|  | 542 # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | 
|  | 543 EOF_license | 
|  | 544 | 
|  | 545   my $v = | 
|  | 546     ( defined $self->{_version} ) | 
|  | 547     ? "version $version, " | 
|  | 548     : ''; | 
|  | 549 | 
|  | 550   unshift @{ $self->{_header} }, | 
|  | 551     "# pfam_scan.pl, $v run at " . scalar(localtime) . "\n#\n"; | 
|  | 552 } | 
|  | 553 | 
|  | 554 #------------------------------------------------------------------------------- | 
|  | 555 | 
|  | 556 =head2 _read_fasta | 
|  | 557 | 
|  | 558 Reads a sequence from the fasta-format file that was specified in the | 
|  | 559 parameters. | 
|  | 560 | 
|  | 561 =cut | 
|  | 562 | 
|  | 563 sub _read_fasta { | 
|  | 564   my $self = shift; | 
|  | 565 | 
|  | 566   open( FASTA, $self->{_fasta} ) | 
|  | 567     or croak qq(FATAL: Couldn't open fasta file "$self->{_fasta}" $!\n); | 
|  | 568   my @rows = <FASTA>; | 
|  | 569   close FASTA; | 
|  | 570 | 
|  | 571   $self->{_sequence_rows} = \@rows; | 
|  | 572 | 
|  | 573   $self->{_sequence} = join '', @rows; | 
|  | 574 } | 
|  | 575 | 
|  | 576 #------------------------------------------------------------------------------- | 
|  | 577 | 
|  | 578 =head2 _resolve_clan_overlap | 
|  | 579 | 
|  | 580 Resolves overlaps between clans. | 
|  | 581 | 
|  | 582 =cut | 
|  | 583 | 
|  | 584 sub _resolve_clan_overlap { | 
|  | 585   my $self = shift; | 
|  | 586 | 
|  | 587   my @no_clan_overlap = (); | 
|  | 588   foreach my $result ( @{ $self->{_all_results} } ) { | 
|  | 589     my $new = | 
|  | 590       $result->remove_overlaps_by_clan( $self->{_clanmap}, $self->{_nested} ); | 
|  | 591 | 
|  | 592     push @no_clan_overlap, $new; | 
|  | 593   } | 
|  | 594 | 
|  | 595   $self->{_all_results} = \@no_clan_overlap; | 
|  | 596 } | 
|  | 597 | 
|  | 598 #------------------------------------------------------------------------------- | 
|  | 599 | 
|  | 600 =head2 _pred_act_sites | 
|  | 601 | 
|  | 602 Predicts active sites. Takes no arguments. Populates the "act_site" field on | 
|  | 603 each results object. | 
|  | 604 | 
|  | 605 =cut | 
|  | 606 | 
|  | 607 sub _pred_act_sites { | 
|  | 608   my $self = shift; | 
|  | 609 | 
|  | 610   # print STDERR "predicting active sites...\n"; | 
|  | 611 | 
|  | 612   my $hmm_file = $self->{_dir} . '/Pfam-A.hmm'; | 
|  | 613 | 
|  | 614 RESULT: foreach my $result ( @{ $self->{_all_results} } ) { | 
|  | 615 | 
|  | 616     # print STDERR "result: |" . $result->seqName . "|\n"; | 
|  | 617 | 
|  | 618   UNIT: foreach my $unit ( @{ $result->units } ) { | 
|  | 619 | 
|  | 620       # print STDERR "family: |" . $unit->name . "|\n"; | 
|  | 621 | 
|  | 622       next UNIT | 
|  | 623         unless ( $self->{_act_site_data}->{ $unit->name }->{'alignment'} ); | 
|  | 624 | 
|  | 625       my $seq_region = substr( | 
|  | 626         $self->{_seq_hash}->{ $result->seqName }, | 
|  | 627         $unit->seqFrom - 1, | 
|  | 628         $unit->seqTo - $unit->seqFrom + 1 | 
|  | 629       ); | 
|  | 630 | 
|  | 631       my $seq_se = $unit->seqFrom . '-' . $unit->seqTo; | 
|  | 632 | 
|  | 633       # print STDERR "seq_id:     |" . $result->seqName . "|\n"; | 
|  | 634       # print STDERR "seq_se:     |" . $seq_se . "|\n"; | 
|  | 635       # print STDERR "seq_region: |" . $seq_region . "|\n"; | 
|  | 636       # print STDERR "family:     |" . $unit->name . "|\n"; | 
|  | 637       # print STDERR "hmm_file:   |" . $hmm_file . "|\n"; | 
|  | 638       # print STDERR "dir:        |" . $self->{_dir} . "|\n"; | 
|  | 639 | 
|  | 640       $unit->{act_site} = Bio::Pfam::Active_site::as_search::find_as( | 
|  | 641         $self->{_act_site_data}->{ $unit->name }->{'alignment'}, | 
|  | 642         $self->{_act_site_data}->{ $unit->name }->{'residues'}, | 
|  | 643         $result->seqName, | 
|  | 644         $seq_se, | 
|  | 645         $seq_region, | 
|  | 646         $unit->name, | 
|  | 647         $hmm_file | 
|  | 648       ); | 
|  | 649     } | 
|  | 650   } | 
|  | 651 } | 
|  | 652 | 
|  | 653 #------------------------------------------------------------------------------- | 
|  | 654 | 
|  | 655 =head2 _read_pfam_data | 
|  | 656 | 
|  | 657 Reads the Pfam data file ("Pfam-A.scan.dat") and populates the C<accmap>, | 
|  | 658 C<nested> and C<clanmap> hashes on the object. | 
|  | 659 | 
|  | 660 =cut | 
|  | 661 | 
|  | 662 sub _read_pfam_data { | 
|  | 663   my $self = shift; | 
|  | 664 | 
|  | 665   #print STDERR "reading " . $self->{_hmmlib} . ".dat\n" if($ENV{DEBUG}); | 
|  | 666   $self->{_accmap}    = {}; | 
|  | 667   $self->{_nested}    = {}; | 
|  | 668   $self->{_clanmap}   = {}; | 
|  | 669   $self->{_desc}      = {}; | 
|  | 670   $self->{_seqGA}     = {}; | 
|  | 671   $self->{_domGA}     = {}; | 
|  | 672   $self->{_type}      = {}; | 
|  | 673   $self->{_model_len} = {}; | 
|  | 674 | 
|  | 675   foreach my $hmmlib ( @{ $self->{_hmmlib} } ) { | 
|  | 676     my $scandat = $self->{_dir} . '/' . $hmmlib . '.dat'; | 
|  | 677     open( SCANDAT, $scandat ) | 
|  | 678       or croak qq(FATAL: Couldn't open "$scandat" data file: $!); | 
|  | 679     my $id; | 
|  | 680     while (<SCANDAT>) { | 
|  | 681       if (m/^\#=GF ID\s+(\S+)/) { | 
|  | 682         $id = $1; | 
|  | 683       } | 
|  | 684       elsif (m/^\#=GF\s+AC\s+(\S+)/) { | 
|  | 685         $self->{_accmap}->{$id} = $1; | 
|  | 686       } | 
|  | 687       elsif (m/^\#=GF\s+DE\s+(.+)/) { | 
|  | 688         $self->{_desc}->{$id} = $1; | 
|  | 689       } | 
|  | 690       elsif (m/^\#=GF\s+GA\s+(\S+)\;\s+(\S+)\;/) { | 
|  | 691         $self->{_seqGA}->{$id} = $1; | 
|  | 692         $self->{_domGA}->{$id} = $2; | 
|  | 693       } | 
|  | 694       elsif (m/^\#=GF\s+TP\s+(\S+)/) { | 
|  | 695         $self->{_type}->{$id} = $1; | 
|  | 696       } | 
|  | 697       elsif (m/^\#=GF\s+ML\s+(\d+)/) { | 
|  | 698         $self->{_model_len}->{$id} = $1; | 
|  | 699       } | 
|  | 700       elsif (/^\#=GF\s+NE\s+(\S+)/) { | 
|  | 701         $self->{_nested}->{$id}->{$1} = 1; | 
|  | 702         $self->{_nested}->{$1}->{$id} = 1; | 
|  | 703       } | 
|  | 704       elsif (/^\#=GF\s+CL\s+(\S+)/) { | 
|  | 705         $self->{_clanmap}->{$id} = $1; | 
|  | 706       } | 
|  | 707     } | 
|  | 708 | 
|  | 709     close SCANDAT; | 
|  | 710 | 
|  | 711     # set a flag to show that we've read the data files already | 
|  | 712     $self->{ '_read_' . $hmmlib } = 1; | 
|  | 713   } | 
|  | 714 | 
|  | 715 } | 
|  | 716 | 
|  | 717 #------------------------------------------------------------------------------- | 
|  | 718 | 
|  | 719 =head2 _read_act_site_data | 
|  | 720 | 
|  | 721 Reads the Pfam active site data file ("active_site.dat") and populates | 
|  | 722 the C<act_site_data> hashes on the object. | 
|  | 723 | 
|  | 724 =cut | 
|  | 725 | 
|  | 726 sub _parse_act_site_data { | 
|  | 727   my $self   = shift; | 
|  | 728   my $as_dat = $self->{_dir} . '/active_site.dat'; | 
|  | 729 | 
|  | 730   $self->{_act_site_data} = {}; | 
|  | 731 | 
|  | 732   open( AS, $as_dat ) | 
|  | 733     or croak qq(FATAL: Couldn\'t open "$as_dat" data file: $!); | 
|  | 734 | 
|  | 735   my ( $fam_id, $aln ); | 
|  | 736 | 
|  | 737   while (<AS>) { | 
|  | 738     if (/^ID\s+(\S+)/) { | 
|  | 739       $fam_id = $1; | 
|  | 740       $aln    = new Bio::SimpleAlign; | 
|  | 741     } | 
|  | 742     elsif (/^AL\s+(\S+)\/(\d+)\-(\d+)\s+(\S+)/) { | 
|  | 743       my ( $seq_id, $st, $en, $seq ) = ( $1, $2, $3, $4 ); | 
|  | 744 | 
|  | 745       $aln->add_seq( | 
|  | 746         Bio::Pfam::Scan::Seq->new( | 
|  | 747           '-seq'   => $seq, | 
|  | 748           '-id'    => $seq_id, | 
|  | 749           '-start' => $st, | 
|  | 750           '-end'   => $en, | 
|  | 751           '-type'  => 'aligned' | 
|  | 752         ) | 
|  | 753       ); | 
|  | 754     } | 
|  | 755     elsif (/^RE\s+(\S+)\s+(\d+)/) { | 
|  | 756       my ( $seq_id, $res ) = ( $1, $2 ); | 
|  | 757       push( | 
|  | 758         @{ $self->{_act_site_data}->{$fam_id}->{'residues'}->{$seq_id} }, | 
|  | 759         $res | 
|  | 760       ); | 
|  | 761 | 
|  | 762     } | 
|  | 763     elsif (/^\/\//) { | 
|  | 764 | 
|  | 765       $self->{_act_site_data}->{$fam_id}->{'alignment'} = $aln; | 
|  | 766 | 
|  | 767       $fam_id = ""; | 
|  | 768       $aln    = ""; | 
|  | 769 | 
|  | 770     } | 
|  | 771     else { | 
|  | 772       warn "Ignoring line:\n[$_]"; | 
|  | 773     } | 
|  | 774   } | 
|  | 775   close AS; | 
|  | 776   $self->{_read_read_act_site_data} = 1; | 
|  | 777 } | 
|  | 778 | 
|  | 779 #------------------------------------------------------------------------------- | 
|  | 780 | 
|  | 781 =head2 _parse_sequence | 
|  | 782 | 
|  | 783 This method is used to parse the sequence and hash it on sequence | 
|  | 784 identifier. It also stores the length of the longest sequence id | 
|  | 785 | 
|  | 786 =cut | 
|  | 787 | 
|  | 788 sub _parse_sequence { | 
|  | 789   my $self = shift; | 
|  | 790 | 
|  | 791   my $seq_hash = {}; | 
|  | 792   my $seq_id; | 
|  | 793   foreach ( @{ $self->{_sequence_rows} } ) { | 
|  | 794 | 
|  | 795     next if m/^\s*$/;    #Ignore blank lines | 
|  | 796 | 
|  | 797     if (m/^>(\S+)/) { | 
|  | 798       $seq_id = $1; | 
|  | 799 | 
|  | 800       if ( exists( $seq_hash->{$seq_id} ) ) { | 
|  | 801         croak "FATAL: Sequence identifiers must be unique. Your fasta file contains two sequences with the same id ($seq_id)"; | 
|  | 802       } | 
|  | 803 | 
|  | 804       #Store the max length of seq name, use this later when printing in ascii | 
|  | 805       $self->{_max_seqname} = length($seq_id) | 
|  | 806         if ( !$self->{_max_seqname} | 
|  | 807         or length($seq_id) > $self->{_max_seqname} ); | 
|  | 808     } | 
|  | 809     else { | 
|  | 810       croak "FATAL: Unrecognised format of fasta file. Each sequence must have a header line in the format '>identifier  <optional description>'" | 
|  | 811         unless defined $seq_id; | 
|  | 812       chomp; | 
|  | 813       $seq_hash->{$seq_id} .= $_; | 
|  | 814     } | 
|  | 815   } | 
|  | 816 | 
|  | 817   $self->{_seq_hash} = $seq_hash; | 
|  | 818 } | 
|  | 819 | 
|  | 820 #------------------------------------------------------------------------------- | 
|  | 821 | 
|  | 822 =head2 _translate_fasta | 
|  | 823 | 
|  | 824 Uses the HMMER v2.3.2 progam "translate" to perform a six-frame translation of | 
|  | 825 the input sequence. Checks the parameter "-translate". | 
|  | 826 | 
|  | 827 Accepted arguments are "all" and "orf", where "all" means (from the "translate" | 
|  | 828 help text) "translate in full, with stops; no individual ORFs" and "orf" means | 
|  | 829 "report only ORFs greater than minlen" where minlen is set to the default of | 
|  | 830 20. | 
|  | 831 | 
|  | 832 =cut | 
|  | 833 | 
|  | 834 sub _translate_fasta { | 
|  | 835   my ($self) = @_; | 
|  | 836   my $translatedFasta = $self->{_fasta} . ".translated"; | 
|  | 837 | 
|  | 838   my @params = ( 'translate', '-q', ); | 
|  | 839   if ( $self->{_translate} eq 'all' ) { | 
|  | 840     push( @params, '-a' ); | 
|  | 841   } | 
|  | 842   elsif ( $self->{_translate} eq 'orf' ) { | 
|  | 843     push( @params, '-l', '20' ); | 
|  | 844   } | 
|  | 845   else { | 
|  | 846     croak qq(Unexpected parameter '$self->{_translate}'); | 
|  | 847   } | 
|  | 848   push( @params, '-o', $translatedFasta, $self->{_fasta} ); | 
|  | 849 | 
|  | 850   print STDERR "PfamScan::translate_fasta: translate command: |@params|\n" | 
|  | 851     if $ENV{DEBUG}; | 
|  | 852 | 
|  | 853   my $run = start \@params, '<pipe', \*IN, '>pipe', \*OUT, '2>pipe', \*ERR | 
|  | 854     or croak qq(FATAL: error running translate; IPC::Run returned '$?'); | 
|  | 855 | 
|  | 856   close IN; | 
|  | 857   close OUT; | 
|  | 858 | 
|  | 859   my $err; | 
|  | 860   while (<ERR>) { | 
|  | 861     $err .= $_; | 
|  | 862   } | 
|  | 863   close ERR; | 
|  | 864 | 
|  | 865   finish $run | 
|  | 866     or croak qq|FATAL: error running translate ($err); ipc returned '$?'|; | 
|  | 867   open( F, "<", $translatedFasta ) | 
|  | 868     or croak qw(Could not open $translatedFasta '$!'); | 
|  | 869   if ( $self->{_translate} eq 'orf' ) { | 
|  | 870     while (<F>) { | 
|  | 871       if (/^>\s?(\S+).*nt (\d+)\.+(\d+)/) { | 
|  | 872         $self->{_orf}->{$1}->{start}  = $2; | 
|  | 873         $self->{_orf}->{$1}->{end}    = $3; | 
|  | 874         $self->{_orf}->{$1}->{strand} = ( $2 < $3 ) ? '+' : '-'; | 
|  | 875       } | 
|  | 876     } | 
|  | 877   } | 
|  | 878   else { | 
|  | 879     my $currentSeq; | 
|  | 880     my $currentFrame; | 
|  | 881     my $currentLen = 0; | 
|  | 882     my $maxEnd = 0; | 
|  | 883     while (<F>) { | 
|  | 884       chomp; | 
|  | 885       if (/^>\s?(\S+\:)(\d+)/) { | 
|  | 886         if ( $currentLen > 0 ) { | 
|  | 887           my $seqName = $currentSeq . $currentFrame; | 
|  | 888           if ( $currentFrame < 3 ) { | 
|  | 889             my $start = 1 + $currentFrame; | 
|  | 890             my $end   = $start + $currentLen - 1; | 
|  | 891             $self->{_orf}->{$seqName}->{strand} = '+'; | 
|  | 892             $self->{_orf}->{$seqName}->{start}  = $start; | 
|  | 893             $self->{_orf}->{$seqName}->{end}    = $end; | 
|  | 894             $maxEnd = $end if ( $end > $maxEnd ); | 
|  | 895           } | 
|  | 896           else { | 
|  | 897             my $start = $maxEnd - ( $currentFrame - 3 ); | 
|  | 898             my $end = $start - $currentLen + 1; | 
|  | 899             $self->{_orf}->{$seqName}->{strand} = '-'; | 
|  | 900             $self->{_orf}->{$seqName}->{start}  = $start; | 
|  | 901             $self->{_orf}->{$seqName}->{end}    = $end; | 
|  | 902           } | 
|  | 903         } | 
|  | 904         $currentLen   = 0; | 
|  | 905         $currentSeq   = $1; | 
|  | 906         $currentFrame = $2; | 
|  | 907       } | 
|  | 908       else { | 
|  | 909         $currentLen += length($_) * 3; | 
|  | 910       } | 
|  | 911     } | 
|  | 912     my $seqName = $currentSeq . $currentFrame; | 
|  | 913     if ( $currentFrame < 3 ) { | 
|  | 914       my $start = 1 + $currentFrame; | 
|  | 915       my $end   = $start + $currentLen - 1; | 
|  | 916       $self->{_orf}->{$seqName}->{strand} = '+'; | 
|  | 917       $self->{_orf}->{$seqName}->{start}  = $start; | 
|  | 918       $self->{_orf}->{$seqName}->{end}    = $end; | 
|  | 919       $maxEnd = $end if ( $end > $maxEnd ); | 
|  | 920     } | 
|  | 921     else { | 
|  | 922       my $start = $maxEnd - ( $currentFrame - 3 ); | 
|  | 923       my $end = $start - $currentLen + 1; | 
|  | 924       $self->{_orf}->{$seqName}->{strand} = '-'; | 
|  | 925       $self->{_orf}->{$seqName}->{start}  = $start; | 
|  | 926       $self->{_orf}->{$seqName}->{end}    = $end; | 
|  | 927     } | 
|  | 928   } | 
|  | 929   $self->{_fasta} = $translatedFasta; | 
|  | 930 } | 
|  | 931 #------------------------------------------------------------------------------- | 
|  | 932 | 
|  | 933 =head1 COPYRIGHT | 
|  | 934 | 
|  | 935 Copyright (c) 2009: Genome Research Ltd. | 
|  | 936 | 
|  | 937 Authors: Jaina Mistry (jm14@sanger.ac.uk), John Tate (jt6@sanger.ac.uk), Rob Finn (finnr@janelia.hhmi.org) | 
|  | 938 | 
|  | 939 This is free software; you can redistribute it and/or | 
|  | 940 modify it under the terms of the GNU General Public License | 
|  | 941 as published by the Free Software Foundation; either version 2 | 
|  | 942 of the License, or (at your option) any later version. | 
|  | 943 | 
|  | 944 This program is distributed in the hope that it will be useful, | 
|  | 945 but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|  | 946 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
|  | 947 GNU General Public License for more details. | 
|  | 948 | 
|  | 949 You should have received a copy of the GNU General Public License | 
|  | 950 along with this program; if not, write to the Free Software | 
|  | 951 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. | 
|  | 952 or see the on-line version at http://www.gnu.org/copyleft/gpl.txt | 
|  | 953 | 
|  | 954 =cut | 
|  | 955 | 
|  | 956   1; | 
|  | 957 |