Mercurial > repos > matteoc > agame_custom_tools
diff pfamScan/Bio/Pfam/Active_site/as_search.pm @ 0:68a3648c7d91 draft default tip
Uploaded
author | matteoc |
---|---|
date | Thu, 22 Dec 2016 04:45:31 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pfamScan/Bio/Pfam/Active_site/as_search.pm Thu Dec 22 04:45:31 2016 -0500 @@ -0,0 +1,399 @@ +package Bio::Pfam::Active_site::as_search; + +use strict; +use warnings; + +use Bio::SeqFeature::Generic; +use Bio::SimpleAlign; +use Bio::Pfam::Scan::Seq; + +=head2 find_as + + Title : find_as + Usage : find_as($as_aln, $as_res, $seq_id, $seq_se, $seq_region, $family, $hmm_file) + Function: finds active sites in a query sequence which + has a match to a Pfam active site family + + Returns : An array reference of active site postions + Args : Alignment object of active site sequences, hash of arrays containing seq ids => active site positions, + start-end sequence in the format "3-50", sequence region, family, file containing all Pfam models + +=cut + +sub find_as { + my ($as_aln, $as_res, $seq_id, $seq_se, $seq_region, $family, $hmm_file) = @_; + + + system("hmmfetch $hmm_file $family > /tmp/hmm.$$") and die "FATAL: Problem running [hmmfetch $hmm_file $family > /tmp/hmm.$$]\n"; + + $seq_id = "Query_".$seq_id; + + my $fasta; + foreach my $seq ($as_aln->each_seq) { + my $s = $seq->seq; + $s =~ s/[\-\.]//g; #Remove gaps + + $fasta .= ">" . $seq->id . "/" . $seq->start . "-" . $seq->end . "\n$s\n"; + } + $fasta .= ">$seq_id/$seq_se\n$seq_region"; + open(SEQ, ">/tmp/seqs.$$") or die "Couldn't open file seqs.$$ $!\n"; + print SEQ $fasta; + close SEQ; + + + open(OUT, "hmmalign --outformat Pfam /tmp/hmm.$$ /tmp/seqs.$$ |") or die "Couldn't open fh to hmmalign $!\n"; + + my $aln = new Bio::SimpleAlign; + my ($name, $start, $end, $seq); + while(<OUT>) { + if( /^(\S+)\/(\d+)-(\d+)\s+(\S+)\s*/ ) { + $name = $1; + $start = $2; + $end = $3; + $seq = $4; + + $aln->add_seq(Bio::Pfam::Scan::Seq->new('-seq'=>$seq, '-id'=>$name, '-start'=>$start, '-end'=>$end, '-type'=>'aligned')); + } + } + close OUT; + + + unlink "/tmp/seqs.$$"; + unlink "/tmp/hmm.$$"; + + #Locate exp as in fam + _exp_as($aln, $as_res); + + #Store as patterns + my $pattern_aln = new Bio::SimpleAlign; + _pattern_info($aln, $pattern_aln); + #find pred as + my $array_ref = _add_pred_as($aln, $pattern_aln, $seq_id); + return $array_ref; +} + +=head2 _exp_as + + Title : _exp_as + Usage : _exp_as($aln, $hash_of_arrays) + Function : Adds experimental active site data to alignment object + Returns : Nothing, populates the alignment object with active site residue info + Args : alignment object + +=cut + +sub _exp_as { + + my ($aln, $as_res) = @_; + + + foreach my $seq ($aln->each_seq) { + + foreach my $pos ( @{$as_res->{$seq->id}}) { + + if($pos >= $seq->start and $pos <= $seq->end) { #Feature is in the alignment + + #store column position for seq + my $col = $aln->column_from_residue_number($seq->id, $pos); + + #add feature to seq + my $aa .= uc substr($seq->seq(), $col-1, 1); + + my $feat = new Bio::SeqFeature::Generic ( -display_name => 'experimental', + -primary => $aa, + -start => $col); + + + + $seq->add_SeqFeature($feat); + } + + } + } +} + + +=head2 _pattern_info + + Title : _pattern_info + Usage : _pattern_info($aln_object, $aln_object) + Function : Takes an alignment and extracts active site patterns into a second alignment + Returns : Nothing, populates a second alignment object with active site seqences + Args : alignment object, empty alignment object + +=cut + + +sub _pattern_info { + my ($aln, $pattern_aln) = @_; + my (%pat_col_seq); + + foreach my $seq ( $aln->each_seq() ) { + + next unless($seq->all_SeqFeatures()); + my ($pat, $col); + foreach my $feat ( sort {$a->start <=> $b->start } $seq->all_SeqFeatures() ) { + $pat .= $feat->primary_tag(); #HEK + $col .= $feat->start() . " "; #33 44 55 + } + + unless(exists($pat_col_seq{"$pat:$col"})) { + $pattern_aln->add_seq($seq); + $pat_col_seq{"$pat:$col"}=1; + } + + } +} + + + +=head2 _add_pred_as + + Title : _add_pred_as + Usage : _add_pred_as($aln_object, $aln_object) + Function : Predicts active sites based on known active site data + Returns : array of active site pos + Args : alignment, alignment of known active sites + +=cut + + + + +sub _add_pred_as { + my ($aln, $pattern_aln, $query_seq_id) = @_; + my $num_seq=0; + my ($query_seq, @as_res); + + #locate query seq + foreach my $seq ( $aln->each_seq() ) { + if($seq->id eq $query_seq_id) { + $query_seq = $seq; + last; + } + } + die "FATAL: Can't locate query sequence [$query_seq_id] in active site alignement\n" unless($query_seq); + + + my $aligns_with = new Bio::SimpleAlign; + foreach my $seq1 ( $pattern_aln->each_seq() ) { + + + #See if all active site residues from seq1 exist in query seq + my $mismatch; + foreach my $feat ( sort {$a->start <=> $b->start } $seq1->all_SeqFeatures() ) { + + my $aa1 = $feat->primary_tag(); + my $col = $feat->start(); + + my $aa2 = uc substr($query_seq->seq, $col-1, 1); + unless($aa1 eq $aa2) { + $mismatch = 1; + last; + + } + + } + + #Store seq1 if all active site residues are present in seq1 + unless($mismatch) { + $aligns_with->add_seq($seq1); + } + } + + + + $num_seq = $aligns_with->num_sequences(); + return unless($num_seq); + my (%seq_to_remove, %seq_to_rem); #two hashes used to collect seq that need removing + + + #if query seq matches more than one pattern remove subpatterns and any patterns that overlap + + #first remove sub pat + if($num_seq>1) { + foreach my $sequence1 ($aligns_with->each_seq() ) { + foreach my $sequence2 ($aligns_with->each_seq() ) { + + next if($sequence1 eq $sequence2); + + my (%hash1, %hash2, $num_1, $num_2, %smaller, %larger); + #collect column positions + foreach my $feat1 ($sequence1->all_SeqFeatures() ) { + $hash1{$feat1->start} =1; + $num_1++; + } + foreach my $feat2 ($sequence2->all_SeqFeatures() ) { + $hash2{$feat2->start} =1; + $num_2++; + } + + + #see if one is a subpattern of the other + my $diff=0; + unless($num_1 eq $num_2) { + + my $remove_seq; + + if($num_1 > $num_2) { + %smaller = %hash2; + %larger = %hash1; + $remove_seq = $sequence2; + + } + else { + %smaller = %hash1; + %larger = %hash2; + $remove_seq = $sequence1; + } + + + foreach my $key (keys %smaller) { + $diff = 1 unless(exists($larger{$key})); #diff is true if it is not a subpattern + } + + + $seq_to_rem{$remove_seq}= $remove_seq unless($diff) ; + next unless($diff); + } + } + + } + } + + #Now remove any patterns which need removing + foreach my $remove (keys %seq_to_rem) { + $aligns_with->remove_seq($seq_to_rem{$remove}); + } + + + unless($num_seq >=1) { + die "FATAL: All sequences that align with active site sequences have been removed - this should never happen\n"; + } + + + + $num_seq = $aligns_with->num_sequences(); + #and then any patterns that overlap + if($num_seq>1) { + + foreach my $sequence1 ($aligns_with->each_seq() ) { + + foreach my $sequence2 ($aligns_with->each_seq() ) { + next if($sequence1 eq $sequence2); + + my ($seq1_st, $seq1_en, $seq2_st, $seq2_en); + + my (%hash1, %hash2, $num_1, $num_2, %smaller, %larger); + + #see if patterns overlap - find pattern start ends and collect column positions + foreach my $feat1 ($sequence1->all_SeqFeatures() ) { + + $seq1_st = $feat1->start() if(!$seq1_st or $feat1->start() < $seq1_st); + $seq1_en = $feat1->start() if(!$seq1_en or $feat1->start() > $seq1_en); + } + + foreach my $feat2 ($sequence2->all_SeqFeatures() ) { + + $seq2_st = $feat2->start() if(!$seq2_st or $feat2->start() < $seq2_st); + $seq2_en = $feat2->start() if(!$seq2_en or $feat2->start() > $seq2_en); + } + + #then see if patterns overlap - remove sequence with pattern of least identity + if(($seq1_st >= $seq2_st and $seq1_st <= $seq2_en) or ($seq2_st >= $seq1_st and $seq2_st <= $seq1_en)) { + my $remove = _identity($query_seq, $sequence1, $sequence2); + $seq_to_remove{$remove}= $remove; + } + } + + } + } + + #Now remove any patterns which need removing + foreach my $remove (keys %seq_to_remove) { + $aligns_with->remove_seq($seq_to_remove{$remove}); + $num_seq = $aligns_with->num_sequences(); + last if($num_seq eq "1"); #just in case the % identities are identical + } + + + $num_seq = $aligns_with->num_sequences(); + unless($num_seq >=1) { + die "FATAL: All sequences that align with active site sequences have been removed - this should never happen\n"; + } + + + + #Add features to seq + foreach my $sequence ($aligns_with->each_seq() ) { + foreach my $feat ($sequence->all_SeqFeatures() ) { + + my $actual_pos = $query_seq->location_from_column($feat->start); + $actual_pos = $actual_pos->start(); + + + push(@as_res, $actual_pos); + + + + } + } + return \@as_res + +} + + +=head2 _identity + + Title : _identity + Usage : _identity($sequence1 , $sequence2, $sequence3) + Function : Identifies seq with lowest % identity to sequence1 + Returns : The sequence which has the lowest % id to sequence 1 + Args : sequence1, sequence2, sequence3. + +=cut + + +sub _identity { + my $seq1 = shift; + my @aligns_with = @_; + my $lower_identity=100; + my $lower_identity_seq; + foreach my $s (@aligns_with) { + my $tmp_aln = new Bio::SimpleAlign; + $tmp_aln->add_seq($s); + $tmp_aln->add_seq($seq1); + + my $identity = $tmp_aln->percentage_identity(); + if($identity < $lower_identity) { + $lower_identity = $identity; + $lower_identity_seq = $s; + } + + } + return $lower_identity_seq; +} + +=head1 COPYRIGHT + +Copyright (c) 2007: Genome Research Ltd. + +Authors: Rob Finn (rdf@sanger.ac.uk), John Tate (jt6@sanger.ac.uk), + Jaina Mistry (jm14@sanger.ac.uk) + +This is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 2 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see <http://www.gnu.org/licenses/>. + +=cut + +1;