0
|
1 package Bio::Pfam::Active_site::as_search;
|
|
2
|
|
3 use strict;
|
|
4 use warnings;
|
|
5
|
|
6 use Bio::SeqFeature::Generic;
|
|
7 use Bio::SimpleAlign;
|
|
8 use Bio::Pfam::Scan::Seq;
|
|
9
|
|
10 =head2 find_as
|
|
11
|
|
12 Title : find_as
|
|
13 Usage : find_as($as_aln, $as_res, $seq_id, $seq_se, $seq_region, $family, $hmm_file)
|
|
14 Function: finds active sites in a query sequence which
|
|
15 has a match to a Pfam active site family
|
|
16
|
|
17 Returns : An array reference of active site postions
|
|
18 Args : Alignment object of active site sequences, hash of arrays containing seq ids => active site positions,
|
|
19 start-end sequence in the format "3-50", sequence region, family, file containing all Pfam models
|
|
20
|
|
21 =cut
|
|
22
|
|
23 sub find_as {
|
|
24 my ($as_aln, $as_res, $seq_id, $seq_se, $seq_region, $family, $hmm_file) = @_;
|
|
25
|
|
26
|
|
27 system("hmmfetch $hmm_file $family > /tmp/hmm.$$") and die "FATAL: Problem running [hmmfetch $hmm_file $family > /tmp/hmm.$$]\n";
|
|
28
|
|
29 $seq_id = "Query_".$seq_id;
|
|
30
|
|
31 my $fasta;
|
|
32 foreach my $seq ($as_aln->each_seq) {
|
|
33 my $s = $seq->seq;
|
|
34 $s =~ s/[\-\.]//g; #Remove gaps
|
|
35
|
|
36 $fasta .= ">" . $seq->id . "/" . $seq->start . "-" . $seq->end . "\n$s\n";
|
|
37 }
|
|
38 $fasta .= ">$seq_id/$seq_se\n$seq_region";
|
|
39 open(SEQ, ">/tmp/seqs.$$") or die "Couldn't open file seqs.$$ $!\n";
|
|
40 print SEQ $fasta;
|
|
41 close SEQ;
|
|
42
|
|
43
|
|
44 open(OUT, "hmmalign --outformat Pfam /tmp/hmm.$$ /tmp/seqs.$$ |") or die "Couldn't open fh to hmmalign $!\n";
|
|
45
|
|
46 my $aln = new Bio::SimpleAlign;
|
|
47 my ($name, $start, $end, $seq);
|
|
48 while(<OUT>) {
|
|
49 if( /^(\S+)\/(\d+)-(\d+)\s+(\S+)\s*/ ) {
|
|
50 $name = $1;
|
|
51 $start = $2;
|
|
52 $end = $3;
|
|
53 $seq = $4;
|
|
54
|
|
55 $aln->add_seq(Bio::Pfam::Scan::Seq->new('-seq'=>$seq, '-id'=>$name, '-start'=>$start, '-end'=>$end, '-type'=>'aligned'));
|
|
56 }
|
|
57 }
|
|
58 close OUT;
|
|
59
|
|
60
|
|
61 unlink "/tmp/seqs.$$";
|
|
62 unlink "/tmp/hmm.$$";
|
|
63
|
|
64 #Locate exp as in fam
|
|
65 _exp_as($aln, $as_res);
|
|
66
|
|
67 #Store as patterns
|
|
68 my $pattern_aln = new Bio::SimpleAlign;
|
|
69 _pattern_info($aln, $pattern_aln);
|
|
70 #find pred as
|
|
71 my $array_ref = _add_pred_as($aln, $pattern_aln, $seq_id);
|
|
72 return $array_ref;
|
|
73 }
|
|
74
|
|
75 =head2 _exp_as
|
|
76
|
|
77 Title : _exp_as
|
|
78 Usage : _exp_as($aln, $hash_of_arrays)
|
|
79 Function : Adds experimental active site data to alignment object
|
|
80 Returns : Nothing, populates the alignment object with active site residue info
|
|
81 Args : alignment object
|
|
82
|
|
83 =cut
|
|
84
|
|
85 sub _exp_as {
|
|
86
|
|
87 my ($aln, $as_res) = @_;
|
|
88
|
|
89
|
|
90 foreach my $seq ($aln->each_seq) {
|
|
91
|
|
92 foreach my $pos ( @{$as_res->{$seq->id}}) {
|
|
93
|
|
94 if($pos >= $seq->start and $pos <= $seq->end) { #Feature is in the alignment
|
|
95
|
|
96 #store column position for seq
|
|
97 my $col = $aln->column_from_residue_number($seq->id, $pos);
|
|
98
|
|
99 #add feature to seq
|
|
100 my $aa .= uc substr($seq->seq(), $col-1, 1);
|
|
101
|
|
102 my $feat = new Bio::SeqFeature::Generic ( -display_name => 'experimental',
|
|
103 -primary => $aa,
|
|
104 -start => $col);
|
|
105
|
|
106
|
|
107
|
|
108 $seq->add_SeqFeature($feat);
|
|
109 }
|
|
110
|
|
111 }
|
|
112 }
|
|
113 }
|
|
114
|
|
115
|
|
116 =head2 _pattern_info
|
|
117
|
|
118 Title : _pattern_info
|
|
119 Usage : _pattern_info($aln_object, $aln_object)
|
|
120 Function : Takes an alignment and extracts active site patterns into a second alignment
|
|
121 Returns : Nothing, populates a second alignment object with active site seqences
|
|
122 Args : alignment object, empty alignment object
|
|
123
|
|
124 =cut
|
|
125
|
|
126
|
|
127 sub _pattern_info {
|
|
128 my ($aln, $pattern_aln) = @_;
|
|
129 my (%pat_col_seq);
|
|
130
|
|
131 foreach my $seq ( $aln->each_seq() ) {
|
|
132
|
|
133 next unless($seq->all_SeqFeatures());
|
|
134 my ($pat, $col);
|
|
135 foreach my $feat ( sort {$a->start <=> $b->start } $seq->all_SeqFeatures() ) {
|
|
136 $pat .= $feat->primary_tag(); #HEK
|
|
137 $col .= $feat->start() . " "; #33 44 55
|
|
138 }
|
|
139
|
|
140 unless(exists($pat_col_seq{"$pat:$col"})) {
|
|
141 $pattern_aln->add_seq($seq);
|
|
142 $pat_col_seq{"$pat:$col"}=1;
|
|
143 }
|
|
144
|
|
145 }
|
|
146 }
|
|
147
|
|
148
|
|
149
|
|
150 =head2 _add_pred_as
|
|
151
|
|
152 Title : _add_pred_as
|
|
153 Usage : _add_pred_as($aln_object, $aln_object)
|
|
154 Function : Predicts active sites based on known active site data
|
|
155 Returns : array of active site pos
|
|
156 Args : alignment, alignment of known active sites
|
|
157
|
|
158 =cut
|
|
159
|
|
160
|
|
161
|
|
162
|
|
163 sub _add_pred_as {
|
|
164 my ($aln, $pattern_aln, $query_seq_id) = @_;
|
|
165 my $num_seq=0;
|
|
166 my ($query_seq, @as_res);
|
|
167
|
|
168 #locate query seq
|
|
169 foreach my $seq ( $aln->each_seq() ) {
|
|
170 if($seq->id eq $query_seq_id) {
|
|
171 $query_seq = $seq;
|
|
172 last;
|
|
173 }
|
|
174 }
|
|
175 die "FATAL: Can't locate query sequence [$query_seq_id] in active site alignement\n" unless($query_seq);
|
|
176
|
|
177
|
|
178 my $aligns_with = new Bio::SimpleAlign;
|
|
179 foreach my $seq1 ( $pattern_aln->each_seq() ) {
|
|
180
|
|
181
|
|
182 #See if all active site residues from seq1 exist in query seq
|
|
183 my $mismatch;
|
|
184 foreach my $feat ( sort {$a->start <=> $b->start } $seq1->all_SeqFeatures() ) {
|
|
185
|
|
186 my $aa1 = $feat->primary_tag();
|
|
187 my $col = $feat->start();
|
|
188
|
|
189 my $aa2 = uc substr($query_seq->seq, $col-1, 1);
|
|
190 unless($aa1 eq $aa2) {
|
|
191 $mismatch = 1;
|
|
192 last;
|
|
193
|
|
194 }
|
|
195
|
|
196 }
|
|
197
|
|
198 #Store seq1 if all active site residues are present in seq1
|
|
199 unless($mismatch) {
|
|
200 $aligns_with->add_seq($seq1);
|
|
201 }
|
|
202 }
|
|
203
|
|
204
|
|
205
|
|
206 $num_seq = $aligns_with->num_sequences();
|
|
207 return unless($num_seq);
|
|
208 my (%seq_to_remove, %seq_to_rem); #two hashes used to collect seq that need removing
|
|
209
|
|
210
|
|
211 #if query seq matches more than one pattern remove subpatterns and any patterns that overlap
|
|
212
|
|
213 #first remove sub pat
|
|
214 if($num_seq>1) {
|
|
215 foreach my $sequence1 ($aligns_with->each_seq() ) {
|
|
216 foreach my $sequence2 ($aligns_with->each_seq() ) {
|
|
217
|
|
218 next if($sequence1 eq $sequence2);
|
|
219
|
|
220 my (%hash1, %hash2, $num_1, $num_2, %smaller, %larger);
|
|
221 #collect column positions
|
|
222 foreach my $feat1 ($sequence1->all_SeqFeatures() ) {
|
|
223 $hash1{$feat1->start} =1;
|
|
224 $num_1++;
|
|
225 }
|
|
226 foreach my $feat2 ($sequence2->all_SeqFeatures() ) {
|
|
227 $hash2{$feat2->start} =1;
|
|
228 $num_2++;
|
|
229 }
|
|
230
|
|
231
|
|
232 #see if one is a subpattern of the other
|
|
233 my $diff=0;
|
|
234 unless($num_1 eq $num_2) {
|
|
235
|
|
236 my $remove_seq;
|
|
237
|
|
238 if($num_1 > $num_2) {
|
|
239 %smaller = %hash2;
|
|
240 %larger = %hash1;
|
|
241 $remove_seq = $sequence2;
|
|
242
|
|
243 }
|
|
244 else {
|
|
245 %smaller = %hash1;
|
|
246 %larger = %hash2;
|
|
247 $remove_seq = $sequence1;
|
|
248 }
|
|
249
|
|
250
|
|
251 foreach my $key (keys %smaller) {
|
|
252 $diff = 1 unless(exists($larger{$key})); #diff is true if it is not a subpattern
|
|
253 }
|
|
254
|
|
255
|
|
256 $seq_to_rem{$remove_seq}= $remove_seq unless($diff) ;
|
|
257 next unless($diff);
|
|
258 }
|
|
259 }
|
|
260
|
|
261 }
|
|
262 }
|
|
263
|
|
264 #Now remove any patterns which need removing
|
|
265 foreach my $remove (keys %seq_to_rem) {
|
|
266 $aligns_with->remove_seq($seq_to_rem{$remove});
|
|
267 }
|
|
268
|
|
269
|
|
270 unless($num_seq >=1) {
|
|
271 die "FATAL: All sequences that align with active site sequences have been removed - this should never happen\n";
|
|
272 }
|
|
273
|
|
274
|
|
275
|
|
276 $num_seq = $aligns_with->num_sequences();
|
|
277 #and then any patterns that overlap
|
|
278 if($num_seq>1) {
|
|
279
|
|
280 foreach my $sequence1 ($aligns_with->each_seq() ) {
|
|
281
|
|
282 foreach my $sequence2 ($aligns_with->each_seq() ) {
|
|
283 next if($sequence1 eq $sequence2);
|
|
284
|
|
285 my ($seq1_st, $seq1_en, $seq2_st, $seq2_en);
|
|
286
|
|
287 my (%hash1, %hash2, $num_1, $num_2, %smaller, %larger);
|
|
288
|
|
289 #see if patterns overlap - find pattern start ends and collect column positions
|
|
290 foreach my $feat1 ($sequence1->all_SeqFeatures() ) {
|
|
291
|
|
292 $seq1_st = $feat1->start() if(!$seq1_st or $feat1->start() < $seq1_st);
|
|
293 $seq1_en = $feat1->start() if(!$seq1_en or $feat1->start() > $seq1_en);
|
|
294 }
|
|
295
|
|
296 foreach my $feat2 ($sequence2->all_SeqFeatures() ) {
|
|
297
|
|
298 $seq2_st = $feat2->start() if(!$seq2_st or $feat2->start() < $seq2_st);
|
|
299 $seq2_en = $feat2->start() if(!$seq2_en or $feat2->start() > $seq2_en);
|
|
300 }
|
|
301
|
|
302 #then see if patterns overlap - remove sequence with pattern of least identity
|
|
303 if(($seq1_st >= $seq2_st and $seq1_st <= $seq2_en) or ($seq2_st >= $seq1_st and $seq2_st <= $seq1_en)) {
|
|
304 my $remove = _identity($query_seq, $sequence1, $sequence2);
|
|
305 $seq_to_remove{$remove}= $remove;
|
|
306 }
|
|
307 }
|
|
308
|
|
309 }
|
|
310 }
|
|
311
|
|
312 #Now remove any patterns which need removing
|
|
313 foreach my $remove (keys %seq_to_remove) {
|
|
314 $aligns_with->remove_seq($seq_to_remove{$remove});
|
|
315 $num_seq = $aligns_with->num_sequences();
|
|
316 last if($num_seq eq "1"); #just in case the % identities are identical
|
|
317 }
|
|
318
|
|
319
|
|
320 $num_seq = $aligns_with->num_sequences();
|
|
321 unless($num_seq >=1) {
|
|
322 die "FATAL: All sequences that align with active site sequences have been removed - this should never happen\n";
|
|
323 }
|
|
324
|
|
325
|
|
326
|
|
327 #Add features to seq
|
|
328 foreach my $sequence ($aligns_with->each_seq() ) {
|
|
329 foreach my $feat ($sequence->all_SeqFeatures() ) {
|
|
330
|
|
331 my $actual_pos = $query_seq->location_from_column($feat->start);
|
|
332 $actual_pos = $actual_pos->start();
|
|
333
|
|
334
|
|
335 push(@as_res, $actual_pos);
|
|
336
|
|
337
|
|
338
|
|
339 }
|
|
340 }
|
|
341 return \@as_res
|
|
342
|
|
343 }
|
|
344
|
|
345
|
|
346 =head2 _identity
|
|
347
|
|
348 Title : _identity
|
|
349 Usage : _identity($sequence1 , $sequence2, $sequence3)
|
|
350 Function : Identifies seq with lowest % identity to sequence1
|
|
351 Returns : The sequence which has the lowest % id to sequence 1
|
|
352 Args : sequence1, sequence2, sequence3.
|
|
353
|
|
354 =cut
|
|
355
|
|
356
|
|
357 sub _identity {
|
|
358 my $seq1 = shift;
|
|
359 my @aligns_with = @_;
|
|
360 my $lower_identity=100;
|
|
361 my $lower_identity_seq;
|
|
362 foreach my $s (@aligns_with) {
|
|
363 my $tmp_aln = new Bio::SimpleAlign;
|
|
364 $tmp_aln->add_seq($s);
|
|
365 $tmp_aln->add_seq($seq1);
|
|
366
|
|
367 my $identity = $tmp_aln->percentage_identity();
|
|
368 if($identity < $lower_identity) {
|
|
369 $lower_identity = $identity;
|
|
370 $lower_identity_seq = $s;
|
|
371 }
|
|
372
|
|
373 }
|
|
374 return $lower_identity_seq;
|
|
375 }
|
|
376
|
|
377 =head1 COPYRIGHT
|
|
378
|
|
379 Copyright (c) 2007: Genome Research Ltd.
|
|
380
|
|
381 Authors: Rob Finn (rdf@sanger.ac.uk), John Tate (jt6@sanger.ac.uk),
|
|
382 Jaina Mistry (jm14@sanger.ac.uk)
|
|
383
|
|
384 This is free software; you can redistribute it and/or modify it under
|
|
385 the terms of the GNU General Public License as published by the Free Software
|
|
386 Foundation; either version 2 of the License, or (at your option) any later
|
|
387 version.
|
|
388
|
|
389 This program is distributed in the hope that it will be useful, but WITHOUT
|
|
390 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
391 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
|
|
392 details.
|
|
393
|
|
394 You should have received a copy of the GNU General Public License along with
|
|
395 this program. If not, see <http://www.gnu.org/licenses/>.
|
|
396
|
|
397 =cut
|
|
398
|
|
399 1;
|