Mercurial > repos > matteoc > agame_custom_tools
comparison pfamScan/Bio/Pfam/Active_site/as_search.pm @ 0:68a3648c7d91 draft default tip
Uploaded
author | matteoc |
---|---|
date | Thu, 22 Dec 2016 04:45:31 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:68a3648c7d91 |
---|---|
1 package Bio::Pfam::Active_site::as_search; | |
2 | |
3 use strict; | |
4 use warnings; | |
5 | |
6 use Bio::SeqFeature::Generic; | |
7 use Bio::SimpleAlign; | |
8 use Bio::Pfam::Scan::Seq; | |
9 | |
10 =head2 find_as | |
11 | |
12 Title : find_as | |
13 Usage : find_as($as_aln, $as_res, $seq_id, $seq_se, $seq_region, $family, $hmm_file) | |
14 Function: finds active sites in a query sequence which | |
15 has a match to a Pfam active site family | |
16 | |
17 Returns : An array reference of active site postions | |
18 Args : Alignment object of active site sequences, hash of arrays containing seq ids => active site positions, | |
19 start-end sequence in the format "3-50", sequence region, family, file containing all Pfam models | |
20 | |
21 =cut | |
22 | |
23 sub find_as { | |
24 my ($as_aln, $as_res, $seq_id, $seq_se, $seq_region, $family, $hmm_file) = @_; | |
25 | |
26 | |
27 system("hmmfetch $hmm_file $family > /tmp/hmm.$$") and die "FATAL: Problem running [hmmfetch $hmm_file $family > /tmp/hmm.$$]\n"; | |
28 | |
29 $seq_id = "Query_".$seq_id; | |
30 | |
31 my $fasta; | |
32 foreach my $seq ($as_aln->each_seq) { | |
33 my $s = $seq->seq; | |
34 $s =~ s/[\-\.]//g; #Remove gaps | |
35 | |
36 $fasta .= ">" . $seq->id . "/" . $seq->start . "-" . $seq->end . "\n$s\n"; | |
37 } | |
38 $fasta .= ">$seq_id/$seq_se\n$seq_region"; | |
39 open(SEQ, ">/tmp/seqs.$$") or die "Couldn't open file seqs.$$ $!\n"; | |
40 print SEQ $fasta; | |
41 close SEQ; | |
42 | |
43 | |
44 open(OUT, "hmmalign --outformat Pfam /tmp/hmm.$$ /tmp/seqs.$$ |") or die "Couldn't open fh to hmmalign $!\n"; | |
45 | |
46 my $aln = new Bio::SimpleAlign; | |
47 my ($name, $start, $end, $seq); | |
48 while(<OUT>) { | |
49 if( /^(\S+)\/(\d+)-(\d+)\s+(\S+)\s*/ ) { | |
50 $name = $1; | |
51 $start = $2; | |
52 $end = $3; | |
53 $seq = $4; | |
54 | |
55 $aln->add_seq(Bio::Pfam::Scan::Seq->new('-seq'=>$seq, '-id'=>$name, '-start'=>$start, '-end'=>$end, '-type'=>'aligned')); | |
56 } | |
57 } | |
58 close OUT; | |
59 | |
60 | |
61 unlink "/tmp/seqs.$$"; | |
62 unlink "/tmp/hmm.$$"; | |
63 | |
64 #Locate exp as in fam | |
65 _exp_as($aln, $as_res); | |
66 | |
67 #Store as patterns | |
68 my $pattern_aln = new Bio::SimpleAlign; | |
69 _pattern_info($aln, $pattern_aln); | |
70 #find pred as | |
71 my $array_ref = _add_pred_as($aln, $pattern_aln, $seq_id); | |
72 return $array_ref; | |
73 } | |
74 | |
75 =head2 _exp_as | |
76 | |
77 Title : _exp_as | |
78 Usage : _exp_as($aln, $hash_of_arrays) | |
79 Function : Adds experimental active site data to alignment object | |
80 Returns : Nothing, populates the alignment object with active site residue info | |
81 Args : alignment object | |
82 | |
83 =cut | |
84 | |
85 sub _exp_as { | |
86 | |
87 my ($aln, $as_res) = @_; | |
88 | |
89 | |
90 foreach my $seq ($aln->each_seq) { | |
91 | |
92 foreach my $pos ( @{$as_res->{$seq->id}}) { | |
93 | |
94 if($pos >= $seq->start and $pos <= $seq->end) { #Feature is in the alignment | |
95 | |
96 #store column position for seq | |
97 my $col = $aln->column_from_residue_number($seq->id, $pos); | |
98 | |
99 #add feature to seq | |
100 my $aa .= uc substr($seq->seq(), $col-1, 1); | |
101 | |
102 my $feat = new Bio::SeqFeature::Generic ( -display_name => 'experimental', | |
103 -primary => $aa, | |
104 -start => $col); | |
105 | |
106 | |
107 | |
108 $seq->add_SeqFeature($feat); | |
109 } | |
110 | |
111 } | |
112 } | |
113 } | |
114 | |
115 | |
116 =head2 _pattern_info | |
117 | |
118 Title : _pattern_info | |
119 Usage : _pattern_info($aln_object, $aln_object) | |
120 Function : Takes an alignment and extracts active site patterns into a second alignment | |
121 Returns : Nothing, populates a second alignment object with active site seqences | |
122 Args : alignment object, empty alignment object | |
123 | |
124 =cut | |
125 | |
126 | |
127 sub _pattern_info { | |
128 my ($aln, $pattern_aln) = @_; | |
129 my (%pat_col_seq); | |
130 | |
131 foreach my $seq ( $aln->each_seq() ) { | |
132 | |
133 next unless($seq->all_SeqFeatures()); | |
134 my ($pat, $col); | |
135 foreach my $feat ( sort {$a->start <=> $b->start } $seq->all_SeqFeatures() ) { | |
136 $pat .= $feat->primary_tag(); #HEK | |
137 $col .= $feat->start() . " "; #33 44 55 | |
138 } | |
139 | |
140 unless(exists($pat_col_seq{"$pat:$col"})) { | |
141 $pattern_aln->add_seq($seq); | |
142 $pat_col_seq{"$pat:$col"}=1; | |
143 } | |
144 | |
145 } | |
146 } | |
147 | |
148 | |
149 | |
150 =head2 _add_pred_as | |
151 | |
152 Title : _add_pred_as | |
153 Usage : _add_pred_as($aln_object, $aln_object) | |
154 Function : Predicts active sites based on known active site data | |
155 Returns : array of active site pos | |
156 Args : alignment, alignment of known active sites | |
157 | |
158 =cut | |
159 | |
160 | |
161 | |
162 | |
163 sub _add_pred_as { | |
164 my ($aln, $pattern_aln, $query_seq_id) = @_; | |
165 my $num_seq=0; | |
166 my ($query_seq, @as_res); | |
167 | |
168 #locate query seq | |
169 foreach my $seq ( $aln->each_seq() ) { | |
170 if($seq->id eq $query_seq_id) { | |
171 $query_seq = $seq; | |
172 last; | |
173 } | |
174 } | |
175 die "FATAL: Can't locate query sequence [$query_seq_id] in active site alignement\n" unless($query_seq); | |
176 | |
177 | |
178 my $aligns_with = new Bio::SimpleAlign; | |
179 foreach my $seq1 ( $pattern_aln->each_seq() ) { | |
180 | |
181 | |
182 #See if all active site residues from seq1 exist in query seq | |
183 my $mismatch; | |
184 foreach my $feat ( sort {$a->start <=> $b->start } $seq1->all_SeqFeatures() ) { | |
185 | |
186 my $aa1 = $feat->primary_tag(); | |
187 my $col = $feat->start(); | |
188 | |
189 my $aa2 = uc substr($query_seq->seq, $col-1, 1); | |
190 unless($aa1 eq $aa2) { | |
191 $mismatch = 1; | |
192 last; | |
193 | |
194 } | |
195 | |
196 } | |
197 | |
198 #Store seq1 if all active site residues are present in seq1 | |
199 unless($mismatch) { | |
200 $aligns_with->add_seq($seq1); | |
201 } | |
202 } | |
203 | |
204 | |
205 | |
206 $num_seq = $aligns_with->num_sequences(); | |
207 return unless($num_seq); | |
208 my (%seq_to_remove, %seq_to_rem); #two hashes used to collect seq that need removing | |
209 | |
210 | |
211 #if query seq matches more than one pattern remove subpatterns and any patterns that overlap | |
212 | |
213 #first remove sub pat | |
214 if($num_seq>1) { | |
215 foreach my $sequence1 ($aligns_with->each_seq() ) { | |
216 foreach my $sequence2 ($aligns_with->each_seq() ) { | |
217 | |
218 next if($sequence1 eq $sequence2); | |
219 | |
220 my (%hash1, %hash2, $num_1, $num_2, %smaller, %larger); | |
221 #collect column positions | |
222 foreach my $feat1 ($sequence1->all_SeqFeatures() ) { | |
223 $hash1{$feat1->start} =1; | |
224 $num_1++; | |
225 } | |
226 foreach my $feat2 ($sequence2->all_SeqFeatures() ) { | |
227 $hash2{$feat2->start} =1; | |
228 $num_2++; | |
229 } | |
230 | |
231 | |
232 #see if one is a subpattern of the other | |
233 my $diff=0; | |
234 unless($num_1 eq $num_2) { | |
235 | |
236 my $remove_seq; | |
237 | |
238 if($num_1 > $num_2) { | |
239 %smaller = %hash2; | |
240 %larger = %hash1; | |
241 $remove_seq = $sequence2; | |
242 | |
243 } | |
244 else { | |
245 %smaller = %hash1; | |
246 %larger = %hash2; | |
247 $remove_seq = $sequence1; | |
248 } | |
249 | |
250 | |
251 foreach my $key (keys %smaller) { | |
252 $diff = 1 unless(exists($larger{$key})); #diff is true if it is not a subpattern | |
253 } | |
254 | |
255 | |
256 $seq_to_rem{$remove_seq}= $remove_seq unless($diff) ; | |
257 next unless($diff); | |
258 } | |
259 } | |
260 | |
261 } | |
262 } | |
263 | |
264 #Now remove any patterns which need removing | |
265 foreach my $remove (keys %seq_to_rem) { | |
266 $aligns_with->remove_seq($seq_to_rem{$remove}); | |
267 } | |
268 | |
269 | |
270 unless($num_seq >=1) { | |
271 die "FATAL: All sequences that align with active site sequences have been removed - this should never happen\n"; | |
272 } | |
273 | |
274 | |
275 | |
276 $num_seq = $aligns_with->num_sequences(); | |
277 #and then any patterns that overlap | |
278 if($num_seq>1) { | |
279 | |
280 foreach my $sequence1 ($aligns_with->each_seq() ) { | |
281 | |
282 foreach my $sequence2 ($aligns_with->each_seq() ) { | |
283 next if($sequence1 eq $sequence2); | |
284 | |
285 my ($seq1_st, $seq1_en, $seq2_st, $seq2_en); | |
286 | |
287 my (%hash1, %hash2, $num_1, $num_2, %smaller, %larger); | |
288 | |
289 #see if patterns overlap - find pattern start ends and collect column positions | |
290 foreach my $feat1 ($sequence1->all_SeqFeatures() ) { | |
291 | |
292 $seq1_st = $feat1->start() if(!$seq1_st or $feat1->start() < $seq1_st); | |
293 $seq1_en = $feat1->start() if(!$seq1_en or $feat1->start() > $seq1_en); | |
294 } | |
295 | |
296 foreach my $feat2 ($sequence2->all_SeqFeatures() ) { | |
297 | |
298 $seq2_st = $feat2->start() if(!$seq2_st or $feat2->start() < $seq2_st); | |
299 $seq2_en = $feat2->start() if(!$seq2_en or $feat2->start() > $seq2_en); | |
300 } | |
301 | |
302 #then see if patterns overlap - remove sequence with pattern of least identity | |
303 if(($seq1_st >= $seq2_st and $seq1_st <= $seq2_en) or ($seq2_st >= $seq1_st and $seq2_st <= $seq1_en)) { | |
304 my $remove = _identity($query_seq, $sequence1, $sequence2); | |
305 $seq_to_remove{$remove}= $remove; | |
306 } | |
307 } | |
308 | |
309 } | |
310 } | |
311 | |
312 #Now remove any patterns which need removing | |
313 foreach my $remove (keys %seq_to_remove) { | |
314 $aligns_with->remove_seq($seq_to_remove{$remove}); | |
315 $num_seq = $aligns_with->num_sequences(); | |
316 last if($num_seq eq "1"); #just in case the % identities are identical | |
317 } | |
318 | |
319 | |
320 $num_seq = $aligns_with->num_sequences(); | |
321 unless($num_seq >=1) { | |
322 die "FATAL: All sequences that align with active site sequences have been removed - this should never happen\n"; | |
323 } | |
324 | |
325 | |
326 | |
327 #Add features to seq | |
328 foreach my $sequence ($aligns_with->each_seq() ) { | |
329 foreach my $feat ($sequence->all_SeqFeatures() ) { | |
330 | |
331 my $actual_pos = $query_seq->location_from_column($feat->start); | |
332 $actual_pos = $actual_pos->start(); | |
333 | |
334 | |
335 push(@as_res, $actual_pos); | |
336 | |
337 | |
338 | |
339 } | |
340 } | |
341 return \@as_res | |
342 | |
343 } | |
344 | |
345 | |
346 =head2 _identity | |
347 | |
348 Title : _identity | |
349 Usage : _identity($sequence1 , $sequence2, $sequence3) | |
350 Function : Identifies seq with lowest % identity to sequence1 | |
351 Returns : The sequence which has the lowest % id to sequence 1 | |
352 Args : sequence1, sequence2, sequence3. | |
353 | |
354 =cut | |
355 | |
356 | |
357 sub _identity { | |
358 my $seq1 = shift; | |
359 my @aligns_with = @_; | |
360 my $lower_identity=100; | |
361 my $lower_identity_seq; | |
362 foreach my $s (@aligns_with) { | |
363 my $tmp_aln = new Bio::SimpleAlign; | |
364 $tmp_aln->add_seq($s); | |
365 $tmp_aln->add_seq($seq1); | |
366 | |
367 my $identity = $tmp_aln->percentage_identity(); | |
368 if($identity < $lower_identity) { | |
369 $lower_identity = $identity; | |
370 $lower_identity_seq = $s; | |
371 } | |
372 | |
373 } | |
374 return $lower_identity_seq; | |
375 } | |
376 | |
377 =head1 COPYRIGHT | |
378 | |
379 Copyright (c) 2007: Genome Research Ltd. | |
380 | |
381 Authors: Rob Finn (rdf@sanger.ac.uk), John Tate (jt6@sanger.ac.uk), | |
382 Jaina Mistry (jm14@sanger.ac.uk) | |
383 | |
384 This is free software; you can redistribute it and/or modify it under | |
385 the terms of the GNU General Public License as published by the Free Software | |
386 Foundation; either version 2 of the License, or (at your option) any later | |
387 version. | |
388 | |
389 This program is distributed in the hope that it will be useful, but WITHOUT | |
390 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
391 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more | |
392 details. | |
393 | |
394 You should have received a copy of the GNU General Public License along with | |
395 this program. If not, see <http://www.gnu.org/licenses/>. | |
396 | |
397 =cut | |
398 | |
399 1; |