comparison corebio/ssearch_io/fasta.py @ 0:c55bdc2fb9fa

Uploaded
author davidmurphy
date Thu, 27 Oct 2011 12:09:09 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c55bdc2fb9fa
1 # Copyright (c) 2006 John Gilman
2 #
3 # This software is distributed under the MIT Open Source License.
4 # <http://www.opensource.org/licenses/mit-license.html>
5 #
6 # Permission is hereby granted, free of charge, to any person obtaining a
7 # copy of this software and associated documentation files (the "Software"),
8 # to deal in the Software without restriction, including without limitation
9 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 # and/or sell copies of the Software, and to permit persons to whom the
11 # Software is furnished to do so, subject to the following conditions:
12 #
13 # The above copyright notice and this permission notice shall be included
14 # in all copies or substantial portions of the Software.
15 #
16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 # THE SOFTWARE.
23
24
25
26
27 """Read the output of a fasta sequence similarity search.
28
29 FASTA is a DNA and Protein sequence alignment software package first described
30 by David J. Lipman and William R. Pearson in 1985. In addition to rapid
31 heuristic search methods, the FASTA package provides SSEARCH, an implementation
32 of the optimal Smith Waterman algorithm.
33
34 The module can parse the output from fasta, ssearch and other search programs
35 in the fasta collection. It will parse both default ('-m 1') and compact
36 ('-m 9 -d 0') output.
37
38 Refs:
39 ftp.virginia.edu/pub/fasta
40 http://en.wikipedia.org/wiki/FASTA
41 """
42
43
44
45 from corebio.utils import Reiterate, Token, isblank
46 from corebio.ssearch_io import Report, Result, Hit, Annotation, Alignment
47 from math import floor
48 import re
49
50 __all__ = 'read'
51
52 _rangere = re.compile(r"\((\d+)-\d+:(\d+)-\d+\)")
53
54
55 def read(fin) :
56 """Read and parse a fasta search output file.
57
58 returns: a list of Results
59 """
60 scanner = _scan(fin)
61
62 report = None
63 result = None
64 hit = None
65 #query_seq = None
66 #target_seq = None
67 alignment_num = 0
68
69
70 for token in scanner :
71 #print token
72 typeof = token.typeof
73 value = token.data
74
75 if typeof == 'begin_report' :
76 report = Report()
77 elif typeof == 'algorithm' :
78 report.algorithm = value
79 elif typeof == 'algorithm_version' :
80 report.algorithm_version = value
81 elif typeof == 'algorithm_reference' :
82 report.algorithm_reference = value
83 elif typeof == 'database_name' :
84 report.database_name = value
85 elif typeof == 'database_letters' :
86 report.database_letters = value
87 elif typeof == 'database_entries' :
88 report.database_entries = value
89 elif typeof == 'end_report' :
90 # Final sanity checking
91 break
92 elif typeof == 'parameter' :
93 key = value[0]
94 value = value[1]
95 report.parameters[key] = value
96
97 elif typeof == 'begin_result' :
98 result = Result()
99 report.results.append(result)
100
101 elif typeof == 'query_name' :
102 result.query.name = value
103 elif typeof == 'query_description' :
104 result.query.description = value
105 elif typeof == 'end_result' :
106 pass
107
108 elif typeof == 'begin_hit' :
109 hit = Hit()
110 elif typeof == 'target_name' :
111 hit.target.name = value
112 elif typeof == 'target_description' :
113 hit.target.description = value
114 elif typeof == 'target_length' :
115 hit.target.length = value
116 elif typeof == 'raw_score' :
117 hit.raw_score = value
118 elif typeof == 'bit_score' :
119 hit.bit_score = value
120 elif typeof == 'significance' :
121 hit.significance = value
122 elif typeof == 'end_hit' :
123 result.hits.append(hit)
124 hit = None
125
126 elif typeof == 'begin_alignment' :
127 alignment = Alignment()
128 tseq = []
129 qseq = []
130 elif typeof == 'end_alignment' :
131 tseq = ''.join(tseq)
132 qseq = ''.join(qseq)
133 L = max (len(tseq), len(qseq) )
134 tseq = tseq.ljust(L).replace(' ', '.')
135 qseq = qseq.ljust(L).replace(' ', '.')
136 alignment.query_seq = tseq
137 alignment.target_seq = qseq
138 result.hits[alignment_num].alignments.append(alignment)
139 alignment_num+=1
140 tseq = None
141 qseq = None
142 elif typeof == 'target_seq' :
143 tseq += value
144 elif typeof == 'query_seq' :
145 qseq += value
146 elif typeof == 'alignment_raw_score' :
147 alignment.raw_score = value
148
149 elif typeof == 'alignment_bit_score' :
150 alignment.bit_score = value
151 elif typeof == 'alignment_significance' :
152 alignment.significance = value
153 elif typeof == 'alignment_length' :
154 alignment.length = value
155 elif typeof == 'alignment_similar' :
156 alignment.similar = value
157 elif typeof == 'alignment_identical' :
158 alignment.identical = value
159 elif typeof == 'alignment_query_start' :
160 alignment.query_start = value
161 elif typeof == 'alignment_target_start' :
162 alignment.target_start = value
163
164 else:
165 # Should never get here.
166 raise RuntimeError("Unrecoverable internal parse error (SPE)")
167 pass
168
169
170 return report
171 # End method read()
172
173
174 def _scan(fin) :
175
176 def next_nonempty(i) :
177 L = i.next()
178 while L.strip() == '': L = i.next()
179 return L
180
181
182 lines = Reiterate(iter(fin))
183 try :
184
185 yield Token("begin_report", lineno= lines.index())
186
187 # find header line : "SSEARCH searches a sequence data bank"
188 L = lines.next()
189
190 if L[0] == '#' :
191 yield Token("parameter", ("command", L[1:].strip()), lines.index())
192 L = lines.next()
193
194 while not L : L= lines.next()
195 algorithm = L.split()[0]
196 expected = [ "SSEARCH", "FASTA","TFASTA","FASTX",
197 "FASTY","TFASTX","TFASTY"]
198 if algorithm not in expected:
199 raise ValueError("Parse failed: line %d" % lines.index() )
200 yield Token ("algorithm", algorithm, lines.index() )
201
202 # Next line should be the version
203 L = lines.next()
204 if not L.startswith(" version") :
205 raise ValueError("Parse failed: Cannot find version.")
206 yield Token( "algorithm_version", L[8:].split()[0].strip(), lines.index())
207
208 # Algorithm reference
209 L = lines.next()
210 if not L.startswith("Please cite:") :
211 raise ValueError("Parse failed: Expecting citation" + L)
212 cite = lines.next().strip() + ' ' + lines.next().strip()
213 yield Token( "algorithm_reference", cite)
214
215 # Find line "searching testset.fa library"
216 L = lines.next()
217 while not L.startswith("searching") : L = lines.next()
218 yield Token("database_name", L[10:-8], lines.index() )
219
220 # Results
221 L = lines.next()
222 while isblank(L) : L = lines.next()
223 if ">>>" not in L :
224 raise ValueError("Parse failed on line %d: " % lines.index())
225
226 while ">>>" in L :
227 yield Token("begin_result", lineno= lines.index())
228 index = L.find('>>>')
229 (name, description) = L[index+3:].split(' ',1)
230 yield Token("query_name", name, lines.index())
231 yield Token("query_description", description, lines.index())
232
233 while not L.startswith("The best scores are:") :
234 L = lines.next()
235 L = lines.next()
236 # hits
237 while not isblank(L) :
238 lineno = lines.index()
239 desc = L[0:49]
240 yield Token("begin_hit", lineno= lineno)
241 yield Token("target_description", desc, lineno, 0)
242 yield Token("target_name", desc.split(' ',1)[0], lineno, 0)
243 yield Token("target_length", int(L[52:56]), lineno, 52)
244 fields = L[57:].split()
245 raw, bit, sig = fields[0], fields[1], fields[2]
246 #print raw, bit, sig
247 yield Token("raw_score", float(raw), lineno, 57)
248 yield Token("bit_score", float(bit), lineno)
249 yield Token("significance", float(sig), lineno)
250 yield Token("end_hit", lineno=lineno)
251 L = lines.next()
252
253 # Optimal alignment information
254 L = next_nonempty(lines)
255 #print ">>>", L, L.startswith('>>')
256 while L.startswith('>>'):
257 if L.startswith('>>>') : break
258
259 yield Token("begin_alignment", lineno=lines.index() )
260
261 # 1 2 3 4
262 #01234567890123456789012345678901234567890123456789
263 # s-w opt: 46 Z-score: 70.7 bits: 18.5 E(): 3.6
264 L = lines.next()
265 fields = L.split()
266 raw, bit, sig = fields[2], fields[6], fields[8]
267 yield Token("alignment_raw_score", float(raw), lineno)
268 yield Token("alignment_bit_score", float(bit), lineno)
269 yield Token("alignment_significance", float(sig), lineno)
270
271 #Smith-Waterman score: 46; 38.095% identity (71.429% similar) in 21 aa overlap (2-22:36-56)
272 L = lines.next()
273 lineno = lines.index()
274 fields = L.split()
275 assert( len(fields) ==12)
276 alen = int(fields[8])
277 identical = int( floor(0.5+alen* float(fields[3][:-1])/100.))
278 similar = int( floor(0.5+alen* float(fields[3][:-1])/100.))
279 yield Token("alignment_length", alen, lineno)
280 yield Token("alignment_similar", similar, lineno)
281 yield Token("alignment_identical", identical, lineno)
282
283 m = _rangere.match( fields[11])
284 assert (m is not None)
285 yield Token("alignment_query_start", int(m.group(1))-1, lineno)
286 yield Token("alignment_target_start", int(m.group(2))-1, lineno)
287
288
289 count = 1
290 while True:
291 L = lines.next()
292 count += 1
293
294
295
296 if L.startswith('>>'): break
297 if '>>>' in L:
298 lines.push(L)
299 break
300 if 'residues' in L and 'sequences' in L :
301 lines.push(L)
302 break
303 if not L or L[0].isspace() : continue
304
305
306 # there are 2 lines before the first query sequence (but
307 # we started the count at 1). There is 1 line between query
308 # and target, 3 lines between target and query, unless the
309 # query ends before the ends and the target wraps onto another
310 # Then there are two lines between target and target.
311
312 # Smith-Waterman score: 34; 35.294% identity ...
313 #
314 # 30 40 50 60 70
315 # d1pfsa EGFLHLEDKPHPLQCQFFVESVIPAGSYQVPYRINVNNG-RPELAFDFKAMKRA
316 # : . . .:: .: .::
317 # d8rxna MKKYVCTVCGYEYDPAEGDPDNGVKPGTSFDDLPADWVCPVCGA
318 # 10 20 30 40
319 #
320 # d8rxna PKSEFEAA
321 # 50
322
323 lineno=lines.index()
324 if count==4 :
325 yield Token("query_seq", L[7:].rstrip(), lineno)
326 else :
327 yield Token("target_seq", L[7:].rstrip(),lineno)
328 count = 0
329
330 yield Token("end_alignment", lineno=lines.index() )
331 yield Token("end_result", lineno= lines.index())
332 L = next_nonempty(lines)
333 # End results
334
335 # "13355 residues in 93 query sequences"
336 # "13355 residues in 93 library sequences"
337 #print '>>', L
338 LL = L.split()
339 yield Token("database_letters",int(LL[0]), lines.index() )
340 yield Token("database_entries", int(LL[3]), lines.index() )
341
342 yield Token("end_report", lineno= lines.index())
343 except StopIteration :
344 raise ValueError("Premature end of file ")
345
346
347
348
349