0
|
1 # Copyright (c) 2006 John Gilman
|
|
2 #
|
|
3 # This software is distributed under the MIT Open Source License.
|
|
4 # <http://www.opensource.org/licenses/mit-license.html>
|
|
5 #
|
|
6 # Permission is hereby granted, free of charge, to any person obtaining a
|
|
7 # copy of this software and associated documentation files (the "Software"),
|
|
8 # to deal in the Software without restriction, including without limitation
|
|
9 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
10 # and/or sell copies of the Software, and to permit persons to whom the
|
|
11 # Software is furnished to do so, subject to the following conditions:
|
|
12 #
|
|
13 # The above copyright notice and this permission notice shall be included
|
|
14 # in all copies or substantial portions of the Software.
|
|
15 #
|
|
16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
22 # THE SOFTWARE.
|
|
23
|
|
24
|
|
25
|
|
26
|
|
27 """Read the output of a fasta sequence similarity search.
|
|
28
|
|
29 FASTA is a DNA and Protein sequence alignment software package first described
|
|
30 by David J. Lipman and William R. Pearson in 1985. In addition to rapid
|
|
31 heuristic search methods, the FASTA package provides SSEARCH, an implementation
|
|
32 of the optimal Smith Waterman algorithm.
|
|
33
|
|
34 The module can parse the output from fasta, ssearch and other search programs
|
|
35 in the fasta collection. It will parse both default ('-m 1') and compact
|
|
36 ('-m 9 -d 0') output.
|
|
37
|
|
38 Refs:
|
|
39 ftp.virginia.edu/pub/fasta
|
|
40 http://en.wikipedia.org/wiki/FASTA
|
|
41 """
|
|
42
|
|
43
|
|
44
|
|
45 from corebio.utils import Reiterate, Token, isblank
|
|
46 from corebio.ssearch_io import Report, Result, Hit, Annotation, Alignment
|
|
47 from math import floor
|
|
48 import re
|
|
49
|
|
50 __all__ = 'read'
|
|
51
|
|
52 _rangere = re.compile(r"\((\d+)-\d+:(\d+)-\d+\)")
|
|
53
|
|
54
|
|
55 def read(fin) :
|
|
56 """Read and parse a fasta search output file.
|
|
57
|
|
58 returns: a list of Results
|
|
59 """
|
|
60 scanner = _scan(fin)
|
|
61
|
|
62 report = None
|
|
63 result = None
|
|
64 hit = None
|
|
65 #query_seq = None
|
|
66 #target_seq = None
|
|
67 alignment_num = 0
|
|
68
|
|
69
|
|
70 for token in scanner :
|
|
71 #print token
|
|
72 typeof = token.typeof
|
|
73 value = token.data
|
|
74
|
|
75 if typeof == 'begin_report' :
|
|
76 report = Report()
|
|
77 elif typeof == 'algorithm' :
|
|
78 report.algorithm = value
|
|
79 elif typeof == 'algorithm_version' :
|
|
80 report.algorithm_version = value
|
|
81 elif typeof == 'algorithm_reference' :
|
|
82 report.algorithm_reference = value
|
|
83 elif typeof == 'database_name' :
|
|
84 report.database_name = value
|
|
85 elif typeof == 'database_letters' :
|
|
86 report.database_letters = value
|
|
87 elif typeof == 'database_entries' :
|
|
88 report.database_entries = value
|
|
89 elif typeof == 'end_report' :
|
|
90 # Final sanity checking
|
|
91 break
|
|
92 elif typeof == 'parameter' :
|
|
93 key = value[0]
|
|
94 value = value[1]
|
|
95 report.parameters[key] = value
|
|
96
|
|
97 elif typeof == 'begin_result' :
|
|
98 result = Result()
|
|
99 report.results.append(result)
|
|
100
|
|
101 elif typeof == 'query_name' :
|
|
102 result.query.name = value
|
|
103 elif typeof == 'query_description' :
|
|
104 result.query.description = value
|
|
105 elif typeof == 'end_result' :
|
|
106 pass
|
|
107
|
|
108 elif typeof == 'begin_hit' :
|
|
109 hit = Hit()
|
|
110 elif typeof == 'target_name' :
|
|
111 hit.target.name = value
|
|
112 elif typeof == 'target_description' :
|
|
113 hit.target.description = value
|
|
114 elif typeof == 'target_length' :
|
|
115 hit.target.length = value
|
|
116 elif typeof == 'raw_score' :
|
|
117 hit.raw_score = value
|
|
118 elif typeof == 'bit_score' :
|
|
119 hit.bit_score = value
|
|
120 elif typeof == 'significance' :
|
|
121 hit.significance = value
|
|
122 elif typeof == 'end_hit' :
|
|
123 result.hits.append(hit)
|
|
124 hit = None
|
|
125
|
|
126 elif typeof == 'begin_alignment' :
|
|
127 alignment = Alignment()
|
|
128 tseq = []
|
|
129 qseq = []
|
|
130 elif typeof == 'end_alignment' :
|
|
131 tseq = ''.join(tseq)
|
|
132 qseq = ''.join(qseq)
|
|
133 L = max (len(tseq), len(qseq) )
|
|
134 tseq = tseq.ljust(L).replace(' ', '.')
|
|
135 qseq = qseq.ljust(L).replace(' ', '.')
|
|
136 alignment.query_seq = tseq
|
|
137 alignment.target_seq = qseq
|
|
138 result.hits[alignment_num].alignments.append(alignment)
|
|
139 alignment_num+=1
|
|
140 tseq = None
|
|
141 qseq = None
|
|
142 elif typeof == 'target_seq' :
|
|
143 tseq += value
|
|
144 elif typeof == 'query_seq' :
|
|
145 qseq += value
|
|
146 elif typeof == 'alignment_raw_score' :
|
|
147 alignment.raw_score = value
|
|
148
|
|
149 elif typeof == 'alignment_bit_score' :
|
|
150 alignment.bit_score = value
|
|
151 elif typeof == 'alignment_significance' :
|
|
152 alignment.significance = value
|
|
153 elif typeof == 'alignment_length' :
|
|
154 alignment.length = value
|
|
155 elif typeof == 'alignment_similar' :
|
|
156 alignment.similar = value
|
|
157 elif typeof == 'alignment_identical' :
|
|
158 alignment.identical = value
|
|
159 elif typeof == 'alignment_query_start' :
|
|
160 alignment.query_start = value
|
|
161 elif typeof == 'alignment_target_start' :
|
|
162 alignment.target_start = value
|
|
163
|
|
164 else:
|
|
165 # Should never get here.
|
|
166 raise RuntimeError("Unrecoverable internal parse error (SPE)")
|
|
167 pass
|
|
168
|
|
169
|
|
170 return report
|
|
171 # End method read()
|
|
172
|
|
173
|
|
174 def _scan(fin) :
|
|
175
|
|
176 def next_nonempty(i) :
|
|
177 L = i.next()
|
|
178 while L.strip() == '': L = i.next()
|
|
179 return L
|
|
180
|
|
181
|
|
182 lines = Reiterate(iter(fin))
|
|
183 try :
|
|
184
|
|
185 yield Token("begin_report", lineno= lines.index())
|
|
186
|
|
187 # find header line : "SSEARCH searches a sequence data bank"
|
|
188 L = lines.next()
|
|
189
|
|
190 if L[0] == '#' :
|
|
191 yield Token("parameter", ("command", L[1:].strip()), lines.index())
|
|
192 L = lines.next()
|
|
193
|
|
194 while not L : L= lines.next()
|
|
195 algorithm = L.split()[0]
|
|
196 expected = [ "SSEARCH", "FASTA","TFASTA","FASTX",
|
|
197 "FASTY","TFASTX","TFASTY"]
|
|
198 if algorithm not in expected:
|
|
199 raise ValueError("Parse failed: line %d" % lines.index() )
|
|
200 yield Token ("algorithm", algorithm, lines.index() )
|
|
201
|
|
202 # Next line should be the version
|
|
203 L = lines.next()
|
|
204 if not L.startswith(" version") :
|
|
205 raise ValueError("Parse failed: Cannot find version.")
|
|
206 yield Token( "algorithm_version", L[8:].split()[0].strip(), lines.index())
|
|
207
|
|
208 # Algorithm reference
|
|
209 L = lines.next()
|
|
210 if not L.startswith("Please cite:") :
|
|
211 raise ValueError("Parse failed: Expecting citation" + L)
|
|
212 cite = lines.next().strip() + ' ' + lines.next().strip()
|
|
213 yield Token( "algorithm_reference", cite)
|
|
214
|
|
215 # Find line "searching testset.fa library"
|
|
216 L = lines.next()
|
|
217 while not L.startswith("searching") : L = lines.next()
|
|
218 yield Token("database_name", L[10:-8], lines.index() )
|
|
219
|
|
220 # Results
|
|
221 L = lines.next()
|
|
222 while isblank(L) : L = lines.next()
|
|
223 if ">>>" not in L :
|
|
224 raise ValueError("Parse failed on line %d: " % lines.index())
|
|
225
|
|
226 while ">>>" in L :
|
|
227 yield Token("begin_result", lineno= lines.index())
|
|
228 index = L.find('>>>')
|
|
229 (name, description) = L[index+3:].split(' ',1)
|
|
230 yield Token("query_name", name, lines.index())
|
|
231 yield Token("query_description", description, lines.index())
|
|
232
|
|
233 while not L.startswith("The best scores are:") :
|
|
234 L = lines.next()
|
|
235 L = lines.next()
|
|
236 # hits
|
|
237 while not isblank(L) :
|
|
238 lineno = lines.index()
|
|
239 desc = L[0:49]
|
|
240 yield Token("begin_hit", lineno= lineno)
|
|
241 yield Token("target_description", desc, lineno, 0)
|
|
242 yield Token("target_name", desc.split(' ',1)[0], lineno, 0)
|
|
243 yield Token("target_length", int(L[52:56]), lineno, 52)
|
|
244 fields = L[57:].split()
|
|
245 raw, bit, sig = fields[0], fields[1], fields[2]
|
|
246 #print raw, bit, sig
|
|
247 yield Token("raw_score", float(raw), lineno, 57)
|
|
248 yield Token("bit_score", float(bit), lineno)
|
|
249 yield Token("significance", float(sig), lineno)
|
|
250 yield Token("end_hit", lineno=lineno)
|
|
251 L = lines.next()
|
|
252
|
|
253 # Optimal alignment information
|
|
254 L = next_nonempty(lines)
|
|
255 #print ">>>", L, L.startswith('>>')
|
|
256 while L.startswith('>>'):
|
|
257 if L.startswith('>>>') : break
|
|
258
|
|
259 yield Token("begin_alignment", lineno=lines.index() )
|
|
260
|
|
261 # 1 2 3 4
|
|
262 #01234567890123456789012345678901234567890123456789
|
|
263 # s-w opt: 46 Z-score: 70.7 bits: 18.5 E(): 3.6
|
|
264 L = lines.next()
|
|
265 fields = L.split()
|
|
266 raw, bit, sig = fields[2], fields[6], fields[8]
|
|
267 yield Token("alignment_raw_score", float(raw), lineno)
|
|
268 yield Token("alignment_bit_score", float(bit), lineno)
|
|
269 yield Token("alignment_significance", float(sig), lineno)
|
|
270
|
|
271 #Smith-Waterman score: 46; 38.095% identity (71.429% similar) in 21 aa overlap (2-22:36-56)
|
|
272 L = lines.next()
|
|
273 lineno = lines.index()
|
|
274 fields = L.split()
|
|
275 assert( len(fields) ==12)
|
|
276 alen = int(fields[8])
|
|
277 identical = int( floor(0.5+alen* float(fields[3][:-1])/100.))
|
|
278 similar = int( floor(0.5+alen* float(fields[3][:-1])/100.))
|
|
279 yield Token("alignment_length", alen, lineno)
|
|
280 yield Token("alignment_similar", similar, lineno)
|
|
281 yield Token("alignment_identical", identical, lineno)
|
|
282
|
|
283 m = _rangere.match( fields[11])
|
|
284 assert (m is not None)
|
|
285 yield Token("alignment_query_start", int(m.group(1))-1, lineno)
|
|
286 yield Token("alignment_target_start", int(m.group(2))-1, lineno)
|
|
287
|
|
288
|
|
289 count = 1
|
|
290 while True:
|
|
291 L = lines.next()
|
|
292 count += 1
|
|
293
|
|
294
|
|
295
|
|
296 if L.startswith('>>'): break
|
|
297 if '>>>' in L:
|
|
298 lines.push(L)
|
|
299 break
|
|
300 if 'residues' in L and 'sequences' in L :
|
|
301 lines.push(L)
|
|
302 break
|
|
303 if not L or L[0].isspace() : continue
|
|
304
|
|
305
|
|
306 # there are 2 lines before the first query sequence (but
|
|
307 # we started the count at 1). There is 1 line between query
|
|
308 # and target, 3 lines between target and query, unless the
|
|
309 # query ends before the ends and the target wraps onto another
|
|
310 # Then there are two lines between target and target.
|
|
311
|
|
312 # Smith-Waterman score: 34; 35.294% identity ...
|
|
313 #
|
|
314 # 30 40 50 60 70
|
|
315 # d1pfsa EGFLHLEDKPHPLQCQFFVESVIPAGSYQVPYRINVNNG-RPELAFDFKAMKRA
|
|
316 # : . . .:: .: .::
|
|
317 # d8rxna MKKYVCTVCGYEYDPAEGDPDNGVKPGTSFDDLPADWVCPVCGA
|
|
318 # 10 20 30 40
|
|
319 #
|
|
320 # d8rxna PKSEFEAA
|
|
321 # 50
|
|
322
|
|
323 lineno=lines.index()
|
|
324 if count==4 :
|
|
325 yield Token("query_seq", L[7:].rstrip(), lineno)
|
|
326 else :
|
|
327 yield Token("target_seq", L[7:].rstrip(),lineno)
|
|
328 count = 0
|
|
329
|
|
330 yield Token("end_alignment", lineno=lines.index() )
|
|
331 yield Token("end_result", lineno= lines.index())
|
|
332 L = next_nonempty(lines)
|
|
333 # End results
|
|
334
|
|
335 # "13355 residues in 93 query sequences"
|
|
336 # "13355 residues in 93 library sequences"
|
|
337 #print '>>', L
|
|
338 LL = L.split()
|
|
339 yield Token("database_letters",int(LL[0]), lines.index() )
|
|
340 yield Token("database_entries", int(LL[3]), lines.index() )
|
|
341
|
|
342 yield Token("end_report", lineno= lines.index())
|
|
343 except StopIteration :
|
|
344 raise ValueError("Premature end of file ")
|
|
345
|
|
346
|
|
347
|
|
348
|
|
349
|