annotate corebio/ssearch_io/fasta.py @ 0:c55bdc2fb9fa

Uploaded
author davidmurphy
date Thu, 27 Oct 2011 12:09:09 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1 # Copyright (c) 2006 John Gilman
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
2 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
3 # This software is distributed under the MIT Open Source License.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
4 # <http://www.opensource.org/licenses/mit-license.html>
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
5 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
6 # Permission is hereby granted, free of charge, to any person obtaining a
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
7 # copy of this software and associated documentation files (the "Software"),
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
8 # to deal in the Software without restriction, including without limitation
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
9 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
10 # and/or sell copies of the Software, and to permit persons to whom the
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
11 # Software is furnished to do so, subject to the following conditions:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
12 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
13 # The above copyright notice and this permission notice shall be included
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
14 # in all copies or substantial portions of the Software.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
15 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
17 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
18 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
19 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
20 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
21 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
22 # THE SOFTWARE.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
23
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
24
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
25
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
26
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
27 """Read the output of a fasta sequence similarity search.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
28
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
29 FASTA is a DNA and Protein sequence alignment software package first described
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
30 by David J. Lipman and William R. Pearson in 1985. In addition to rapid
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
31 heuristic search methods, the FASTA package provides SSEARCH, an implementation
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
32 of the optimal Smith Waterman algorithm.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
33
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
34 The module can parse the output from fasta, ssearch and other search programs
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
35 in the fasta collection. It will parse both default ('-m 1') and compact
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
36 ('-m 9 -d 0') output.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
37
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
38 Refs:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
39 ftp.virginia.edu/pub/fasta
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
40 http://en.wikipedia.org/wiki/FASTA
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
41 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
42
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
43
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
44
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
45 from corebio.utils import Reiterate, Token, isblank
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
46 from corebio.ssearch_io import Report, Result, Hit, Annotation, Alignment
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
47 from math import floor
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
48 import re
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
49
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
50 __all__ = 'read'
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
51
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
52 _rangere = re.compile(r"\((\d+)-\d+:(\d+)-\d+\)")
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
53
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
54
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
55 def read(fin) :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
56 """Read and parse a fasta search output file.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
57
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
58 returns: a list of Results
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
59 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
60 scanner = _scan(fin)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
61
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
62 report = None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
63 result = None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
64 hit = None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
65 #query_seq = None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
66 #target_seq = None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
67 alignment_num = 0
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
68
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
69
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
70 for token in scanner :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
71 #print token
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
72 typeof = token.typeof
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
73 value = token.data
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
74
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
75 if typeof == 'begin_report' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
76 report = Report()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
77 elif typeof == 'algorithm' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
78 report.algorithm = value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
79 elif typeof == 'algorithm_version' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
80 report.algorithm_version = value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
81 elif typeof == 'algorithm_reference' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
82 report.algorithm_reference = value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
83 elif typeof == 'database_name' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
84 report.database_name = value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
85 elif typeof == 'database_letters' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
86 report.database_letters = value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
87 elif typeof == 'database_entries' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
88 report.database_entries = value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
89 elif typeof == 'end_report' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
90 # Final sanity checking
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
91 break
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
92 elif typeof == 'parameter' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
93 key = value[0]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
94 value = value[1]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
95 report.parameters[key] = value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
96
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
97 elif typeof == 'begin_result' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
98 result = Result()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
99 report.results.append(result)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
100
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
101 elif typeof == 'query_name' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
102 result.query.name = value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
103 elif typeof == 'query_description' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
104 result.query.description = value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
105 elif typeof == 'end_result' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
106 pass
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
107
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
108 elif typeof == 'begin_hit' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
109 hit = Hit()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
110 elif typeof == 'target_name' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
111 hit.target.name = value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
112 elif typeof == 'target_description' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
113 hit.target.description = value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
114 elif typeof == 'target_length' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
115 hit.target.length = value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
116 elif typeof == 'raw_score' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
117 hit.raw_score = value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
118 elif typeof == 'bit_score' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
119 hit.bit_score = value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
120 elif typeof == 'significance' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
121 hit.significance = value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
122 elif typeof == 'end_hit' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
123 result.hits.append(hit)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
124 hit = None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
125
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
126 elif typeof == 'begin_alignment' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
127 alignment = Alignment()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
128 tseq = []
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
129 qseq = []
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
130 elif typeof == 'end_alignment' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
131 tseq = ''.join(tseq)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
132 qseq = ''.join(qseq)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
133 L = max (len(tseq), len(qseq) )
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
134 tseq = tseq.ljust(L).replace(' ', '.')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
135 qseq = qseq.ljust(L).replace(' ', '.')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
136 alignment.query_seq = tseq
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
137 alignment.target_seq = qseq
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
138 result.hits[alignment_num].alignments.append(alignment)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
139 alignment_num+=1
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
140 tseq = None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
141 qseq = None
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
142 elif typeof == 'target_seq' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
143 tseq += value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
144 elif typeof == 'query_seq' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
145 qseq += value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
146 elif typeof == 'alignment_raw_score' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
147 alignment.raw_score = value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
148
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
149 elif typeof == 'alignment_bit_score' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
150 alignment.bit_score = value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
151 elif typeof == 'alignment_significance' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
152 alignment.significance = value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
153 elif typeof == 'alignment_length' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
154 alignment.length = value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
155 elif typeof == 'alignment_similar' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
156 alignment.similar = value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
157 elif typeof == 'alignment_identical' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
158 alignment.identical = value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
159 elif typeof == 'alignment_query_start' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
160 alignment.query_start = value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
161 elif typeof == 'alignment_target_start' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
162 alignment.target_start = value
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
163
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
164 else:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
165 # Should never get here.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
166 raise RuntimeError("Unrecoverable internal parse error (SPE)")
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
167 pass
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
168
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
169
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
170 return report
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
171 # End method read()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
172
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
173
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
174 def _scan(fin) :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
175
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
176 def next_nonempty(i) :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
177 L = i.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
178 while L.strip() == '': L = i.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
179 return L
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
180
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
181
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
182 lines = Reiterate(iter(fin))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
183 try :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
184
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
185 yield Token("begin_report", lineno= lines.index())
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
186
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
187 # find header line : "SSEARCH searches a sequence data bank"
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
188 L = lines.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
189
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
190 if L[0] == '#' :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
191 yield Token("parameter", ("command", L[1:].strip()), lines.index())
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
192 L = lines.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
193
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
194 while not L : L= lines.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
195 algorithm = L.split()[0]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
196 expected = [ "SSEARCH", "FASTA","TFASTA","FASTX",
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
197 "FASTY","TFASTX","TFASTY"]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
198 if algorithm not in expected:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
199 raise ValueError("Parse failed: line %d" % lines.index() )
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
200 yield Token ("algorithm", algorithm, lines.index() )
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
201
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
202 # Next line should be the version
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
203 L = lines.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
204 if not L.startswith(" version") :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
205 raise ValueError("Parse failed: Cannot find version.")
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
206 yield Token( "algorithm_version", L[8:].split()[0].strip(), lines.index())
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
207
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
208 # Algorithm reference
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
209 L = lines.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
210 if not L.startswith("Please cite:") :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
211 raise ValueError("Parse failed: Expecting citation" + L)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
212 cite = lines.next().strip() + ' ' + lines.next().strip()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
213 yield Token( "algorithm_reference", cite)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
214
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
215 # Find line "searching testset.fa library"
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
216 L = lines.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
217 while not L.startswith("searching") : L = lines.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
218 yield Token("database_name", L[10:-8], lines.index() )
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
219
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
220 # Results
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
221 L = lines.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
222 while isblank(L) : L = lines.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
223 if ">>>" not in L :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
224 raise ValueError("Parse failed on line %d: " % lines.index())
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
225
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
226 while ">>>" in L :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
227 yield Token("begin_result", lineno= lines.index())
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
228 index = L.find('>>>')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
229 (name, description) = L[index+3:].split(' ',1)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
230 yield Token("query_name", name, lines.index())
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
231 yield Token("query_description", description, lines.index())
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
232
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
233 while not L.startswith("The best scores are:") :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
234 L = lines.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
235 L = lines.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
236 # hits
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
237 while not isblank(L) :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
238 lineno = lines.index()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
239 desc = L[0:49]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
240 yield Token("begin_hit", lineno= lineno)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
241 yield Token("target_description", desc, lineno, 0)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
242 yield Token("target_name", desc.split(' ',1)[0], lineno, 0)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
243 yield Token("target_length", int(L[52:56]), lineno, 52)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
244 fields = L[57:].split()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
245 raw, bit, sig = fields[0], fields[1], fields[2]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
246 #print raw, bit, sig
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
247 yield Token("raw_score", float(raw), lineno, 57)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
248 yield Token("bit_score", float(bit), lineno)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
249 yield Token("significance", float(sig), lineno)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
250 yield Token("end_hit", lineno=lineno)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
251 L = lines.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
252
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
253 # Optimal alignment information
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
254 L = next_nonempty(lines)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
255 #print ">>>", L, L.startswith('>>')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
256 while L.startswith('>>'):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
257 if L.startswith('>>>') : break
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
258
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
259 yield Token("begin_alignment", lineno=lines.index() )
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
260
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
261 # 1 2 3 4
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
262 #01234567890123456789012345678901234567890123456789
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
263 # s-w opt: 46 Z-score: 70.7 bits: 18.5 E(): 3.6
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
264 L = lines.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
265 fields = L.split()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
266 raw, bit, sig = fields[2], fields[6], fields[8]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
267 yield Token("alignment_raw_score", float(raw), lineno)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
268 yield Token("alignment_bit_score", float(bit), lineno)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
269 yield Token("alignment_significance", float(sig), lineno)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
270
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
271 #Smith-Waterman score: 46; 38.095% identity (71.429% similar) in 21 aa overlap (2-22:36-56)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
272 L = lines.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
273 lineno = lines.index()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
274 fields = L.split()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
275 assert( len(fields) ==12)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
276 alen = int(fields[8])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
277 identical = int( floor(0.5+alen* float(fields[3][:-1])/100.))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
278 similar = int( floor(0.5+alen* float(fields[3][:-1])/100.))
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
279 yield Token("alignment_length", alen, lineno)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
280 yield Token("alignment_similar", similar, lineno)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
281 yield Token("alignment_identical", identical, lineno)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
282
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
283 m = _rangere.match( fields[11])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
284 assert (m is not None)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
285 yield Token("alignment_query_start", int(m.group(1))-1, lineno)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
286 yield Token("alignment_target_start", int(m.group(2))-1, lineno)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
287
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
288
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
289 count = 1
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
290 while True:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
291 L = lines.next()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
292 count += 1
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
293
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
294
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
295
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
296 if L.startswith('>>'): break
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
297 if '>>>' in L:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
298 lines.push(L)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
299 break
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
300 if 'residues' in L and 'sequences' in L :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
301 lines.push(L)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
302 break
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
303 if not L or L[0].isspace() : continue
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
304
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
305
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
306 # there are 2 lines before the first query sequence (but
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
307 # we started the count at 1). There is 1 line between query
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
308 # and target, 3 lines between target and query, unless the
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
309 # query ends before the ends and the target wraps onto another
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
310 # Then there are two lines between target and target.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
311
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
312 # Smith-Waterman score: 34; 35.294% identity ...
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
313 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
314 # 30 40 50 60 70
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
315 # d1pfsa EGFLHLEDKPHPLQCQFFVESVIPAGSYQVPYRINVNNG-RPELAFDFKAMKRA
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
316 # : . . .:: .: .::
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
317 # d8rxna MKKYVCTVCGYEYDPAEGDPDNGVKPGTSFDDLPADWVCPVCGA
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
318 # 10 20 30 40
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
319 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
320 # d8rxna PKSEFEAA
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
321 # 50
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
322
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
323 lineno=lines.index()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
324 if count==4 :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
325 yield Token("query_seq", L[7:].rstrip(), lineno)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
326 else :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
327 yield Token("target_seq", L[7:].rstrip(),lineno)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
328 count = 0
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
329
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
330 yield Token("end_alignment", lineno=lines.index() )
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
331 yield Token("end_result", lineno= lines.index())
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
332 L = next_nonempty(lines)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
333 # End results
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
334
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
335 # "13355 residues in 93 query sequences"
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
336 # "13355 residues in 93 library sequences"
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
337 #print '>>', L
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
338 LL = L.split()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
339 yield Token("database_letters",int(LL[0]), lines.index() )
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
340 yield Token("database_entries", int(LL[3]), lines.index() )
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
341
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
342 yield Token("end_report", lineno= lines.index())
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
343 except StopIteration :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
344 raise ValueError("Premature end of file ")
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
345
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
346
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
347
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
348
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
349