0
|
1
|
|
2 # Copyright (c) 2006, The Regents of the University of California, through
|
|
3 # Lawrence Berkeley National Laboratory (subject to receipt of any required
|
|
4 # approvals from the U.S. Dept. of Energy). All rights reserved.
|
|
5
|
|
6 # This software is distributed under the new BSD Open Source License.
|
|
7 # <http://www.opensource.org/licenses/bsd-license.html>
|
|
8 #
|
|
9 # Redistribution and use in source and binary forms, with or without
|
|
10 # modification, are permitted provided that the following conditions are met:
|
|
11 #
|
|
12 # (1) Redistributions of source code must retain the above copyright notice,
|
|
13 # this list of conditions and the following disclaimer.
|
|
14 #
|
|
15 # (2) Redistributions in binary form must reproduce the above copyright
|
|
16 # notice, this list of conditions and the following disclaimer in the
|
|
17 # documentation and or other materials provided with the distribution.
|
|
18 #
|
|
19 # (3) Neither the name of the University of California, Lawrence Berkeley
|
|
20 # National Laboratory, U.S. Dept. of Energy nor the names of its contributors
|
|
21 # may be used to endorse or promote products derived from this software
|
|
22 # without specific prior written permission.
|
|
23 #
|
|
24 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
25 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
26 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
27 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
28 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
29 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
30 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
31 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
32 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
33 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
34 # POSSIBILITY OF SUCH DAMAGE.
|
|
35
|
|
36 """Sequence IO for NBRF/PIR format.
|
|
37
|
|
38 The format is similar to fasta. The header line consistins of '>', a two-
|
|
39 letter sequence type (P1, F1, DL, DC, RL, RC, or XX), a semicolon, and a
|
|
40 sequence ID. The next line is a textual description of the sequence,
|
|
41 followed by one or more lines containing the sequence data. The end of
|
|
42 the sequence is marked by a "*" (asterisk) character.
|
|
43
|
|
44 type_code -- A map between NBRF two letter type codes and Alphabets.
|
|
45
|
|
46
|
|
47 see: http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html
|
|
48
|
|
49 --- Example NBRF File ---
|
|
50
|
|
51 >P1;CRAB_ANAPL
|
|
52 ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
|
|
53 MDITIHNPLI RRPLFSWLAP SRIFDQIFGE HLQESELLPA SPSLSPFLMR
|
|
54 SPIFRMPSWL ETGLSEMRLE KDKFSVNLDV KHFSPEELKV KVLGDMVEIH
|
|
55 GKHEERQDEH GFIAREFNRK YRIPADVDPL TITSSLSLDG VLTVSAPRKQ
|
|
56 SDVPERSIPI TREEKPAIAG AQRK*
|
|
57
|
|
58 >P1;CRAB_BOVIN
|
|
59 ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
|
|
60 MDIAIHHPWI RRPFFPFHSP SRLFDQFFGE HLLESDLFPA STSLSPFYLR
|
|
61 PPSFLRAPSW IDTGLSEMRL EKDRFSVNLD VKHFSPEELK VKVLGDVIEV
|
|
62 HGKHEERQDE HGFISREFHR KYRIPADVDP LAITSSLSSD GVLTVNGPRK
|
|
63 QASGPERTIP ITREEKPAVT AAPKK*
|
|
64
|
|
65 """
|
|
66
|
|
67 from corebio.utils import *
|
|
68 from corebio.seq import *
|
|
69 from corebio.seq_io import *
|
|
70
|
|
71 names = ("nbrf", "pir",)
|
|
72 extensions = ('nbrf', 'pir', 'ali')
|
|
73
|
|
74
|
|
75
|
|
76
|
|
77 type_code = {
|
|
78 'P1' : protein_alphabet, # Protein (complete)
|
|
79 'F1' : protein_alphabet, # Protein (fragment)
|
|
80 'DL' : dna_alphabet, # DNA (linear)
|
|
81 'DC' : dna_alphabet, # DNA (circular)
|
|
82 'RC' : rna_alphabet, # RNA (linear)
|
|
83 'RL' : rna_alphabet, # RNA (circular)
|
|
84 'N3' : rna_alphabet, # tRNA
|
|
85 'N1' : rna_alphabet, # other functional RNA
|
|
86 'XX' : generic_alphabet
|
|
87 }
|
|
88
|
|
89 def read(fin, alphabet=None):
|
|
90 """Read and parse a NBRF seqquence file.
|
|
91
|
|
92 Args:
|
|
93 fin -- A stream or file to read
|
|
94 alphabet -- The expected alphabet of the data. If not supplied, then
|
|
95 an appropriate alphabet will be inferred from the data.
|
|
96 Returns:
|
|
97 SeqList -- A list of sequences
|
|
98 Raises:
|
|
99 ValueError -- If the file is unparsable
|
|
100 """
|
|
101 seqs = [ s for s in iterseq(fin, alphabet)]
|
|
102 return SeqList(seqs)
|
|
103
|
|
104
|
|
105
|
|
106 def iterseq(fin, alphabet=None):
|
|
107 """ Generate sequences from an NBRF file.
|
|
108
|
|
109 arguments:
|
|
110 fin -- A stream or file to read
|
|
111 alphabet --
|
|
112 yeilds :
|
|
113 Seq
|
|
114 raises :
|
|
115 ValueError -- On a parse error.
|
|
116 """
|
|
117
|
|
118 body, header,sequence = range(3) # Internal states
|
|
119
|
|
120 state = body
|
|
121 seq_id = None
|
|
122 seq_desc = None
|
|
123 seq_alpha = None
|
|
124 seqs = []
|
|
125
|
|
126 for lineno, line in enumerate(fin) :
|
|
127 if state == body :
|
|
128 if line == "" or line.isspace() :
|
|
129 continue
|
|
130 if line[0] == '>':
|
|
131 seq_type, seq_id = line[1:].split(';')
|
|
132 if alphabet :
|
|
133 seq_alpha = alphabet
|
|
134 else :
|
|
135 seq_alpha = type_code[seq_type]
|
|
136 state = header
|
|
137 continue
|
|
138 raise ValueError("Parse error on line: %d" % lineno)
|
|
139
|
|
140 elif state == header :
|
|
141 seq_desc = line.strip()
|
|
142 state = sequence
|
|
143 continue
|
|
144
|
|
145 elif state == sequence :
|
|
146 data = "".join(line.split()) # Strip out white space
|
|
147 if data[-1] =='*' :
|
|
148 # End of sequence data
|
|
149 seqs.append(data[:-1])
|
|
150
|
|
151 seq = Seq( "".join(seqs), name = seq_id.strip(),
|
|
152 description = seq_desc, alphabet = seq_alpha)
|
|
153
|
|
154 yield seq
|
|
155 state= body
|
|
156 seq_id = None
|
|
157 seq_desc = None
|
|
158 seqs = []
|
|
159 continue
|
|
160 else :
|
|
161 seqs.append(data)
|
|
162 continue
|
|
163 else :
|
|
164 # If we ever get here something has gone terrible wrong
|
|
165 assert(False)
|
|
166
|
|
167 # end for
|
|
168
|
|
169
|