comparison corebio/seq_io/nbrf_io.py @ 0:c55bdc2fb9fa

Uploaded
author davidmurphy
date Thu, 27 Oct 2011 12:09:09 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c55bdc2fb9fa
1
2 # Copyright (c) 2006, The Regents of the University of California, through
3 # Lawrence Berkeley National Laboratory (subject to receipt of any required
4 # approvals from the U.S. Dept. of Energy). All rights reserved.
5
6 # This software is distributed under the new BSD Open Source License.
7 # <http://www.opensource.org/licenses/bsd-license.html>
8 #
9 # Redistribution and use in source and binary forms, with or without
10 # modification, are permitted provided that the following conditions are met:
11 #
12 # (1) Redistributions of source code must retain the above copyright notice,
13 # this list of conditions and the following disclaimer.
14 #
15 # (2) Redistributions in binary form must reproduce the above copyright
16 # notice, this list of conditions and the following disclaimer in the
17 # documentation and or other materials provided with the distribution.
18 #
19 # (3) Neither the name of the University of California, Lawrence Berkeley
20 # National Laboratory, U.S. Dept. of Energy nor the names of its contributors
21 # may be used to endorse or promote products derived from this software
22 # without specific prior written permission.
23 #
24 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
25 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
28 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
30 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
31 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
32 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
33 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 # POSSIBILITY OF SUCH DAMAGE.
35
36 """Sequence IO for NBRF/PIR format.
37
38 The format is similar to fasta. The header line consistins of '>', a two-
39 letter sequence type (P1, F1, DL, DC, RL, RC, or XX), a semicolon, and a
40 sequence ID. The next line is a textual description of the sequence,
41 followed by one or more lines containing the sequence data. The end of
42 the sequence is marked by a "*" (asterisk) character.
43
44 type_code -- A map between NBRF two letter type codes and Alphabets.
45
46
47 see: http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html
48
49 --- Example NBRF File ---
50
51 >P1;CRAB_ANAPL
52 ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
53 MDITIHNPLI RRPLFSWLAP SRIFDQIFGE HLQESELLPA SPSLSPFLMR
54 SPIFRMPSWL ETGLSEMRLE KDKFSVNLDV KHFSPEELKV KVLGDMVEIH
55 GKHEERQDEH GFIAREFNRK YRIPADVDPL TITSSLSLDG VLTVSAPRKQ
56 SDVPERSIPI TREEKPAIAG AQRK*
57
58 >P1;CRAB_BOVIN
59 ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
60 MDIAIHHPWI RRPFFPFHSP SRLFDQFFGE HLLESDLFPA STSLSPFYLR
61 PPSFLRAPSW IDTGLSEMRL EKDRFSVNLD VKHFSPEELK VKVLGDVIEV
62 HGKHEERQDE HGFISREFHR KYRIPADVDP LAITSSLSSD GVLTVNGPRK
63 QASGPERTIP ITREEKPAVT AAPKK*
64
65 """
66
67 from corebio.utils import *
68 from corebio.seq import *
69 from corebio.seq_io import *
70
71 names = ("nbrf", "pir",)
72 extensions = ('nbrf', 'pir', 'ali')
73
74
75
76
77 type_code = {
78 'P1' : protein_alphabet, # Protein (complete)
79 'F1' : protein_alphabet, # Protein (fragment)
80 'DL' : dna_alphabet, # DNA (linear)
81 'DC' : dna_alphabet, # DNA (circular)
82 'RC' : rna_alphabet, # RNA (linear)
83 'RL' : rna_alphabet, # RNA (circular)
84 'N3' : rna_alphabet, # tRNA
85 'N1' : rna_alphabet, # other functional RNA
86 'XX' : generic_alphabet
87 }
88
89 def read(fin, alphabet=None):
90 """Read and parse a NBRF seqquence file.
91
92 Args:
93 fin -- A stream or file to read
94 alphabet -- The expected alphabet of the data. If not supplied, then
95 an appropriate alphabet will be inferred from the data.
96 Returns:
97 SeqList -- A list of sequences
98 Raises:
99 ValueError -- If the file is unparsable
100 """
101 seqs = [ s for s in iterseq(fin, alphabet)]
102 return SeqList(seqs)
103
104
105
106 def iterseq(fin, alphabet=None):
107 """ Generate sequences from an NBRF file.
108
109 arguments:
110 fin -- A stream or file to read
111 alphabet --
112 yeilds :
113 Seq
114 raises :
115 ValueError -- On a parse error.
116 """
117
118 body, header,sequence = range(3) # Internal states
119
120 state = body
121 seq_id = None
122 seq_desc = None
123 seq_alpha = None
124 seqs = []
125
126 for lineno, line in enumerate(fin) :
127 if state == body :
128 if line == "" or line.isspace() :
129 continue
130 if line[0] == '>':
131 seq_type, seq_id = line[1:].split(';')
132 if alphabet :
133 seq_alpha = alphabet
134 else :
135 seq_alpha = type_code[seq_type]
136 state = header
137 continue
138 raise ValueError("Parse error on line: %d" % lineno)
139
140 elif state == header :
141 seq_desc = line.strip()
142 state = sequence
143 continue
144
145 elif state == sequence :
146 data = "".join(line.split()) # Strip out white space
147 if data[-1] =='*' :
148 # End of sequence data
149 seqs.append(data[:-1])
150
151 seq = Seq( "".join(seqs), name = seq_id.strip(),
152 description = seq_desc, alphabet = seq_alpha)
153
154 yield seq
155 state= body
156 seq_id = None
157 seq_desc = None
158 seqs = []
159 continue
160 else :
161 seqs.append(data)
162 continue
163 else :
164 # If we ever get here something has gone terrible wrong
165 assert(False)
166
167 # end for
168
169