comparison corebio/seq_io/intelligenetics_io.py @ 0:c55bdc2fb9fa

Uploaded
author davidmurphy
date Thu, 27 Oct 2011 12:09:09 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c55bdc2fb9fa
1 #!/usr/bin/env python
2
3 # Copyright (c) 2005 Gavin E. Crooks <gec@threeplusone.com>
4 #
5 # This software is distributed under the MIT Open Source License.
6 # <http://www.opensource.org/licenses/mit-license.html>
7 #
8 # Permission is hereby granted, free of charge, to any person obtaining a
9 # copy of this software and associated documentation files (the "Software"),
10 # to deal in the Software without restriction, including without limitation
11 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 # and/or sell copies of the Software, and to permit persons to whom the
13 # Software is furnished to do so, subject to the following conditions:
14 #
15 # The above copyright notice and this permission notice shall be included
16 # in all copies or substantial portions of the Software.
17 #
18 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 # THE SOFTWARE.
25 #
26
27 """Read and write sequence information in IntelliGenetics format.
28
29 A sequence file in IG format can contain several sequences, each consisting of a
30 number of comment lines that must begin with a semicolon (";"), a line with the
31 sequence name and the sequence itself terminated with the termination character
32 '1' for linear or '2' for circular sequences. The termination caracter is
33 defacto optional.
34
35 --- Example IG File ---
36
37 ;H.sapiens fau mRNA, 518 bases
38 HSFAU
39 ttcctctttctcgactccatcttcgcggtagctgggaccgccgttcagtc
40 actcttaagtcttttgtaattctggctttctctaataaaaaagccactta
41 gttcagtcaaaaaaaaaa1
42 ;H.sapiens fau 1 gene, 2016 bases
43 HSFAU1
44 ctaccattttccctctcgattctatatgtacactcgggacaagttctcct
45 gatcgaaaacggcaaaactaaggccccaagtaggaatgccttagttttcg
46 gggttaacaatgattaacactgagcctcacacccacgcgatgccctcagc
47 tcctcgctcagcgctctcaccaacagccgtagcccgcagccccgctggac
48 accggttctccatccccgcagcgtagcccggaacatggtagctgccatct
49 ttacctgctacgccagccttctgtgcgcgcaactgtctggtcccgcccc2
50
51 """
52
53 from corebio.utils import *
54 from corebio.seq import *
55 from corebio.seq_io import *
56
57
58 names = ( 'intelligenetics', 'ig', 'stanford', )
59 extensions = ('ig')
60
61
62 example = """
63 ;H.sapiens fau mRNA, 518 bases
64 HSFAU
65 ttcctctttctcgactccatcttcgcggtagctgggaccgccgttcagtc
66 actcttaagtcttttgtaattctggctttctctaataaaaaagccactta
67 gttcagtcaaaaaaaaaa1
68 ;H.sapiens fau 1 gene, 2016 bases
69 HSFAU1
70 ctaccattttccctctcgattctatatgtacactcgggacaagttctcct
71 gatcgaaaacggcaaaactaaggccccaagtaggaatgccttagttttcg
72 gggttaacaatgattaacactgagcctcacacccacgcgatgccctcagc
73 tcctcgctcagcgctctcaccaacagccgtagcccgcagccccgctggac
74 accggttctccatccccgcagcgtagcccggaacatggtagctgccatct
75 ttacctgctacgccagccttctgtgcgcgcaactgtctggtcccgcccc2
76 """
77
78
79
80
81 def read(fin, alphabet=None):
82 """Read and parse an IG file.
83
84 Args:
85 fin -- A stream or file to read
86 alphabet -- The expected alphabet of the data, if given
87 Returns:
88 SeqList -- A list of sequences
89 Raises:
90 ValueError -- If the file is unparsable
91 """
92 seqs = [ s for s in iterseq(fin, alphabet)]
93 return SeqList(seqs)
94
95
96 def iterseq(fin, alphabet=None):
97 """ Parse an IG file and generate sequences.
98
99 Args:
100 fin -- A stream or file to read
101 alphabet -- The expected alphabet of the data, if given
102 Yeilds:
103 Seq -- One alphabetic sequence at a time.
104 Raises:
105 ValueError -- If the file is unparsable
106 """
107 alphabet = Alphabet(alphabet)
108
109 seqs = []
110 header = []
111 start_lineno = -1
112 name = None
113
114 def build_seq(seqs,alphabet, name, comments, lineno) :
115 try :
116 desc = '\n'.join(comments)
117 s = Seq( "".join(seqs), alphabet, name=name, description=desc)
118 except ValueError :
119 raise ValueError(
120 "Parsed failed with sequence starting at line %d: "
121 "Character not in alphabet: %s" % (lineno, alphabet) )
122 return s
123
124 for lineno, line in enumerate(fin) :
125 line = line.strip()
126 if line == '' : continue
127 if line.startswith(';') :
128 if seqs :
129 # end of sequence
130 yield build_seq(seqs,alphabet, name, header, start_lineno)
131 header = []
132 seqs = []
133 name = None
134 header.append(line[1:])
135 start_lineno = lineno
136 elif not name :
137 name = line
138 elif line[-1] == '1' or line[-1]=='2':
139 # End of sequence
140 seqs.append(remove_whitespace(line[0:-1]))
141 yield build_seq(seqs,alphabet, name, header, start_lineno)
142 header = []
143 seqs = []
144 name = None
145 else:
146 seqs.append( remove_whitespace(line))
147
148 if seqs :
149 yield build_seq(seqs,alphabet, name, header, start_lineno)
150 return
151
152
153
154
155
156 def write(fout, seqs):
157 """Write an IG file.
158
159 Args:
160 fout -- A writable stream.
161 seqs -- A list of Seq's
162 Raises:
163 ValueError -- If a sequence is missing a name
164 """
165 for s in seqs :
166 writeseq(fout, s)
167
168
169 def writeseq(fout, seq):
170 """ Write a single sequence in IG format.
171
172 Args:
173 afile -- A writable stream.
174 seq -- A Seq instance
175 Raises:
176 ValueError -- If a sequence is missing a name
177 """
178
179 desc = seq.description or ''
180
181 # We prepend ';' to each line
182 for h in desc.splitlines() :
183 print >> fout, ';' +h
184
185 if not seq.name :
186 raise ValueError(
187 "Write failed with missing sequence name: %s"% str(seq) )
188 print >>fout, seq.name
189 L = len(seq)
190 line_length = 80
191 for n in range (1+ int(L/line_length)) :
192 print >>fout, seq[n * line_length: (n+1) * line_length]
193 print >>fout
194
195
196
197
198
199
200
201
202
203