Mercurial > repos > davidmurphy > codonlogo
comparison corebio/seq_io/intelligenetics_io.py @ 4:4d47ab2b7bcc
Uploaded
author | davidmurphy |
---|---|
date | Fri, 13 Jan 2012 07:18:19 -0500 |
parents | c55bdc2fb9fa |
children |
comparison
equal
deleted
inserted
replaced
3:09d2dac9ef73 | 4:4d47ab2b7bcc |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 # Copyright (c) 2005 Gavin E. Crooks <gec@threeplusone.com> | |
4 # | |
5 # This software is distributed under the MIT Open Source License. | |
6 # <http://www.opensource.org/licenses/mit-license.html> | |
7 # | |
8 # Permission is hereby granted, free of charge, to any person obtaining a | |
9 # copy of this software and associated documentation files (the "Software"), | |
10 # to deal in the Software without restriction, including without limitation | |
11 # the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
12 # and/or sell copies of the Software, and to permit persons to whom the | |
13 # Software is furnished to do so, subject to the following conditions: | |
14 # | |
15 # The above copyright notice and this permission notice shall be included | |
16 # in all copies or substantial portions of the Software. | |
17 # | |
18 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
19 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
20 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
21 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
22 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
23 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
24 # THE SOFTWARE. | |
25 # | |
26 | |
27 """Read and write sequence information in IntelliGenetics format. | |
28 | |
29 A sequence file in IG format can contain several sequences, each consisting of a | |
30 number of comment lines that must begin with a semicolon (";"), a line with the | |
31 sequence name and the sequence itself terminated with the termination character | |
32 '1' for linear or '2' for circular sequences. The termination caracter is | |
33 defacto optional. | |
34 | |
35 --- Example IG File --- | |
36 | |
37 ;H.sapiens fau mRNA, 518 bases | |
38 HSFAU | |
39 ttcctctttctcgactccatcttcgcggtagctgggaccgccgttcagtc | |
40 actcttaagtcttttgtaattctggctttctctaataaaaaagccactta | |
41 gttcagtcaaaaaaaaaa1 | |
42 ;H.sapiens fau 1 gene, 2016 bases | |
43 HSFAU1 | |
44 ctaccattttccctctcgattctatatgtacactcgggacaagttctcct | |
45 gatcgaaaacggcaaaactaaggccccaagtaggaatgccttagttttcg | |
46 gggttaacaatgattaacactgagcctcacacccacgcgatgccctcagc | |
47 tcctcgctcagcgctctcaccaacagccgtagcccgcagccccgctggac | |
48 accggttctccatccccgcagcgtagcccggaacatggtagctgccatct | |
49 ttacctgctacgccagccttctgtgcgcgcaactgtctggtcccgcccc2 | |
50 | |
51 """ | |
52 | |
53 from corebio.utils import * | |
54 from corebio.seq import * | |
55 from corebio.seq_io import * | |
56 | |
57 | |
58 names = ( 'intelligenetics', 'ig', 'stanford', ) | |
59 extensions = ('ig') | |
60 | |
61 | |
62 example = """ | |
63 ;H.sapiens fau mRNA, 518 bases | |
64 HSFAU | |
65 ttcctctttctcgactccatcttcgcggtagctgggaccgccgttcagtc | |
66 actcttaagtcttttgtaattctggctttctctaataaaaaagccactta | |
67 gttcagtcaaaaaaaaaa1 | |
68 ;H.sapiens fau 1 gene, 2016 bases | |
69 HSFAU1 | |
70 ctaccattttccctctcgattctatatgtacactcgggacaagttctcct | |
71 gatcgaaaacggcaaaactaaggccccaagtaggaatgccttagttttcg | |
72 gggttaacaatgattaacactgagcctcacacccacgcgatgccctcagc | |
73 tcctcgctcagcgctctcaccaacagccgtagcccgcagccccgctggac | |
74 accggttctccatccccgcagcgtagcccggaacatggtagctgccatct | |
75 ttacctgctacgccagccttctgtgcgcgcaactgtctggtcccgcccc2 | |
76 """ | |
77 | |
78 | |
79 | |
80 | |
81 def read(fin, alphabet=None): | |
82 """Read and parse an IG file. | |
83 | |
84 Args: | |
85 fin -- A stream or file to read | |
86 alphabet -- The expected alphabet of the data, if given | |
87 Returns: | |
88 SeqList -- A list of sequences | |
89 Raises: | |
90 ValueError -- If the file is unparsable | |
91 """ | |
92 seqs = [ s for s in iterseq(fin, alphabet)] | |
93 return SeqList(seqs) | |
94 | |
95 | |
96 def iterseq(fin, alphabet=None): | |
97 """ Parse an IG file and generate sequences. | |
98 | |
99 Args: | |
100 fin -- A stream or file to read | |
101 alphabet -- The expected alphabet of the data, if given | |
102 Yeilds: | |
103 Seq -- One alphabetic sequence at a time. | |
104 Raises: | |
105 ValueError -- If the file is unparsable | |
106 """ | |
107 alphabet = Alphabet(alphabet) | |
108 | |
109 seqs = [] | |
110 header = [] | |
111 start_lineno = -1 | |
112 name = None | |
113 | |
114 def build_seq(seqs,alphabet, name, comments, lineno) : | |
115 try : | |
116 desc = '\n'.join(comments) | |
117 s = Seq( "".join(seqs), alphabet, name=name, description=desc) | |
118 except ValueError : | |
119 raise ValueError( | |
120 "Parsed failed with sequence starting at line %d: " | |
121 "Character not in alphabet: %s" % (lineno, alphabet) ) | |
122 return s | |
123 | |
124 for lineno, line in enumerate(fin) : | |
125 line = line.strip() | |
126 if line == '' : continue | |
127 if line.startswith(';') : | |
128 if seqs : | |
129 # end of sequence | |
130 yield build_seq(seqs,alphabet, name, header, start_lineno) | |
131 header = [] | |
132 seqs = [] | |
133 name = None | |
134 header.append(line[1:]) | |
135 start_lineno = lineno | |
136 elif not name : | |
137 name = line | |
138 elif line[-1] == '1' or line[-1]=='2': | |
139 # End of sequence | |
140 seqs.append(remove_whitespace(line[0:-1])) | |
141 yield build_seq(seqs,alphabet, name, header, start_lineno) | |
142 header = [] | |
143 seqs = [] | |
144 name = None | |
145 else: | |
146 seqs.append( remove_whitespace(line)) | |
147 | |
148 if seqs : | |
149 yield build_seq(seqs,alphabet, name, header, start_lineno) | |
150 return | |
151 | |
152 | |
153 | |
154 | |
155 | |
156 def write(fout, seqs): | |
157 """Write an IG file. | |
158 | |
159 Args: | |
160 fout -- A writable stream. | |
161 seqs -- A list of Seq's | |
162 Raises: | |
163 ValueError -- If a sequence is missing a name | |
164 """ | |
165 for s in seqs : | |
166 writeseq(fout, s) | |
167 | |
168 | |
169 def writeseq(fout, seq): | |
170 """ Write a single sequence in IG format. | |
171 | |
172 Args: | |
173 afile -- A writable stream. | |
174 seq -- A Seq instance | |
175 Raises: | |
176 ValueError -- If a sequence is missing a name | |
177 """ | |
178 | |
179 desc = seq.description or '' | |
180 | |
181 # We prepend ';' to each line | |
182 for h in desc.splitlines() : | |
183 print >> fout, ';' +h | |
184 | |
185 if not seq.name : | |
186 raise ValueError( | |
187 "Write failed with missing sequence name: %s"% str(seq) ) | |
188 print >>fout, seq.name | |
189 L = len(seq) | |
190 line_length = 80 | |
191 for n in range (1+ int(L/line_length)) : | |
192 print >>fout, seq[n * line_length: (n+1) * line_length] | |
193 print >>fout | |
194 | |
195 | |
196 | |
197 | |
198 | |
199 | |
200 | |
201 | |
202 | |
203 |