comparison corebio/seq_io/plain_io.py @ 0:c55bdc2fb9fa

Uploaded
author davidmurphy
date Thu, 27 Oct 2011 12:09:09 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c55bdc2fb9fa
1 #!/usr/bin/env python
2
3 # Copyright (c) 2005 Gavin E. Crooks <gec@threeplusone.com>
4 #
5 # This software is distributed under the MIT Open Source License.
6 # <http://www.opensource.org/licenses/mit-license.html>
7 #
8 # Permission is hereby granted, free of charge, to any person obtaining a
9 # copy of this software and associated documentation files (the "Software"),
10 # to deal in the Software without restriction, including without limitation
11 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 # and/or sell copies of the Software, and to permit persons to whom the
13 # Software is furnished to do so, subject to the following conditions:
14 #
15 # The above copyright notice and this permission notice shall be included
16 # in all copies or substantial portions of the Software.
17 #
18 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 # THE SOFTWARE.
25 #
26
27 """Read and write raw, unformatted sequence data. The whole file is read
28 in as a sequence. Whitespace is removed.
29
30
31 --- Example Plain/Raw/Text File ---
32
33 --------------------------LENSTSPYDYGENESD-------FSDSPPCPQDF
34 --------------------------LENLEDLF-WELDRLD------NYNDTSLVENH-
35 --------------------------MSNITDPQMWDFDDLN-------FTGMPPADEDY
36 -----------------------------------YTSDN---------YSGSGDYDSNK
37 -SL-------NFDRTFLPALYSLLFLLGLLGNGAVAAVLLSQRTALSSTDTFLLHLAVAD
38 --LC-PATMASFKAVFVPVAYSLIFLLGVIGNVLVLVILERHRQTRSSTETFLFHLAVAD
39 -SPC-MLETETLNKYVVIIAYALVFLLSLLGNSLVMLVILYSRVGRSVTDVYLLNLALAD
40 -EPC-RDENVHFNRIFLPTIYFIIFLTGIVGNGLVILVMGYQKKLRSMTDKYRLHLSVAD
41 """
42
43 from corebio.seq import *
44 from corebio.utils import remove_whitespace
45
46 example = """
47 --------------------------LENSTSPYDYGENESD-------FSDSPPCPQDF
48 --------------------------LENLEDLF-WELDRLD------NYNDTSLVENH-
49 --------------------------MSNITDPQMWDFDDLN-------FTGMPPADEDY
50 -----------------------------------YTSDN---------YSGSGDYDSNK
51 -SL-------NFDRTFLPALYSLLFLLGLLGNGAVAAVLLSQRTALSSTDTFLLHLAVAD
52 --LC-PATMASFKAVFVPVAYSLIFLLGVIGNVLVLVILERHRQTRSSTETFLFHLAVAD
53 -SPC-MLETETLNKYVVIIAYALVFLLSLLGNSLVMLVILYSRVGRSVTDVYLLNLALAD
54 -EPC-RDENVHFNRIFLPTIYFIIFLTGIV
55 """
56
57 names = ("plain","raw")
58 extensions = ('txt', )
59
60 def read(fin, alphabet=None):
61 """Read a file of raw sequecne data.
62
63 Args:
64 fin -- A stream or file to read
65 alphabet -- The expected alphabet of the data, if given
66 Returns:
67 SeqList -- A list of sequences
68 Raises:
69 ValueError -- If the file is unparsable
70 """
71 seqs = [ s for s in iterseq(fin, alphabet)]
72 return SeqList(seqs)
73
74
75 def iterseq(fin, alphabet=None) :
76 """ Read the sequence data and yeild one (and only one) sequence.
77
78 Args:
79 fin -- A stream or file to read
80 alphabet -- The expected alphabet of the data, if given
81 Yeilds:
82 Seq -- One alphabetic sequence at a time.
83 Raises:
84 ValueError -- If the file is unparsable
85 """
86
87 alphabet = Alphabet(alphabet)
88 lines = []
89 for linenum, line in enumerate(fin) :
90 if line.isspace(): continue # Blank line
91 line = line.strip()
92
93
94 if line[0] == '>' : # probable a fasta file. Fail.
95 raise ValueError(
96 "Parse Error on input line: %d " % (linenum) )
97 line = remove_whitespace(line)
98
99 if not alphabet.alphabetic(line) :
100 raise ValueError(
101 "Character on line: %d not in alphabet: %s : %s" % \
102 (linenum, alphabet, line) )
103 lines.append(line)
104
105 yield Seq(''.join(lines), alphabet)
106
107
108
109 def write(afile, seqs):
110 """Write raw sequence data, one line per sequence.
111
112 arguments:
113 afile -- A writable stream.
114 seqs -- A list of Seq's
115 """
116 for s in seqs :
117 writeseq(afile, s)
118
119
120 def writeseq(afile, seq):
121 """ Write a single sequence in raw format.
122
123 arguments:
124 afile -- A writable stream.
125 seq -- A Seq instance
126 """
127 print >>afile, seq
128
129