annotate corebio/seq_io/array_io.py @ 0:c55bdc2fb9fa

Uploaded
author davidmurphy
date Thu, 27 Oct 2011 12:09:09 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1 #!/usr/bin/env python
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
2
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
3 # Copyright (c) 2005 Gavin E. Crooks <gec@threeplusone.com>
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
4 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
5 # This software is distributed under the MIT Open Source License.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
6 # <http://www.opensource.org/licenses/mit-license.html>
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
7 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
8 # Permission is hereby granted, free of charge, to any person obtaining a
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
9 # copy of this software and associated documentation files (the "Software"),
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
10 # to deal in the Software without restriction, including without limitation
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
11 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
12 # and/or sell copies of the Software, and to permit persons to whom the
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
13 # Software is furnished to do so, subject to the following conditions:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
14 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
15 # The above copyright notice and this permission notice shall be included
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
16 # in all copies or substantial portions of the Software.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
17 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
18 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
19 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
20 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
21 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
22 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
23 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
24 # THE SOFTWARE.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
25 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
26
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
27 """Read and write a rectangular array of sequence data.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
28
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
29 One sequence per line and nothing else. Each line must contain the same number
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
30 of characters. Blank lines and white space are ignored.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
31
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
32 --- Example Array ---
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
33
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
34 --------------------------LENSTSPYDYGENESD-------FSDSPPCPQDF
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
35 --------------------------LENLEDLF-WELDRLD------NYNDTSLVENH-
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
36 --------------------------MSNITDPQMWDFDDLN-------FTGMPPADEDY
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
37 -----------------------------------YTSDN---------YSGSGDYDSNK
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
38 -SL-------NFDRTFLPALYSLLFLLGLLGNGAVAAVLLSQRTALSSTDTFLLHLAVAD
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
39 --LC-PATMASFKAVFVPVAYSLIFLLGVIGNVLVLVILERHRQTRSSTETFLFHLAVAD
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
40 -SPC-MLETETLNKYVVIIAYALVFLLSLLGNSLVMLVILYSRVGRSVTDVYLLNLALAD
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
41 -EPC-RDENVHFNRIFLPTIYFIIFLTGIVGNGLVILVMGYQKKLRSMTDKYRLHLSVAD
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
42 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
43
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
44 from corebio.seq import *
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
45 from corebio.utils import *
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
46
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
47 example = """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
48 --------------------------LENSTSPYDYGENESD-------FSDSPPCPQDF
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
49 --------------------------LENLEDLF-WELDRLD------NYNDTSLVENH-
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
50 --------------------------MSNITDPQMWDFDDLN-------FTGMPPADEDY
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
51 -----------------------------------YTSDN---------YSGSGDYDSNK
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
52 -SL-------NFDRTFLPALYSLLFLLGLLGNGAVAAVLLSQRTALSSTDTFLLHLAVAD
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
53 --LC-PATMASFKAVFVPVAYSLIFLLGVIGNVLVLVILERHRQTRSSTETFLFHLAVAD
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
54 -SPC-MLETETLNKYVVIIAYALVFLLSLLGNSLVMLVILYSRVGRSVTDVYLLNLALAD
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
55 -EPC-RDENVHFNRIFLPTIYFIIFLTGIVGNGLVILVMGYQKKLRSMTDKYRLHLSVAD
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
56 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
57
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
58 names = ("array",'flatfile')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
59 extensions = ()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
60
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
61 def read(fin, alphabet=None):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
62 """Read a file of raw sequecne alignment data.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
63
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
64 Args:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
65 fin -- A stream or file to read
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
66 alphabet -- The expected alphabet of the data, if given
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
67 Returns:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
68 SeqList -- A list of sequences
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
69 Raises:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
70 ValueError -- If the file is unparsable
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
71 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
72 seqs = [ s for s in iterseq(fin, alphabet)]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
73 return SeqList(seqs)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
74
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
75
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
76 def iterseq(fin, alphabet=None) :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
77 """ Read one line of sequence data and yeild the sequence.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
78
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
79 Args:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
80 fin -- A stream or file to read
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
81 alphabet -- The expected alphabet of the data, if given
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
82 Yeilds:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
83 Seq -- One alphabetic sequence at a time.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
84 Raises:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
85 ValueError -- If the file is unparsable
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
86 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
87
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
88 alphabet = Alphabet(alphabet)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
89 line_length = 0
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
90
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
91 for linenum, line in enumerate(fin) :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
92 if line.isspace(): continue # Blank line
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
93 line = line.strip()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
94
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
95 if line[0] == '>' : # probable a fasta file. Fail.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
96 raise ValueError(
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
97 "Parse Error on input line: %d " % (linenum) )
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
98
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
99 line = remove_whitespace(line)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
100
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
101 if not alphabet.alphabetic(line) :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
102 raise ValueError(
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
103 "Character on line: %d not in alphabet: %s : %s" % \
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
104 (linenum, alphabet, line) )
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
105
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
106 if line_length and line_length != len(line) :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
107 raise ValueError("Line %d has a incommensurate length." % linenum)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
108 line_length = len(line)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
109
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
110 yield Seq(line, alphabet)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
111
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
112
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
113 def write(afile, seqs):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
114 """Write raw sequence data, one line per sequence.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
115
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
116 arguments:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
117 afile -- A writable stream.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
118 seqs -- A list of Seq's
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
119 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
120 for s in seqs :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
121 writeseq(afile, s)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
122
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
123
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
124 def writeseq(afile, seq):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
125 """ Write a single sequence in raw format.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
126
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
127 arguments:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
128 afile -- A writable stream.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
129 seq -- A Seq instance
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
130 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
131 print >>afile, seq
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
132
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
133