comparison corebio/seq_io/phylip_io.py @ 0:c55bdc2fb9fa

Uploaded
author davidmurphy
date Thu, 27 Oct 2011 12:09:09 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c55bdc2fb9fa
1 #!/usr/bin/env python
2
3 # Copyright (c) 2005 David D. Ding <dding@berkeley.edu>
4 #
5 # This software is distributed under the MIT Open Source License.
6 # <http://www.opensource.org/licenses/mit-license.html>
7 #
8 # Permission is hereby granted, free of charge, to any person obtaining a
9 # copy of this software and associated documentation files (the "Software"),
10 # to deal in the Software without restriction, including without limitation
11 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 # and/or sell copies of the Software, and to permit persons to whom the
13 # Software is furnished to do so, subject to the following conditions:
14 #
15 # The above copyright notice and this permission notice shall be included
16 # in all copies or substantial portions of the Software.
17 #
18 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 # THE SOFTWARE.
25 #
26
27 """Read Sequences in interleaved Phylip format (not sequential) and returns a
28 list of sequences. Phylips is a very common phylogeny generating sequence type
29 that has the following traits
30 1) First line contains number of species and number of characters in a species'
31 sequence. Options can may follow, and they can be spaced or unspaced. Options are
32 simply letters such as A and W after the number of characters.
33 2) Options doesn't have to contain U in order for a usertree to appear.
34 3) If there are options then options appear first, then the sequences. For the
35 first iteration of sequences the first ten spaces are reserved for names of
36 options and species, the rest is for sequences.
37 4) For the second and following iterations the names are removed, only
38 sequence appears
39 4) At end of file an usertree may appear. First there is a number that indicts
40 the number of lines the usertree will take, and then the usertrees follow.
41
42 Examples:
43 6 50 W
44 W 0101001111 0101110101 01011
45 dmras1 GTCGTCGTTG GACCTGGAGG CGTGG
46 hschras GTGGTGGTGG GCGCCGGCCG TGTGG
47 ddrasa GTTATTGTTG GTGGTGGTGG TGTCG
48 spras GTAGTTGTAG GAGATGGTGG TGTTG
49 scras1 GTAGTTGTCG GTGGAGGTGG CGTTG
50 scras2 GTCGTCGTTG GTGGTGGTGG TGTTG
51
52 0101001111 0101110101 01011
53 GTCGTCGTTG GACCTGGAGG CGTGG
54 GTGGTGGTGG GCGCCGGCCG TGTGG
55 GTTATTGTTG GTGGTGGTGG TGTCG
56 GTAGTTGTAG GAGATGGTGG TGTTG
57 GTAGTTGTCG GTGGAGGTGG CGTTG
58 GTCGTCGTTG GTGGTGGTGG TGTTG
59
60 1
61 ((dmras1,ddrasa),((hschras,spras),(scras1,scras2)));
62
63
64 """
65
66 from corebio.seq import *
67
68 names = ( 'phylip',)
69 extensions = ('phy',)
70
71 def iterseq(fin, alphabet=None):
72 """Iterate over the sequences in the file."""
73 # Default implementation
74 return iter(read(fin, alphabet) )
75
76
77 #Read takes in a phylip file name, read it, processes it, and returns a SeqList
78 def read(fin, alphabet=None):
79
80
81 sequence=[] #where sequences are stored
82 idents=[]
83 num_seq=0
84 num_total_seq=0 #length of sequence of 1 species
85 tracker=0 #track what sequence the line is on
86 usertree_tracker=0 #track usertree lines
87 options='' #options
88 num_options=0 #number/lens of options - U
89
90 line=fin.readline()
91 while line:
92 s_line=line.split() #for ease of use, not used in all scenarios, but easier on the eye
93
94 if s_line == []: #see nothing do nothing
95 pass
96
97 elif (s_line[0].isdigit() and len(s_line) == 1 and len(sequence)==num_seq and len(sequence[0])==num_total_seq): #identifies usertree
98 usertree_tracker = int(s_line[0])
99 pass
100
101 elif num_options > 0:
102 if len(sequence) < num_seq:
103 if s_line[0][0] in options:
104 num_options -= 1
105 pass
106 else:
107 raise ValueError('Not an option, but it should be one')
108 else:
109 num_options -= 1
110 pass
111
112 elif usertree_tracker > 0: #baskically skip usertree
113 if len(sequence[num_seq-1]) == num_total_seq:
114 usertree_tracker -=1
115 pass
116 else:
117 raise ValueError('User Tree in Wrong Place')
118
119 #####problems parse error unexpected
120 elif s_line[0].isdigit():
121 if len(s_line) >= 2 and len(sequence) == 0: #identifies first line of file
122 num_seq = int(s_line[0]) #get number of sequences
123 num_total_seq = int(s_line[1]) #get length of sequences
124 if len(s_line) > 2: #takes care of the options
125 options= (''.join(s_line[2:]))
126 num_options=len(options) - options.count('U')
127 else:
128 raise ValueError('parse error')
129
130
131 #when options end, this take care of the sequence
132 elif num_options == 0:
133 if (num_seq==0):
134 raise ValueError("Empty File, or possibly wrong file")
135 elif tracker < num_seq:
136 if num_seq > len(sequence):
137 sequence.append(''.join(line[10:].split())) #removes species name
138 idents.append(line[0:10].strip())
139 tracker +=1
140
141 else:
142 sequence[tracker] += (''.join(s_line))
143 tracker +=1
144
145 if tracker == num_seq:
146 tracker = 0
147 num_options = len(options)-options.count('U')
148
149 line=fin.readline()
150
151 if len(sequence) != len(idents) or len(sequence)!=num_seq:
152 raise ValueError("Number of different sequences wrong")
153
154 seqs = []
155 for i in range (0, len(idents)):
156 if len(sequence[i])==num_total_seq:
157 seqs.append(Seq(sequence[i], alphabet, idents[i]))
158 else:
159 raise ValueError("extra sequence in list")
160
161 return SeqList(seqs)
162
163
164
165
166
167