0
|
1 #!/usr/bin/env python2.5
|
|
2
|
|
3 import sys
|
|
4 import cdblib
|
|
5
|
|
6 def _openfile( filename=None, mode='r' ):
|
|
7 try:
|
|
8 fh = open( filename, mode )
|
|
9 except IOError, err:
|
|
10 raise RuntimeError( "can't open file: %s\n" % str( err ) )
|
|
11 return fh
|
|
12
|
|
13 def get_filename_from_loc( species=None, filename=None ):
|
|
14 fh = _openfile( filename )
|
|
15 for line in fh:
|
|
16 if line and not line.startswith( '#' ):
|
|
17 line = line.rstrip( '\r\n' )
|
|
18 if line:
|
|
19 elems = line.split( '\t' )
|
|
20 if len( elems ) >= 2 and elems[0] == species:
|
|
21 return elems[1]
|
|
22 return None
|
|
23
|
|
24
|
|
25 class SnpFile( object ):
|
|
26 def __init__( self, filename=None, seq_col=1, pos_col=2, ref_seq_col=7, ref_pos_col=8 ):
|
|
27 self.filename = filename
|
|
28 self.fh = _openfile( filename )
|
|
29 self.seq_col = seq_col
|
|
30 self.pos_col = pos_col
|
|
31 self.ref_seq_col = ref_seq_col
|
|
32 self.ref_pos_col = ref_pos_col
|
|
33 self.elems = None
|
|
34 self.line = None
|
|
35 self.comments = []
|
|
36
|
|
37 def next( self ):
|
|
38 while self.fh:
|
|
39 try:
|
|
40 self.line = self.fh.next()
|
|
41 except StopIteration:
|
|
42 self.line = None
|
|
43 self.elems = None
|
|
44 return None
|
|
45 if self.line:
|
|
46 self.line = self.line.rstrip( '\r\n' )
|
|
47 if self.line:
|
|
48 if self.line.startswith( '#' ):
|
|
49 self.comments.append( self.line )
|
|
50 else:
|
|
51 self.elems = self.line.split( '\t' )
|
|
52 return 1
|
|
53
|
|
54 def get_seq_pos( self ):
|
|
55 if self.elems:
|
|
56 return self.elems[ self.seq_col - 1 ], self.elems[ self.pos_col - 1 ]
|
|
57 else:
|
|
58 return None, None
|
|
59
|
|
60 def get_ref_seq_pos( self ):
|
|
61 if self.elems:
|
|
62 return self.elems[ self.ref_seq_seq - 1 ], self.elems[ self.ref_pos_col - 1 ]
|
|
63 else:
|
|
64 return None, None
|
|
65
|
|
66
|
|
67 class IndexedFile( object ):
|
|
68
|
|
69 def __init__( self, data_file=None, index_file=None ):
|
|
70 self.data_file = data_file
|
|
71 self.index_file = index_file
|
|
72 self.data_fh = _openfile( data_file )
|
|
73 self.index_fh = _openfile( index_file )
|
|
74 self._reader = cdblib.Reader( self.index_fh.read(), hash )
|
|
75
|
|
76 def get_indexed_line( self, key=None ):
|
|
77 line = None
|
|
78 if key in self._reader:
|
|
79 offset = self._reader.getint( key )
|
|
80 self.data_fh.seek( offset )
|
|
81 try:
|
|
82 line = self.data_fh.next()
|
|
83 except StopIteration:
|
|
84 raise RuntimeError( 'index file out of sync for %s' % key )
|
|
85 return line
|
|
86
|
|
87 class PrimersFile( IndexedFile ):
|
|
88 def get_primer_header( self, sequence=None, position=None ):
|
|
89 key = "%s %s" % ( str( sequence ), str( position ) )
|
|
90 header = self.get_indexed_line( key )
|
|
91 if header:
|
|
92 if header.startswith( '>' ):
|
|
93 elems = header.split()
|
|
94 if len( elems ) < 3:
|
|
95 raise RuntimeError( 'short primers header for %s' % key )
|
|
96 if sequence != elems[1] or str( position ) != elems[2]:
|
|
97 raise RuntimeError( 'primers index for %s finds %s %s' % ( key, elems[1], elems[2] ) )
|
|
98 else:
|
|
99 raise RuntimeError( 'primers index out of sync for %s' % key )
|
|
100 return header
|
|
101
|
|
102 def get_entry( self, sequence=None, position=None ):
|
|
103 entry = self.get_primer_header( sequence, position )
|
|
104 if entry:
|
|
105 while self.data_fh:
|
|
106 try:
|
|
107 line = self.data_fh.next()
|
|
108 except StopIteration:
|
|
109 break
|
|
110 if line.startswith( '>' ):
|
|
111 break
|
|
112 entry += line
|
|
113 return entry
|
|
114
|
|
115 def get_enzymes( self, sequence=None, position=None ):
|
|
116 entry = self.get_primer_header( sequence, position )
|
|
117 enzyme_list = []
|
|
118 if entry:
|
|
119 try:
|
|
120 line = self.data_fh.next()
|
|
121 except StopIteration:
|
|
122 raise RuntimeError( 'primers entry for %s %s is truncated' % ( str( sequence ), str( position ) ) )
|
|
123 if line.startswith( '>' ):
|
|
124 raise RuntimeError( 'primers entry for %s %s is truncated' % ( str( sequence ), str( position ) ) )
|
|
125 line.rstrip( '\r\n' )
|
|
126 if line:
|
|
127 enzymes = line.split( ',' )
|
|
128 for enzyme in enzymes:
|
|
129 enzyme = enzyme.strip()
|
|
130 if enzyme:
|
|
131 enzyme_list.append( enzyme )
|
|
132 return enzyme_list
|
|
133
|
|
134 class SnpcallsFile( IndexedFile ):
|
|
135 def get_snp_seq( self, sequence=None, position=None ):
|
|
136 key = "%s %s" % ( str( sequence ), str( position ) )
|
|
137 line = self.get_indexed_line( key )
|
|
138 if line:
|
|
139 elems = line.split( '\t' )
|
|
140 if len (elems) < 3:
|
|
141 raise RuntimeError( 'short snpcalls line for %s' % key )
|
|
142 if sequence != elems[0] or str( position ) != elems[1]:
|
|
143 raise RuntimeError( 'snpcalls index for %s finds %s %s' % ( key, elems[0], elems[1] ) )
|
|
144 return elems[2]
|
|
145 else:
|
|
146 return None
|
|
147
|
|
148 def get_flanking_dna( self, sequence=None, position=None, format='fasta' ):
|
|
149 if format != 'fasta' and format != 'primer3':
|
|
150 raise RuntimeError( 'invalid format for flanking dna: %s' % str( format ) )
|
|
151 seq = self.get_snp_seq( sequence, position )
|
|
152 if seq:
|
|
153 p = seq.find('[')
|
|
154 if p == -1:
|
|
155 raise RuntimeError( 'snpcalls entry for %s %s missing left bracket: %s' % ( str( sequence ), str( position ), seq ) )
|
|
156 q = seq.find(']', p + 1)
|
|
157 if q == -1:
|
|
158 raise RuntimeError( 'snpcalls entry for %s %s missing right bracket: %s' % ( str( sequence ), str( position ), seq ) )
|
|
159 q += 1
|
|
160
|
|
161 if format == 'fasta':
|
|
162 flanking_seq = '> '
|
|
163 else:
|
|
164 flanking_seq = 'SEQUENCE_ID='
|
|
165
|
|
166 flanking_seq += "%s %s %s %s\n" % ( str( sequence ), str( position ), seq[p+1], seq[p+3] )
|
|
167
|
|
168 if format == 'primer3':
|
|
169 flanking_seq += 'SEQUENCE_TEMPLATE='
|
|
170
|
|
171 flanking_seq += "%sn%s\n" % ( seq[0:p], seq[q:] )
|
|
172
|
|
173 if format == 'primer3':
|
|
174 flanking_seq += "SEQUENCE_TARGET=%d,11\n=\n" % ( p - 5 )
|
|
175
|
|
176 return flanking_seq
|
|
177 else:
|
|
178 return None
|
|
179
|
|
180
|
|
181
|
|
182 class LocationFile( object ):
|
|
183 def __init__(self, filename):
|
|
184 self.build_map(filename)
|
|
185
|
|
186 def build_map(self, filename):
|
|
187 self.map = {}
|
|
188 self.open_file(filename)
|
|
189 for line in self.read_lines():
|
|
190 elems = line.split('\t', 1)
|
|
191 if len(elems) == 2:
|
|
192 self.map[ elems[0].strip() ] = elems[1].strip()
|
|
193 self.close_file()
|
|
194
|
|
195 def read_lines(self):
|
|
196 for line in self.fh:
|
|
197 if not line.startswith('#'):
|
|
198 line = line.rstrip('\r\n')
|
|
199 yield line
|
|
200
|
|
201 def open_file(self, filename):
|
|
202 self.filename = filename
|
|
203 try:
|
|
204 self.fh = open(filename, 'r')
|
|
205 except IOError, err:
|
|
206 print >> sys.stderr, "Error opening location file '%s': %s" % (filename, str(err))
|
|
207 sys.exit(1)
|
|
208
|
|
209 def close_file(self):
|
|
210 self.fh.close()
|
|
211
|
|
212 def loc_file( self, key ):
|
|
213 if key in self.map:
|
|
214 return self.map[key]
|
|
215 else:
|
|
216 print >> sys.stderr, "'%s' does not appear in location file '%s'" % (key, self.filename)
|
|
217 sys.exit(1)
|
|
218
|
|
219 class ChrLens( object ):
|
|
220 def __init__( self, location_file, species ):
|
|
221 self.chrlen_loc = LocationFile( location_file )
|
|
222 self.chrlen_filename = self.chrlen_loc.loc_file( species )
|
|
223 self.build_map()
|
|
224
|
|
225 def build_map(self):
|
|
226 self.map = {}
|
|
227 self.open_file(self.chrlen_filename)
|
|
228 for line in self.read_lines():
|
|
229 elems = line.split('\t', 1)
|
|
230 if len(elems) == 2:
|
|
231 chrom = elems[0].strip()
|
|
232 chrom_len_text = elems[1].strip()
|
|
233 try:
|
|
234 chrom_len = int( chrom_len_text )
|
|
235 except ValueError:
|
|
236 print >> sys.stderr, "Bad length '%s' for chromosome '%s' in '%s'" % (chrom_len_text, chrom, self.chrlen_filename)
|
|
237 self.map[ chrom ] = chrom_len
|
|
238 self.close_file()
|
|
239
|
|
240 def read_lines(self):
|
|
241 for line in self.fh:
|
|
242 if not line.startswith('#'):
|
|
243 line = line.rstrip('\r\n')
|
|
244 yield line
|
|
245
|
|
246 def open_file(self, filename):
|
|
247 self.filename = filename
|
|
248 try:
|
|
249 self.fh = open(filename, 'r')
|
|
250 except IOError, err:
|
|
251 print >> sys.stderr, "Error opening chromosome length file '%s': %s" % (filename, str(err))
|
|
252 sys.exit(1)
|
|
253
|
|
254 def close_file(self):
|
|
255 self.fh.close()
|
|
256
|
|
257 def length( self, key ):
|
|
258 if key in self.map:
|
|
259 return self.map[key]
|
|
260 else:
|
|
261 return None
|
|
262
|
|
263 def __iter__( self ):
|
|
264 for chrom in self.map:
|
|
265 yield chrom
|
|
266
|