Mercurial > repos > xuebing > sharplabtool
comparison tools/genome_diversity/genome_diversity.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 #!/usr/bin/env python2.5 | |
2 | |
3 import sys | |
4 import cdblib | |
5 | |
6 def _openfile( filename=None, mode='r' ): | |
7 try: | |
8 fh = open( filename, mode ) | |
9 except IOError, err: | |
10 raise RuntimeError( "can't open file: %s\n" % str( err ) ) | |
11 return fh | |
12 | |
13 def get_filename_from_loc( species=None, filename=None ): | |
14 fh = _openfile( filename ) | |
15 for line in fh: | |
16 if line and not line.startswith( '#' ): | |
17 line = line.rstrip( '\r\n' ) | |
18 if line: | |
19 elems = line.split( '\t' ) | |
20 if len( elems ) >= 2 and elems[0] == species: | |
21 return elems[1] | |
22 return None | |
23 | |
24 | |
25 class SnpFile( object ): | |
26 def __init__( self, filename=None, seq_col=1, pos_col=2, ref_seq_col=7, ref_pos_col=8 ): | |
27 self.filename = filename | |
28 self.fh = _openfile( filename ) | |
29 self.seq_col = seq_col | |
30 self.pos_col = pos_col | |
31 self.ref_seq_col = ref_seq_col | |
32 self.ref_pos_col = ref_pos_col | |
33 self.elems = None | |
34 self.line = None | |
35 self.comments = [] | |
36 | |
37 def next( self ): | |
38 while self.fh: | |
39 try: | |
40 self.line = self.fh.next() | |
41 except StopIteration: | |
42 self.line = None | |
43 self.elems = None | |
44 return None | |
45 if self.line: | |
46 self.line = self.line.rstrip( '\r\n' ) | |
47 if self.line: | |
48 if self.line.startswith( '#' ): | |
49 self.comments.append( self.line ) | |
50 else: | |
51 self.elems = self.line.split( '\t' ) | |
52 return 1 | |
53 | |
54 def get_seq_pos( self ): | |
55 if self.elems: | |
56 return self.elems[ self.seq_col - 1 ], self.elems[ self.pos_col - 1 ] | |
57 else: | |
58 return None, None | |
59 | |
60 def get_ref_seq_pos( self ): | |
61 if self.elems: | |
62 return self.elems[ self.ref_seq_seq - 1 ], self.elems[ self.ref_pos_col - 1 ] | |
63 else: | |
64 return None, None | |
65 | |
66 | |
67 class IndexedFile( object ): | |
68 | |
69 def __init__( self, data_file=None, index_file=None ): | |
70 self.data_file = data_file | |
71 self.index_file = index_file | |
72 self.data_fh = _openfile( data_file ) | |
73 self.index_fh = _openfile( index_file ) | |
74 self._reader = cdblib.Reader( self.index_fh.read(), hash ) | |
75 | |
76 def get_indexed_line( self, key=None ): | |
77 line = None | |
78 if key in self._reader: | |
79 offset = self._reader.getint( key ) | |
80 self.data_fh.seek( offset ) | |
81 try: | |
82 line = self.data_fh.next() | |
83 except StopIteration: | |
84 raise RuntimeError( 'index file out of sync for %s' % key ) | |
85 return line | |
86 | |
87 class PrimersFile( IndexedFile ): | |
88 def get_primer_header( self, sequence=None, position=None ): | |
89 key = "%s %s" % ( str( sequence ), str( position ) ) | |
90 header = self.get_indexed_line( key ) | |
91 if header: | |
92 if header.startswith( '>' ): | |
93 elems = header.split() | |
94 if len( elems ) < 3: | |
95 raise RuntimeError( 'short primers header for %s' % key ) | |
96 if sequence != elems[1] or str( position ) != elems[2]: | |
97 raise RuntimeError( 'primers index for %s finds %s %s' % ( key, elems[1], elems[2] ) ) | |
98 else: | |
99 raise RuntimeError( 'primers index out of sync for %s' % key ) | |
100 return header | |
101 | |
102 def get_entry( self, sequence=None, position=None ): | |
103 entry = self.get_primer_header( sequence, position ) | |
104 if entry: | |
105 while self.data_fh: | |
106 try: | |
107 line = self.data_fh.next() | |
108 except StopIteration: | |
109 break | |
110 if line.startswith( '>' ): | |
111 break | |
112 entry += line | |
113 return entry | |
114 | |
115 def get_enzymes( self, sequence=None, position=None ): | |
116 entry = self.get_primer_header( sequence, position ) | |
117 enzyme_list = [] | |
118 if entry: | |
119 try: | |
120 line = self.data_fh.next() | |
121 except StopIteration: | |
122 raise RuntimeError( 'primers entry for %s %s is truncated' % ( str( sequence ), str( position ) ) ) | |
123 if line.startswith( '>' ): | |
124 raise RuntimeError( 'primers entry for %s %s is truncated' % ( str( sequence ), str( position ) ) ) | |
125 line.rstrip( '\r\n' ) | |
126 if line: | |
127 enzymes = line.split( ',' ) | |
128 for enzyme in enzymes: | |
129 enzyme = enzyme.strip() | |
130 if enzyme: | |
131 enzyme_list.append( enzyme ) | |
132 return enzyme_list | |
133 | |
134 class SnpcallsFile( IndexedFile ): | |
135 def get_snp_seq( self, sequence=None, position=None ): | |
136 key = "%s %s" % ( str( sequence ), str( position ) ) | |
137 line = self.get_indexed_line( key ) | |
138 if line: | |
139 elems = line.split( '\t' ) | |
140 if len (elems) < 3: | |
141 raise RuntimeError( 'short snpcalls line for %s' % key ) | |
142 if sequence != elems[0] or str( position ) != elems[1]: | |
143 raise RuntimeError( 'snpcalls index for %s finds %s %s' % ( key, elems[0], elems[1] ) ) | |
144 return elems[2] | |
145 else: | |
146 return None | |
147 | |
148 def get_flanking_dna( self, sequence=None, position=None, format='fasta' ): | |
149 if format != 'fasta' and format != 'primer3': | |
150 raise RuntimeError( 'invalid format for flanking dna: %s' % str( format ) ) | |
151 seq = self.get_snp_seq( sequence, position ) | |
152 if seq: | |
153 p = seq.find('[') | |
154 if p == -1: | |
155 raise RuntimeError( 'snpcalls entry for %s %s missing left bracket: %s' % ( str( sequence ), str( position ), seq ) ) | |
156 q = seq.find(']', p + 1) | |
157 if q == -1: | |
158 raise RuntimeError( 'snpcalls entry for %s %s missing right bracket: %s' % ( str( sequence ), str( position ), seq ) ) | |
159 q += 1 | |
160 | |
161 if format == 'fasta': | |
162 flanking_seq = '> ' | |
163 else: | |
164 flanking_seq = 'SEQUENCE_ID=' | |
165 | |
166 flanking_seq += "%s %s %s %s\n" % ( str( sequence ), str( position ), seq[p+1], seq[p+3] ) | |
167 | |
168 if format == 'primer3': | |
169 flanking_seq += 'SEQUENCE_TEMPLATE=' | |
170 | |
171 flanking_seq += "%sn%s\n" % ( seq[0:p], seq[q:] ) | |
172 | |
173 if format == 'primer3': | |
174 flanking_seq += "SEQUENCE_TARGET=%d,11\n=\n" % ( p - 5 ) | |
175 | |
176 return flanking_seq | |
177 else: | |
178 return None | |
179 | |
180 | |
181 | |
182 class LocationFile( object ): | |
183 def __init__(self, filename): | |
184 self.build_map(filename) | |
185 | |
186 def build_map(self, filename): | |
187 self.map = {} | |
188 self.open_file(filename) | |
189 for line in self.read_lines(): | |
190 elems = line.split('\t', 1) | |
191 if len(elems) == 2: | |
192 self.map[ elems[0].strip() ] = elems[1].strip() | |
193 self.close_file() | |
194 | |
195 def read_lines(self): | |
196 for line in self.fh: | |
197 if not line.startswith('#'): | |
198 line = line.rstrip('\r\n') | |
199 yield line | |
200 | |
201 def open_file(self, filename): | |
202 self.filename = filename | |
203 try: | |
204 self.fh = open(filename, 'r') | |
205 except IOError, err: | |
206 print >> sys.stderr, "Error opening location file '%s': %s" % (filename, str(err)) | |
207 sys.exit(1) | |
208 | |
209 def close_file(self): | |
210 self.fh.close() | |
211 | |
212 def loc_file( self, key ): | |
213 if key in self.map: | |
214 return self.map[key] | |
215 else: | |
216 print >> sys.stderr, "'%s' does not appear in location file '%s'" % (key, self.filename) | |
217 sys.exit(1) | |
218 | |
219 class ChrLens( object ): | |
220 def __init__( self, location_file, species ): | |
221 self.chrlen_loc = LocationFile( location_file ) | |
222 self.chrlen_filename = self.chrlen_loc.loc_file( species ) | |
223 self.build_map() | |
224 | |
225 def build_map(self): | |
226 self.map = {} | |
227 self.open_file(self.chrlen_filename) | |
228 for line in self.read_lines(): | |
229 elems = line.split('\t', 1) | |
230 if len(elems) == 2: | |
231 chrom = elems[0].strip() | |
232 chrom_len_text = elems[1].strip() | |
233 try: | |
234 chrom_len = int( chrom_len_text ) | |
235 except ValueError: | |
236 print >> sys.stderr, "Bad length '%s' for chromosome '%s' in '%s'" % (chrom_len_text, chrom, self.chrlen_filename) | |
237 self.map[ chrom ] = chrom_len | |
238 self.close_file() | |
239 | |
240 def read_lines(self): | |
241 for line in self.fh: | |
242 if not line.startswith('#'): | |
243 line = line.rstrip('\r\n') | |
244 yield line | |
245 | |
246 def open_file(self, filename): | |
247 self.filename = filename | |
248 try: | |
249 self.fh = open(filename, 'r') | |
250 except IOError, err: | |
251 print >> sys.stderr, "Error opening chromosome length file '%s': %s" % (filename, str(err)) | |
252 sys.exit(1) | |
253 | |
254 def close_file(self): | |
255 self.fh.close() | |
256 | |
257 def length( self, key ): | |
258 if key in self.map: | |
259 return self.map[key] | |
260 else: | |
261 return None | |
262 | |
263 def __iter__( self ): | |
264 for chrom in self.map: | |
265 yield chrom | |
266 |