comparison ngsutils/support/__init__.py @ 0:4e4e4093d65d draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 09194687c74a424732f8b0c017cbb942aad89068
author iuc
date Wed, 11 Nov 2015 13:04:07 -0500
parents
children 7a68005de299
comparison
equal deleted inserted replaced
-1:000000000000 0:4e4e4093d65d
1 import collections
2 import gzip
3 import os
4 import sys
5 import re
6 try:
7 from eta import ETA
8 except:
9 pass
10
11 class FASTARead(collections.namedtuple('FASTARecord', 'name comment seq')):
12 def __repr__(self):
13 if self.comment:
14 return '>%s %s\n%s\n' % (self.name, self.comment, self.seq)
15 return '>%s\n%s\n' % (self.name, self.seq)
16
17 def subseq(self, start, end, comment=None):
18 if self.comment:
19 comment = '%s %s' % (self.comment, comment)
20
21 return FASTARead(self.name, comment, self.seq[start:end])
22
23 def clone(self, name=None, comment=None, seq=None):
24 n = name if name else self.name
25 c = comment if comment else self.comment
26 s = seq if seq else self.seq
27
28 return FASTARead(n, c, s)
29
30 def write(self, out):
31 out.write(repr(self))
32
33
34 class FASTA(object):
35 def __init__(self, fname=None, fileobj=None, qual=False):
36 self.fname = fname
37 self.qual = qual
38 if fileobj:
39 self.fileobj = fileobj
40 else:
41 if self.fname == '-':
42 self.fileobj = sys.stdin
43 elif self.fname[-3:] == '.gz' or self.fname[-4:] == '.bgz':
44 self.fileobj = gzip.open(os.path.expanduser(self.fname))
45 else:
46 self.fileobj = open(os.path.expanduser(self.fname))
47
48 if not self.fileobj:
49 raise ValueError("Missing valid filename or fileobj")
50
51 def close(self):
52 if self.fileobj != sys.stdout:
53 self.fileobj.close()
54
55 def tell(self):
56 # always relative to uncompressed...
57 return self.fileobj.tell()
58
59 def seek(self, pos, whence=0):
60 self.fileobj.seek(pos, whence)
61
62 def fetch(self, quiet=False):
63 name = ''
64 comment = ''
65 seq = ''
66
67 if not quiet and self.fname and self.fname != '-':
68 eta = ETA(os.stat(self.fname).st_size, fileobj=self.fileobj)
69 else:
70 eta = None
71
72 for line in self.fileobj:
73 line = line.strip()
74 if not line:
75 continue
76 if line[0] == '#':
77 continue
78
79 if line[0] == '>':
80 if name and seq:
81 if eta:
82 eta.print_status(extra=name)
83 yield FASTARead(name, comment, seq)
84
85 spl = re.split(r'[ \t]', line[1:], maxsplit=1)
86 name = spl[0]
87 if len(spl) > 1:
88 comment = spl[1]
89 else:
90 comment = ''
91 seq = ''
92
93 else:
94 if self.qual:
95 seq = seq + ' ' + line
96 else:
97 seq += line
98
99 if name and seq:
100 if eta:
101 eta.print_status(extra=name)
102 yield FASTARead(name, comment, seq)
103
104 if eta:
105 eta.done()
106
107
108 def gzip_reader(fname, quiet=False, callback=None, done_callback=None, fileobj=None):
109 if fileobj:
110 f = fileobj
111 elif fname == '-':
112 f = sys.stdin
113 elif fname[-3:] == '.gz' or fname[-4:] == '.bgz':
114 f = gzip.open(os.path.expanduser(fname))
115 else:
116 f = open(os.path.expanduser(fname))
117
118 if quiet or fname == '-':
119 eta = None
120 else:
121 eta = ETA(os.stat(fname).st_size, fileobj=f)
122
123 for line in f:
124 if eta:
125 if callback:
126 extra = callback()
127 else:
128 extra = ''
129
130 eta.print_status(extra=extra)
131 yield line
132
133 if done_callback and done_callback():
134 break
135
136 if f != sys.stdin:
137 f.close()
138
139 if eta:
140 eta.done()
141
142
143 class Symbolize(object):
144 'Converts strings to symbols - basically a cache of strings'
145 def __init__(self):
146 self.__cache = {}
147
148 def __getitem__(self, k):
149 if not k in self.__cache:
150 self.__cache[k] = k
151
152 return self.__cache[k]
153
154 symbols = Symbolize()
155
156 _compliments = {
157 'a': 't',
158 'A': 'T',
159 'c': 'g',
160 'C': 'G',
161 'g': 'c',
162 'G': 'C',
163 't': 'a',
164 'T': 'A',
165 'n': 'n',
166 'N': 'N'
167 }
168
169
170 def revcomp(seq):
171 '''
172 >>> revcomp('ATCGatcg')
173 'cgatCGAT'
174 '''
175 ret = []
176
177 for s in seq:
178 ret.append(_compliments[s])
179
180 ret.reverse()
181 return ''.join(ret)
182
183
184 class Counts(object):
185 '''
186 Setup simple binning. Bins are continuous 0->max. Values are added to
187 bins and then means / distributions can be calculated.
188 '''
189 def __init__(self):
190 self.bins = []
191
192 def add(self, val):
193 while len(self.bins) <= val:
194 self.bins.append(0)
195 self.bins[val] += 1
196
197 def mean(self):
198 acc = 0
199 count = 0
200
201 for i, val in enumerate(self.bins):
202 acc += (i * val)
203 count += val
204
205 if count > 0:
206 return float(acc) / count
207
208 def max(self):
209 return len(self.bins) - 1
210
211
212 def memoize(func):
213 if 'TESTING' in os.environ or 'DEBUG' in os.environ:
214 return func
215
216 __cache = {}
217 def inner(*args, **kwargs):
218 k = (args, tuple(kwargs.iteritems()))
219 if k not in __cache:
220 __cache[k] = func(*args, **kwargs)
221 return __cache[k]
222
223 inner.__doc__ = '(@memoized %s)\n%s' % (func.__name__, func.__doc__)
224 return inner
225
226
227 def quoted_split(s, delim, quote_char='"'):
228 tokens = []
229
230 buf = ""
231 inquote = False
232
233 for c in s:
234 if inquote:
235 buf += c
236 if c == quote_char:
237 inquote = False
238 elif c == delim:
239 tokens.append(buf)
240 buf = ""
241 else:
242 buf += c
243 if c == quote_char:
244 inquote = True
245
246 if buf:
247 tokens.append(buf)
248
249 return tokens