Mercurial > repos > iuc > ngsutils_bam_filter
comparison ngsutils/support/__init__.py @ 0:4e4e4093d65d draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 09194687c74a424732f8b0c017cbb942aad89068
author | iuc |
---|---|
date | Wed, 11 Nov 2015 13:04:07 -0500 |
parents | |
children | 7a68005de299 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4e4e4093d65d |
---|---|
1 import collections | |
2 import gzip | |
3 import os | |
4 import sys | |
5 import re | |
6 try: | |
7 from eta import ETA | |
8 except: | |
9 pass | |
10 | |
11 class FASTARead(collections.namedtuple('FASTARecord', 'name comment seq')): | |
12 def __repr__(self): | |
13 if self.comment: | |
14 return '>%s %s\n%s\n' % (self.name, self.comment, self.seq) | |
15 return '>%s\n%s\n' % (self.name, self.seq) | |
16 | |
17 def subseq(self, start, end, comment=None): | |
18 if self.comment: | |
19 comment = '%s %s' % (self.comment, comment) | |
20 | |
21 return FASTARead(self.name, comment, self.seq[start:end]) | |
22 | |
23 def clone(self, name=None, comment=None, seq=None): | |
24 n = name if name else self.name | |
25 c = comment if comment else self.comment | |
26 s = seq if seq else self.seq | |
27 | |
28 return FASTARead(n, c, s) | |
29 | |
30 def write(self, out): | |
31 out.write(repr(self)) | |
32 | |
33 | |
34 class FASTA(object): | |
35 def __init__(self, fname=None, fileobj=None, qual=False): | |
36 self.fname = fname | |
37 self.qual = qual | |
38 if fileobj: | |
39 self.fileobj = fileobj | |
40 else: | |
41 if self.fname == '-': | |
42 self.fileobj = sys.stdin | |
43 elif self.fname[-3:] == '.gz' or self.fname[-4:] == '.bgz': | |
44 self.fileobj = gzip.open(os.path.expanduser(self.fname)) | |
45 else: | |
46 self.fileobj = open(os.path.expanduser(self.fname)) | |
47 | |
48 if not self.fileobj: | |
49 raise ValueError("Missing valid filename or fileobj") | |
50 | |
51 def close(self): | |
52 if self.fileobj != sys.stdout: | |
53 self.fileobj.close() | |
54 | |
55 def tell(self): | |
56 # always relative to uncompressed... | |
57 return self.fileobj.tell() | |
58 | |
59 def seek(self, pos, whence=0): | |
60 self.fileobj.seek(pos, whence) | |
61 | |
62 def fetch(self, quiet=False): | |
63 name = '' | |
64 comment = '' | |
65 seq = '' | |
66 | |
67 if not quiet and self.fname and self.fname != '-': | |
68 eta = ETA(os.stat(self.fname).st_size, fileobj=self.fileobj) | |
69 else: | |
70 eta = None | |
71 | |
72 for line in self.fileobj: | |
73 line = line.strip() | |
74 if not line: | |
75 continue | |
76 if line[0] == '#': | |
77 continue | |
78 | |
79 if line[0] == '>': | |
80 if name and seq: | |
81 if eta: | |
82 eta.print_status(extra=name) | |
83 yield FASTARead(name, comment, seq) | |
84 | |
85 spl = re.split(r'[ \t]', line[1:], maxsplit=1) | |
86 name = spl[0] | |
87 if len(spl) > 1: | |
88 comment = spl[1] | |
89 else: | |
90 comment = '' | |
91 seq = '' | |
92 | |
93 else: | |
94 if self.qual: | |
95 seq = seq + ' ' + line | |
96 else: | |
97 seq += line | |
98 | |
99 if name and seq: | |
100 if eta: | |
101 eta.print_status(extra=name) | |
102 yield FASTARead(name, comment, seq) | |
103 | |
104 if eta: | |
105 eta.done() | |
106 | |
107 | |
108 def gzip_reader(fname, quiet=False, callback=None, done_callback=None, fileobj=None): | |
109 if fileobj: | |
110 f = fileobj | |
111 elif fname == '-': | |
112 f = sys.stdin | |
113 elif fname[-3:] == '.gz' or fname[-4:] == '.bgz': | |
114 f = gzip.open(os.path.expanduser(fname)) | |
115 else: | |
116 f = open(os.path.expanduser(fname)) | |
117 | |
118 if quiet or fname == '-': | |
119 eta = None | |
120 else: | |
121 eta = ETA(os.stat(fname).st_size, fileobj=f) | |
122 | |
123 for line in f: | |
124 if eta: | |
125 if callback: | |
126 extra = callback() | |
127 else: | |
128 extra = '' | |
129 | |
130 eta.print_status(extra=extra) | |
131 yield line | |
132 | |
133 if done_callback and done_callback(): | |
134 break | |
135 | |
136 if f != sys.stdin: | |
137 f.close() | |
138 | |
139 if eta: | |
140 eta.done() | |
141 | |
142 | |
143 class Symbolize(object): | |
144 'Converts strings to symbols - basically a cache of strings' | |
145 def __init__(self): | |
146 self.__cache = {} | |
147 | |
148 def __getitem__(self, k): | |
149 if not k in self.__cache: | |
150 self.__cache[k] = k | |
151 | |
152 return self.__cache[k] | |
153 | |
154 symbols = Symbolize() | |
155 | |
156 _compliments = { | |
157 'a': 't', | |
158 'A': 'T', | |
159 'c': 'g', | |
160 'C': 'G', | |
161 'g': 'c', | |
162 'G': 'C', | |
163 't': 'a', | |
164 'T': 'A', | |
165 'n': 'n', | |
166 'N': 'N' | |
167 } | |
168 | |
169 | |
170 def revcomp(seq): | |
171 ''' | |
172 >>> revcomp('ATCGatcg') | |
173 'cgatCGAT' | |
174 ''' | |
175 ret = [] | |
176 | |
177 for s in seq: | |
178 ret.append(_compliments[s]) | |
179 | |
180 ret.reverse() | |
181 return ''.join(ret) | |
182 | |
183 | |
184 class Counts(object): | |
185 ''' | |
186 Setup simple binning. Bins are continuous 0->max. Values are added to | |
187 bins and then means / distributions can be calculated. | |
188 ''' | |
189 def __init__(self): | |
190 self.bins = [] | |
191 | |
192 def add(self, val): | |
193 while len(self.bins) <= val: | |
194 self.bins.append(0) | |
195 self.bins[val] += 1 | |
196 | |
197 def mean(self): | |
198 acc = 0 | |
199 count = 0 | |
200 | |
201 for i, val in enumerate(self.bins): | |
202 acc += (i * val) | |
203 count += val | |
204 | |
205 if count > 0: | |
206 return float(acc) / count | |
207 | |
208 def max(self): | |
209 return len(self.bins) - 1 | |
210 | |
211 | |
212 def memoize(func): | |
213 if 'TESTING' in os.environ or 'DEBUG' in os.environ: | |
214 return func | |
215 | |
216 __cache = {} | |
217 def inner(*args, **kwargs): | |
218 k = (args, tuple(kwargs.iteritems())) | |
219 if k not in __cache: | |
220 __cache[k] = func(*args, **kwargs) | |
221 return __cache[k] | |
222 | |
223 inner.__doc__ = '(@memoized %s)\n%s' % (func.__name__, func.__doc__) | |
224 return inner | |
225 | |
226 | |
227 def quoted_split(s, delim, quote_char='"'): | |
228 tokens = [] | |
229 | |
230 buf = "" | |
231 inquote = False | |
232 | |
233 for c in s: | |
234 if inquote: | |
235 buf += c | |
236 if c == quote_char: | |
237 inquote = False | |
238 elif c == delim: | |
239 tokens.append(buf) | |
240 buf = "" | |
241 else: | |
242 buf += c | |
243 if c == quote_char: | |
244 inquote = True | |
245 | |
246 if buf: | |
247 tokens.append(buf) | |
248 | |
249 return tokens |