Mercurial > repos > iuc > ngsutils_bam_filter
comparison ngsutils/support/ngs_utils.py @ 0:4e4e4093d65d draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 09194687c74a424732f8b0c017cbb942aad89068
author | iuc |
---|---|
date | Wed, 11 Nov 2015 13:04:07 -0500 |
parents | |
children | 7a68005de299 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4e4e4093d65d |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 | |
4 Common util classes / functions for the NGS project | |
5 | |
6 """ | |
7 import sys | |
8 import os | |
9 import gzip | |
10 import re | |
11 import collections | |
12 | |
13 | |
14 def format_number(n): | |
15 ''' | |
16 >>> format_number(1000) | |
17 '1,000' | |
18 >>> format_number(1234567) | |
19 '1,234,567' | |
20 ''' | |
21 ar = list(str(n)) | |
22 for i in range(len(ar))[::-3][1:]: | |
23 ar.insert(i + 1, ',') | |
24 return ''.join(ar) | |
25 | |
26 | |
27 def natural_sort(ar): | |
28 ''' | |
29 >>> natural_sort('1 3 4 2 5'.split()) | |
30 ['1', '2', '3', '4', '5'] | |
31 >>> natural_sort('1 10 20 2 3 4'.split()) | |
32 ['1', '2', '3', '4', '10', '20'] | |
33 ''' | |
34 to_sort = [] | |
35 for item in ar: | |
36 spl = re.split('(\d+)', item) | |
37 l2 = [] | |
38 for el in spl: | |
39 try: | |
40 n = int(el) | |
41 except: | |
42 n = el | |
43 l2.append(n) | |
44 to_sort.append((l2, item)) | |
45 | |
46 to_sort.sort() | |
47 return [x[1] for x in to_sort] | |
48 | |
49 | |
50 def dictify(values, colnames): | |
51 """ | |
52 Convert a list of values into a dictionary based upon given column names. | |
53 | |
54 If the column name starts with an '@', the value is assumed to be a comma | |
55 separated list. | |
56 | |
57 If the name starts with a '#', the value is assumed to be an int. | |
58 | |
59 If the name starts with '@#', the value is assumed to a comma separated | |
60 list of ints. | |
61 | |
62 """ | |
63 d = {} | |
64 for i in xrange(len(colnames)): | |
65 key = colnames[i] | |
66 split = False | |
67 num = False | |
68 | |
69 if key[0] == '@': | |
70 key = key[1:] | |
71 split = True | |
72 if key[0] == '#': | |
73 key = key[1:] | |
74 num = True | |
75 | |
76 if i < len(values): | |
77 if num and split: | |
78 val = [int(x) for x in values[i].rstrip(',').split(',')] | |
79 elif num: | |
80 val = int(values[i]) | |
81 elif split: | |
82 val = values[i].rstrip(',').split(',') | |
83 else: | |
84 val = values[i] | |
85 | |
86 d[key] = val | |
87 | |
88 else: | |
89 d[key] = None | |
90 | |
91 return d | |
92 | |
93 | |
94 def gzip_aware_open(fname): | |
95 if fname == '-': | |
96 f = sys.stdin | |
97 elif fname[-3:] == '.gz' or fname[-4:] == '.bgz': | |
98 f = gzip.open(os.path.expanduser(fname)) | |
99 else: | |
100 f = open(os.path.expanduser(fname)) | |
101 return f | |
102 | |
103 | |
104 class gzip_opener: | |
105 ''' | |
106 A Python 2.6 class to handle 'with' opening of text files that may | |
107 or may not be gzip compressed. | |
108 ''' | |
109 def __init__(self, fname): | |
110 self.fname = fname | |
111 | |
112 def __enter__(self): | |
113 self.f = gzip_aware_open(self.fname) | |
114 return self.f | |
115 | |
116 def __exit__(self, type, value, traceback): | |
117 if self.f != sys.stdin: | |
118 self.f.close() | |
119 return False | |
120 | |
121 | |
122 def filenames_to_uniq(names, new_delim='.'): | |
123 ''' | |
124 Given a set of file names, produce a list of names consisting of the | |
125 uniq parts of the names. This works from the end of the name. Chunks of | |
126 the name are split on '.' and '-'. | |
127 | |
128 For example: | |
129 A.foo.bar.txt | |
130 B.foo.bar.txt | |
131 returns: ['A','B'] | |
132 | |
133 AA.BB.foo.txt | |
134 CC.foo.txt | |
135 returns: ['AA.BB','CC'] | |
136 | |
137 >>> filenames_to_uniq('a.foo.bar.txt b.foo.bar.txt'.split()) | |
138 ['a', 'b'] | |
139 >>> filenames_to_uniq('a.b.foo.txt c.foo.txt'.split()) | |
140 ['a.b', 'c'] | |
141 | |
142 ''' | |
143 name_words = [] | |
144 maxlen = 0 | |
145 for name in names: | |
146 name_words.append(name.replace('.', ' ').replace('-', ' ').strip().split()) | |
147 name_words[-1].reverse() | |
148 if len(name_words[-1]) > maxlen: | |
149 maxlen = len(name_words[-1]) | |
150 | |
151 common = [False, ] * maxlen | |
152 for i in xrange(maxlen): | |
153 last = None | |
154 same = True | |
155 for nameword in name_words: | |
156 if i >= len(nameword): | |
157 same = False | |
158 break | |
159 if not last: | |
160 last = nameword[i] | |
161 elif nameword[i] != last: | |
162 same = False | |
163 break | |
164 common[i] = same | |
165 | |
166 newnames = [] | |
167 for nameword in name_words: | |
168 nn = [] | |
169 for (i, val) in enumerate(common): | |
170 if not val and i < len(nameword): | |
171 nn.append(nameword[i]) | |
172 nn.reverse() | |
173 newnames.append(new_delim.join(nn)) | |
174 return newnames | |
175 | |
176 | |
177 def parse_args(argv, defaults=None, expected_argc=0): | |
178 opts = {} | |
179 if defaults: | |
180 opts.update(defaults) | |
181 | |
182 args = [] | |
183 | |
184 i = 0 | |
185 while i < len(argv): | |
186 if argv[i][0] == '-': | |
187 arg = argv[i].lstrip('-') | |
188 if '=' in arg: | |
189 k, v = arg.split('=', 2) | |
190 if k in defaults: | |
191 if type(defaults[k]) == float: | |
192 opts[k] = float(v) | |
193 elif type(defaults[k]) == int: | |
194 opts[k] = int(v) | |
195 else: | |
196 opts[k] = v | |
197 else: | |
198 opts[arg] = True | |
199 else: | |
200 args.append(argv[i]) | |
201 i += 1 | |
202 | |
203 while len(args) < expected_argc: | |
204 args.append(None) | |
205 return opts, args | |
206 | |
207 | |
208 class memoize(object): | |
209 'Simple memoizing decorator to cache results' | |
210 def __init__(self, func): | |
211 self.func = func | |
212 self.cache = {} | |
213 | |
214 def __call__(self, *args): | |
215 if not isinstance(args, collections.Hashable): | |
216 # uncacheable. a list, for instance. | |
217 # better to not cache than blow up. | |
218 return self.func(*args) | |
219 | |
220 if args in self.cache: | |
221 return self.cache[args] | |
222 else: | |
223 value = self.func(*args) | |
224 self.cache[args] = value | |
225 return value |