Mercurial > repos > iuc > ngsutils_bam_filter
diff ngsutils/support/ngs_utils.py @ 0:4e4e4093d65d draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 09194687c74a424732f8b0c017cbb942aad89068
author | iuc |
---|---|
date | Wed, 11 Nov 2015 13:04:07 -0500 |
parents | |
children | 7a68005de299 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsutils/support/ngs_utils.py Wed Nov 11 13:04:07 2015 -0500 @@ -0,0 +1,225 @@ +#!/usr/bin/env python +""" + +Common util classes / functions for the NGS project + +""" +import sys +import os +import gzip +import re +import collections + + +def format_number(n): + ''' + >>> format_number(1000) + '1,000' + >>> format_number(1234567) + '1,234,567' + ''' + ar = list(str(n)) + for i in range(len(ar))[::-3][1:]: + ar.insert(i + 1, ',') + return ''.join(ar) + + +def natural_sort(ar): + ''' + >>> natural_sort('1 3 4 2 5'.split()) + ['1', '2', '3', '4', '5'] + >>> natural_sort('1 10 20 2 3 4'.split()) + ['1', '2', '3', '4', '10', '20'] + ''' + to_sort = [] + for item in ar: + spl = re.split('(\d+)', item) + l2 = [] + for el in spl: + try: + n = int(el) + except: + n = el + l2.append(n) + to_sort.append((l2, item)) + + to_sort.sort() + return [x[1] for x in to_sort] + + +def dictify(values, colnames): + """ + Convert a list of values into a dictionary based upon given column names. + + If the column name starts with an '@', the value is assumed to be a comma + separated list. + + If the name starts with a '#', the value is assumed to be an int. + + If the name starts with '@#', the value is assumed to a comma separated + list of ints. + + """ + d = {} + for i in xrange(len(colnames)): + key = colnames[i] + split = False + num = False + + if key[0] == '@': + key = key[1:] + split = True + if key[0] == '#': + key = key[1:] + num = True + + if i < len(values): + if num and split: + val = [int(x) for x in values[i].rstrip(',').split(',')] + elif num: + val = int(values[i]) + elif split: + val = values[i].rstrip(',').split(',') + else: + val = values[i] + + d[key] = val + + else: + d[key] = None + + return d + + +def gzip_aware_open(fname): + if fname == '-': + f = sys.stdin + elif fname[-3:] == '.gz' or fname[-4:] == '.bgz': + f = gzip.open(os.path.expanduser(fname)) + else: + f = open(os.path.expanduser(fname)) + return f + + +class gzip_opener: + ''' + A Python 2.6 class to handle 'with' opening of text files that may + or may not be gzip compressed. + ''' + def __init__(self, fname): + self.fname = fname + + def __enter__(self): + self.f = gzip_aware_open(self.fname) + return self.f + + def __exit__(self, type, value, traceback): + if self.f != sys.stdin: + self.f.close() + return False + + +def filenames_to_uniq(names, new_delim='.'): + ''' + Given a set of file names, produce a list of names consisting of the + uniq parts of the names. This works from the end of the name. Chunks of + the name are split on '.' and '-'. + + For example: + A.foo.bar.txt + B.foo.bar.txt + returns: ['A','B'] + + AA.BB.foo.txt + CC.foo.txt + returns: ['AA.BB','CC'] + + >>> filenames_to_uniq('a.foo.bar.txt b.foo.bar.txt'.split()) + ['a', 'b'] + >>> filenames_to_uniq('a.b.foo.txt c.foo.txt'.split()) + ['a.b', 'c'] + + ''' + name_words = [] + maxlen = 0 + for name in names: + name_words.append(name.replace('.', ' ').replace('-', ' ').strip().split()) + name_words[-1].reverse() + if len(name_words[-1]) > maxlen: + maxlen = len(name_words[-1]) + + common = [False, ] * maxlen + for i in xrange(maxlen): + last = None + same = True + for nameword in name_words: + if i >= len(nameword): + same = False + break + if not last: + last = nameword[i] + elif nameword[i] != last: + same = False + break + common[i] = same + + newnames = [] + for nameword in name_words: + nn = [] + for (i, val) in enumerate(common): + if not val and i < len(nameword): + nn.append(nameword[i]) + nn.reverse() + newnames.append(new_delim.join(nn)) + return newnames + + +def parse_args(argv, defaults=None, expected_argc=0): + opts = {} + if defaults: + opts.update(defaults) + + args = [] + + i = 0 + while i < len(argv): + if argv[i][0] == '-': + arg = argv[i].lstrip('-') + if '=' in arg: + k, v = arg.split('=', 2) + if k in defaults: + if type(defaults[k]) == float: + opts[k] = float(v) + elif type(defaults[k]) == int: + opts[k] = int(v) + else: + opts[k] = v + else: + opts[arg] = True + else: + args.append(argv[i]) + i += 1 + + while len(args) < expected_argc: + args.append(None) + return opts, args + + +class memoize(object): + 'Simple memoizing decorator to cache results' + def __init__(self, func): + self.func = func + self.cache = {} + + def __call__(self, *args): + if not isinstance(args, collections.Hashable): + # uncacheable. a list, for instance. + # better to not cache than blow up. + return self.func(*args) + + if args in self.cache: + return self.cache[args] + else: + value = self.func(*args) + self.cache[args] = value + return value