Mercurial > repos > iuc > bigwig_outlier_bed
diff bigwig_outlier_bed.py @ 2:61946b8bd43b draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/bigwig_outlier_bed commit 3cce4c76a60b9353298fdcf759e893b8fcdfaa77
author | iuc |
---|---|
date | Thu, 25 Jul 2024 14:38:34 +0000 |
parents | 8377a6abb4da |
children | 00b3da7776a0 |
line wrap: on
line diff
--- a/bigwig_outlier_bed.py Sun Jul 21 11:03:36 2024 +0000 +++ b/bigwig_outlier_bed.py Thu Jul 25 14:38:34 2024 +0000 @@ -13,7 +13,6 @@ """ import argparse -import copy import os import sys from pathlib import Path @@ -22,6 +21,90 @@ import pybigtools +class asciihist: + + def __init__( + self, + data, + bins=10, + minmax=None, + str_tag="", + scale_output=80, + generate_only=True, + ): + """ + https://gist.github.com/bgbg/608d9ef4fd75032731651257fe67fc81 + Create an ASCII histogram from an interable of numbers. + Author: Boris Gorelik boris@gorelik.net. based on http://econpy.googlecode.com/svn/trunk/pytrix/pytrix.py + License: MIT + """ + self.data = data + self.minmax = minmax + self.str_tag = str_tag + self.bins = bins + self.generate_only = generate_only + self.scale_output = scale_output + self.itarray = np.asanyarray(self.data) + if self.minmax == "auto": + self.minmax = np.percentile(data, [5, 95]) + if self.minmax[0] == self.minmax[1]: + # for very ugly distributions + self.minmax = None + if self.minmax is not None: + # discard values that are outside minmax range + mn = self.minmax[0] + mx = self.minmax[1] + self.itarray = self.itarray[self.itarray >= mn] + self.itarray = self.itarray[self.itarray <= mx] + + def draw(self): + values, counts = np.unique(self.data, return_counts=True) + if len(values) <= 20: + self.bins = len(values) + ret = [] + if self.itarray.size: + total = len(self.itarray) + counts, cutoffs = np.histogram(self.itarray, bins=self.bins) + cutoffs = cutoffs[1:] + if self.str_tag: + self.str_tag = "%s " % self.str_tag + else: + self.str_tag = "" + if self.scale_output is not None: + scaled_counts = counts.astype(float) / counts.sum() * self.scale_output + else: + scaled_counts = counts + footerbar = "{:s}{:s} |{:s} |".format( + self.str_tag, + "-" * 12, + "-" * 12, + ) + if self.minmax is not None: + ret.append( + "Trimmed to range (%s - %s)" + % (str(self.minmax[0]), str(self.minmax[1])) + ) + for cutoff, original_count, scaled_count in zip( + cutoffs, counts, scaled_counts + ): + ret.append( + "{:s}{:>12.2f} |{:>12,d} | {:s}".format( + self.str_tag, cutoff, original_count, "*" * int(scaled_count) + ) + ) + ret.append(footerbar) + ret.append("{:s}{:>12s} |{:>12,d} |".format(self.str_tag, "N=", total)) + ret.append(footerbar) + ret.append("") + else: + ret = [] + if not self.generate_only: + for line in ret: + print(line) + ret = "\n".join(ret) + return ret + + class findOut: def __init__(self, args): @@ -34,14 +117,16 @@ self.bedouthilo = args.bedouthilo self.tableoutfile = args.tableoutfile self.bedwin = args.minwin - self.qhi = args.qhi self.qlo = None - try: - f = float(args.qlo) - self.qlo = f - except Exception as e: - s = str(e) - print(s, ' qlo=', args.qlo) + self.qhi = None + if args.outbeds != "outtab": + self.qhi = args.qhi + if args.qlo: + try: + f = float(args.qlo) + self.qlo = f + except Exception: + print("qlo not provided") nbw = len(args.bigwig) nlab = len(args.bigwiglabels) if nlab < nbw: @@ -106,10 +191,11 @@ def makeBed(self): bedhi = [] bedlo = [] + restab = [] bwlabels = self.bwlabels bwnames = self.bwnames - if self.tableoutfile: - restab = ["bigwig\tcontig\tn\tmean\tstd\tmin\tmax\tqtop\tqbot"] + bwnames.sort() + reshead = "bigwig\tcontig\tn\tmean\tstd\tmin\tmax\tqtop\tqbot" for i, bwname in enumerate(bwnames): bwlabel = bwlabels[i].replace(" ", "") fakepath = "in%d.bw" % i @@ -120,16 +206,38 @@ bwf = pybigtools.open(fakepath) chrlist = bwf.chroms() chrs = list(chrlist.keys()) - chrs.sort() for chr in chrs: + first_few = None bw = bwf.values(chr) + values, counts = np.unique(bw, return_counts=True) + nvalues = len(values) + if nvalues <= 20: + histo = "\n".join( + [ + "%s: %f occurs %d times" % (chr, values[x], counts[x]) + for x in range(len(values)) + ] + ) + else: + last10 = range(nvalues - 10, nvalues) + first_few = ["%.2f\t%d" % (values[x], counts[x]) for x in range(10)] + first_few += ["%.2f\t%d" % (values[x], counts[x]) for x in last10] + first_few.insert(0, "First/Last 10 value counts\nValue\tCount") + ha = asciihist(data=bw, bins=20, str_tag="%s_%s" % (bwlabel, chr)) + histo = ha.draw() + histo = ( + "\n".join(first_few) + + "\nHistogram of %s bigwig values\n" % bwlabel + + histo + ) bw = bw[~np.isnan(bw)] # some have NaN if parts of a contig not covered if self.qhi is not None: self.bwtop = np.quantile(bw, self.qhi) bwhi = self.processVals(bw, isTop=True) for j, seg in enumerate(bwhi): - if seg[1] - seg[0] >= self.bedwin: - score = np.sum(bw[seg[0]:seg[1]]) + seglen = seg[1] - seg[0] + if seglen >= self.bedwin: + score = np.sum(bw[seg[0]:seg[1]]) / float(seglen) bedhi.append( ( chr, @@ -144,7 +252,7 @@ bwlo = self.processVals(bw, isTop=False) for j, seg in enumerate(bwlo): if seg[1] - seg[0] >= self.bedwin: - score = -1 * np.sum(bw[seg[0]:seg[1]]) + score = -1 * np.sum(bw[seg[0]:seg[1]]) / float(seglen) bedlo.append( ( chr, @@ -156,7 +264,15 @@ ) if self.tableoutfile: row = self.makeTableRow(bw, bwlabel, chr) - restab.append(copy.copy(row)) + resheadl = reshead.split("\t") + rowl = row.split() + desc = ["%s\t%s" % (resheadl[x], rowl[x]) for x in range(len(rowl))] + desc.insert(0, "Descriptive measures") + descn = "\n".join(desc) + restab.append(descn) + restab.append(histo) + if os.path.isfile(fakepath): + os.remove(fakepath) if self.tableoutfile: stable = "\n".join(restab) with open(self.tableoutfile, "w") as t: @@ -175,7 +291,7 @@ allbed = bedlo + bedhi self.writeBed(allbed, self.bedouthilo) some = True - if not some: + if not ((self.outbeds == "outtab") or some): sys.stderr.write( "Invalid configuration - no output could be created. Was qlo missing and only low output requested for example?" )