diff bigwig_outlier_bed.py @ 2:61946b8bd43b draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/bigwig_outlier_bed commit 3cce4c76a60b9353298fdcf759e893b8fcdfaa77
author iuc
date Thu, 25 Jul 2024 14:38:34 +0000
parents 8377a6abb4da
children 00b3da7776a0
line wrap: on
line diff
--- a/bigwig_outlier_bed.py	Sun Jul 21 11:03:36 2024 +0000
+++ b/bigwig_outlier_bed.py	Thu Jul 25 14:38:34 2024 +0000
@@ -13,7 +13,6 @@
 """
 
 import argparse
-import copy
 import os
 import sys
 from pathlib import Path
@@ -22,6 +21,90 @@
 import pybigtools
 
 
+class asciihist:
+
+    def __init__(
+        self,
+        data,
+        bins=10,
+        minmax=None,
+        str_tag="",
+        scale_output=80,
+        generate_only=True,
+    ):
+        """
+        https://gist.github.com/bgbg/608d9ef4fd75032731651257fe67fc81
+        Create an ASCII histogram from an interable of numbers.
+        Author: Boris Gorelik boris@gorelik.net. based on  http://econpy.googlecode.com/svn/trunk/pytrix/pytrix.py
+        License: MIT
+        """
+        self.data = data
+        self.minmax = minmax
+        self.str_tag = str_tag
+        self.bins = bins
+        self.generate_only = generate_only
+        self.scale_output = scale_output
+        self.itarray = np.asanyarray(self.data)
+        if self.minmax == "auto":
+            self.minmax = np.percentile(data, [5, 95])
+            if self.minmax[0] == self.minmax[1]:
+                # for very ugly distributions
+                self.minmax = None
+        if self.minmax is not None:
+            # discard values that are outside minmax range
+            mn = self.minmax[0]
+            mx = self.minmax[1]
+            self.itarray = self.itarray[self.itarray >= mn]
+            self.itarray = self.itarray[self.itarray <= mx]
+
+    def draw(self):
+        values, counts = np.unique(self.data, return_counts=True)
+        if len(values) <= 20:
+            self.bins = len(values)
+        ret = []
+        if self.itarray.size:
+            total = len(self.itarray)
+            counts, cutoffs = np.histogram(self.itarray, bins=self.bins)
+            cutoffs = cutoffs[1:]
+            if self.str_tag:
+                self.str_tag = "%s " % self.str_tag
+            else:
+                self.str_tag = ""
+            if self.scale_output is not None:
+                scaled_counts = counts.astype(float) / counts.sum() * self.scale_output
+            else:
+                scaled_counts = counts
+            footerbar = "{:s}{:s} |{:s} |".format(
+                self.str_tag,
+                "-" * 12,
+                "-" * 12,
+            )
+            if self.minmax is not None:
+                ret.append(
+                    "Trimmed to range (%s - %s)"
+                    % (str(self.minmax[0]), str(self.minmax[1]))
+                )
+            for cutoff, original_count, scaled_count in zip(
+                cutoffs, counts, scaled_counts
+            ):
+                ret.append(
+                    "{:s}{:>12.2f} |{:>12,d} | {:s}".format(
+                        self.str_tag, cutoff, original_count, "*" * int(scaled_count)
+                    )
+                )
+            ret.append(footerbar)
+            ret.append("{:s}{:>12s} |{:>12,d} |".format(self.str_tag, "N=", total))
+            ret.append(footerbar)
+            ret.append("")
+        else:
+            ret = []
+        if not self.generate_only:
+            for line in ret:
+                print(line)
+        ret = "\n".join(ret)
+        return ret
+
+
 class findOut:
 
     def __init__(self, args):
@@ -34,14 +117,16 @@
         self.bedouthilo = args.bedouthilo
         self.tableoutfile = args.tableoutfile
         self.bedwin = args.minwin
-        self.qhi = args.qhi
         self.qlo = None
-        try:
-            f = float(args.qlo)
-            self.qlo = f
-        except Exception as e:
-            s = str(e)
-            print(s, ' qlo=', args.qlo)
+        self.qhi = None
+        if args.outbeds != "outtab":
+            self.qhi = args.qhi
+            if args.qlo:
+                try:
+                    f = float(args.qlo)
+                    self.qlo = f
+                except Exception:
+                    print("qlo not provided")
         nbw = len(args.bigwig)
         nlab = len(args.bigwiglabels)
         if nlab < nbw:
@@ -106,10 +191,11 @@
     def makeBed(self):
         bedhi = []
         bedlo = []
+        restab = []
         bwlabels = self.bwlabels
         bwnames = self.bwnames
-        if self.tableoutfile:
-            restab = ["bigwig\tcontig\tn\tmean\tstd\tmin\tmax\tqtop\tqbot"]
+        bwnames.sort()
+        reshead = "bigwig\tcontig\tn\tmean\tstd\tmin\tmax\tqtop\tqbot"
         for i, bwname in enumerate(bwnames):
             bwlabel = bwlabels[i].replace(" ", "")
             fakepath = "in%d.bw" % i
@@ -120,16 +206,38 @@
             bwf = pybigtools.open(fakepath)
             chrlist = bwf.chroms()
             chrs = list(chrlist.keys())
-            chrs.sort()
             for chr in chrs:
+                first_few = None
                 bw = bwf.values(chr)
+                values, counts = np.unique(bw, return_counts=True)
+                nvalues = len(values)
+                if nvalues <= 20:
+                    histo = "\n".join(
+                        [
+                            "%s: %f occurs %d times" % (chr, values[x], counts[x])
+                            for x in range(len(values))
+                        ]
+                    )
+                else:
+                    last10 = range(nvalues - 10, nvalues)
+                    first_few = ["%.2f\t%d" % (values[x], counts[x]) for x in range(10)]
+                    first_few += ["%.2f\t%d" % (values[x], counts[x]) for x in last10]
+                    first_few.insert(0, "First/Last 10 value counts\nValue\tCount")
+                    ha = asciihist(data=bw, bins=20, str_tag="%s_%s" % (bwlabel, chr))
+                    histo = ha.draw()
+                    histo = (
+                        "\n".join(first_few)
+                        + "\nHistogram of %s bigwig values\n" % bwlabel
+                        + histo
+                    )
                 bw = bw[~np.isnan(bw)]  # some have NaN if parts of a contig not covered
                 if self.qhi is not None:
                     self.bwtop = np.quantile(bw, self.qhi)
                     bwhi = self.processVals(bw, isTop=True)
                     for j, seg in enumerate(bwhi):
-                        if seg[1] - seg[0] >= self.bedwin:
-                            score = np.sum(bw[seg[0]:seg[1]])
+                        seglen = seg[1] - seg[0]
+                        if seglen >= self.bedwin:
+                            score = np.sum(bw[seg[0]:seg[1]]) / float(seglen)
                             bedhi.append(
                                 (
                                     chr,
@@ -144,7 +252,7 @@
                     bwlo = self.processVals(bw, isTop=False)
                     for j, seg in enumerate(bwlo):
                         if seg[1] - seg[0] >= self.bedwin:
-                            score = -1 * np.sum(bw[seg[0]:seg[1]])
+                            score = -1 * np.sum(bw[seg[0]:seg[1]]) / float(seglen)
                             bedlo.append(
                                 (
                                     chr,
@@ -156,7 +264,15 @@
                             )
                 if self.tableoutfile:
                     row = self.makeTableRow(bw, bwlabel, chr)
-                    restab.append(copy.copy(row))
+                    resheadl = reshead.split("\t")
+                    rowl = row.split()
+                    desc = ["%s\t%s" % (resheadl[x], rowl[x]) for x in range(len(rowl))]
+                    desc.insert(0, "Descriptive measures")
+                    descn = "\n".join(desc)
+                    restab.append(descn)
+                    restab.append(histo)
+        if os.path.isfile(fakepath):
+            os.remove(fakepath)
         if self.tableoutfile:
             stable = "\n".join(restab)
             with open(self.tableoutfile, "w") as t:
@@ -175,7 +291,7 @@
             allbed = bedlo + bedhi
             self.writeBed(allbed, self.bedouthilo)
             some = True
-        if not some:
+        if not ((self.outbeds == "outtab") or some):
             sys.stderr.write(
                 "Invalid configuration - no output could be created. Was qlo missing and only low output requested for example?"
             )