Mercurial > repos > fubar > bigwig_outlier_bed

diff bigwig_outlier_bed.xml @ 0:c71db540eb38 draft
planemo upload for repository https://github.com/jackh726/bigtools commit ce6b9f638ebcebcad5a5b10219f252962f30e5cc-dirty
author: fubar
date: Mon, 01 Jul 2024 02:48:46 +0000
children: a7d26bca0a3b
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bigwig_outlier_bed.xml	Mon Jul 01 02:48:46 2024 +0000
@@ -0,0 +1,212 @@
+<tool name="bigwig_outlier_bed" id="bigwigoutlierbed" version="0.01" profile="22.05">
+  <!--Source in git at: https://github.com/fubar2/galaxy_tf_overlay-->
+  <!--Created by toolfactory@galaxy.org at 30/06/2024 19:44:14 using the Galaxy Tool Factory.-->
+  <description>Writes high and low bigwig regions as features in a bed file</description>
+  <edam_topics>
+      <edam_topic>topic_0157</edam_topic>
+      <edam_topic>topic_0092</edam_topic>
+  </edam_topics>
+  <edam_operations>
+      <edam_operation>operation_0337</edam_operation>
+  </edam_operations>  
+  <requirements>
+    <requirement type="package" version="3.12.3">python</requirement>
+    <requirement type="package" version="2.0.0">numpy</requirement>
+    <requirement type="package" version="0.1.4">pybigtools</requirement>
+  </requirements>
+  <version_command><![CDATA[python -c "import pybigtools; from importlib.metadata import version; print(version('pybigtools'))"]]></version_command>
+  <command><![CDATA[python
+'$runme'
+--bigwig
+'$bigwig'
+--bedouthilo
+'$bedouthilo'
+--minwin
+'$minwin'
+--qhi
+'$qhi'
+--qlo
+'$qlo'
+#if $tableout == "set"
+ --tableout
+#end if
+--bigwiglabels
+'$bigwiglabels']]></command>
+  <configfiles>
+    <configfile name="runme"><![CDATA[#raw
+"""
+Bigwigs are great, but hard to reliably "see" small low coverage or small very high coverage regions.
+Colouring in JB2 tracks will need a new plugin, so this code will find bigwig regions above and below a chosen percentile point.
+0.99 and 0.01 work well in testing with a minimum span of 10 bp.
+Multiple bigwigs **with the same reference** can be combined - bed segments will be named appropriately
+Combining multiple references works but is silly because display will rely on one reference so features mapped to other references will not appear.
+
+Tricksy numpy method from http://gregoryzynda.com/python/numpy/contiguous/interval/2019/11/29/contiguous-regions.html
+takes about 95 seconds for a 17MB test wiggle
+JBrowse2 bed normally displays ignore the score, so could provide separate low/high bed file outputs as an option.
+Update june 30 2024: wrote a 'no-build' plugin for beds to display red/blue if >0/<0 so those are used for scores
+Bed interval naming must be short for JB2 but needs input bigwig name and (lo or hi).
+"""
+
+import argparse
+import numpy as np
+import pybigtools
+import sys
+from pathlib import Path
+
+
+class findOut():
+    def __init__(self, args):
+        self.bwnames=args.bigwig
+        self.bwlabels=args.bigwiglabels
+        self.bedwin=args.minwin
+        self.qlo=args.qlo
+        self.qhi=args.qhi
+        self.bedouthilo=args.bedouthilo
+        self.bedouthi=args.bedouthi
+        self.bedoutlo=args.bedoutlo
+        self.tableout = args.tableout
+        self.bedwin = args.minwin
+        self.qhi = args.qhi
+        self.qlo = args.qlo
+        self.makeBed()
+
+    def processVals(self, bw, isTop):
+        # http://gregoryzynda.com/python/numpy/contiguous/interval/2019/11/29/contiguous-regions.html
+        if isTop:
+            bwex = np.r_[False, bw >= self.bwtop, False] # extend with 0s
+        else:
+            bwex = np.r_[False, bw <= self.bwbot, False]
+        bwexd = np.diff(bwex)
+        bwexdnz = bwexd.nonzero()[0]
+        bwregions = np.reshape(bwexdnz, (-1,2))
+        return bwregions
+
+    def writeBed(self, bed, bedfname):
+        """
+        potentially multiple
+        """
+        bed.sort()
+        beds = ['%s\t%d\t%d\t%s\t%d' % x for x in bed]
+        with open(bedfname, "w") as bedf:
+            bedf.write('\n'.join(beds))
+            bedf.write('\n')
+        print('Wrote %d bed regions to %s' % (len(bed), bedfname))
+
+    def makeBed(self):
+        bedhi = []
+        bedlo = []
+        bwlabels = self.bwlabels
+        bwnames = self.bwnames
+        print('bwnames=', bwnames, "bwlabs=", bwlabels)
+        for i, bwname in enumerate(bwnames):
+            bwlabel = bwlabels[i].replace(" ",'')
+            p = Path('in.bw')
+            p.symlink_to( bwname ) # required by pybigtools (!)
+            bwf = pybigtools.open('in.bw')
+            chrlist = bwf.chroms()
+            chrs = list(chrlist.keys())
+            chrs.sort()
+            restab = ["contig\tn\tmean\tstd\tmin\tmax\tqtop\tqbot"]
+            for chr in chrs:
+                bw = bwf.values(chr)
+                bw = bw[~np.isnan(bw)] # some have NaN if parts of a contig not covered
+                if self.qhi is not None:
+                    self.bwtop = np.quantile(bw, self.qhi)
+                    bwhi = self.processVals(bw, isTop=True)
+                    for i, seg in enumerate(bwhi):
+                        if seg[1] - seg[0] >= self.bedwin:
+                            bedhi.append((chr, seg[0], seg[1], '%s_hi' % (bwlabel), 1))
+                if self.qlo is not None:
+                    self.bwbot = np.quantile(bw, self.qlo)
+                    bwlo = self.processVals(bw, isTop=False)            
+                    for i, seg in enumerate(bwlo):
+                        if seg[1] - seg[0] >= self.bedwin:
+                            bedlo.append((chr, seg[0], seg[1], '%s_lo' % (bwlabel), -1))
+                bwmean = np.mean(bw)
+                bwstd = np.std(bw)
+                bwmax = np.max(bw)
+                nrow = np.size(bw)
+                bwmin = np.min(bw)
+                restab.append('%s\t%d\t%f\t%f\t%f\t%f\t%f\t%f' % (chr,nrow,bwmean,bwstd,bwmin,bwmax,self.bwtop,self.bwbot))        
+        print('\n'.join(restab), '\n')
+        if self.tableout:
+            with open(self.tableout) as t:
+                t.write('\n'.join(restab))
+                t.write('\n')
+        if self.bedoutlo:
+            if self.qlo:
+                self.writeBed(bedlo, self.bedoutlo)
+        if self.bedouthi:
+            if self.qhi:
+                self.writeBed(bedhi, self.bedouthi)
+        if self.bedouthilo:
+            allbed = bedlo + bedhi
+            self.writeBed(allbed, self.bedouthilo)
+        return restab
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    a = parser.add_argument
+    a('-m', '--minwin',default=10, type=int)
+    a('-l', '--qlo',default=None, type=float)
+    a('-i', '--qhi',default=None, type=float)
+    a('-w', '--bigwig', nargs='+')
+    a('-n', '--bigwiglabels', nargs='+')
+    a('-o', '--bedouthilo', default=None, help="optional high and low combined bed")
+    a('-u', '--bedouthi', default=None, help="optional high only bed")
+    a('-b', '--bedoutlo', default=None, help="optional low only bed")
+    a('-t', '--tableout', default=None)
+    args = parser.parse_args()
+    print('args=', args)
+    if not (args.bedouthilo or args.bedouthi or args.bedoutlo):
+        sys.stderr.write("bigwig_outlier_bed.py cannot usefully run - need a bed output choice - must be one of low only, high only or both combined")
+        sys.exit(2)
+    if not (args.qlo or args.qhi):
+        sys.stderr.write("bigwig_outlier_bed.py cannot usefully run - need one or both of quantile cutpoints qhi and qlo")
+        sys.exit(2)
+    restab = findOut(args)
+    if args.tableout:
+        with open(args.tableout, 'w') as tout:
+            tout.write('\n'.join(restab))
+            tout.write('\n')
+#end raw]]></configfile>
+  </configfiles>
+  <inputs>
+    <param name="bigwig" type="data" optional="false" label="Bigwig file(s) to process. " help="If more than one, MUST all use the same reference sequence to be displayable. Feature names will include the bigwig label." format="bigwig" multiple="true"/>
+    <param name="minwin" type="integer" value="10" label="Minimum continuous bases to count as a high or low bed feature" help="Actual run length will be found and used for continuous features as long or longer."/>
+    <param name="qhi" type="float" value="0.99" label="Quantile cutoff for a high region - 0.99 will cut off at or above the 99th percentile" help=""/>
+    <param name="qlo" type="float" value="0.01" label="Quantile cutoff for a low region - 0.01 will cut off at or below the 1st percentile." help=""/>
+    <param name="tableout" type="select" label="Write a table showing contig statistics for each bigwig" help="" display="radio">
+      <option value="notset">Do not set this flag</option>
+      <option value="set">Set this flag</option>
+    </param>
+    <param name="bigwiglabels" type="text" value="outbed" label="Label to use in bed feature names to indicate source bigwig contents - such as coverage" help=""/>
+  </inputs>
+  <outputs>
+    <data name="bedouthilo" format="bed" label="Both high and low contiguous regions as long or longer than window length into one bed " hidden="false"/>
+  </outputs>
+  <tests>
+    <test>
+      <output name="bedouthilo" value="bedouthilo_sample" compare="diff" lines_diff="0"/>
+      <param name="bigwig" value="bigwig_sample"/>
+      <param name="minwin" value="10"/>
+      <param name="qhi" value="0.99"/>
+      <param name="qlo" value="0.01"/>
+      <param name="tableout" value="notset"/>
+      <param name="bigwiglabels" value="outbed"/>
+    </test>
+  </tests>
+  <help><![CDATA[
+ **What it Does**
+ 
+ Takes one or more bigwigs mapped to the same reference and finds all the minimum window sized or greater contiguous regions above or below an upper and lower quantile cutoff.
+ A window size of 10 works well, and quantiles set at 0.01 and 0.99 will generally work well.
+ 
+  ]]></help>
+  <citations>
+    <citation type="doi">10.1093/bioinformatics/btae350</citation>
+  </citations>
+</tool>
+
author	fubar
date	Mon, 01 Jul 2024 02:48:46 +0000
parents
children	a7d26bca0a3b