diff find_str.py @ 8:01c16e8fbc91 draft

planemo upload for repository https://github.com/fubar2/microsatbed commit 80a8c0db54b6e2cab9dfe7178b1e5b3b39592f2c-dirty
author fubar
date Tue, 13 Aug 2024 04:50:47 +0000
parents 4ff60fb9ca4d
children
line wrap: on
line diff
--- a/find_str.py	Sun Jul 14 23:34:26 2024 +0000
+++ b/find_str.py	Tue Aug 13 04:50:47 2024 +0000
@@ -1,4 +1,5 @@
 import argparse
+import subprocess
 
 import pytrf  # 1.3.0
 from pyfastx import Fastx  # 0.5.2
@@ -9,6 +10,25 @@
 """
 
 
+def getDensity(name, bed, chrlen, winwidth):
+    """
+    pybigtools can write bigwigs and they are processed by other ucsc tools - but jb2 will not read them.
+    Swapped the conversion to use a bedgraph file processed by bedGraphToBigWig
+    """
+    nwin = int(chrlen / winwidth)
+    d = [0.0 for x in range(nwin + 1)]
+    for b in bed:
+        nt = b[5]
+        bin = int(b[1] / winwidth)
+        d[bin] += nt
+    bedg = [
+        (name, (x * winwidth), ((x + 1) * winwidth) - 1, float(d[x]))
+        for x in range(nwin + 1)
+        if (x + 1) * winwidth <= chrlen
+    ]
+    return bedg
+
+
 def write_ssrs(args):
     """
     The integers in the call change the minimum repeats for mono-, di-, tri-, tetra-, penta-, hexa-nucleotide repeats
@@ -18,62 +38,68 @@
     Sequence read bias might be influenced by GC density or some other specific motif.
     """
     bed = []
+    wig = []
+    chrlens = {}
     specific = None
     if args.specific:
         specific = args.specific.upper().split(",")
     fa = Fastx(args.fasta, uppercase=True)
     for name, seq in fa:
-        if args.specific:
-            ssrs = pytrf.STRFinder(
-                name,
-                seq,
-                args.minreps,
-                args.minreps,
-                args.minreps,
-                args.minreps,
-                args.minreps,
-                args.minreps,
-            )
-        else:
-            ssrs = pytrf.STRFinder(
-                name,
-                seq,
-                args.monomin,
-                args.dimin,
-                args.trimin,
-                args.tetramin,
-                args.pentamin,
-                args.hexamin,
-            )
-        for ssr in ssrs:
+        chrlen = len(seq)
+        chrlens[name] = chrlen
+        cbed = []
+        for ssr in pytrf.STRFinder(
+            name,
+            seq,
+            args.monomin,
+            args.dimin,
+            args.trimin,
+            args.tetramin,
+            args.pentamin,
+            args.hexamin,
+        ):
             row = (
                 ssr.chrom,
-                ssr.start - 1,
+                ssr.start,
                 ssr.end,
                 ssr.motif,
                 ssr.repeat,
                 ssr.length,
             )
-            # pytrf reports a 1 based start position so start-1 fixes the bed interval lengths
             if args.specific and ssr.motif in specific:
-                bed.append(row)
+                cbed.append(row)
             elif args.mono and len(ssr.motif) == 1:
-                bed.append(row)
+                cbed.append(row)
             elif args.di and len(ssr.motif) == 2:
-                bed.append(row)
+                cbed.append(row)
             elif args.tri and len(ssr.motif) == 3:
-                bed.append(row)
+                cbed.append(row)
             elif args.tetra and len(ssr.motif) == 4:
-                bed.append(row)
+                cbed.append(row)
             elif args.penta and len(ssr.motif) == 5:
-                bed.append(row)
+                cbed.append(row)
             elif args.hexa and len(ssr.motif) == 6:
-                bed.append(row)
-    bed.sort()
-    obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed]
-    with open(args.bed, "w") as outbed:
-        outbed.write("\n".join(obed))
-        outbed.write("\n")
+                cbed.append(row)
+        if args.bigwig:
+            w = getDensity(name, cbed, chrlen, args.winwidth)
+            wig += w
+        bed += cbed
+    if args.bigwig:
+        wig.sort()
+        bedg = ["%s %d %d %.2f" % x for x in wig]
+        with open("temp.bedg", "w") as bw:
+            bw.write("\n".join(bedg))
+        chroms = ["%s\t%s" % (x, chrlens[x]) for x in chrlens.keys()]
+        with open("temp.chromlen", "w") as cl:
+            cl.write("\n".join(chroms))
+        cmd = ["bedGraphToBigWig", "temp.bedg", "temp.chromlen", args.bed]
+        subprocess.run(cmd)
+    else:
+        bed.sort()
+        obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed]
+        with open(args.bed, "w") as outbed:
+            outbed.write("\n".join(obed))
+            outbed.write("\n")
 
 
 if __name__ == "__main__":
@@ -93,6 +119,8 @@
     a("--monomin", default=2, type=int)
     a("-f", "--fasta", default="humsamp.fa")
     a("-b", "--bed", default="humsamp.bed")
+    a("--bigwig", action="store_true")
+    a("--winwidth", default=128, type=int)
     a("--specific", default=None)
     a("--minreps", default=2, type=int)
     args = parser.parse_args()