diff find_str.py @ 20:410144c7b2d6 draft

planemo upload for repository https://github.com/fubar2/microsatbed commit d952bc313f408735456747c3d33e09a3170c8f59-dirty
author fubar
date Wed, 17 Jul 2024 12:08:15 +0000
parents db5523378e5c
children 45f690db0eaf
line wrap: on
line diff
--- a/find_str.py	Wed Jul 17 07:40:00 2024 +0000
+++ b/find_str.py	Wed Jul 17 12:08:15 2024 +0000
@@ -1,4 +1,7 @@
 import argparse
+import shutil
+
+import pybigtools
 
 import pytrf  # 1.3.0
 from pyfastx import Fastx  # 0.5.2
@@ -8,6 +11,15 @@
 Designed to build some of the microsatellite tracks from https://github.com/arangrhie/T2T-Polish/tree/master/pattern for the VGP.
 """
 
+def getDensity(name, bed, len, winwidth):
+    nwin = int(len / winwidth)
+    d = [0.0 for x in range(nwin+1)]
+    for b in bed:
+        nt = b[5]
+        bin = int(b[1]/winwidth)
+        d[bin] += nt
+    dw = [(name,x*winwidth,(x+1)*winwidth,float(d[x])) for x in range(nwin+1) if (x+1)*winwidth <= len]
+    return dw
 
 def write_ssrs(args):
     """
@@ -18,11 +30,14 @@
     Sequence read bias might be influenced by GC density or some other specific motif.
     """
     bed = []
+    wig = []
+    chrlens = {}
     specific = None
     if args.specific:
         specific = args.specific.upper().split(",")
     fa = Fastx(args.fasta, uppercase=True)
     for name, seq in fa:
+        cbed = []
         for ssr in pytrf.STRFinder(
                 name,
                 seq,
@@ -43,24 +58,35 @@
             )
             # pytrf reports a 1 based start position so start-1 fixes the bed interval lengths
             if args.specific and ssr.motif in specific:
-                bed.append(row)
+                cbed.append(row)
             elif args.mono and len(ssr.motif) == 1:
-                bed.append(row)
+                cbed.append(row)
             elif args.di and len(ssr.motif) == 2:
-                bed.append(row)
+                cbed.append(row)
             elif args.tri and len(ssr.motif) == 3:
-                bed.append(row)
+                cbed.append(row)
             elif args.tetra and len(ssr.motif) == 4:
-                bed.append(row)
+                cbed.append(row)
             elif args.penta and len(ssr.motif) == 5:
-                bed.append(row)
+                cbed.append(row)
             elif args.hexa and len(ssr.motif) == 6:
-                bed.append(row)
-    bed.sort()
-    obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed]
-    with open(args.bed, "w") as outbed:
-        outbed.write("\n".join(obed))
-        outbed.write("\n")
+                cbed.append(row)
+        bed += cbed
+        if args.bigwig:
+            chrlens[name] = len(seq)
+            w = getDensity(name, cbed, len(seq), args.winwidth)
+            wig += w
+    if args.bigwig:
+        wig.sort()
+        bw = pybigtools.open("temp.bw", 'w')
+        bw.write(chrlens,wig)
+        shutil.move("temp.bw", args.bed)
+    else:
+        bed.sort()
+        obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed]
+        with open(args.bed, "w") as outbed:
+            outbed.write("\n".join(obed))
+            outbed.write("\n")
 
 
 if __name__ == "__main__":
@@ -80,6 +106,8 @@
     a("--monomin", default=2, type=int)
     a("-f", "--fasta", default="humsamp.fa")
     a("-b", "--bed", default="humsamp.bed")
+    a("--bigwig", action="store_true")
+    a("--winwidth", default=128, type=int)
     a("--specific", default=None)
     a("--minreps", default=2, type=int)
     args = parser.parse_args()