microsatbedfubar: find_str.py comparison

comparison find_str.py @ 8:01c16e8fbc91 draft

planemo upload for repository https://github.com/fubar2/microsatbed commit 80a8c0db54b6e2cab9dfe7178b1e5b3b39592f2c-dirty

author	fubar
date	Tue, 13 Aug 2024 04:50:47 +0000
parents	4ff60fb9ca4d
children

comparison

equal deleted inserted replaced

-:f27be15cc58d
+:01c16e8fbc91
 import argparse
+import subprocess
 import pytrf  # 1.3.0
 from pyfastx import Fastx  # 0.5.2
 """
 Allows all STR or those for a subset of motifs to be written to a bed file
 Designed to build some of the microsatellite tracks from https://github.com/arangrhie/T2T-Polish/tree/master/pattern for the VGP.
 """
+def getDensity(name, bed, chrlen, winwidth):
+"""
+pybigtools can write bigwigs and they are processed by other ucsc tools - but jb2 will not read them.
+Swapped the conversion to use a bedgraph file processed by bedGraphToBigWig
+"""
+nwin = int(chrlen / winwidth)
+d = [0.0 for x in range(nwin + 1)]
+for b in bed:
+nt = b[5]
+bin = int(b[1] / winwidth)
+d[bin] += nt
+bedg = [
+(name, (x * winwidth), ((x + 1) * winwidth) - 1, float(d[x]))
+for x in range(nwin + 1)
+if (x + 1) * winwidth <= chrlen
+]
+return bedg
 def write_ssrs(args):
 """
 The integers in the call change the minimum repeats for mono-, di-, tri-, tetra-, penta-, hexa-nucleotide repeats
 NOTE: Dinucleotides GA and AG are reported separately by https://github.com/marbl/seqrequester.
 The reversed pair STRs are about as common in the documentation sample.
 Sequence read bias might be influenced by GC density or some other specific motif.
 """
 bed = []
+wig = []
+chrlens = {}
 specific = None
 if args.specific:
 specific = args.specific.upper().split(",")
 fa = Fastx(args.fasta, uppercase=True)
 for name, seq in fa:
-if args.specific:
+chrlen = len(seq)
-ssrs = pytrf.STRFinder(
+chrlens[name] = chrlen
-name,
+cbed = []
-seq,
+for ssr in pytrf.STRFinder(
-args.minreps,
+name,
-args.minreps,
+seq,
-args.minreps,
+args.monomin,
-args.minreps,
+args.dimin,
-args.minreps,
+args.trimin,
-args.minreps,
+args.tetramin,
-)
+args.pentamin,
-else:
+args.hexamin,
-ssrs = pytrf.STRFinder(
+):
-name,
-seq,
-args.monomin,
-args.dimin,
-args.trimin,
-args.tetramin,
-args.pentamin,
-args.hexamin,
-)
-for ssr in ssrs:
 row = (
 ssr.chrom,
-ssr.start - 1,
+ssr.start,
 ssr.end,
 ssr.motif,
 ssr.repeat,
 ssr.length,
 )
-# pytrf reports a 1 based start position so start-1 fixes the bed interval lengths
 if args.specific and ssr.motif in specific:
-bed.append(row)
+cbed.append(row)
 elif args.mono and len(ssr.motif) == 1:
-bed.append(row)
+cbed.append(row)
 elif args.di and len(ssr.motif) == 2:
-bed.append(row)
+cbed.append(row)
 elif args.tri and len(ssr.motif) == 3:
-bed.append(row)
+cbed.append(row)
 elif args.tetra and len(ssr.motif) == 4:
-bed.append(row)
+cbed.append(row)
 elif args.penta and len(ssr.motif) == 5:
-bed.append(row)
+cbed.append(row)
 elif args.hexa and len(ssr.motif) == 6:
-bed.append(row)
+cbed.append(row)
-bed.sort()
+if args.bigwig:
-obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed]
+w = getDensity(name, cbed, chrlen, args.winwidth)
-with open(args.bed, "w") as outbed:
+wig += w
-outbed.write("\n".join(obed))
+bed += cbed
-outbed.write("\n")
+if args.bigwig:
+wig.sort()
+bedg = ["%s %d %d %.2f" % x for x in wig]
+with open("temp.bedg", "w") as bw:
+bw.write("\n".join(bedg))
+chroms = ["%s\t%s" % (x, chrlens[x]) for x in chrlens.keys()]
+with open("temp.chromlen", "w") as cl:
+cl.write("\n".join(chroms))
+cmd = ["bedGraphToBigWig", "temp.bedg", "temp.chromlen", args.bed]
+subprocess.run(cmd)
+else:
+bed.sort()
+obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed]
+with open(args.bed, "w") as outbed:
+outbed.write("\n".join(obed))
+outbed.write("\n")
 if __name__ == "__main__":
 parser = argparse.ArgumentParser()
 a = parser.add_argument
 a("--pentamin", default=2, type=int)
 a("--hexamin", default=2, type=int)
 a("--monomin", default=2, type=int)
 a("-f", "--fasta", default="humsamp.fa")
 a("-b", "--bed", default="humsamp.bed")
+a("--bigwig", action="store_true")
+a("--winwidth", default=128, type=int)
 a("--specific", default=None)
 a("--minreps", default=2, type=int)
 args = parser.parse_args()
 write_ssrs(args)

Mercurial > repos > fubar > microsatbedfubar

comparison find_str.py @ 8:01c16e8fbc91 draft