Mercurial > repos > fubar > microsatbedfubar
diff find_str.py @ 8:01c16e8fbc91 draft
planemo upload for repository https://github.com/fubar2/microsatbed commit 80a8c0db54b6e2cab9dfe7178b1e5b3b39592f2c-dirty
author | fubar |
---|---|
date | Tue, 13 Aug 2024 04:50:47 +0000 |
parents | 4ff60fb9ca4d |
children |
line wrap: on
line diff
--- a/find_str.py Sun Jul 14 23:34:26 2024 +0000 +++ b/find_str.py Tue Aug 13 04:50:47 2024 +0000 @@ -1,4 +1,5 @@ import argparse +import subprocess import pytrf # 1.3.0 from pyfastx import Fastx # 0.5.2 @@ -9,6 +10,25 @@ """ +def getDensity(name, bed, chrlen, winwidth): + """ + pybigtools can write bigwigs and they are processed by other ucsc tools - but jb2 will not read them. + Swapped the conversion to use a bedgraph file processed by bedGraphToBigWig + """ + nwin = int(chrlen / winwidth) + d = [0.0 for x in range(nwin + 1)] + for b in bed: + nt = b[5] + bin = int(b[1] / winwidth) + d[bin] += nt + bedg = [ + (name, (x * winwidth), ((x + 1) * winwidth) - 1, float(d[x])) + for x in range(nwin + 1) + if (x + 1) * winwidth <= chrlen + ] + return bedg + + def write_ssrs(args): """ The integers in the call change the minimum repeats for mono-, di-, tri-, tetra-, penta-, hexa-nucleotide repeats @@ -18,62 +38,68 @@ Sequence read bias might be influenced by GC density or some other specific motif. """ bed = [] + wig = [] + chrlens = {} specific = None if args.specific: specific = args.specific.upper().split(",") fa = Fastx(args.fasta, uppercase=True) for name, seq in fa: - if args.specific: - ssrs = pytrf.STRFinder( - name, - seq, - args.minreps, - args.minreps, - args.minreps, - args.minreps, - args.minreps, - args.minreps, - ) - else: - ssrs = pytrf.STRFinder( - name, - seq, - args.monomin, - args.dimin, - args.trimin, - args.tetramin, - args.pentamin, - args.hexamin, - ) - for ssr in ssrs: + chrlen = len(seq) + chrlens[name] = chrlen + cbed = [] + for ssr in pytrf.STRFinder( + name, + seq, + args.monomin, + args.dimin, + args.trimin, + args.tetramin, + args.pentamin, + args.hexamin, + ): row = ( ssr.chrom, - ssr.start - 1, + ssr.start, ssr.end, ssr.motif, ssr.repeat, ssr.length, ) - # pytrf reports a 1 based start position so start-1 fixes the bed interval lengths if args.specific and ssr.motif in specific: - bed.append(row) + cbed.append(row) elif args.mono and len(ssr.motif) == 1: - bed.append(row) + cbed.append(row) elif args.di and len(ssr.motif) == 2: - bed.append(row) + cbed.append(row) elif args.tri and len(ssr.motif) == 3: - bed.append(row) + cbed.append(row) elif args.tetra and len(ssr.motif) == 4: - bed.append(row) + cbed.append(row) elif args.penta and len(ssr.motif) == 5: - bed.append(row) + cbed.append(row) elif args.hexa and len(ssr.motif) == 6: - bed.append(row) - bed.sort() - obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed] - with open(args.bed, "w") as outbed: - outbed.write("\n".join(obed)) - outbed.write("\n") + cbed.append(row) + if args.bigwig: + w = getDensity(name, cbed, chrlen, args.winwidth) + wig += w + bed += cbed + if args.bigwig: + wig.sort() + bedg = ["%s %d %d %.2f" % x for x in wig] + with open("temp.bedg", "w") as bw: + bw.write("\n".join(bedg)) + chroms = ["%s\t%s" % (x, chrlens[x]) for x in chrlens.keys()] + with open("temp.chromlen", "w") as cl: + cl.write("\n".join(chroms)) + cmd = ["bedGraphToBigWig", "temp.bedg", "temp.chromlen", args.bed] + subprocess.run(cmd) + else: + bed.sort() + obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed] + with open(args.bed, "w") as outbed: + outbed.write("\n".join(obed)) + outbed.write("\n") if __name__ == "__main__": @@ -93,6 +119,8 @@ a("--monomin", default=2, type=int) a("-f", "--fasta", default="humsamp.fa") a("-b", "--bed", default="humsamp.bed") + a("--bigwig", action="store_true") + a("--winwidth", default=128, type=int) a("--specific", default=None) a("--minreps", default=2, type=int) args = parser.parse_args()