Mercurial > repos > fubar > microsatbedfubar
comparison find_str.py @ 8:01c16e8fbc91 draft
planemo upload for repository https://github.com/fubar2/microsatbed commit 80a8c0db54b6e2cab9dfe7178b1e5b3b39592f2c-dirty
| author | fubar |
|---|---|
| date | Tue, 13 Aug 2024 04:50:47 +0000 |
| parents | 4ff60fb9ca4d |
| children |
comparison
equal
deleted
inserted
replaced
| 7:f27be15cc58d | 8:01c16e8fbc91 |
|---|---|
| 1 import argparse | 1 import argparse |
| 2 import subprocess | |
| 2 | 3 |
| 3 import pytrf # 1.3.0 | 4 import pytrf # 1.3.0 |
| 4 from pyfastx import Fastx # 0.5.2 | 5 from pyfastx import Fastx # 0.5.2 |
| 5 | 6 |
| 6 """ | 7 """ |
| 7 Allows all STR or those for a subset of motifs to be written to a bed file | 8 Allows all STR or those for a subset of motifs to be written to a bed file |
| 8 Designed to build some of the microsatellite tracks from https://github.com/arangrhie/T2T-Polish/tree/master/pattern for the VGP. | 9 Designed to build some of the microsatellite tracks from https://github.com/arangrhie/T2T-Polish/tree/master/pattern for the VGP. |
| 9 """ | 10 """ |
| 11 | |
| 12 | |
| 13 def getDensity(name, bed, chrlen, winwidth): | |
| 14 """ | |
| 15 pybigtools can write bigwigs and they are processed by other ucsc tools - but jb2 will not read them. | |
| 16 Swapped the conversion to use a bedgraph file processed by bedGraphToBigWig | |
| 17 """ | |
| 18 nwin = int(chrlen / winwidth) | |
| 19 d = [0.0 for x in range(nwin + 1)] | |
| 20 for b in bed: | |
| 21 nt = b[5] | |
| 22 bin = int(b[1] / winwidth) | |
| 23 d[bin] += nt | |
| 24 bedg = [ | |
| 25 (name, (x * winwidth), ((x + 1) * winwidth) - 1, float(d[x])) | |
| 26 for x in range(nwin + 1) | |
| 27 if (x + 1) * winwidth <= chrlen | |
| 28 ] | |
| 29 return bedg | |
| 10 | 30 |
| 11 | 31 |
| 12 def write_ssrs(args): | 32 def write_ssrs(args): |
| 13 """ | 33 """ |
| 14 The integers in the call change the minimum repeats for mono-, di-, tri-, tetra-, penta-, hexa-nucleotide repeats | 34 The integers in the call change the minimum repeats for mono-, di-, tri-, tetra-, penta-, hexa-nucleotide repeats |
| 16 NOTE: Dinucleotides GA and AG are reported separately by https://github.com/marbl/seqrequester. | 36 NOTE: Dinucleotides GA and AG are reported separately by https://github.com/marbl/seqrequester. |
| 17 The reversed pair STRs are about as common in the documentation sample. | 37 The reversed pair STRs are about as common in the documentation sample. |
| 18 Sequence read bias might be influenced by GC density or some other specific motif. | 38 Sequence read bias might be influenced by GC density or some other specific motif. |
| 19 """ | 39 """ |
| 20 bed = [] | 40 bed = [] |
| 41 wig = [] | |
| 42 chrlens = {} | |
| 21 specific = None | 43 specific = None |
| 22 if args.specific: | 44 if args.specific: |
| 23 specific = args.specific.upper().split(",") | 45 specific = args.specific.upper().split(",") |
| 24 fa = Fastx(args.fasta, uppercase=True) | 46 fa = Fastx(args.fasta, uppercase=True) |
| 25 for name, seq in fa: | 47 for name, seq in fa: |
| 26 if args.specific: | 48 chrlen = len(seq) |
| 27 ssrs = pytrf.STRFinder( | 49 chrlens[name] = chrlen |
| 28 name, | 50 cbed = [] |
| 29 seq, | 51 for ssr in pytrf.STRFinder( |
| 30 args.minreps, | 52 name, |
| 31 args.minreps, | 53 seq, |
| 32 args.minreps, | 54 args.monomin, |
| 33 args.minreps, | 55 args.dimin, |
| 34 args.minreps, | 56 args.trimin, |
| 35 args.minreps, | 57 args.tetramin, |
| 36 ) | 58 args.pentamin, |
| 37 else: | 59 args.hexamin, |
| 38 ssrs = pytrf.STRFinder( | 60 ): |
| 39 name, | |
| 40 seq, | |
| 41 args.monomin, | |
| 42 args.dimin, | |
| 43 args.trimin, | |
| 44 args.tetramin, | |
| 45 args.pentamin, | |
| 46 args.hexamin, | |
| 47 ) | |
| 48 for ssr in ssrs: | |
| 49 row = ( | 61 row = ( |
| 50 ssr.chrom, | 62 ssr.chrom, |
| 51 ssr.start - 1, | 63 ssr.start, |
| 52 ssr.end, | 64 ssr.end, |
| 53 ssr.motif, | 65 ssr.motif, |
| 54 ssr.repeat, | 66 ssr.repeat, |
| 55 ssr.length, | 67 ssr.length, |
| 56 ) | 68 ) |
| 57 # pytrf reports a 1 based start position so start-1 fixes the bed interval lengths | |
| 58 if args.specific and ssr.motif in specific: | 69 if args.specific and ssr.motif in specific: |
| 59 bed.append(row) | 70 cbed.append(row) |
| 60 elif args.mono and len(ssr.motif) == 1: | 71 elif args.mono and len(ssr.motif) == 1: |
| 61 bed.append(row) | 72 cbed.append(row) |
| 62 elif args.di and len(ssr.motif) == 2: | 73 elif args.di and len(ssr.motif) == 2: |
| 63 bed.append(row) | 74 cbed.append(row) |
| 64 elif args.tri and len(ssr.motif) == 3: | 75 elif args.tri and len(ssr.motif) == 3: |
| 65 bed.append(row) | 76 cbed.append(row) |
| 66 elif args.tetra and len(ssr.motif) == 4: | 77 elif args.tetra and len(ssr.motif) == 4: |
| 67 bed.append(row) | 78 cbed.append(row) |
| 68 elif args.penta and len(ssr.motif) == 5: | 79 elif args.penta and len(ssr.motif) == 5: |
| 69 bed.append(row) | 80 cbed.append(row) |
| 70 elif args.hexa and len(ssr.motif) == 6: | 81 elif args.hexa and len(ssr.motif) == 6: |
| 71 bed.append(row) | 82 cbed.append(row) |
| 72 bed.sort() | 83 if args.bigwig: |
| 73 obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed] | 84 w = getDensity(name, cbed, chrlen, args.winwidth) |
| 74 with open(args.bed, "w") as outbed: | 85 wig += w |
| 75 outbed.write("\n".join(obed)) | 86 bed += cbed |
| 76 outbed.write("\n") | 87 if args.bigwig: |
| 88 wig.sort() | |
| 89 bedg = ["%s %d %d %.2f" % x for x in wig] | |
| 90 with open("temp.bedg", "w") as bw: | |
| 91 bw.write("\n".join(bedg)) | |
| 92 chroms = ["%s\t%s" % (x, chrlens[x]) for x in chrlens.keys()] | |
| 93 with open("temp.chromlen", "w") as cl: | |
| 94 cl.write("\n".join(chroms)) | |
| 95 cmd = ["bedGraphToBigWig", "temp.bedg", "temp.chromlen", args.bed] | |
| 96 subprocess.run(cmd) | |
| 97 else: | |
| 98 bed.sort() | |
| 99 obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed] | |
| 100 with open(args.bed, "w") as outbed: | |
| 101 outbed.write("\n".join(obed)) | |
| 102 outbed.write("\n") | |
| 77 | 103 |
| 78 | 104 |
| 79 if __name__ == "__main__": | 105 if __name__ == "__main__": |
| 80 parser = argparse.ArgumentParser() | 106 parser = argparse.ArgumentParser() |
| 81 a = parser.add_argument | 107 a = parser.add_argument |
| 91 a("--pentamin", default=2, type=int) | 117 a("--pentamin", default=2, type=int) |
| 92 a("--hexamin", default=2, type=int) | 118 a("--hexamin", default=2, type=int) |
| 93 a("--monomin", default=2, type=int) | 119 a("--monomin", default=2, type=int) |
| 94 a("-f", "--fasta", default="humsamp.fa") | 120 a("-f", "--fasta", default="humsamp.fa") |
| 95 a("-b", "--bed", default="humsamp.bed") | 121 a("-b", "--bed", default="humsamp.bed") |
| 122 a("--bigwig", action="store_true") | |
| 123 a("--winwidth", default=128, type=int) | |
| 96 a("--specific", default=None) | 124 a("--specific", default=None) |
| 97 a("--minreps", default=2, type=int) | 125 a("--minreps", default=2, type=int) |
| 98 args = parser.parse_args() | 126 args = parser.parse_args() |
| 99 write_ssrs(args) | 127 write_ssrs(args) |
