Mercurial > repos > fubar > microsatbed
comparison find_str.py @ 20:410144c7b2d6 draft
planemo upload for repository https://github.com/fubar2/microsatbed commit d952bc313f408735456747c3d33e09a3170c8f59-dirty
| author | fubar |
|---|---|
| date | Wed, 17 Jul 2024 12:08:15 +0000 |
| parents | db5523378e5c |
| children | 45f690db0eaf |
comparison
equal
deleted
inserted
replaced
| 19:db5523378e5c | 20:410144c7b2d6 |
|---|---|
| 1 import argparse | 1 import argparse |
| 2 import shutil | |
| 3 | |
| 4 import pybigtools | |
| 2 | 5 |
| 3 import pytrf # 1.3.0 | 6 import pytrf # 1.3.0 |
| 4 from pyfastx import Fastx # 0.5.2 | 7 from pyfastx import Fastx # 0.5.2 |
| 5 | 8 |
| 6 """ | 9 """ |
| 7 Allows all STR or those for a subset of motifs to be written to a bed file | 10 Allows all STR or those for a subset of motifs to be written to a bed file |
| 8 Designed to build some of the microsatellite tracks from https://github.com/arangrhie/T2T-Polish/tree/master/pattern for the VGP. | 11 Designed to build some of the microsatellite tracks from https://github.com/arangrhie/T2T-Polish/tree/master/pattern for the VGP. |
| 9 """ | 12 """ |
| 10 | 13 |
| 14 def getDensity(name, bed, len, winwidth): | |
| 15 nwin = int(len / winwidth) | |
| 16 d = [0.0 for x in range(nwin+1)] | |
| 17 for b in bed: | |
| 18 nt = b[5] | |
| 19 bin = int(b[1]/winwidth) | |
| 20 d[bin] += nt | |
| 21 dw = [(name,x*winwidth,(x+1)*winwidth,float(d[x])) for x in range(nwin+1) if (x+1)*winwidth <= len] | |
| 22 return dw | |
| 11 | 23 |
| 12 def write_ssrs(args): | 24 def write_ssrs(args): |
| 13 """ | 25 """ |
| 14 The integers in the call change the minimum repeats for mono-, di-, tri-, tetra-, penta-, hexa-nucleotide repeats | 26 The integers in the call change the minimum repeats for mono-, di-, tri-, tetra-, penta-, hexa-nucleotide repeats |
| 15 ssrs = pytrf.STRFinder(name, seq, 10, 6, 4, 3, 3, 3) | 27 ssrs = pytrf.STRFinder(name, seq, 10, 6, 4, 3, 3, 3) |
| 16 NOTE: Dinucleotides GA and AG are reported separately by https://github.com/marbl/seqrequester. | 28 NOTE: Dinucleotides GA and AG are reported separately by https://github.com/marbl/seqrequester. |
| 17 The reversed pair STRs are about as common in the documentation sample. | 29 The reversed pair STRs are about as common in the documentation sample. |
| 18 Sequence read bias might be influenced by GC density or some other specific motif. | 30 Sequence read bias might be influenced by GC density or some other specific motif. |
| 19 """ | 31 """ |
| 20 bed = [] | 32 bed = [] |
| 33 wig = [] | |
| 34 chrlens = {} | |
| 21 specific = None | 35 specific = None |
| 22 if args.specific: | 36 if args.specific: |
| 23 specific = args.specific.upper().split(",") | 37 specific = args.specific.upper().split(",") |
| 24 fa = Fastx(args.fasta, uppercase=True) | 38 fa = Fastx(args.fasta, uppercase=True) |
| 25 for name, seq in fa: | 39 for name, seq in fa: |
| 40 cbed = [] | |
| 26 for ssr in pytrf.STRFinder( | 41 for ssr in pytrf.STRFinder( |
| 27 name, | 42 name, |
| 28 seq, | 43 seq, |
| 29 args.monomin, | 44 args.monomin, |
| 30 args.dimin, | 45 args.dimin, |
| 41 ssr.repeat, | 56 ssr.repeat, |
| 42 ssr.length, | 57 ssr.length, |
| 43 ) | 58 ) |
| 44 # pytrf reports a 1 based start position so start-1 fixes the bed interval lengths | 59 # pytrf reports a 1 based start position so start-1 fixes the bed interval lengths |
| 45 if args.specific and ssr.motif in specific: | 60 if args.specific and ssr.motif in specific: |
| 46 bed.append(row) | 61 cbed.append(row) |
| 47 elif args.mono and len(ssr.motif) == 1: | 62 elif args.mono and len(ssr.motif) == 1: |
| 48 bed.append(row) | 63 cbed.append(row) |
| 49 elif args.di and len(ssr.motif) == 2: | 64 elif args.di and len(ssr.motif) == 2: |
| 50 bed.append(row) | 65 cbed.append(row) |
| 51 elif args.tri and len(ssr.motif) == 3: | 66 elif args.tri and len(ssr.motif) == 3: |
| 52 bed.append(row) | 67 cbed.append(row) |
| 53 elif args.tetra and len(ssr.motif) == 4: | 68 elif args.tetra and len(ssr.motif) == 4: |
| 54 bed.append(row) | 69 cbed.append(row) |
| 55 elif args.penta and len(ssr.motif) == 5: | 70 elif args.penta and len(ssr.motif) == 5: |
| 56 bed.append(row) | 71 cbed.append(row) |
| 57 elif args.hexa and len(ssr.motif) == 6: | 72 elif args.hexa and len(ssr.motif) == 6: |
| 58 bed.append(row) | 73 cbed.append(row) |
| 59 bed.sort() | 74 bed += cbed |
| 60 obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed] | 75 if args.bigwig: |
| 61 with open(args.bed, "w") as outbed: | 76 chrlens[name] = len(seq) |
| 62 outbed.write("\n".join(obed)) | 77 w = getDensity(name, cbed, len(seq), args.winwidth) |
| 63 outbed.write("\n") | 78 wig += w |
| 79 if args.bigwig: | |
| 80 wig.sort() | |
| 81 bw = pybigtools.open("temp.bw", 'w') | |
| 82 bw.write(chrlens,wig) | |
| 83 shutil.move("temp.bw", args.bed) | |
| 84 else: | |
| 85 bed.sort() | |
| 86 obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed] | |
| 87 with open(args.bed, "w") as outbed: | |
| 88 outbed.write("\n".join(obed)) | |
| 89 outbed.write("\n") | |
| 64 | 90 |
| 65 | 91 |
| 66 if __name__ == "__main__": | 92 if __name__ == "__main__": |
| 67 parser = argparse.ArgumentParser() | 93 parser = argparse.ArgumentParser() |
| 68 a = parser.add_argument | 94 a = parser.add_argument |
| 78 a("--pentamin", default=2, type=int) | 104 a("--pentamin", default=2, type=int) |
| 79 a("--hexamin", default=2, type=int) | 105 a("--hexamin", default=2, type=int) |
| 80 a("--monomin", default=2, type=int) | 106 a("--monomin", default=2, type=int) |
| 81 a("-f", "--fasta", default="humsamp.fa") | 107 a("-f", "--fasta", default="humsamp.fa") |
| 82 a("-b", "--bed", default="humsamp.bed") | 108 a("-b", "--bed", default="humsamp.bed") |
| 109 a("--bigwig", action="store_true") | |
| 110 a("--winwidth", default=128, type=int) | |
| 83 a("--specific", default=None) | 111 a("--specific", default=None) |
| 84 a("--minreps", default=2, type=int) | 112 a("--minreps", default=2, type=int) |
| 85 args = parser.parse_args() | 113 args = parser.parse_args() |
| 86 write_ssrs(args) | 114 write_ssrs(args) |
