comparison find_str.py @ 20:410144c7b2d6 draft

planemo upload for repository https://github.com/fubar2/microsatbed commit d952bc313f408735456747c3d33e09a3170c8f59-dirty
author fubar
date Wed, 17 Jul 2024 12:08:15 +0000
parents db5523378e5c
children 45f690db0eaf
comparison
equal deleted inserted replaced
19:db5523378e5c 20:410144c7b2d6
1 import argparse 1 import argparse
2 import shutil
3
4 import pybigtools
2 5
3 import pytrf # 1.3.0 6 import pytrf # 1.3.0
4 from pyfastx import Fastx # 0.5.2 7 from pyfastx import Fastx # 0.5.2
5 8
6 """ 9 """
7 Allows all STR or those for a subset of motifs to be written to a bed file 10 Allows all STR or those for a subset of motifs to be written to a bed file
8 Designed to build some of the microsatellite tracks from https://github.com/arangrhie/T2T-Polish/tree/master/pattern for the VGP. 11 Designed to build some of the microsatellite tracks from https://github.com/arangrhie/T2T-Polish/tree/master/pattern for the VGP.
9 """ 12 """
10 13
14 def getDensity(name, bed, len, winwidth):
15 nwin = int(len / winwidth)
16 d = [0.0 for x in range(nwin+1)]
17 for b in bed:
18 nt = b[5]
19 bin = int(b[1]/winwidth)
20 d[bin] += nt
21 dw = [(name,x*winwidth,(x+1)*winwidth,float(d[x])) for x in range(nwin+1) if (x+1)*winwidth <= len]
22 return dw
11 23
12 def write_ssrs(args): 24 def write_ssrs(args):
13 """ 25 """
14 The integers in the call change the minimum repeats for mono-, di-, tri-, tetra-, penta-, hexa-nucleotide repeats 26 The integers in the call change the minimum repeats for mono-, di-, tri-, tetra-, penta-, hexa-nucleotide repeats
15 ssrs = pytrf.STRFinder(name, seq, 10, 6, 4, 3, 3, 3) 27 ssrs = pytrf.STRFinder(name, seq, 10, 6, 4, 3, 3, 3)
16 NOTE: Dinucleotides GA and AG are reported separately by https://github.com/marbl/seqrequester. 28 NOTE: Dinucleotides GA and AG are reported separately by https://github.com/marbl/seqrequester.
17 The reversed pair STRs are about as common in the documentation sample. 29 The reversed pair STRs are about as common in the documentation sample.
18 Sequence read bias might be influenced by GC density or some other specific motif. 30 Sequence read bias might be influenced by GC density or some other specific motif.
19 """ 31 """
20 bed = [] 32 bed = []
33 wig = []
34 chrlens = {}
21 specific = None 35 specific = None
22 if args.specific: 36 if args.specific:
23 specific = args.specific.upper().split(",") 37 specific = args.specific.upper().split(",")
24 fa = Fastx(args.fasta, uppercase=True) 38 fa = Fastx(args.fasta, uppercase=True)
25 for name, seq in fa: 39 for name, seq in fa:
40 cbed = []
26 for ssr in pytrf.STRFinder( 41 for ssr in pytrf.STRFinder(
27 name, 42 name,
28 seq, 43 seq,
29 args.monomin, 44 args.monomin,
30 args.dimin, 45 args.dimin,
41 ssr.repeat, 56 ssr.repeat,
42 ssr.length, 57 ssr.length,
43 ) 58 )
44 # pytrf reports a 1 based start position so start-1 fixes the bed interval lengths 59 # pytrf reports a 1 based start position so start-1 fixes the bed interval lengths
45 if args.specific and ssr.motif in specific: 60 if args.specific and ssr.motif in specific:
46 bed.append(row) 61 cbed.append(row)
47 elif args.mono and len(ssr.motif) == 1: 62 elif args.mono and len(ssr.motif) == 1:
48 bed.append(row) 63 cbed.append(row)
49 elif args.di and len(ssr.motif) == 2: 64 elif args.di and len(ssr.motif) == 2:
50 bed.append(row) 65 cbed.append(row)
51 elif args.tri and len(ssr.motif) == 3: 66 elif args.tri and len(ssr.motif) == 3:
52 bed.append(row) 67 cbed.append(row)
53 elif args.tetra and len(ssr.motif) == 4: 68 elif args.tetra and len(ssr.motif) == 4:
54 bed.append(row) 69 cbed.append(row)
55 elif args.penta and len(ssr.motif) == 5: 70 elif args.penta and len(ssr.motif) == 5:
56 bed.append(row) 71 cbed.append(row)
57 elif args.hexa and len(ssr.motif) == 6: 72 elif args.hexa and len(ssr.motif) == 6:
58 bed.append(row) 73 cbed.append(row)
59 bed.sort() 74 bed += cbed
60 obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed] 75 if args.bigwig:
61 with open(args.bed, "w") as outbed: 76 chrlens[name] = len(seq)
62 outbed.write("\n".join(obed)) 77 w = getDensity(name, cbed, len(seq), args.winwidth)
63 outbed.write("\n") 78 wig += w
79 if args.bigwig:
80 wig.sort()
81 bw = pybigtools.open("temp.bw", 'w')
82 bw.write(chrlens,wig)
83 shutil.move("temp.bw", args.bed)
84 else:
85 bed.sort()
86 obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed]
87 with open(args.bed, "w") as outbed:
88 outbed.write("\n".join(obed))
89 outbed.write("\n")
64 90
65 91
66 if __name__ == "__main__": 92 if __name__ == "__main__":
67 parser = argparse.ArgumentParser() 93 parser = argparse.ArgumentParser()
68 a = parser.add_argument 94 a = parser.add_argument
78 a("--pentamin", default=2, type=int) 104 a("--pentamin", default=2, type=int)
79 a("--hexamin", default=2, type=int) 105 a("--hexamin", default=2, type=int)
80 a("--monomin", default=2, type=int) 106 a("--monomin", default=2, type=int)
81 a("-f", "--fasta", default="humsamp.fa") 107 a("-f", "--fasta", default="humsamp.fa")
82 a("-b", "--bed", default="humsamp.bed") 108 a("-b", "--bed", default="humsamp.bed")
109 a("--bigwig", action="store_true")
110 a("--winwidth", default=128, type=int)
83 a("--specific", default=None) 111 a("--specific", default=None)
84 a("--minreps", default=2, type=int) 112 a("--minreps", default=2, type=int)
85 args = parser.parse_args() 113 args = parser.parse_args()
86 write_ssrs(args) 114 write_ssrs(args)