comparison microsatbed/find_str.py @ 0:50a1636fde68 draft default tip

Uploaded
author fubar
date Sun, 14 Jul 2024 02:32:13 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:50a1636fde68
1 import argparse
2
3 import pytrf # 1.3.0
4 from pyfastx import Fastx # 0.5.2
5
6 """
7 Allows all STR or those for a subset of motifs to be written to a bed file
8 Designed to build some of the microsatellite tracks from https://github.com/arangrhie/T2T-Polish/tree/master/pattern for the VGP.
9 """
10
11
12 def write_ssrs(args):
13 """
14 The integers in the call change the minimum repeats for mono-, di-, tri-, tetra-, penta-, hexa-nucleotide repeats
15 ssrs = pytrf.STRFinder(name, seq, 10, 6, 4, 3, 3, 3)
16 NOTE: Dinucleotides GA and AG are reported separately by https://github.com/marbl/seqrequester.
17 The reversed pair STRs are about as common in the documentation sample.
18 Sequence read bias might be influenced by GC density or some other specific motif.
19 """
20 bed = []
21 specific = None
22 if args.specific:
23 specific = args.specific.upper().split(",")
24 fa = Fastx(args.fasta, uppercase=True)
25 for name, seq in fa:
26 if args.specific:
27 ssrs = pytrf.STRFinder(
28 name,
29 seq,
30 args.minreps,
31 args.minreps,
32 args.minreps,
33 args.minreps,
34 args.minreps,
35 args.minreps,
36 )
37 else:
38 ssrs = pytrf.STRFinder(
39 name,
40 seq,
41 args.monomin,
42 args.dimin,
43 args.trimin,
44 args.tetramin,
45 args.pentamin,
46 args.hexamin,
47 )
48 for ssr in ssrs:
49 row = (
50 ssr.chrom,
51 ssr.start - 1,
52 ssr.end,
53 ssr.motif,
54 ssr.repeat,
55 ssr.length,
56 )
57 # pytrf reports a 1 based start position so start-1 fixes the bed interval lengths
58 if args.specific and ssr.motif in specific:
59 bed.append(row)
60 elif args.mono and len(ssr.motif) == 1:
61 bed.append(row)
62 elif args.di and len(ssr.motif) == 2:
63 bed.append(row)
64 elif args.tri and len(ssr.motif) == 3:
65 bed.append(row)
66 elif args.tetra and len(ssr.motif) == 4:
67 bed.append(row)
68 elif args.penta and len(ssr.motif) == 5:
69 bed.append(row)
70 elif args.hexa and len(ssr.motif) == 6:
71 bed.append(row)
72 bed.sort()
73 obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed]
74 with open(args.bed, "w") as outbed:
75 outbed.write("\n".join(obed))
76 outbed.write("\n")
77
78
79 if __name__ == "__main__":
80 parser = argparse.ArgumentParser()
81 a = parser.add_argument
82 a("--di", action="store_true")
83 a("--tri", action="store_true")
84 a("--tetra", action="store_true")
85 a("--penta", action="store_true")
86 a("--hexa", action="store_true")
87 a("--mono", action="store_true")
88 a("--dimin", default=2, type=int)
89 a("--trimin", default=2, type=int)
90 a("--tetramin", default=2, type=int)
91 a("--pentamin", default=2, type=int)
92 a("--hexamin", default=2, type=int)
93 a("--monomin", default=2, type=int)
94 a("-f", "--fasta", default="humsamp.fa")
95 a("-b", "--bed", default="humsamp.bed")
96 a("--specific", default=None)
97 a("--minreps", default=2, type=int)
98 args = parser.parse_args()
99 write_ssrs(args)