0
|
1 import argparse
|
|
2
|
|
3 import pytrf # 1.3.0
|
|
4 from pyfastx import Fastx # 0.5.2
|
|
5
|
|
6 """
|
|
7 Allows all STR or those for a subset of motifs to be written to a bed file
|
|
8 Designed to build some of the microsatellite tracks from https://github.com/arangrhie/T2T-Polish/tree/master/pattern for the VGP.
|
|
9 """
|
|
10
|
|
11
|
|
12 def write_ssrs(args):
|
|
13 """
|
|
14 The integers in the call change the minimum repeats for mono-, di-, tri-, tetra-, penta-, hexa-nucleotide repeats
|
|
15 ssrs = pytrf.STRFinder(name, seq, 10, 6, 4, 3, 3, 3)
|
|
16 NOTE: Dinucleotides GA and AG are reported separately by https://github.com/marbl/seqrequester.
|
|
17 The reversed pair STRs are about as common in the documentation sample.
|
|
18 Sequence read bias might be influenced by GC density or some other specific motif.
|
|
19 """
|
|
20 bed = []
|
|
21 specific = None
|
|
22 if args.specific:
|
|
23 specific = args.specific.upper().split(",")
|
|
24 fa = Fastx(args.fasta, uppercase=True)
|
|
25 for name, seq in fa:
|
|
26 if args.specific:
|
|
27 ssrs = pytrf.STRFinder(
|
|
28 name,
|
|
29 seq,
|
|
30 args.minreps,
|
|
31 args.minreps,
|
|
32 args.minreps,
|
|
33 args.minreps,
|
|
34 args.minreps,
|
|
35 args.minreps,
|
|
36 )
|
|
37 else:
|
|
38 ssrs = pytrf.STRFinder(
|
|
39 name,
|
|
40 seq,
|
|
41 args.monomin,
|
|
42 args.dimin,
|
|
43 args.trimin,
|
|
44 args.tetramin,
|
|
45 args.pentamin,
|
|
46 args.hexamin,
|
|
47 )
|
|
48 for ssr in ssrs:
|
|
49 row = (
|
|
50 ssr.chrom,
|
|
51 ssr.start - 1,
|
|
52 ssr.end,
|
|
53 ssr.motif,
|
|
54 ssr.repeat,
|
|
55 ssr.length,
|
|
56 )
|
|
57 # pytrf reports a 1 based start position so start-1 fixes the bed interval lengths
|
|
58 if args.specific and ssr.motif in specific:
|
|
59 bed.append(row)
|
|
60 elif args.mono and len(ssr.motif) == 1:
|
|
61 bed.append(row)
|
|
62 elif args.di and len(ssr.motif) == 2:
|
|
63 bed.append(row)
|
|
64 elif args.tri and len(ssr.motif) == 3:
|
|
65 bed.append(row)
|
|
66 elif args.tetra and len(ssr.motif) == 4:
|
|
67 bed.append(row)
|
|
68 elif args.penta and len(ssr.motif) == 5:
|
|
69 bed.append(row)
|
|
70 elif args.hexa and len(ssr.motif) == 6:
|
|
71 bed.append(row)
|
|
72 bed.sort()
|
|
73 obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed]
|
|
74 with open(args.bed, "w") as outbed:
|
|
75 outbed.write("\n".join(obed))
|
|
76 outbed.write("\n")
|
|
77
|
|
78
|
|
79 if __name__ == "__main__":
|
|
80 parser = argparse.ArgumentParser()
|
|
81 a = parser.add_argument
|
|
82 a("--di", action="store_true")
|
|
83 a("--tri", action="store_true")
|
|
84 a("--tetra", action="store_true")
|
|
85 a("--penta", action="store_true")
|
|
86 a("--hexa", action="store_true")
|
|
87 a("--mono", action="store_true")
|
|
88 a("--dimin", default=2, type=int)
|
|
89 a("--trimin", default=2, type=int)
|
|
90 a("--tetramin", default=2, type=int)
|
|
91 a("--pentamin", default=2, type=int)
|
|
92 a("--hexamin", default=2, type=int)
|
|
93 a("--monomin", default=2, type=int)
|
|
94 a("-f", "--fasta", default="humsamp.fa")
|
|
95 a("-b", "--bed", default="humsamp.bed")
|
|
96 a("--specific", default=None)
|
|
97 a("--minreps", default=2, type=int)
|
|
98 args = parser.parse_args()
|
|
99 write_ssrs(args)
|