comparison find_str.py @ 0:4ff60fb9ca4d draft

planemo upload for repository https://github.com/fubar2/microsatbed commit 7ceb6658309a7ababe622b5d92e729e5470e22f0-dirty
author fubar
date Sat, 13 Jul 2024 13:38:39 +0000
parents
children 01c16e8fbc91
comparison
equal deleted inserted replaced
-1:000000000000 0:4ff60fb9ca4d
1 import argparse
2
3 import pytrf # 1.3.0
4 from pyfastx import Fastx # 0.5.2
5
6 """
7 Allows all STR or those for a subset of motifs to be written to a bed file
8 Designed to build some of the microsatellite tracks from https://github.com/arangrhie/T2T-Polish/tree/master/pattern for the VGP.
9 """
10
11
12 def write_ssrs(args):
13 """
14 The integers in the call change the minimum repeats for mono-, di-, tri-, tetra-, penta-, hexa-nucleotide repeats
15 ssrs = pytrf.STRFinder(name, seq, 10, 6, 4, 3, 3, 3)
16 NOTE: Dinucleotides GA and AG are reported separately by https://github.com/marbl/seqrequester.
17 The reversed pair STRs are about as common in the documentation sample.
18 Sequence read bias might be influenced by GC density or some other specific motif.
19 """
20 bed = []
21 specific = None
22 if args.specific:
23 specific = args.specific.upper().split(",")
24 fa = Fastx(args.fasta, uppercase=True)
25 for name, seq in fa:
26 if args.specific:
27 ssrs = pytrf.STRFinder(
28 name,
29 seq,
30 args.minreps,
31 args.minreps,
32 args.minreps,
33 args.minreps,
34 args.minreps,
35 args.minreps,
36 )
37 else:
38 ssrs = pytrf.STRFinder(
39 name,
40 seq,
41 args.monomin,
42 args.dimin,
43 args.trimin,
44 args.tetramin,
45 args.pentamin,
46 args.hexamin,
47 )
48 for ssr in ssrs:
49 row = (
50 ssr.chrom,
51 ssr.start - 1,
52 ssr.end,
53 ssr.motif,
54 ssr.repeat,
55 ssr.length,
56 )
57 # pytrf reports a 1 based start position so start-1 fixes the bed interval lengths
58 if args.specific and ssr.motif in specific:
59 bed.append(row)
60 elif args.mono and len(ssr.motif) == 1:
61 bed.append(row)
62 elif args.di and len(ssr.motif) == 2:
63 bed.append(row)
64 elif args.tri and len(ssr.motif) == 3:
65 bed.append(row)
66 elif args.tetra and len(ssr.motif) == 4:
67 bed.append(row)
68 elif args.penta and len(ssr.motif) == 5:
69 bed.append(row)
70 elif args.hexa and len(ssr.motif) == 6:
71 bed.append(row)
72 bed.sort()
73 obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed]
74 with open(args.bed, "w") as outbed:
75 outbed.write("\n".join(obed))
76 outbed.write("\n")
77
78
79 if __name__ == "__main__":
80 parser = argparse.ArgumentParser()
81 a = parser.add_argument
82 a("--di", action="store_true")
83 a("--tri", action="store_true")
84 a("--tetra", action="store_true")
85 a("--penta", action="store_true")
86 a("--hexa", action="store_true")
87 a("--mono", action="store_true")
88 a("--dimin", default=2, type=int)
89 a("--trimin", default=2, type=int)
90 a("--tetramin", default=2, type=int)
91 a("--pentamin", default=2, type=int)
92 a("--hexamin", default=2, type=int)
93 a("--monomin", default=2, type=int)
94 a("-f", "--fasta", default="humsamp.fa")
95 a("-b", "--bed", default="humsamp.bed")
96 a("--specific", default=None)
97 a("--minreps", default=2, type=int)
98 args = parser.parse_args()
99 write_ssrs(args)