comparison find_str.py @ 7:f27be15cc58d draft

Uploaded
author fubar
date Sun, 14 Jul 2024 23:34:26 +0000
parents 4ff60fb9ca4d
children 01c16e8fbc91
comparison
equal deleted inserted replaced
6:c5324bf8a52c 7:f27be15cc58d
1 import argparse
2
3 import pytrf # 1.3.0
4 from pyfastx import Fastx # 0.5.2
5
6 """
7 Allows all STR or those for a subset of motifs to be written to a bed file
8 Designed to build some of the microsatellite tracks from https://github.com/arangrhie/T2T-Polish/tree/master/pattern for the VGP.
9 """
10
11
12 def write_ssrs(args):
13 """
14 The integers in the call change the minimum repeats for mono-, di-, tri-, tetra-, penta-, hexa-nucleotide repeats
15 ssrs = pytrf.STRFinder(name, seq, 10, 6, 4, 3, 3, 3)
16 NOTE: Dinucleotides GA and AG are reported separately by https://github.com/marbl/seqrequester.
17 The reversed pair STRs are about as common in the documentation sample.
18 Sequence read bias might be influenced by GC density or some other specific motif.
19 """
20 bed = []
21 specific = None
22 if args.specific:
23 specific = args.specific.upper().split(",")
24 fa = Fastx(args.fasta, uppercase=True)
25 for name, seq in fa:
26 if args.specific:
27 ssrs = pytrf.STRFinder(
28 name,
29 seq,
30 args.minreps,
31 args.minreps,
32 args.minreps,
33 args.minreps,
34 args.minreps,
35 args.minreps,
36 )
37 else:
38 ssrs = pytrf.STRFinder(
39 name,
40 seq,
41 args.monomin,
42 args.dimin,
43 args.trimin,
44 args.tetramin,
45 args.pentamin,
46 args.hexamin,
47 )
48 for ssr in ssrs:
49 row = (
50 ssr.chrom,
51 ssr.start - 1,
52 ssr.end,
53 ssr.motif,
54 ssr.repeat,
55 ssr.length,
56 )
57 # pytrf reports a 1 based start position so start-1 fixes the bed interval lengths
58 if args.specific and ssr.motif in specific:
59 bed.append(row)
60 elif args.mono and len(ssr.motif) == 1:
61 bed.append(row)
62 elif args.di and len(ssr.motif) == 2:
63 bed.append(row)
64 elif args.tri and len(ssr.motif) == 3:
65 bed.append(row)
66 elif args.tetra and len(ssr.motif) == 4:
67 bed.append(row)
68 elif args.penta and len(ssr.motif) == 5:
69 bed.append(row)
70 elif args.hexa and len(ssr.motif) == 6:
71 bed.append(row)
72 bed.sort()
73 obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed]
74 with open(args.bed, "w") as outbed:
75 outbed.write("\n".join(obed))
76 outbed.write("\n")
77
78
79 if __name__ == "__main__":
80 parser = argparse.ArgumentParser()
81 a = parser.add_argument
82 a("--di", action="store_true")
83 a("--tri", action="store_true")
84 a("--tetra", action="store_true")
85 a("--penta", action="store_true")
86 a("--hexa", action="store_true")
87 a("--mono", action="store_true")
88 a("--dimin", default=2, type=int)
89 a("--trimin", default=2, type=int)
90 a("--tetramin", default=2, type=int)
91 a("--pentamin", default=2, type=int)
92 a("--hexamin", default=2, type=int)
93 a("--monomin", default=2, type=int)
94 a("-f", "--fasta", default="humsamp.fa")
95 a("-b", "--bed", default="humsamp.bed")
96 a("--specific", default=None)
97 a("--minreps", default=2, type=int)
98 args = parser.parse_args()
99 write_ssrs(args)