microsatbedfubar: find_str.py comparison

comparison find_str.py @ 7:f27be15cc58d draft

Uploaded

author	fubar
date	Sun, 14 Jul 2024 23:34:26 +0000
parents	4ff60fb9ca4d
children	01c16e8fbc91

comparison

equal deleted inserted replaced

-:c5324bf8a52c
+:f27be15cc58d
+import argparse
+import pytrf  # 1.3.0
+from pyfastx import Fastx  # 0.5.2
+"""
+Allows all STR or those for a subset of motifs to be written to a bed file
+Designed to build some of the microsatellite tracks from https://github.com/arangrhie/T2T-Polish/tree/master/pattern for the VGP.
+"""
+def write_ssrs(args):
+"""
+The integers in the call change the minimum repeats for mono-, di-, tri-, tetra-, penta-, hexa-nucleotide repeats
+ssrs = pytrf.STRFinder(name, seq, 10, 6, 4, 3, 3, 3)
+NOTE: Dinucleotides GA and AG are reported separately by https://github.com/marbl/seqrequester.
+The reversed pair STRs are about as common in the documentation sample.
+Sequence read bias might be influenced by GC density or some other specific motif.
+"""
+bed = []
+specific = None
+if args.specific:
+specific = args.specific.upper().split(",")
+fa = Fastx(args.fasta, uppercase=True)
+for name, seq in fa:
+if args.specific:
+ssrs = pytrf.STRFinder(
+name,
+seq,
+args.minreps,
+args.minreps,
+args.minreps,
+args.minreps,
+args.minreps,
+args.minreps,
+)
+else:
+ssrs = pytrf.STRFinder(
+name,
+seq,
+args.monomin,
+args.dimin,
+args.trimin,
+args.tetramin,
+args.pentamin,
+args.hexamin,
+)
+for ssr in ssrs:
+row = (
+ssr.chrom,
+ssr.start - 1,
+ssr.end,
+ssr.motif,
+ssr.repeat,
+ssr.length,
+)
+# pytrf reports a 1 based start position so start-1 fixes the bed interval lengths
+if args.specific and ssr.motif in specific:
+bed.append(row)
+elif args.mono and len(ssr.motif) == 1:
+bed.append(row)
+elif args.di and len(ssr.motif) == 2:
+bed.append(row)
+elif args.tri and len(ssr.motif) == 3:
+bed.append(row)
+elif args.tetra and len(ssr.motif) == 4:
+bed.append(row)
+elif args.penta and len(ssr.motif) == 5:
+bed.append(row)
+elif args.hexa and len(ssr.motif) == 6:
+bed.append(row)
+bed.sort()
+obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed]
+with open(args.bed, "w") as outbed:
+outbed.write("\n".join(obed))
+outbed.write("\n")
+if __name__ == "__main__":
+parser = argparse.ArgumentParser()
+a = parser.add_argument
+a("--di", action="store_true")
+a("--tri", action="store_true")
+a("--tetra", action="store_true")
+a("--penta", action="store_true")
+a("--hexa", action="store_true")
+a("--mono", action="store_true")
+a("--dimin", default=2, type=int)
+a("--trimin", default=2, type=int)
+a("--tetramin", default=2, type=int)
+a("--pentamin", default=2, type=int)
+a("--hexamin", default=2, type=int)
+a("--monomin", default=2, type=int)
+a("-f", "--fasta", default="humsamp.fa")
+a("-b", "--bed", default="humsamp.bed")
+a("--specific", default=None)
+a("--minreps", default=2, type=int)
+args = parser.parse_args()
+write_ssrs(args)

Mercurial > repos > fubar > microsatbedfubar

comparison find_str.py @ 7:f27be15cc58d draft