Mercurial > repos > fubar > rosstest
comparison microsatbed/find_str.py @ 0:50a1636fde68 draft default tip
Uploaded
| author | fubar |
|---|---|
| date | Sun, 14 Jul 2024 02:32:13 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:50a1636fde68 |
|---|---|
| 1 import argparse | |
| 2 | |
| 3 import pytrf # 1.3.0 | |
| 4 from pyfastx import Fastx # 0.5.2 | |
| 5 | |
| 6 """ | |
| 7 Allows all STR or those for a subset of motifs to be written to a bed file | |
| 8 Designed to build some of the microsatellite tracks from https://github.com/arangrhie/T2T-Polish/tree/master/pattern for the VGP. | |
| 9 """ | |
| 10 | |
| 11 | |
| 12 def write_ssrs(args): | |
| 13 """ | |
| 14 The integers in the call change the minimum repeats for mono-, di-, tri-, tetra-, penta-, hexa-nucleotide repeats | |
| 15 ssrs = pytrf.STRFinder(name, seq, 10, 6, 4, 3, 3, 3) | |
| 16 NOTE: Dinucleotides GA and AG are reported separately by https://github.com/marbl/seqrequester. | |
| 17 The reversed pair STRs are about as common in the documentation sample. | |
| 18 Sequence read bias might be influenced by GC density or some other specific motif. | |
| 19 """ | |
| 20 bed = [] | |
| 21 specific = None | |
| 22 if args.specific: | |
| 23 specific = args.specific.upper().split(",") | |
| 24 fa = Fastx(args.fasta, uppercase=True) | |
| 25 for name, seq in fa: | |
| 26 if args.specific: | |
| 27 ssrs = pytrf.STRFinder( | |
| 28 name, | |
| 29 seq, | |
| 30 args.minreps, | |
| 31 args.minreps, | |
| 32 args.minreps, | |
| 33 args.minreps, | |
| 34 args.minreps, | |
| 35 args.minreps, | |
| 36 ) | |
| 37 else: | |
| 38 ssrs = pytrf.STRFinder( | |
| 39 name, | |
| 40 seq, | |
| 41 args.monomin, | |
| 42 args.dimin, | |
| 43 args.trimin, | |
| 44 args.tetramin, | |
| 45 args.pentamin, | |
| 46 args.hexamin, | |
| 47 ) | |
| 48 for ssr in ssrs: | |
| 49 row = ( | |
| 50 ssr.chrom, | |
| 51 ssr.start - 1, | |
| 52 ssr.end, | |
| 53 ssr.motif, | |
| 54 ssr.repeat, | |
| 55 ssr.length, | |
| 56 ) | |
| 57 # pytrf reports a 1 based start position so start-1 fixes the bed interval lengths | |
| 58 if args.specific and ssr.motif in specific: | |
| 59 bed.append(row) | |
| 60 elif args.mono and len(ssr.motif) == 1: | |
| 61 bed.append(row) | |
| 62 elif args.di and len(ssr.motif) == 2: | |
| 63 bed.append(row) | |
| 64 elif args.tri and len(ssr.motif) == 3: | |
| 65 bed.append(row) | |
| 66 elif args.tetra and len(ssr.motif) == 4: | |
| 67 bed.append(row) | |
| 68 elif args.penta and len(ssr.motif) == 5: | |
| 69 bed.append(row) | |
| 70 elif args.hexa and len(ssr.motif) == 6: | |
| 71 bed.append(row) | |
| 72 bed.sort() | |
| 73 obed = ["%s\t%d\t%d\t%s_%d\t%d" % x for x in bed] | |
| 74 with open(args.bed, "w") as outbed: | |
| 75 outbed.write("\n".join(obed)) | |
| 76 outbed.write("\n") | |
| 77 | |
| 78 | |
| 79 if __name__ == "__main__": | |
| 80 parser = argparse.ArgumentParser() | |
| 81 a = parser.add_argument | |
| 82 a("--di", action="store_true") | |
| 83 a("--tri", action="store_true") | |
| 84 a("--tetra", action="store_true") | |
| 85 a("--penta", action="store_true") | |
| 86 a("--hexa", action="store_true") | |
| 87 a("--mono", action="store_true") | |
| 88 a("--dimin", default=2, type=int) | |
| 89 a("--trimin", default=2, type=int) | |
| 90 a("--tetramin", default=2, type=int) | |
| 91 a("--pentamin", default=2, type=int) | |
| 92 a("--hexamin", default=2, type=int) | |
| 93 a("--monomin", default=2, type=int) | |
| 94 a("-f", "--fasta", default="humsamp.fa") | |
| 95 a("-b", "--bed", default="humsamp.bed") | |
| 96 a("--specific", default=None) | |
| 97 a("--minreps", default=2, type=int) | |
| 98 args = parser.parse_args() | |
| 99 write_ssrs(args) |
