Mercurial > repos > fubar > jbrowse2
comparison filter_multihit_paf.py @ 135:21bb464c1d53 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/jbrowse2 commit 78bf7abb931bf3d348837c7211cd3cff32486691
author | fubar |
---|---|
date | Sun, 15 Dec 2024 23:47:40 +0000 |
parents | 69c6ea16c148 |
children |
comparison
equal
deleted
inserted
replaced
134:ed3a21033188 | 135:21bb464c1d53 |
---|---|
1 # idea from https://github.com/marbl/MashMap/blob/master/scripts/denovo_repeat_annotation.py | |
2 # adds filter for more than minMatch denovo repeated paf read locations: >1 gives ~13%, >3 gives 3% of all hits in testing a big and small paf. | |
3 # A haplotype paf filtered of denovo repeats might give cleaner dotplots, because they add visual noise and repeats are less informative for sequence similarity. | |
4 # Alternative to use repeatmasked haplotype as the mashmap reference might also be useful. | |
5 # Also outputs a bed with the multimatch regions filtered from the paf with their repeat count as the score | |
6 # ross lazarus october 6 2024 | |
7 | |
8 import argparse | |
9 | |
10 from collections import OrderedDict | |
11 | |
12 | |
13 def pafDeDupe(inPaf, outDeNovoBed, outPaf, minMatch): | |
14 """ | |
15 Usage notes: | |
16 if a segment appears more than once on the left of a paf row, it can be considered a denovo repeat! | |
17 Contig and start offset are used as the id so approximate-ish. | |
18 Seems a consistent 13% 1+ denovo repeats with different levels of identity and match length | |
19 3+ finds a much lower 3% of hits | |
20 | |
21 With a 181217 row squirrelhaps1k.paf at 1k 99%, get about 13% denovo repeat rows | |
22 (venv311) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ python filter_multihit_paf.py --inpaf squirrelhaps1k.paf --outbed sh1k.bed --outpaf sh1kdedupe.paf --minMatch 1 | |
23 (venv311) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ wc -l sh1kdedupe.paf | |
24 109179 sh1kdedupe.paf | |
25 (venv311) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ wc -l sh1k.bed | |
26 23015 sh1k.bed | |
27 | |
28 venv311) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ python filter_multihit_paf.py --inpaf squirrelhaps1k.paf --outbed sh1k.bed --outpaf sh1kdedupe.paf --minMatch 3 | |
29 (venv311) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ wc -l sh1k.bed | |
30 7996 sh1k.bed | |
31 | |
32 with a smaller 26610 10k 95%, get about | |
33 (venv3row 11) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ wc -l squirrelhaps.paf | |
34 26610 squirrelhaps.paf | |
35 (venv311) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ python filter_multihit_paf.py squirrelhaps.paf sh.bed shdedupe.paf | |
36 (venv311) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ wc -l shdedupe.paf | |
37 18332 shdedupe.paf | |
38 (venv311) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ wc -l sh.bed | |
39 3663 sh.bed | |
40 """ | |
41 | |
42 CHROMOSOMECOL1 = 0 | |
43 STARTCOL1 = 2 | |
44 ENDCOL1 = 3 | |
45 hitTable1 = OrderedDict() | |
46 hitTable1_lens = {} | |
47 filterLen = minMatch | |
48 minMatch = int(minMatch) | |
49 filterMe = {} | |
50 | |
51 with open(inPaf) as f: | |
52 for i, line in enumerate(f): | |
53 rowElements = line.split() | |
54 chromosome1 = rowElements[CHROMOSOMECOL1] | |
55 start1 = int(rowElements[STARTCOL1]) | |
56 end1 = int(rowElements[ENDCOL1]) | |
57 h1key = "%s~%d" % (chromosome1, start1) | |
58 if hitTable1.get(h1key, None): | |
59 hitTable1[h1key].append(i) | |
60 if len(hitTable1[h1key]) > minMatch: | |
61 filterMe[i] = i | |
62 else: | |
63 hitTable1[h1key] = [ | |
64 i, | |
65 ] | |
66 hitTable1_lens[h1key] = abs(end1 - start1) | |
67 with open(outDeNovoBed, "w") as f: | |
68 for k in hitTable1.keys(): | |
69 # OrderedDict so input paf order preserved - if that's wrong so is the bed. | |
70 nk = len(hitTable1[k]) | |
71 if nk > minMatch: | |
72 (chr, start) = k.split("~") | |
73 end = int(start) + hitTable1_lens.get(k, 0) | |
74 name= '_'.join([chr,start]) | |
75 row = (chr, start, "%d" % end, name, "%d" % nk) | |
76 f.write("\t".join(row)) | |
77 f.write("\n") | |
78 with open(outPaf, "w") as f: | |
79 f.writelines( | |
80 line for i, line in enumerate(open(inPaf)) if not filterMe.get(i, None) | |
81 ) | |
82 | |
83 | |
84 if __name__ == "__main__": | |
85 VERS = 0.01 | |
86 parser = argparse.ArgumentParser() | |
87 a = parser.add_argument | |
88 a("--inpaf", default=None) | |
89 a("--outbed", default=None) | |
90 a("--outpaf", default=None) | |
91 a("--minmatch", default=1, type=int) | |
92 a('--version', action='version', version='%(prog)s 0.1') | |
93 args = parser.parse_args() | |
94 pafDeDupe(args.inpaf, args.outbed, args.outpaf, args.minmatch) |