diff filter_multihit_paf.py @ 131:69c6ea16c148 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/jbrowse2 commit 3cf9ec268c0719caf060b7d6bf5c0159909c348a
author fubar
date Sat, 12 Oct 2024 23:11:22 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_multihit_paf.py	Sat Oct 12 23:11:22 2024 +0000
@@ -0,0 +1,94 @@
+# idea from https://github.com/marbl/MashMap/blob/master/scripts/denovo_repeat_annotation.py
+# adds filter for more than minMatch denovo repeated paf read locations: >1 gives ~13%, >3 gives 3% of all hits in testing a big and small paf.
+# A haplotype paf filtered of denovo repeats might give cleaner dotplots, because they add visual noise and repeats are less informative for sequence similarity.
+# Alternative to use repeatmasked haplotype as the mashmap reference might also be useful.
+# Also outputs a bed with the multimatch regions filtered from the paf with their repeat count as the score
+# ross lazarus october 6 2024
+
+import argparse
+
+from collections import OrderedDict
+
+
+def pafDeDupe(inPaf, outDeNovoBed, outPaf, minMatch):
+    """
+    Usage notes:
+    if a segment appears more than once on the left of a paf row, it can be considered a denovo repeat!
+    Contig and start offset are used as the id so approximate-ish.
+    Seems a consistent 13% 1+ denovo repeats with different levels of identity and match length
+    3+ finds a much lower 3% of hits
+
+    With a 181217 row squirrelhaps1k.paf at 1k 99%, get about 13% denovo repeat rows
+    (venv311) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ python filter_multihit_paf.py --inpaf squirrelhaps1k.paf --outbed sh1k.bed --outpaf sh1kdedupe.paf --minMatch 1
+    (venv311) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ wc -l sh1kdedupe.paf
+    109179 sh1kdedupe.paf
+    (venv311) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ wc -l sh1k.bed
+    23015 sh1k.bed
+
+    venv311) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ python filter_multihit_paf.py --inpaf squirrelhaps1k.paf --outbed sh1k.bed --outpaf sh1kdedupe.paf --minMatch 3
+    (venv311) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ wc -l sh1k.bed
+    7996 sh1k.bed
+
+    with a smaller 26610 10k 95%, get about
+    (venv3row 11) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ wc -l squirrelhaps.paf
+    26610 squirrelhaps.paf
+    (venv311) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ python filter_multihit_paf.py squirrelhaps.paf sh.bed shdedupe.paf
+    (venv311) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ wc -l shdedupe.paf
+    18332 shdedupe.paf
+    (venv311) ross@pn50:~/rossgit/galaxytools/tools/jbrowse2$ wc -l sh.bed
+    3663 sh.bed
+    """
+
+    CHROMOSOMECOL1 = 0
+    STARTCOL1 = 2
+    ENDCOL1 = 3
+    hitTable1 = OrderedDict()
+    hitTable1_lens = {}
+    filterLen = minMatch
+    minMatch = int(minMatch)
+    filterMe = {}
+
+    with open(inPaf) as f:
+        for i, line in enumerate(f):
+            rowElements = line.split()
+            chromosome1 = rowElements[CHROMOSOMECOL1]
+            start1 = int(rowElements[STARTCOL1])
+            end1 = int(rowElements[ENDCOL1])
+            h1key = "%s~%d" % (chromosome1, start1)
+            if hitTable1.get(h1key, None):
+                hitTable1[h1key].append(i)
+                if len(hitTable1[h1key]) > minMatch:
+                    filterMe[i] = i
+            else:
+                hitTable1[h1key] = [
+                    i,
+                ]
+                hitTable1_lens[h1key] = abs(end1 - start1)
+    with open(outDeNovoBed, "w") as f:
+        for k in hitTable1.keys():
+            # OrderedDict so input paf order preserved - if that's wrong so is the bed.
+            nk = len(hitTable1[k])
+            if nk > minMatch:
+                (chr, start) = k.split("~")
+                end = int(start) + hitTable1_lens.get(k, 0)
+                name= '_'.join([chr,start])
+                row = (chr, start, "%d" % end, name, "%d" % nk)
+                f.write("\t".join(row))
+                f.write("\n")
+    with open(outPaf, "w") as f:
+        f.writelines(
+        line for i, line in enumerate(open(inPaf)) if not filterMe.get(i, None)
+    )
+
+
+if __name__ == "__main__":
+    VERS = 0.01
+    parser = argparse.ArgumentParser()
+    a = parser.add_argument
+    a("--inpaf", default=None)
+    a("--outbed", default=None)
+    a("--outpaf", default=None)
+    a("--minmatch", default=1, type=int)
+    a('--version', action='version', version='%(prog)s 0.1')
+    args = parser.parse_args()
+    pafDeDupe(args.inpaf, args.outbed, args.outpaf, args.minmatch)