diff cherry_pick_fasta.py @ 7:6c0aefd9fee3 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/cherry_pick_fasta commit 849d6d2087dadb81f1b790e3bcb5bda40c3c83af
author artbio
date Thu, 29 Dec 2022 11:53:05 +0000
parents d8fa616a228a
children
line wrap: on
line diff
--- a/cherry_pick_fasta.py	Fri Apr 08 16:56:42 2022 +0000
+++ b/cherry_pick_fasta.py	Thu Dec 29 11:53:05 2022 +0000
@@ -1,4 +1,5 @@
 import argparse
+from collections import defaultdict
 
 
 def Parser():
@@ -28,23 +29,49 @@
 
 
 def parse_fasta_dict(query, fasta_dict, mode):
+
     if not isinstance(query, list):
         query = [query]
+
+    def kmers(string, ksize, index):
+        if ksize > len(string):
+            return
+        for i in range(len(string) - ksize + 1):
+            kmer = string[i:i+ksize]
+            index[kmer].append(string)
+
+    def consult_index(word, index):
+        accumulator = []
+        print(len(index[word]))
+        for title in index[word]:
+            accumulator.append(title)
+        print(len(accumulator))
+        for title in set(accumulator):
+            print(title)
+
     accumulator = []
     if mode == 'includes':
-        for seq_id in fasta_dict:
-            for string in query:
-                if string in seq_id:
-                    accumulator.append(seq_id)
-                    continue
+        kmersizes = set([len(word) for word in query])
+        index = defaultdict(list)
+        for size in kmersizes:
+            for header in fasta_dict:
+                kmers(header, size, index)
+        for keyword in query:
+            for header in index[keyword]:
+                accumulator.append(header)
+        accumulator = set(accumulator)
+        res_dict = {k: fasta_dict[k] for k in fasta_dict if k in accumulator}
+        return res_dict
     elif mode == 'exact':
-        for seq_id in fasta_dict:
-            for string in query:
-                if string == seq_id:
-                    accumulator.append(seq_id)
-                    continue
-    res_dict = {k: fasta_dict[k] for k in fasta_dict if k in accumulator}
-    return res_dict
+        for keyword in query:
+            try:
+                len(fasta_dict[keyword])
+                accumulator.append(keyword)
+            except KeyError:
+                pass
+        accumulator = set(accumulator)
+        res_dict = {k: fasta_dict[k] for k in fasta_dict if k in accumulator}
+        return res_dict
 
 
 def complement_fasta_dict(fasta_dict, subfasta_dict):