comparison cherry_pick_fasta.py @ 7:6c0aefd9fee3 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/cherry_pick_fasta commit 849d6d2087dadb81f1b790e3bcb5bda40c3c83af
author artbio
date Thu, 29 Dec 2022 11:53:05 +0000
parents d8fa616a228a
children
comparison
equal deleted inserted replaced
6:d8fa616a228a 7:6c0aefd9fee3
1 import argparse 1 import argparse
2 from collections import defaultdict
2 3
3 4
4 def Parser(): 5 def Parser():
5 the_parser = argparse.ArgumentParser( 6 the_parser = argparse.ArgumentParser(
6 description='Cherry pick fasta sequences') 7 description='Cherry pick fasta sequences')
26 args = the_parser.parse_args() 27 args = the_parser.parse_args()
27 return args 28 return args
28 29
29 30
30 def parse_fasta_dict(query, fasta_dict, mode): 31 def parse_fasta_dict(query, fasta_dict, mode):
32
31 if not isinstance(query, list): 33 if not isinstance(query, list):
32 query = [query] 34 query = [query]
35
36 def kmers(string, ksize, index):
37 if ksize > len(string):
38 return
39 for i in range(len(string) - ksize + 1):
40 kmer = string[i:i+ksize]
41 index[kmer].append(string)
42
43 def consult_index(word, index):
44 accumulator = []
45 print(len(index[word]))
46 for title in index[word]:
47 accumulator.append(title)
48 print(len(accumulator))
49 for title in set(accumulator):
50 print(title)
51
33 accumulator = [] 52 accumulator = []
34 if mode == 'includes': 53 if mode == 'includes':
35 for seq_id in fasta_dict: 54 kmersizes = set([len(word) for word in query])
36 for string in query: 55 index = defaultdict(list)
37 if string in seq_id: 56 for size in kmersizes:
38 accumulator.append(seq_id) 57 for header in fasta_dict:
39 continue 58 kmers(header, size, index)
59 for keyword in query:
60 for header in index[keyword]:
61 accumulator.append(header)
62 accumulator = set(accumulator)
63 res_dict = {k: fasta_dict[k] for k in fasta_dict if k in accumulator}
64 return res_dict
40 elif mode == 'exact': 65 elif mode == 'exact':
41 for seq_id in fasta_dict: 66 for keyword in query:
42 for string in query: 67 try:
43 if string == seq_id: 68 len(fasta_dict[keyword])
44 accumulator.append(seq_id) 69 accumulator.append(keyword)
45 continue 70 except KeyError:
46 res_dict = {k: fasta_dict[k] for k in fasta_dict if k in accumulator} 71 pass
47 return res_dict 72 accumulator = set(accumulator)
73 res_dict = {k: fasta_dict[k] for k in fasta_dict if k in accumulator}
74 return res_dict
48 75
49 76
50 def complement_fasta_dict(fasta_dict, subfasta_dict): 77 def complement_fasta_dict(fasta_dict, subfasta_dict):
51 fasta_ids = list(fasta_dict.keys()) 78 fasta_ids = list(fasta_dict.keys())
52 subfasta_ids = list(subfasta_dict.keys()) 79 subfasta_ids = list(subfasta_dict.keys())