comparison butina_clustering.py @ 12:3b14765c22ee draft default tip

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 7fb96a3844b4771084f18de2346ed6d5e241d839"
author bgruening
date Sat, 25 Sep 2021 19:07:44 +0000
parents 70b071de9bee
children
comparison
equal deleted inserted replaced
11:92c7cdc243e8 12:3b14765c22ee
3 Modified version of code examples from the chemfp project. 3 Modified version of code examples from the chemfp project.
4 http://code.google.com/p/chem-fingerprints/ 4 http://code.google.com/p/chem-fingerprints/
5 Thanks to Andrew Dalke of Andrew Dalke Scientific! 5 Thanks to Andrew Dalke of Andrew Dalke Scientific!
6 """ 6 """
7 7
8 import argparse
9 import os
10 import subprocess
11 import sys
12 import tempfile
13
8 import chemfp 14 import chemfp
9 import sys
10 import os
11 import tempfile
12 import argparse
13 import subprocess
14 from chemfp import search 15 from chemfp import search
16
15 17
16 def unix_sort(results): 18 def unix_sort(results):
17 temp_unsorted = tempfile.NamedTemporaryFile(delete=False) 19 temp_unsorted = tempfile.NamedTemporaryFile(delete=False)
18 for (i,indices) in enumerate( results.iter_indices() ): 20 for (i, indices) in enumerate(results.iter_indices()):
19 temp_unsorted.write('%s %s\n' % (len(indices), i)) 21 temp_unsorted.write("%s %s\n" % (len(indices), i))
20 temp_unsorted.close() 22 temp_unsorted.close()
21 temp_sorted = tempfile.NamedTemporaryFile(delete=False) 23 temp_sorted = tempfile.NamedTemporaryFile(delete=False)
22 temp_sorted.close() 24 temp_sorted.close()
23 p = subprocess.Popen(['sort', '-n', '-r', '-k', '1,1'], stdin=open(temp_unsorted.name), stdout=open(temp_sorted.name, 'w+')) 25 p = subprocess.Popen(
26 ["sort", "-n", "-r", "-k", "1,1"],
27 stdin=open(temp_unsorted.name),
28 stdout=open(temp_sorted.name, "w+"),
29 )
24 stdout, stderr = p.communicate() 30 stdout, stderr = p.communicate()
25 return_code = p.returncode 31 return_code = p.returncode
26 32
27 if return_code: 33 if return_code:
28 sys.stdout.write(stdout) 34 sys.stdout.write(stdout)
35 size, fp_idx = line.strip().split() 41 size, fp_idx = line.strip().split()
36 yield (int(size), int(fp_idx)) 42 yield (int(size), int(fp_idx))
37 43
38 os.remove(temp_sorted.name) 44 os.remove(temp_sorted.name)
39 45
40 def butina( args ): 46
47 def butina(args):
41 """ 48 """
42 Taylor-Butina clustering from the chemfp help. 49 Taylor-Butina clustering from the chemfp help.
43 """ 50 """
44 out = args.output_path 51 out = args.output_path
45 targets = chemfp.open( args.input_path, format='fps' ) 52 targets = chemfp.open(args.input_path, format="fps")
46 arena = chemfp.load_fingerprints( targets ) 53 arena = chemfp.load_fingerprints(targets)
47 54
48 chemfp.set_num_threads( args.processors ) 55 chemfp.set_num_threads(args.processors)
49 results = search.threshold_tanimoto_search_symmetric(arena, threshold = args.tanimoto_threshold) 56 results = search.threshold_tanimoto_search_symmetric(
57 arena, threshold=args.tanimoto_threshold
58 )
50 results.reorder_all("move-closest-first") 59 results.reorder_all("move-closest-first")
51 60
52 sorted_ids = unix_sort(results) 61 sorted_ids = unix_sort(results)
53 62
54 # Determine the true/false singletons and the clusters 63 # Determine the true/false singletons and the clusters
55 true_singletons = [] 64 true_singletons = []
56 false_singletons = [] 65 false_singletons = []
57 clusters = [] 66 clusters = []
58 67
59 seen = set() 68 seen = set()
60 #for (size, fp_idx, members) in results: 69 # for (size, fp_idx, members) in results:
61 for (size, fp_idx) in sorted_ids: 70 for (size, fp_idx) in sorted_ids:
62 members = results[fp_idx].get_indices() 71 members = results[fp_idx].get_indices()
63 #print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members] 72 # print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members]
64 if fp_idx in seen: 73 if fp_idx in seen:
65 # Can't use a centroid which is already assigned 74 # Can't use a centroid which is already assigned
66 continue 75 continue
67 seen.add(fp_idx) 76 seen.add(fp_idx)
68 77
69 if size == 0: 78 if size == 0:
70 # The only fingerprint in the exclusion sphere is itself 79 # The only fingerprint in the exclusion sphere is itself
71 true_singletons.append( fp_idx ) 80 true_singletons.append(fp_idx)
72 continue 81 continue
73 82
74 # Figure out which ones haven't yet been assigned 83 # Figure out which ones haven't yet been assigned
75 unassigned = set(members) - seen 84 unassigned = set(members) - seen
76 85
77 if not unassigned: 86 if not unassigned:
78 false_singletons.append(fp_idx) 87 false_singletons.append(fp_idx)
79 continue 88 continue
80 89
81 # this is a new cluster 90 # this is a new cluster
82 clusters.append( (fp_idx, unassigned) ) 91 clusters.append((fp_idx, unassigned))
83 seen.update(unassigned) 92 seen.update(unassigned)
84 93
85 len_cluster = len(clusters) 94 len_cluster = len(clusters)
86 #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) ) 95 # out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) )
87 #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) ) 96 # out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) )
88 97
89 out.write( "#%s true singletons\n" % len(true_singletons) ) 98 out.write("#%s true singletons\n" % len(true_singletons))
90 out.write( "#%s false singletons\n" % len(false_singletons) ) 99 out.write("#%s false singletons\n" % len(false_singletons))
91 out.write( "#clusters: %s\n" % len_cluster ) 100 out.write("#clusters: %s\n" % len_cluster)
92 101
93 # Sort so the cluster with the most compounds comes first, 102 # Sort so the cluster with the most compounds comes first,
94 # then by alphabetically smallest id 103 # then by alphabetically smallest id
95 def cluster_sort_key(cluster): 104 def cluster_sort_key(cluster):
96 centroid_idx, members = cluster 105 centroid_idx, members = cluster
98 107
99 clusters.sort(key=cluster_sort_key) 108 clusters.sort(key=cluster_sort_key)
100 109
101 for centroid_idx, members in clusters: 110 for centroid_idx, members in clusters:
102 centroid_name = arena.ids[centroid_idx] 111 centroid_name = arena.ids[centroid_idx]
103 out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(arena.ids[idx] for idx in members))) 112 out.write(
104 #ToDo: len(members) need to be some biggest top 90% or something ... 113 "%s\t%s\t%s\n"
114 % (centroid_name, len(members), " ".join(arena.ids[idx] for idx in members))
115 )
116 # ToDo: len(members) need to be some biggest top 90% or something ...
105 117
106 for idx in true_singletons: 118 for idx in true_singletons:
107 out.write("%s\t%s\n" % (arena.ids[idx], 0)) 119 out.write("%s\t%s\n" % (arena.ids[idx], 0))
108 120
109 out.close() 121 out.close()
110 122
111 123
112 if __name__ == "__main__": 124 if __name__ == "__main__":
113 parser = argparse.ArgumentParser(description="""Taylor-Butina clustering for fps files. 125 parser = argparse.ArgumentParser(
126 description="""Taylor-Butina clustering for fps files.
114 For more details please see the original publication or the chemfp documentation: 127 For more details please see the original publication or the chemfp documentation:
115 http://www.chemomine.co.uk/dbclus-paper.pdf 128 http://www.chemomine.co.uk/dbclus-paper.pdf
116 https://chemfp.readthedocs.org 129 https://chemfp.readthedocs.org
117 """) 130 """
131 )
118 132
119 parser.add_argument("-i", "--input", dest="input_path", 133 parser.add_argument(
120 required=True, 134 "-i",
121 help="Path to the input file.") 135 "--input",
136 dest="input_path",
137 required=True,
138 help="Path to the input file.",
139 )
122 140
123 parser.add_argument("-o", "--output", dest="output_path", type=argparse.FileType('w'), 141 parser.add_argument(
124 default=sys.stdout, 142 "-o",
125 help="Path to the output file.") 143 "--output",
144 dest="output_path",
145 type=argparse.FileType("w"),
146 default=sys.stdout,
147 help="Path to the output file.",
148 )
126 149
127 parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", type=float, 150 parser.add_argument(
128 default=0.8, 151 "-t",
129 help="Tanimoto threshold [0.8]") 152 "--threshold",
153 dest="tanimoto_threshold",
154 type=float,
155 default=0.8,
156 help="Tanimoto threshold [0.8]",
157 )
130 158
131 parser.add_argument('-p', '--processors', type=int, default=4) 159 parser.add_argument("-p", "--processors", type=int, default=4)
132 160
133 options = parser.parse_args() 161 options = parser.parse_args()
134 butina( options ) 162 butina(options)