Mercurial > repos > bgruening > chemfp
comparison butina_clustering.py @ 12:3b14765c22ee draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 7fb96a3844b4771084f18de2346ed6d5e241d839"
author | bgruening |
---|---|
date | Sat, 25 Sep 2021 19:07:44 +0000 |
parents | 70b071de9bee |
children |
comparison
equal
deleted
inserted
replaced
11:92c7cdc243e8 | 12:3b14765c22ee |
---|---|
3 Modified version of code examples from the chemfp project. | 3 Modified version of code examples from the chemfp project. |
4 http://code.google.com/p/chem-fingerprints/ | 4 http://code.google.com/p/chem-fingerprints/ |
5 Thanks to Andrew Dalke of Andrew Dalke Scientific! | 5 Thanks to Andrew Dalke of Andrew Dalke Scientific! |
6 """ | 6 """ |
7 | 7 |
8 import argparse | |
9 import os | |
10 import subprocess | |
11 import sys | |
12 import tempfile | |
13 | |
8 import chemfp | 14 import chemfp |
9 import sys | |
10 import os | |
11 import tempfile | |
12 import argparse | |
13 import subprocess | |
14 from chemfp import search | 15 from chemfp import search |
16 | |
15 | 17 |
16 def unix_sort(results): | 18 def unix_sort(results): |
17 temp_unsorted = tempfile.NamedTemporaryFile(delete=False) | 19 temp_unsorted = tempfile.NamedTemporaryFile(delete=False) |
18 for (i,indices) in enumerate( results.iter_indices() ): | 20 for (i, indices) in enumerate(results.iter_indices()): |
19 temp_unsorted.write('%s %s\n' % (len(indices), i)) | 21 temp_unsorted.write("%s %s\n" % (len(indices), i)) |
20 temp_unsorted.close() | 22 temp_unsorted.close() |
21 temp_sorted = tempfile.NamedTemporaryFile(delete=False) | 23 temp_sorted = tempfile.NamedTemporaryFile(delete=False) |
22 temp_sorted.close() | 24 temp_sorted.close() |
23 p = subprocess.Popen(['sort', '-n', '-r', '-k', '1,1'], stdin=open(temp_unsorted.name), stdout=open(temp_sorted.name, 'w+')) | 25 p = subprocess.Popen( |
26 ["sort", "-n", "-r", "-k", "1,1"], | |
27 stdin=open(temp_unsorted.name), | |
28 stdout=open(temp_sorted.name, "w+"), | |
29 ) | |
24 stdout, stderr = p.communicate() | 30 stdout, stderr = p.communicate() |
25 return_code = p.returncode | 31 return_code = p.returncode |
26 | 32 |
27 if return_code: | 33 if return_code: |
28 sys.stdout.write(stdout) | 34 sys.stdout.write(stdout) |
35 size, fp_idx = line.strip().split() | 41 size, fp_idx = line.strip().split() |
36 yield (int(size), int(fp_idx)) | 42 yield (int(size), int(fp_idx)) |
37 | 43 |
38 os.remove(temp_sorted.name) | 44 os.remove(temp_sorted.name) |
39 | 45 |
40 def butina( args ): | 46 |
47 def butina(args): | |
41 """ | 48 """ |
42 Taylor-Butina clustering from the chemfp help. | 49 Taylor-Butina clustering from the chemfp help. |
43 """ | 50 """ |
44 out = args.output_path | 51 out = args.output_path |
45 targets = chemfp.open( args.input_path, format='fps' ) | 52 targets = chemfp.open(args.input_path, format="fps") |
46 arena = chemfp.load_fingerprints( targets ) | 53 arena = chemfp.load_fingerprints(targets) |
47 | 54 |
48 chemfp.set_num_threads( args.processors ) | 55 chemfp.set_num_threads(args.processors) |
49 results = search.threshold_tanimoto_search_symmetric(arena, threshold = args.tanimoto_threshold) | 56 results = search.threshold_tanimoto_search_symmetric( |
57 arena, threshold=args.tanimoto_threshold | |
58 ) | |
50 results.reorder_all("move-closest-first") | 59 results.reorder_all("move-closest-first") |
51 | 60 |
52 sorted_ids = unix_sort(results) | 61 sorted_ids = unix_sort(results) |
53 | 62 |
54 # Determine the true/false singletons and the clusters | 63 # Determine the true/false singletons and the clusters |
55 true_singletons = [] | 64 true_singletons = [] |
56 false_singletons = [] | 65 false_singletons = [] |
57 clusters = [] | 66 clusters = [] |
58 | 67 |
59 seen = set() | 68 seen = set() |
60 #for (size, fp_idx, members) in results: | 69 # for (size, fp_idx, members) in results: |
61 for (size, fp_idx) in sorted_ids: | 70 for (size, fp_idx) in sorted_ids: |
62 members = results[fp_idx].get_indices() | 71 members = results[fp_idx].get_indices() |
63 #print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members] | 72 # print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members] |
64 if fp_idx in seen: | 73 if fp_idx in seen: |
65 # Can't use a centroid which is already assigned | 74 # Can't use a centroid which is already assigned |
66 continue | 75 continue |
67 seen.add(fp_idx) | 76 seen.add(fp_idx) |
68 | 77 |
69 if size == 0: | 78 if size == 0: |
70 # The only fingerprint in the exclusion sphere is itself | 79 # The only fingerprint in the exclusion sphere is itself |
71 true_singletons.append( fp_idx ) | 80 true_singletons.append(fp_idx) |
72 continue | 81 continue |
73 | 82 |
74 # Figure out which ones haven't yet been assigned | 83 # Figure out which ones haven't yet been assigned |
75 unassigned = set(members) - seen | 84 unassigned = set(members) - seen |
76 | 85 |
77 if not unassigned: | 86 if not unassigned: |
78 false_singletons.append(fp_idx) | 87 false_singletons.append(fp_idx) |
79 continue | 88 continue |
80 | 89 |
81 # this is a new cluster | 90 # this is a new cluster |
82 clusters.append( (fp_idx, unassigned) ) | 91 clusters.append((fp_idx, unassigned)) |
83 seen.update(unassigned) | 92 seen.update(unassigned) |
84 | 93 |
85 len_cluster = len(clusters) | 94 len_cluster = len(clusters) |
86 #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) ) | 95 # out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) ) |
87 #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) ) | 96 # out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) ) |
88 | 97 |
89 out.write( "#%s true singletons\n" % len(true_singletons) ) | 98 out.write("#%s true singletons\n" % len(true_singletons)) |
90 out.write( "#%s false singletons\n" % len(false_singletons) ) | 99 out.write("#%s false singletons\n" % len(false_singletons)) |
91 out.write( "#clusters: %s\n" % len_cluster ) | 100 out.write("#clusters: %s\n" % len_cluster) |
92 | 101 |
93 # Sort so the cluster with the most compounds comes first, | 102 # Sort so the cluster with the most compounds comes first, |
94 # then by alphabetically smallest id | 103 # then by alphabetically smallest id |
95 def cluster_sort_key(cluster): | 104 def cluster_sort_key(cluster): |
96 centroid_idx, members = cluster | 105 centroid_idx, members = cluster |
98 | 107 |
99 clusters.sort(key=cluster_sort_key) | 108 clusters.sort(key=cluster_sort_key) |
100 | 109 |
101 for centroid_idx, members in clusters: | 110 for centroid_idx, members in clusters: |
102 centroid_name = arena.ids[centroid_idx] | 111 centroid_name = arena.ids[centroid_idx] |
103 out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(arena.ids[idx] for idx in members))) | 112 out.write( |
104 #ToDo: len(members) need to be some biggest top 90% or something ... | 113 "%s\t%s\t%s\n" |
114 % (centroid_name, len(members), " ".join(arena.ids[idx] for idx in members)) | |
115 ) | |
116 # ToDo: len(members) need to be some biggest top 90% or something ... | |
105 | 117 |
106 for idx in true_singletons: | 118 for idx in true_singletons: |
107 out.write("%s\t%s\n" % (arena.ids[idx], 0)) | 119 out.write("%s\t%s\n" % (arena.ids[idx], 0)) |
108 | 120 |
109 out.close() | 121 out.close() |
110 | 122 |
111 | 123 |
112 if __name__ == "__main__": | 124 if __name__ == "__main__": |
113 parser = argparse.ArgumentParser(description="""Taylor-Butina clustering for fps files. | 125 parser = argparse.ArgumentParser( |
126 description="""Taylor-Butina clustering for fps files. | |
114 For more details please see the original publication or the chemfp documentation: | 127 For more details please see the original publication or the chemfp documentation: |
115 http://www.chemomine.co.uk/dbclus-paper.pdf | 128 http://www.chemomine.co.uk/dbclus-paper.pdf |
116 https://chemfp.readthedocs.org | 129 https://chemfp.readthedocs.org |
117 """) | 130 """ |
131 ) | |
118 | 132 |
119 parser.add_argument("-i", "--input", dest="input_path", | 133 parser.add_argument( |
120 required=True, | 134 "-i", |
121 help="Path to the input file.") | 135 "--input", |
136 dest="input_path", | |
137 required=True, | |
138 help="Path to the input file.", | |
139 ) | |
122 | 140 |
123 parser.add_argument("-o", "--output", dest="output_path", type=argparse.FileType('w'), | 141 parser.add_argument( |
124 default=sys.stdout, | 142 "-o", |
125 help="Path to the output file.") | 143 "--output", |
144 dest="output_path", | |
145 type=argparse.FileType("w"), | |
146 default=sys.stdout, | |
147 help="Path to the output file.", | |
148 ) | |
126 | 149 |
127 parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", type=float, | 150 parser.add_argument( |
128 default=0.8, | 151 "-t", |
129 help="Tanimoto threshold [0.8]") | 152 "--threshold", |
153 dest="tanimoto_threshold", | |
154 type=float, | |
155 default=0.8, | |
156 help="Tanimoto threshold [0.8]", | |
157 ) | |
130 | 158 |
131 parser.add_argument('-p', '--processors', type=int, default=4) | 159 parser.add_argument("-p", "--processors", type=int, default=4) |
132 | 160 |
133 options = parser.parse_args() | 161 options = parser.parse_args() |
134 butina( options ) | 162 butina(options) |