annotate chemfp_clustering/old/butina_clustering_old.py @ 1:43a9e7d9b24f draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit a44c0a13283e873a740eabcad04f021208290dfe-dirty
author bgruening
date Sun, 01 Nov 2015 10:27:01 -0500 (2015-11-01)
parents 354d3c6bb894
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
1 #!/usr/bin/env python
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
2 """
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
3 Modified version of code examples from the chemfp project.
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
4 http://code.google.com/p/chem-fingerprints/
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
5 Thanks to Andrew Dalke of Andrew Dalke Scientific!
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
6 """
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
7
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
8 import chemfp
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
9 import sys
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
10 import os
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
11 import tempfile
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
12
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
13 temp_file = tempfile.NamedTemporaryFile()
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
14 temp_link = "%s.%s" % (temp_file.name, 'fps')
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
15 temp_file.close()
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
16 os.system('ln -s %s %s' % (os.path.realpath(sys.argv[1]), temp_link) )
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
17
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
18
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
19 chemfp_fingerprint_file = temp_link
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
20 tanimoto_threshold = float(sys.argv[2])
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
21 outfile = sys.argv[3]
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
22 processors = int(sys.argv[4])
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
23
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
24
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
25 def get_hit_indicies(hits):
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
26 return [id for (id, score) in hits]
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
27
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
28 out = open(outfile, 'w')
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
29 dataset = chemfp.load_fingerprints( chemfp_fingerprint_file )
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
30
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
31 chemfp.set_num_threads( processors )
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
32 search = dataset.threshold_tanimoto_search_arena(dataset, threshold = tanimoto_threshold)
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
33 #search = chemfp.search.threshold_tanimoto_search_symmetric (dataset, threshold = tanimoto_threshold)
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
34
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
35 # Reorder so the centroid with the most hits comes first.
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
36 # (That's why I do a reverse search.)
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
37 # Ignore the arbitrariness of breaking ties by fingerprint index
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
38 results = sorted( ( (len(hits), i, hits) for (i, hits) in enumerate(search.iter_indices_and_scores()) ),reverse=True)
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
39
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
40
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
41 # Determine the true/false singletons and the clusters
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
42 true_singletons = []
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
43 false_singletons = []
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
44 clusters = []
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
45
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
46 seen = set()
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
47
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
48 for (size, fp_idx, hits) in results:
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
49 if fp_idx in seen:
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
50 # Can't use a centroid which is already assigned
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
51 continue
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
52 seen.add(fp_idx)
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
53 print size, fp_idx, hits
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
54 if size == 1:
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
55 # The only fingerprint in the exclusion sphere is itself
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
56 true_singletons.append(fp_idx)
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
57 continue
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
58
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
59 members = get_hit_indicies(hits)
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
60 # Figure out which ones haven't yet been assigned
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
61 unassigned = [target_idx for target_idx in members if target_idx not in seen]
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
62
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
63 if not unassigned:
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
64 false_singletons.append(fp_idx)
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
65 continue
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
66
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
67 # this is a new cluster
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
68 clusters.append( (fp_idx, unassigned) )
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
69 seen.update(unassigned)
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
70
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
71 len_cluster = len(clusters)
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
72 #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(dataset.ids[idx] for idx in true_singletons)) ) )
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
73 #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(dataset.ids[idx] for idx in false_singletons)) ) )
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
74
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
75 out.write( "#%s true singletons\n" % len(true_singletons) )
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
76 out.write( "#%s false singletons\n" % len(false_singletons) )
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
77 out.write( "#clusters: %s\n" % len_cluster )
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
78
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
79 # Sort so the cluster with the most compounds comes first,
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
80 # then by alphabetically smallest id
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
81 def cluster_sort_key(cluster):
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
82 centroid_idx, members = cluster
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
83 return -len(members), dataset.ids[centroid_idx]
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
84
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
85 clusters.sort(key=cluster_sort_key)
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
86
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
87
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
88 for centroid_idx, members in clusters:
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
89 centroid_name = dataset.ids[centroid_idx]
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
90 out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(sorted(dataset.ids[idx] for idx in members))))
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
91 #ToDo: len(members) need to be some biggest top 90% or something ...
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
92
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
93 for idx in sorted(true_singletons):
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
94 out.write("%s\t%s\n" % (dataset.ids[idx], 0))
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
95
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
96 out.close()
354d3c6bb894 Uploaded
bgruening
parents:
diff changeset
97 os.remove( temp_link )