Mercurial > repos > iuc > clustering_from_distmat
comparison clustering_from_distmat.py @ 0:8192b416f945 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/clustering_from_distmat/ commit a34052b87a2d05cabed5001c50f1bb10e74f97ee
author | iuc |
---|---|
date | Thu, 08 Aug 2024 19:34:36 +0000 |
parents | |
children | c0b01c55a0e0 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:8192b416f945 |
---|---|
1 import argparse | |
2 import sys | |
3 | |
4 import scipy | |
5 | |
6 | |
7 def linkage_as_newick(linkage, tip_names): | |
8 newick_parts = tip_names[::] | |
9 within_cluster_distances = [0] * len(tip_names) | |
10 for step in linkage: | |
11 n1 = int(step[0]) | |
12 n2 = int(step[1]) | |
13 d = float(step[2]) | |
14 d1 = d - within_cluster_distances[n1] | |
15 d2 = d - within_cluster_distances[n2] | |
16 id1 = newick_parts[n1] | |
17 id2 = newick_parts[n2] | |
18 part = f'({id1}:{d1 / 2},{id2}:{d2 / 2})' | |
19 within_cluster_distances.append(d) | |
20 newick_parts.append(part) | |
21 return newick_parts[-1].format(*newick_parts) + ';' | |
22 | |
23 | |
24 if __name__ == "__main__": | |
25 parser = argparse.ArgumentParser() | |
26 parser.add_argument( | |
27 'infile', | |
28 help='Distance matrix input file' | |
29 ) | |
30 parser.add_argument( | |
31 'out_prefix', | |
32 help="Output prefix" | |
33 ) | |
34 parser.add_argument | |
35 parser.add_argument( | |
36 '-m', '--method', default="average", | |
37 choices=[ | |
38 "single", | |
39 "complete", | |
40 "average", | |
41 "weighted", | |
42 "centroid", | |
43 "median", | |
44 "ward" | |
45 ], | |
46 help="Clustering method to use" | |
47 ) | |
48 cut_mode = parser.add_mutually_exclusive_group() | |
49 cut_mode.add_argument( | |
50 "-n", "--n-clusters", nargs="*", type=int | |
51 ) | |
52 cut_mode.add_argument( | |
53 "--height", nargs="*", type=float | |
54 ) | |
55 args = parser.parse_args() | |
56 | |
57 # TO DO: | |
58 # - parse outputs to generate | |
59 | |
60 # read from input and check that | |
61 # we have been passed a symmetric distance matrix | |
62 with open(args.infile) as i: | |
63 col_names = next(i).rstrip("\n\r").split("\t")[1:] | |
64 col_count = len(col_names) | |
65 if not col_count: | |
66 sys.exit( | |
67 'No data columns found. ' | |
68 'This tool expects tabular input with column names on the first line ' | |
69 'and a row name in the first column of each row followed by data columns.' | |
70 ) | |
71 row_count = 0 | |
72 matrix = [] | |
73 for line in i: | |
74 if not line.strip(): | |
75 # skip empty lines | |
76 continue | |
77 row_count += 1 | |
78 if row_count > col_count: | |
79 sys.exit( | |
80 'This tool expects a symmetric distance matrix with an equal number of rows and columns, ' | |
81 'but got more rows than columns.' | |
82 ) | |
83 row_name, *row_data = line.strip(" \n\r").split("\t") | |
84 col_name = col_names[row_count - 1] | |
85 if not row_name: | |
86 # tolerate omitted row names, use col name instead | |
87 row_name = col_name | |
88 if row_name != col_name: | |
89 sys.exit( | |
90 'This tool expects a symmetric distance matrix with identical names for rows and columns, ' | |
91 f'but got "{col_name}" in column {row_count} and "{row_name}" on row {row_count}.' | |
92 ) | |
93 if len(row_data) != col_count: | |
94 sys.exit( | |
95 'This tool expects a symmetric distance matrix with the same number of columns on each row, ' | |
96 f'but row {row_count} ("{row_name}") has {len(row_data)} columns instead of {col_count}.' | |
97 ) | |
98 try: | |
99 matrix.append([float(x) for x in row_data]) | |
100 except ValueError as e: | |
101 sys.exit(str(e) + f' on row {row_count} ("{row_name}")') | |
102 if row_count < col_count: | |
103 sys.exit( | |
104 'This tool expects a symmetric distance matrix with an equal number of rows and columns, ' | |
105 'but got more columns than rows.' | |
106 ) | |
107 | |
108 # turn the distance matrix into "condensed" vector form | |
109 # this gives us further checks and raises ValueErrors if: | |
110 # - the values on the diagonal aren't zero | |
111 # - the upper and lower triangle of the matrix aren't identical | |
112 D = scipy.spatial.distance.squareform(matrix) | |
113 | |
114 # perform the requested clustering and retrieve the result as a linkage object | |
115 linkage = scipy.cluster.hierarchy.linkage(D, args.method) | |
116 | |
117 with open(args.out_prefix + '.tree.newick', 'w') as o: | |
118 o.write(linkage_as_newick(linkage, col_names)) | |
119 | |
120 # cut the tree as specified and report sample to cluster assignments | |
121 if args.n_clusters or args.height: | |
122 if args.n_clusters: | |
123 cut_values = args.n_clusters | |
124 colname_template = "cluster_id_n{}" | |
125 else: | |
126 cut_values = args.height | |
127 colname_template = "cluster_id_h{}" | |
128 header_cols = ["sample"] + [ | |
129 colname_template.format(x) for x in cut_values | |
130 ] | |
131 cluster_assignments = [] | |
132 for name, cluster_ids in zip( | |
133 col_names, | |
134 scipy.cluster.hierarchy.cut_tree( | |
135 linkage, | |
136 args.n_clusters, | |
137 args.height | |
138 ) | |
139 ): | |
140 cluster_assignments.append( | |
141 [name] | |
142 + [str(c + 1) for c in cluster_ids] | |
143 ) | |
144 with open(args.out_prefix + '.cluster_assignments.tsv', 'w') as o: | |
145 print("\t".join(header_cols), file=o) | |
146 for ass in cluster_assignments: | |
147 print("\t".join(ass), file=o) |