comparison clustering_from_distmat.py @ 0:8192b416f945 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/clustering_from_distmat/ commit a34052b87a2d05cabed5001c50f1bb10e74f97ee
author iuc
date Thu, 08 Aug 2024 19:34:36 +0000
parents
children c0b01c55a0e0
comparison
equal deleted inserted replaced
-1:000000000000 0:8192b416f945
1 import argparse
2 import sys
3
4 import scipy
5
6
7 def linkage_as_newick(linkage, tip_names):
8 newick_parts = tip_names[::]
9 within_cluster_distances = [0] * len(tip_names)
10 for step in linkage:
11 n1 = int(step[0])
12 n2 = int(step[1])
13 d = float(step[2])
14 d1 = d - within_cluster_distances[n1]
15 d2 = d - within_cluster_distances[n2]
16 id1 = newick_parts[n1]
17 id2 = newick_parts[n2]
18 part = f'({id1}:{d1 / 2},{id2}:{d2 / 2})'
19 within_cluster_distances.append(d)
20 newick_parts.append(part)
21 return newick_parts[-1].format(*newick_parts) + ';'
22
23
24 if __name__ == "__main__":
25 parser = argparse.ArgumentParser()
26 parser.add_argument(
27 'infile',
28 help='Distance matrix input file'
29 )
30 parser.add_argument(
31 'out_prefix',
32 help="Output prefix"
33 )
34 parser.add_argument
35 parser.add_argument(
36 '-m', '--method', default="average",
37 choices=[
38 "single",
39 "complete",
40 "average",
41 "weighted",
42 "centroid",
43 "median",
44 "ward"
45 ],
46 help="Clustering method to use"
47 )
48 cut_mode = parser.add_mutually_exclusive_group()
49 cut_mode.add_argument(
50 "-n", "--n-clusters", nargs="*", type=int
51 )
52 cut_mode.add_argument(
53 "--height", nargs="*", type=float
54 )
55 args = parser.parse_args()
56
57 # TO DO:
58 # - parse outputs to generate
59
60 # read from input and check that
61 # we have been passed a symmetric distance matrix
62 with open(args.infile) as i:
63 col_names = next(i).rstrip("\n\r").split("\t")[1:]
64 col_count = len(col_names)
65 if not col_count:
66 sys.exit(
67 'No data columns found. '
68 'This tool expects tabular input with column names on the first line '
69 'and a row name in the first column of each row followed by data columns.'
70 )
71 row_count = 0
72 matrix = []
73 for line in i:
74 if not line.strip():
75 # skip empty lines
76 continue
77 row_count += 1
78 if row_count > col_count:
79 sys.exit(
80 'This tool expects a symmetric distance matrix with an equal number of rows and columns, '
81 'but got more rows than columns.'
82 )
83 row_name, *row_data = line.strip(" \n\r").split("\t")
84 col_name = col_names[row_count - 1]
85 if not row_name:
86 # tolerate omitted row names, use col name instead
87 row_name = col_name
88 if row_name != col_name:
89 sys.exit(
90 'This tool expects a symmetric distance matrix with identical names for rows and columns, '
91 f'but got "{col_name}" in column {row_count} and "{row_name}" on row {row_count}.'
92 )
93 if len(row_data) != col_count:
94 sys.exit(
95 'This tool expects a symmetric distance matrix with the same number of columns on each row, '
96 f'but row {row_count} ("{row_name}") has {len(row_data)} columns instead of {col_count}.'
97 )
98 try:
99 matrix.append([float(x) for x in row_data])
100 except ValueError as e:
101 sys.exit(str(e) + f' on row {row_count} ("{row_name}")')
102 if row_count < col_count:
103 sys.exit(
104 'This tool expects a symmetric distance matrix with an equal number of rows and columns, '
105 'but got more columns than rows.'
106 )
107
108 # turn the distance matrix into "condensed" vector form
109 # this gives us further checks and raises ValueErrors if:
110 # - the values on the diagonal aren't zero
111 # - the upper and lower triangle of the matrix aren't identical
112 D = scipy.spatial.distance.squareform(matrix)
113
114 # perform the requested clustering and retrieve the result as a linkage object
115 linkage = scipy.cluster.hierarchy.linkage(D, args.method)
116
117 with open(args.out_prefix + '.tree.newick', 'w') as o:
118 o.write(linkage_as_newick(linkage, col_names))
119
120 # cut the tree as specified and report sample to cluster assignments
121 if args.n_clusters or args.height:
122 if args.n_clusters:
123 cut_values = args.n_clusters
124 colname_template = "cluster_id_n{}"
125 else:
126 cut_values = args.height
127 colname_template = "cluster_id_h{}"
128 header_cols = ["sample"] + [
129 colname_template.format(x) for x in cut_values
130 ]
131 cluster_assignments = []
132 for name, cluster_ids in zip(
133 col_names,
134 scipy.cluster.hierarchy.cut_tree(
135 linkage,
136 args.n_clusters,
137 args.height
138 )
139 ):
140 cluster_assignments.append(
141 [name]
142 + [str(c + 1) for c in cluster_ids]
143 )
144 with open(args.out_prefix + '.cluster_assignments.tsv', 'w') as o:
145 print("\t".join(header_cols), file=o)
146 for ass in cluster_assignments:
147 print("\t".join(ass), file=o)