comparison clustering_from_distmat.py @ 1:c0b01c55a0e0 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/clustering_from_distmat/ commit 65b5c6f177478883ce664aeb6f27d0bec7155fdc
author iuc
date Mon, 19 Aug 2024 15:33:16 +0000
parents 8192b416f945
children
comparison
equal deleted inserted replaced
0:8192b416f945 1:c0b01c55a0e0
1 import argparse 1 import argparse
2 import sys 2 import sys
3 from collections import Counter
3 4
4 import scipy 5 import scipy
5 6
6 7
7 def linkage_as_newick(linkage, tip_names): 8 def linkage_as_newick(linkage, tip_names):
43 "median", 44 "median",
44 "ward" 45 "ward"
45 ], 46 ],
46 help="Clustering method to use" 47 help="Clustering method to use"
47 ) 48 )
49 missing_names = parser.add_mutually_exclusive_group()
50 missing_names.add_argument(
51 "--nc", "--no-colnames", action="store_true",
52 help="Indicate that the distance matrix input does not feature column names"
53 )
54 missing_names.add_argument(
55 "--nr", "--no-rownames", action="store_true",
56 help="Indicate that the distance matrix input does not feature row names"
57 )
48 cut_mode = parser.add_mutually_exclusive_group() 58 cut_mode = parser.add_mutually_exclusive_group()
49 cut_mode.add_argument( 59 cut_mode.add_argument(
50 "-n", "--n-clusters", nargs="*", type=int 60 "-n", "--n-clusters", nargs="*", type=int
51 ) 61 )
52 cut_mode.add_argument( 62 cut_mode.add_argument(
53 "--height", nargs="*", type=float 63 "--height", nargs="*", type=float
54 ) 64 )
65 parser.add_argument("-s", "--min-cluster-size", type=int, default=2)
55 args = parser.parse_args() 66 args = parser.parse_args()
56
57 # TO DO:
58 # - parse outputs to generate
59 67
60 # read from input and check that 68 # read from input and check that
61 # we have been passed a symmetric distance matrix 69 # we have been passed a symmetric distance matrix
62 with open(args.infile) as i: 70 with open(args.infile) as i:
63 col_names = next(i).rstrip("\n\r").split("\t")[1:] 71 col_count = None
64 col_count = len(col_names)
65 if not col_count:
66 sys.exit(
67 'No data columns found. '
68 'This tool expects tabular input with column names on the first line '
69 'and a row name in the first column of each row followed by data columns.'
70 )
71 row_count = 0 72 row_count = 0
72 matrix = [] 73 matrix = []
74 if args.nc:
75 col_names = col_count = None
76 else:
77 while True:
78 # skip leading empty lines
79 line = next(i).rstrip("\n\r")
80 if line:
81 break
82 if args.nr:
83 col_names = line.split("\t")
84 else:
85 # first column is for row names, rest are column names
86 col_names = line.split("\t")[1:]
87 col_count = len(col_names)
88 if not col_count:
89 sys.exit(
90 'No data columns found. '
91 'By default, this tool expects tabular input with column names on the first line '
92 'and a row name in the first column of each row followed by data columns. '
93 'Use --no-colnames or --no-rownames to modify the expected format.'
94 )
73 for line in i: 95 for line in i:
74 if not line.strip(): 96 if not line.strip():
75 # skip empty lines 97 # skip empty lines
76 continue 98 continue
77 row_count += 1 99 row_count += 1
78 if row_count > col_count: 100 if col_count is not None and row_count > col_count:
79 sys.exit( 101 sys.exit(
80 'This tool expects a symmetric distance matrix with an equal number of rows and columns, ' 102 'This tool expects a symmetric distance matrix with an equal number of rows and columns, '
81 'but got more rows than columns.' 103 'but got more rows than columns.'
82 ) 104 )
83 row_name, *row_data = line.strip(" \n\r").split("\t") 105 if args.nr:
106 row_name = None
107 row_data = line.rstrip("\n\r").split("\t")
108 else:
109 row_name, *row_data = line.rstrip("\n\r").split("\t")
110 if col_count is None:
111 col_count = len(row_data)
112 col_names = [None] * col_count
84 col_name = col_names[row_count - 1] 113 col_name = col_names[row_count - 1]
85 if not row_name: 114 if not row_name and col_name:
86 # tolerate omitted row names, use col name instead 115 # tolerate omitted row names, use col name instead
87 row_name = col_name 116 row_name = col_name
117 elif row_name and not col_name:
118 # likewise for column names
119 # plus update list of col names with row name
120 col_name = col_names[row_count - 1] = row_name
121 elif not row_name and not col_name:
122 sys.exit(
123 'Each sample in the distance matrix must have its name specified via a row name, a column name, or both, '
124 f'but found no name for sample number {row_count}'
125 )
88 if row_name != col_name: 126 if row_name != col_name:
89 sys.exit( 127 sys.exit(
90 'This tool expects a symmetric distance matrix with identical names for rows and columns, ' 128 'This tool expects a symmetric distance matrix with identical names for rows and columns, '
91 f'but got "{col_name}" in column {row_count} and "{row_name}" on row {row_count}.' 129 f'but got "{col_name}" in column {row_count} and "{row_name}" on row {row_count}.'
92 ) 130 )
96 f'but row {row_count} ("{row_name}") has {len(row_data)} columns instead of {col_count}.' 134 f'but row {row_count} ("{row_name}") has {len(row_data)} columns instead of {col_count}.'
97 ) 135 )
98 try: 136 try:
99 matrix.append([float(x) for x in row_data]) 137 matrix.append([float(x) for x in row_data])
100 except ValueError as e: 138 except ValueError as e:
101 sys.exit(str(e) + f' on row {row_count} ("{row_name}")') 139 if args.nr:
140 sys.exit(str(e) + f' on row {row_count}')
141 else:
142 sys.exit(str(e) + f' on row {row_count} ("{row_name}")')
102 if row_count < col_count: 143 if row_count < col_count:
103 sys.exit( 144 sys.exit(
104 'This tool expects a symmetric distance matrix with an equal number of rows and columns, ' 145 'This tool expects a symmetric distance matrix with an equal number of rows and columns, '
105 'but got more columns than rows.' 146 'but got more columns than rows.'
106 ) 147 )
126 cut_values = args.height 167 cut_values = args.height
127 colname_template = "cluster_id_h{}" 168 colname_template = "cluster_id_h{}"
128 header_cols = ["sample"] + [ 169 header_cols = ["sample"] + [
129 colname_template.format(x) for x in cut_values 170 colname_template.format(x) for x in cut_values
130 ] 171 ]
172 cut_result = scipy.cluster.hierarchy.cut_tree(
173 linkage,
174 args.n_clusters,
175 args.height
176 )
177
178 # Go through the cut results once to determine cluster sizes
179
180 # In the final report, the ids of clusters with fewer members than
181 # args.min_cluster_size will be masked with "-".
182 # The remaining cluster ids will be renumbered to start fom 1.
183 # This has to be done for each clustering resulting from the
184 # user-specified cut_values.
185 cluster_member_counts = [Counter() for _ in cut_values]
186 effective_cluster_ids = [{} for _ in cut_values]
187 for cluster_ids in cut_result:
188 for cl_count, cl_id, eff_id in zip(cluster_member_counts, cluster_ids, effective_cluster_ids):
189 cl_count[cl_id] += 1
190 for counter, eff_ids in zip(cluster_member_counts, effective_cluster_ids):
191 eff_id = 1
192 for item, count in counter.items():
193 # Since Python 3.7, Counter objects (like dicts) preserve
194 # insertion order so we can be sure that in the mapping
195 # constructed below, clusters will get renumbered in
196 # the order they will be reported later.
197 if count >= args.min_cluster_size:
198 eff_ids[item] = str(eff_id)
199 eff_id += 1
200 else:
201 eff_ids[item] = "-"
202
203 # build and write the cluster assignment report
204 # with remapped cluster ids
131 cluster_assignments = [] 205 cluster_assignments = []
132 for name, cluster_ids in zip( 206 for name, cluster_ids in zip(col_names, cut_result):
133 col_names,
134 scipy.cluster.hierarchy.cut_tree(
135 linkage,
136 args.n_clusters,
137 args.height
138 )
139 ):
140 cluster_assignments.append( 207 cluster_assignments.append(
141 [name] 208 [name]
142 + [str(c + 1) for c in cluster_ids] 209 + [
210 eff_ids[c]
211 for c, eff_ids in zip(cluster_ids, effective_cluster_ids)
212 ]
143 ) 213 )
144 with open(args.out_prefix + '.cluster_assignments.tsv', 'w') as o: 214 with open(args.out_prefix + '.cluster_assignments.tsv', 'w') as o:
145 print("\t".join(header_cols), file=o) 215 print("\t".join(header_cols), file=o)
146 for ass in cluster_assignments: 216 for ass in cluster_assignments:
147 print("\t".join(ass), file=o) 217 print("\t".join(ass), file=o)