comparison cdhit_analysis.py @ 4:e64af72e1b8f draft default tip

planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit 4017d38cf327c48a6252e488ba792527dae97a70-dirty
author onnodg
date Mon, 15 Dec 2025 16:44:40 +0000
parents c6981ea453ae
children
comparison
equal deleted inserted replaced
3:c6981ea453ae 4:e64af72e1b8f
1 """ 1 """
2 This script processes cluster output files from cd-hit-est for use in Galaxy. 2 This script processes cluster output files from cd-hit-est for use in Galaxy.
3 It extracts cluster information, associates taxa and e-values from annotation files, 3 It extracts cluster information, associates taxa from annotation files,
4 performs statistical calculations, and generates text and plot outputs 4 performs statistical calculations, and generates text and plot outputs
5 summarizing similarity and taxonomic distributions. 5 summarizing similarity and taxonomic distributions.
6 6
7
8 Main steps: 7 Main steps:
9 1. Parse cd-hit-est cluster file and (optional) annotation file. 8 1. Parse cd-hit-est cluster file and (optional) annotation Excel file.
10 2. Process each cluster to extract similarity, taxa, and e-value information. 9 2. Process each cluster to extract similarity and taxa information.
11 3. Aggregate results across clusters. 10 3. Aggregate results across clusters.
12 4. Generate requested outputs: text summaries, plots, and Excel reports. 11 4. Generate requested outputs: text summaries, plots, and Excel reports.
13 """ 12 """
14 13
15 import argparse 14 import argparse
16 from collections import Counter, defaultdict 15 from collections import Counter, defaultdict
17 import os 16 import os
18 import re 17 import re
18 from math import sqrt
19
19 import matplotlib.pyplot as plt 20 import matplotlib.pyplot as plt
20 import pandas as pd 21 import pandas as pd
21 from math import sqrt
22 import openpyxl 22 import openpyxl
23 23
24 def log_message(messages, text: str):
25 """Append a message to the log list."""
26 messages.append(text)
24 27
25 28
26 def parse_arguments(args_list=None): 29 def parse_arguments(args_list=None):
27 """Parse command-line arguments for the script.""" 30 """Parse command-line arguments for the script."""
28 parser = argparse.ArgumentParser( 31 parser = argparse.ArgumentParser(description="Create taxa analysis from cd-hit cluster files")
29 description='Create taxa analysis from cd-hit cluster files') 32 parser.add_argument("--input_cluster", type=str, required=True, help="Input cluster file (.clstr)")
30 parser.add_argument('--input_cluster', type=str, required=True, 33 parser.add_argument("--input_annotation", type=str, required=False, help="Input annotation Excel file (header annotations)")
31 help='Input cluster file (.clstr)') 34
32 parser.add_argument('--input_annotation', type=str, required=False, 35 parser.add_argument("--output_similarity_txt", type=str, help="Similarity text output file")
33 help='Input annotation file (.out)') 36 parser.add_argument("--output_similarity_plot",type=str, help="Similarity plot output file (PNG)")
34 37 parser.add_argument("--output_count", type=str, help="Count summary output file")
35 # Galaxy output files 38 parser.add_argument("--output_excel", type=str, help="Count summary output file")
36 parser.add_argument('--output_similarity_txt', type=str, 39 parser.add_argument("--output_taxa_clusters", action="store_true", default=False, help="Excel output: include raw taxa per cluster sheet")
37 help='Similarity text output file') 40 parser.add_argument("--output_taxa_processed", action="store_true", default=False, help="Excel output: include processed/LCA taxa per cluster sheet")
38 parser.add_argument('--output_similarity_plot', type=str, 41 parser.add_argument("--log_file", type=str, help="Optional log file with run statistics")
39 help='Similarity plot output file') 42
40 parser.add_argument('--output_evalue_txt', type=str, 43 parser.add_argument("--simi_plot_y_min", type=float, default=95.0, help="Minimum value of the y-axis in the similarity plot")
41 help='E-value text output file') 44 parser.add_argument("--simi_plot_y_max", type=float, default=100.0, help="Maximum value of the y-axis in the similarity plot")
42 parser.add_argument('--output_evalue_plot', type=str, 45
43 help='E-value plot output file') 46 parser.add_argument("--uncertain_taxa_use_ratio", type=float, default=0.5, help="Ratio at which uncertain taxa count toward the dominant taxa")
44 parser.add_argument('--output_count', type=str, 47 parser.add_argument("--min_to_split", type=float, default=0.45, help=("Minimum fraction (0–0.5) for splitting a cluster taxonomically. "
45 help='Count summary output file') 48 "ratio = count(second) / (count(first) + count(second))"))
46 parser.add_argument('--output_taxa_clusters', type=str, 49 parser.add_argument("--min_count_to_split", type=int, default=10, help=("Minimum combined count of the two top taxa to avoid being forced "
47 help='Taxa per cluster output file') 50 "into an 'uncertain' split at this level."))
48 parser.add_argument('--output_taxa_processed', type=str, 51
49 help='Processed taxa output file') 52 parser.add_argument("--min_cluster_support", type=int, default=1, help=("Minimum number of annotated reads required for a cluster to be "
50 # Plot parameters 53 "included in the processed/LCA taxa sheet. Unannotated reads do not count toward this threshold."))
51 parser.add_argument('--simi_plot_y_min', type=float, default=95.0,
52 help='Minimum value of the y-axis in the similarity plot')
53 parser.add_argument('--simi_plot_y_max', type=float, default=100.0,
54 help='Maximum value of the y-axis in the similarity plot')
55
56 # Uncertain taxa configuration
57 parser.add_argument('--uncertain_taxa_use_ratio', type=float, default=0.5,
58 help='Ratio at which uncertain taxa count toward the correct taxa')
59 parser.add_argument('--min_to_split', type=float, default=0.45,
60 help='Minimum percentage for taxonomic split')
61 parser.add_argument('--min_count_to_split', type=int, default=10,
62 help='Minimum count for taxonomic split')
63
64 # Processing options
65 parser.add_argument('--show_unannotated_clusters', action='store_true', default=False,
66 help='Show unannotated clusters in output')
67 parser.add_argument('--make_taxa_in_cluster_split', action='store_true', default=False,
68 help='Split clusters with multiple taxa')
69 parser.add_argument('--print_empty_files', action='store_true', default=False,
70 help='Print messages about empty annotation files')
71 54
72 return parser.parse_args(args_list) 55 return parser.parse_args(args_list)
73 56
74 57
75 # Color map for plots
76 COLORMAP = [ 58 COLORMAP = [
77 # List of RGBA tuples for bar coloring in plots 59 # List of RGBA tuples for bar coloring in plots
78 (0.12156862745098039, 0.4666666666666667, 0.7058823529411765, 1.0), 60 (0.12156862745098039, 0.4666666666666667, 0.7058823529411765, 1.0),
79 (1.0, 0.4980392156862745, 0.054901960784313725, 1.0), 61 (1.0, 0.4980392156862745, 0.054901960784313725, 1.0),
80 (0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0), 62 (0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0),
81 (0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0), 63 (0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0),
82 (0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0), 64 (0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0),
83 (0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0), 65 (0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0),
84 (0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0), 66 (0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0),
85 (0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0), 67 (0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0),
86 (0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0), 68 (0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0),
87 (0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0) 69 (0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0),
88 ] 70 ]
89 71
90 72
91 def parse_cluster_file(cluster_file, annotation_file=None, print_empty=False, raise_on_error=False): 73 def parse_cluster_file(cluster_file, annotation_file, raise_on_error=False, log_messages=None):
92 """ 74 """Parse CD-HIT cluster output and attach taxonomic annotations.
93 Parse the cd-hit-est cluster file (.clstr) and (optionally) an Excel annotation file. 75
94 76 Cluster entries are parsed for read headers, counts, and similarities.
95 It extracts cluster information (header, read count, similarity) and associates 77 If an annotation Excel file is supplied, taxa, seq_id and source fields
96 taxonomic information and E-values from the annotation file based on the read header. 78 are mapped per read from sheet ``Individual_Reads``.
97 79
98 :param cluster_file: Path to cd-hit cluster file (.clstr). 80 :param cluster_file: Path to the CD-HIT cluster file.
99 :type cluster_file: str 81 :type cluster_file: str
100 :param annotation_file: Path to Excel annotation file with taxa and e-values. 82 :param annotation_file: Optional Excel annotation file.
101 :type annotation_file: str, optional 83 :type annotation_file: str or None
102 :param print_empty: Print a message if the annotation file is not found or empty. 84 :param raise_on_error: Whether to raise parsing errors directly.
103 :type print_empty: bool
104 :param raise_on_error: Raise parsing errors instead of printing warnings.
105 :type raise_on_error: bool 85 :type raise_on_error: bool
106 :return: List of clusters, where each cluster is a dict mapping read header to a dict of read info. 86 :param log_messages: Optional message collector for logging.
107 :rtype: list[dict[str, dict]] 87 :type log_messages: list[str] or None
108 :raises ValueError: If similarity cannot be parsed from a cluster line. 88 :return: List of clusters, where each cluster is a dict keyed by header.
109 :raises UnboundLocalError: If an error occurs during annotation processing. 89 :rtype: list[dict]
110 """ 90 """
111 clusters = [] 91 clusters = []
112 current_cluster = {} 92 current_cluster = {}
113 93
114 # Load annotations if provided
115 annotations = {} 94 annotations = {}
116 if annotation_file and os.path.exists(annotation_file): 95 if annotation_file and os.path.exists(annotation_file):
117 # Lees het Excel-bestand 96 try:
118 df = pd.read_excel(annotation_file, sheet_name='Individual_Reads', engine='openpyxl') 97 df = pd.read_excel(
119 98 annotation_file,
120 # Itereer over de rijen 99 sheet_name="Individual_Reads",
121 for _, row in df.iterrows(): 100 engine="openpyxl",
122 header = row['header'] # kolomnaam zoals in Excel 101 )
123 evalue = row['e_value'] # of de kolomnaam die je wilt gebruiken 102 required_cols = {"header", "seq_id", "source", "taxa"}
124 taxa = row['taxa'] # afhankelijk van hoe je taxa opslaat 103 missing = required_cols - set(df.columns)
125 104 if missing:
126 annotations[header] = {'evalue': evalue, 'taxa': taxa} 105 msg = (
127 elif annotation_file and print_empty: 106 f"Annotation file missing columns {missing}, "
128 print(f"Annotation file {annotation_file} not found or empty") 107 "continuing without annotations."
129 with open(cluster_file, 'r') as f: 108 )
109 if log_messages is not None:
110 log_message(log_messages, msg)
111 else:
112 for _, row in df.iterrows():
113 header = str(row["header"])
114 seq_id = str(row["seq_id"])
115 source = str(row["source"])
116 taxa = str(row["taxa"])
117 annotations[header] = {
118 "seq_id": seq_id,
119 "source": source,
120 "taxa": taxa,
121 }
122 if log_messages is not None:
123 log_message(
124 log_messages,
125 f"Loaded {len(annotations)} annotated headers",
126 )
127 except Exception as exc:
128 msg = (
129 "Failed to read annotation file; proceeding without annotations. "
130 f"Details: {exc}"
131 )
132 if log_messages is not None:
133 log_message(log_messages, msg)
134 else:
135 if log_messages is not None:
136 if annotation_file:
137 log_message(
138 log_messages,
139 "Annotation file not found; proceeding without annotations.",
140 )
141 else:
142 log_message(
143 log_messages,
144 "No annotation file provided; proceeding without annotations.",
145 )
146
147 with open(cluster_file, "r") as f:
130 for line in f: 148 for line in f:
131 line = line.strip() 149 line = line.strip()
132 if line.startswith('>Cluster'): 150 if not line:
151 continue
152
153 if line.startswith(">Cluster"):
133 # Start of new cluster, save previous if exists 154 # Start of new cluster, save previous if exists
134 if current_cluster: 155 if current_cluster:
135 clusters.append(current_cluster) 156 clusters.append(current_cluster)
136 current_cluster = {} 157 current_cluster = {}
137 else: 158 else:
138 # Parse sequence line
139 parts = line.split() 159 parts = line.split()
140 if len(parts) >= 2: 160 if len(parts) < 3:
141 # Extract header and count 161 continue
142 header_part = parts[2].strip('>.') 162
143 header_parts = header_part.split('(') 163 header_part = parts[2].strip(">.")
144 if len(header_parts) > 1: 164 header_parts = header_part.split("(")
145 last_part = header_parts[-1].strip(')') 165 header = header_parts[0]
146 header = header_parts[0] 166 count = 0
147 if last_part: 167 if len(header_parts) > 1:
168 last_part = header_parts[-1].strip(")")
169 if last_part:
170 try:
148 count = int(last_part) 171 count = int(last_part)
149 else: 172 except ValueError:
150 print('no count')
151 count = 0 173 count = 0
152 header = '' 174
153 # Extract similarity 175 similarity_part = parts[-1].strip()
154 similarity_part = parts[-1].strip() 176 if "*" in similarity_part:
155 if '*' in similarity_part: 177 similarity = 100.0
156 similarity = 100.0 # Representative sequence 178 else:
179 matches = re.findall(r"[\d.]+", similarity_part)
180 if matches:
181 similarity = float(matches[-1])
157 else: 182 else:
158 matches = re.findall(r'[\d.]+', similarity_part) 183 msg = f"Could not parse similarity from: '{similarity_part}'"
159 if matches:
160 similarity = float(matches[-1])
161 else:
162 raise ValueError(f"Could not parse similarity from: '{similarity_part}'")
163 # Get annotation info
164 try:
165 if header in annotations:
166 taxa = annotations[header]['taxa']
167 evalue = annotations[header]['evalue']
168 else:
169 taxa = 'Unannotated read'
170 evalue = 'Unannotated read'
171
172 current_cluster[header] = {
173 'count': count,
174 'similarity': similarity,
175 'taxa': taxa,
176 'evalue': evalue
177 }
178 except UnboundLocalError as e:
179 if raise_on_error: 184 if raise_on_error:
180 raise UnboundLocalError(str(e)) 185 raise ValueError(msg)
181 print(f"Error: {e}, No annotations found") 186 if log_messages is not None:
182 187 log_message(log_messages, f"WARNING: {msg}")
183 # Add the last cluster 188 similarity = 0.0
189
190 if header in annotations:
191 taxa = annotations[header]["taxa"]
192 seq_id = annotations[header]["seq_id"]
193 source = annotations[header]["source"]
194 else:
195 taxa = "Unannotated read"
196 seq_id = "Unannotated read"
197 source = "Unannotated read"
198
199 current_cluster[header] = {
200 "count": count,
201 "similarity": similarity,
202 "taxa": taxa,
203 "seq_id": seq_id,
204 "source": source,
205 }
206
184 if current_cluster: 207 if current_cluster:
185 clusters.append(current_cluster) 208 clusters.append(current_cluster)
186 209
210 if log_messages is not None:
211 log_message(log_messages, f"Parsed {len(clusters)} clusters")
212
187 return clusters 213 return clusters
188 214
189 215
190 def process_cluster_data(cluster): 216 def process_cluster_data(cluster: dict):
191 """ 217 """Convert a raw cluster into similarity lists and aggregated taxa counts.
192 Process a single cluster to extract E-value, similarity, and taxa data for all reads. 218
193 219 Similarity values are expanded based on read count. Taxa are grouped
194 Aggregates information from all reads in the cluster, storing read counts, 220 by unique taxonomic labels, tracking supporting seq_ids and sources.
195 E-values, similarities, and taxa in lists and a dictionary. 221
196 222 :param cluster: Mapping of read header to cluster metadata.
197 :param cluster: Cluster data mapping read headers to read info.
198 :type cluster: dict 223 :type cluster: dict
199 :return: A tuple containing: (list of E-values, list of similarity values, dict of taxa -> counts). 224 :return: similarity_list, taxa_map, annotated_count, unannotated_count
200 The first element of the E-value list ([0]) stores the count of unannotated reads. 225 :rtype: tuple[list[float], dict, int, int]
201 :rtype: tuple[list[float | int], list[float], dict[str, int]] 226 """
202 """ 227 similarity_list: list[float] = []
203 eval_list = [0.0] # First element for unannotated count 228 taxa_map: dict[str, dict] = {}
204 simi_list = [] 229 annotated_count = 0
205 taxa_dict = {'Unannotated read': 0} 230 unannotated_count = 0
206 231
207 for header, info in cluster.items(): 232 for _header, info in cluster.items():
208 count = info['count'] 233 count = info["count"]
209 similarity = info['similarity'] 234 similarity = info["similarity"]
210 taxa = info['taxa'] 235 taxa = info["taxa"]
211 evalue = info['evalue'] 236 seq_id = info["seq_id"]
212 237 source = info["source"]
213 if evalue == 'Unannotated read': 238
214 eval_list[0] += count 239 similarity_list.extend([similarity] * count)
215 taxa_dict['Unannotated read'] += count 240
241 key = taxa if seq_id != "Unannotated read" else "Unannotated read"
242 if key not in taxa_map:
243 taxa_map[key] = {
244 "count": 0,
245 "seq_ids": set(),
246 "sources": set(),
247 }
248
249 taxa_map[key]["count"] += count
250 taxa_map[key]["seq_ids"].add(seq_id)
251 taxa_map[key]["sources"].add(source)
252
253 if seq_id == "Unannotated read":
254 unannotated_count += count
216 else: 255 else:
217 try: 256 annotated_count += count
218 eval_val = float(evalue) 257
219 for _ in range(count): 258 return similarity_list, taxa_map, annotated_count, unannotated_count
220 eval_list.append(eval_val) 259
221 except ValueError:
222 eval_list[0] += count
223
224 if taxa not in taxa_dict:
225 taxa_dict[taxa] = 0
226 taxa_dict[taxa] += count
227
228 # Add similarity values
229 for _ in range(count):
230 simi_list.append(similarity)
231 return eval_list, simi_list, taxa_dict
232 260
233 261
234 def calculate_cluster_taxa(taxa_dict, args): 262 def calculate_cluster_taxa(taxa_dict, args):
235 """ 263 """Resolve cluster-level taxa using weighted recursive LCA splitting.
236 Calculate the most likely taxa for a cluster based on read counts. 264
237 265 Unannotated reads are treated as fully uncertain taxa but contribute
238 This function applies the 'uncertain taxa use ratio' for unannotated reads 266 partially via ``uncertain_taxa_use_ratio`` when determining dominant levels.
239 and uses a recursive approach to potentially split a cluster into sub-clusters 267
240 if taxonomic dominance is not strong enough (based on ``min_to_split`` 268 :param taxa_dict: Mapping of taxa string to total read count.
241 and ``min_count_to_split``).
242
243 :param taxa_dict: Mapping of taxa (full string) -> read counts.
244 :type taxa_dict: dict[str, int] 269 :type taxa_dict: dict[str, int]
245 :param args: Parsed script arguments, including parameters for taxa calculation. 270 :param args: Argument namespace containing resolution parameters.
246 :type args: argparse.Namespace 271 :type args: argparse.Namespace
247 :return: A list of refined taxa assignments (dictionaries), where each dictionary 272 :return: List of taxonomic groups after recursive resolution.
248 represents a potentially split sub-cluster. 273 :rtype: list[dict[str, int]]
249 :rtype: list[dict[str, float | int]] 274 """
250 """
251 # Replace 'Unannotated read' with uncertain taxa format
252 processed_dict = {} 275 processed_dict = {}
253 for taxa, count in taxa_dict.items(): 276 for taxa, count in taxa_dict.items():
254 if taxa == 'Unannotated read': 277 if taxa == "Unannotated read":
255 uncertain_taxa = ' / '.join(['Uncertain taxa'] * 7) 278 uncertain_taxa = " / ".join(["Uncertain taxa"] * 7)
256 processed_dict[uncertain_taxa] = count 279 processed_dict[uncertain_taxa] = count
257 else: 280 else:
258 processed_dict[taxa] = count 281 processed_dict[taxa] = count
259 282
260 return _recursive_taxa_calculation(processed_dict, args, 0) 283 return _recursive_taxa_calculation(processed_dict, args, 0)
261 284
262 285
263 def _recursive_taxa_calculation(taxa_dict, args, level): 286 def _recursive_taxa_calculation(taxa_dict, args, level):
264 """ 287 """Recursive helper performing level-by-level weighted LCA resolution.
265 Recursive helper to calculate and potentially split taxa at each taxonomic level. 288
266 289 At each taxonomic rank, taxa are grouped and compared. If multiple
267 :param taxa_dict: Taxa counts at the current level (or sub-group). 290 taxa exceed the configured split thresholds, the cluster is divided
268 :type taxa_dict: dict[str, float | int] 291 and recursion continues deeper.
269 :param args: Parsed script arguments. 292
293 :param taxa_dict: Mapping of taxa to count for this recursion stage.
294 :type taxa_dict: dict[str, int]
295 :param args: Parameter namespace controlling uncertainty and split rules.
270 :type args: argparse.Namespace 296 :type args: argparse.Namespace
271 :param level: Index of the current taxonomic level (0=kingdom, max 6=species). 297 :param level: Current taxonomy depth (0–6).
272 :type level: int 298 :type level: int
273 :return: List of refined taxa dictionaries. 299 :return: One or more resolved taxon groups at deeper levels.
274 :rtype: list[dict[str, float | int]] 300 :rtype: list[dict[str, int]]
275 """ 301 """
276 if level >= 7: # Max 7 taxonomic levels 302 if level >= 7:
277 return [taxa_dict] 303 return [taxa_dict]
278 304
279 level_dict = defaultdict(float) 305 level_dict = defaultdict(float)
280
281 # Group by taxonomic level
282 for taxa, count in taxa_dict.items(): 306 for taxa, count in taxa_dict.items():
283 taxa_parts = taxa.split(' / ') 307 taxa_parts = taxa.split(" / ")
284 if level < len(taxa_parts): 308 if level < len(taxa_parts):
285 level_taxon = taxa_parts[level] 309 level_taxon = taxa_parts[level]
286 if level_taxon == 'Uncertain taxa': 310 if level_taxon == "Uncertain taxa":
287 level_dict[level_taxon] += count * args.uncertain_taxa_use_ratio 311 level_dict[level_taxon] += count * args.uncertain_taxa_use_ratio
288 else: 312 else:
289 level_dict[level_taxon] += count 313 level_dict[level_taxon] += count
290 314
291 if len(level_dict) <= 1: 315 if len(level_dict) <= 1:
292 # Only one taxon at this level, continue to next level
293 return _recursive_taxa_calculation(taxa_dict, args, level + 1) 316 return _recursive_taxa_calculation(taxa_dict, args, level + 1)
294 317
295 # Sort by abundance
296 sorted_taxa = sorted(level_dict.items(), key=lambda x: x[1], reverse=True) 318 sorted_taxa = sorted(level_dict.items(), key=lambda x: x[1], reverse=True)
297
298 result = [] 319 result = []
299 dominant_taxon = sorted_taxa[0][0] 320 dominant_taxon = sorted_taxa[0][0]
300 321
301 # Check if we should split
302 for i in range(1, len(sorted_taxa)): 322 for i in range(1, len(sorted_taxa)):
303 secondary_taxon = sorted_taxa[i][0] 323 secondary_taxon = sorted_taxa[i][0]
304 total_count = sorted_taxa[0][1] + sorted_taxa[i][1] 324 total_count = sorted_taxa[0][1] + sorted_taxa[i][1]
305 ratio = sorted_taxa[i][1] / total_count 325 ratio = sorted_taxa[i][1] / total_count
306 326
307 if ratio >= args.min_to_split or total_count <= args.min_count_to_split: 327 if ratio >= args.min_to_split or total_count <= args.min_count_to_split:
308 # Split off this taxon 328 split_dict = {
309 split_dict = {taxa: count for taxa, count in taxa_dict.items() 329 taxa: count
310 if taxa.split(' / ')[level] == secondary_taxon} 330 for taxa, count in taxa_dict.items()
331 if taxa.split(" / ")[level] == secondary_taxon
332 }
311 result.extend(_recursive_taxa_calculation(split_dict, args, level + 1)) 333 result.extend(_recursive_taxa_calculation(split_dict, args, level + 1))
312 334
313 # Process the dominant group 335 dominant_dict = {
314 dominant_dict = {taxa: count for taxa, count in taxa_dict.items() 336 taxa: count
315 if taxa.split(' / ')[level] == dominant_taxon} 337 for taxa, count in taxa_dict.items()
338 if taxa.split(" / ")[level] == dominant_taxon
339 }
316 result.extend(_recursive_taxa_calculation(dominant_dict, args, level + 1)) 340 result.extend(_recursive_taxa_calculation(dominant_dict, args, level + 1))
317 341
318 return result 342 return result
319 343
320 344
321 def write_similarity_output(all_simi_data, output_file): 345
322 """ 346 def write_similarity_output(cluster_data_list, output_file, log_messages=None):
323 Write the similarity text output, including the mean and standard deviation, 347 """Write similarity statistics and per-cluster distributions.
324 and a count per similarity percentage. 348
325 349 Output includes mean, standard deviation, and counts of each
326 :param all_simi_data: List of all similarity percentages from all reads. 350 similarity value per cluster.
327 :type all_simi_data: list[float] 351
328 :param output_file: Path to the output file. 352 :param cluster_data_list: List of processed cluster dictionaries.
353 :type cluster_data_list: list[dict]
354 :param output_file: Destination path for the TSV summary.
329 :type output_file: str 355 :type output_file: str
356 :param log_messages: Optional log collector.
357 :type log_messages: list[str] or None
330 :return: None 358 :return: None
331 :rtype: None 359 :rtype: None
332 """ 360 """
333 if not all_simi_data: 361 all_simi = []
362 pair_counter = Counter()
363
364 for cluster_index, cluster_data in enumerate(cluster_data_list):
365 simi_list = cluster_data["similarities"]
366 if not simi_list:
367 continue
368 all_simi.extend(simi_list)
369 for s in simi_list:
370 pair_counter[(cluster_index, s)] += 1
371
372 if not all_simi:
373 if log_messages is not None:
374 log_message(
375 log_messages,
376 "No similarity data found; similarity text file not written.",
377 )
334 return 378 return
335 379
336 avg = sum(all_simi_data) / len(all_simi_data) 380 avg = sum(all_simi) / len(all_simi)
337 variance = sum((s - avg) ** 2 for s in all_simi_data) / len(all_simi_data) 381 variance = sum((s - avg) ** 2 for s in all_simi) / len(all_simi)
338 st_dev = sqrt(variance) 382 st_dev = sqrt(variance)
339 383
340 simi_counter = Counter(all_simi_data) 384 with open(output_file, "w") as f:
341 simi_sorted = sorted(simi_counter.items(), key=lambda x: -x[0]) 385 f.write(f"# Average similarity (all reads): {avg:.2f}\n")
342 386 f.write(f"# Standard deviation (all reads): {st_dev:.2f}\n")
343 with open(output_file, 'w') as f: 387 f.write("cluster\tsimilarity\tcount\n")
344 f.write(f"# Average similarity: {avg:.2f}\n") 388
345 f.write(f"# Standard deviation: {st_dev:.2f}\n") 389 for (cluster_index, similarity), count in sorted(
346 f.write("similarity\tcount\n") 390 pair_counter.items(), key=lambda x: (x[0][0], -x[0][1])
347 for similarity, count in simi_sorted: 391 ):
348 f.write(f"{similarity}\t{count}\n") 392 f.write(f"{cluster_index}\t{similarity}\t{count}\n")
349 393
350 394 if log_messages is not None:
351 def write_evalue_output(all_eval_data, output_file): 395 log_message(log_messages, "Similarity text summary written succesfully")
352 """ 396 log_message(
353 Write the E-value text output, including the count of unannotated reads 397 log_messages, f"Similarity mean = {avg:.2f}, std = {st_dev:.2f}"
354 and a count per E-value. 398 )
355 399
356 :param all_eval_data: List of E-values from all reads. The first element ([0]) is the count of unannotated reads. 400
357 :type all_eval_data: list[float | int] 401 def write_count_output(cluster_data_list, output_file, log_messages=None):
358 :param output_file: Path to the output file. 402 """Write counts of annotated and unannotated reads per cluster.
403
404 A summary row containing total and percentage values is appended.
405
406 :param cluster_data_list: List of processed clusters.
407 :type cluster_data_list: list[dict]
408 :param output_file: Path for the count summary text file.
359 :type output_file: str 409 :type output_file: str
410 :param log_messages: Optional logging accumulator.
411 :type log_messages: list[str] or None
360 :return: None 412 :return: None
361 :rtype: None 413 :rtype: None
362 """ 414 """
363 unanno_count = all_eval_data[0] 415 total_annotated = 0
364 eval_counter = Counter(all_eval_data[1:]) 416 total_unannotated = 0
365 417
366 with open(output_file, 'w') as f: 418 with open(output_file, "w") as f:
367 f.write("evalue\tcount\n") 419 f.write(
368 if unanno_count > 0: 420 "cluster\tunannotated\tannotated\ttotal\tperc_unannotated\tperc_annotated\n"
369 f.write(f"unannotated\t{unanno_count}\n") 421 )
370 422
371 eval_sorted = sorted(eval_counter.items(), 423 for cluster_index, cluster_data in enumerate(cluster_data_list):
372 key=lambda x: (-x[1], float(x[0]) if isinstance(x[0], (int, float)) else float('inf'))) 424 unannotated = cluster_data["unannotated"]
373 for value, count in eval_sorted: 425 annotated = cluster_data["annotated"]
374 f.write(f"{value}\t{count}\n")
375
376
377 def write_count_output(all_eval_data, cluster_data_list, output_file):
378 """
379 Write a summary of annotated and unannotated read counts per cluster
380 and for the total sample.
381
382 :param all_eval_data: List of E-values from all reads for the total count summary.
383 :type all_eval_data: list[float | int]
384 :param cluster_data_list: List of tuples (eval_list, simi_list, taxa_dict) per cluster.
385 :type cluster_data_list: list[tuple]
386 :param output_file: Path to the output file.
387 :type output_file: str
388 :return: None
389 :rtype: None
390 """
391 with open(output_file, 'w') as f:
392 f.write("cluster\tunannotated\tannotated\ttotal\tperc_unannotated\tperc_annotated\n")
393
394 for cluster_num, (eval_list, _, _) in enumerate(cluster_data_list):
395 unannotated = eval_list[0]
396 annotated = len(eval_list[1:])
397 total = unannotated + annotated 426 total = unannotated + annotated
427
428 total_annotated += annotated
429 total_unannotated += unannotated
398 430
399 if total > 0: 431 if total > 0:
400 perc_annotated = (annotated / total) * 100 432 perc_annotated = (annotated / total) * 100
401 perc_unannotated = (unannotated / total) * 100 433 perc_unannotated = (unannotated / total) * 100
402 else: 434 else:
403 perc_annotated = perc_unannotated = 0 435 perc_annotated = perc_unannotated = 0.0
404 436
405 f.write( 437 f.write(
406 f"{cluster_num}\t{unannotated}\t{annotated}\t{total}\t{perc_unannotated:.2f}\t{perc_annotated:.2f}\n") 438 f"{cluster_index}\t{unannotated}\t{annotated}\t{total}"
407 439 f"\t{perc_unannotated:.2f}\t{perc_annotated:.2f}\n"
408 # Add full sample summary 440 )
409 total_unannotated = all_eval_data[0] 441
410 total_annotated = len(all_eval_data[1:]) 442 grand_total = total_annotated + total_unannotated
411 grand_total = total_unannotated + total_annotated
412
413 if grand_total > 0: 443 if grand_total > 0:
414 perc_annotated = (total_annotated / grand_total) * 100 444 perc_annotated = (total_annotated / grand_total) * 100
415 perc_unannotated = (total_unannotated / grand_total) * 100 445 perc_unannotated = (total_unannotated / grand_total) * 100
416 else: 446 else:
417 perc_annotated = perc_unannotated = 0 447 perc_annotated = perc_unannotated = 0.0
418 448
419 f.write( 449 f.write(
420 f"TOTAL\t{total_unannotated}\t{total_annotated}\t{grand_total}\t{perc_unannotated:.2f}\t{perc_annotated:.2f}\n") 450 f"TOTAL\t{total_unannotated}\t{total_annotated}\t{grand_total}"
421 451 f"\t{perc_unannotated:.2f}\t{perc_annotated:.2f}\n"
422 452 )
423 def write_taxa_clusters_output(cluster_data_list, output_file): 453
424 """ 454 if log_messages is not None:
425 Write raw taxa information per cluster to an Excel file. 455 log_message(log_messages, "Count summary written succesfully")
426 456 log_message(
427 Each row contains the cluster number, read count, the full taxa string, 457 log_messages,
428 and the separate taxonomic levels (kingdom through species). 458 f"TOTAL annotated={total_annotated}, unannotated={total_unannotated}, total={grand_total}",
429 459 )
430 :param cluster_data_list: List of tuples (eval_list, simi_list, taxa_dict) per cluster. 460
431 :type cluster_data_list: list[tuple] 461
432 :param output_file: Path to the output Excel file. 462 def write_taxa_excel(cluster_data_list, args, output_file, write_raw: bool, write_processed: bool, log_messages=None):
463 """Write raw and processed taxa data into a combined Excel report.
464
465 Generates up to three sheets:
466
467 - ``Raw_Taxa_Clusters``: Full taxa per read cluster (if enabled).
468 - ``Processed_Taxa_Clusters``: LCA-resolved taxa (if enabled).
469 - ``Settings``: Parameters used for taxonomic resolution.
470
471 seq_id and source tracking are kept only in the raw sheet.
472
473 :param cluster_data_list: List containing similarity/taxa data per cluster.
474 :type cluster_data_list: list[dict]
475 :param args: Parsed arguments containing LCA configuration.
476 :type args: argparse.Namespace
477 :param output_file: Path to the combined Excel file.
433 :type output_file: str 478 :type output_file: str
479 :param write_raw: Whether to include the raw taxa sheet.
480 :type write_raw: bool
481 :param write_processed: Whether to include the processed/LCA sheet.
482 :type write_processed: bool
483 :param log_messages: Optional log collector.
484 :type log_messages: list[str] or None
434 :return: None 485 :return: None
435 :rtype: None 486 :rtype: None
436 """ 487 """
437 # Create main dataframe 488 raw_rows = []
438 data = [] 489 processed_rows = []
439 for cluster_num, (_, _, taxa_dict) in enumerate(cluster_data_list): 490
440 for taxa, count in taxa_dict.items(): 491 if write_raw:
441 if count > 0: 492 for cluster_index, cluster_data in enumerate(cluster_data_list):
442 # Split taxa into taxonomic levels 493 taxa_map = cluster_data["taxa_map"]
443 taxa_levels = taxa.split(' / ') if taxa else [] 494 for taxa, info in taxa_map.items():
444 taxa_levels += ['Unannotated read'] * (7 - len(taxa_levels)) 495 count = info["count"]
445 try: 496 if count <= 0:
446 data.append({
447 'cluster': cluster_num,
448 'count': count,
449 'taxa_full': taxa,
450 'kingdom': taxa_levels[0],
451 'phylum': taxa_levels[1],
452 'class': taxa_levels[2],
453 'order': taxa_levels[3],
454 'family': taxa_levels[4],
455 'genus': taxa_levels[5],
456 'species': taxa_levels[6]
457 })
458 except IndexError as e:
459 # Skip entries with incomplete taxonomic data
460 print(f"Skipped entry in cluster {cluster_num}: incomplete taxonomic data for '{taxa}, error: {e}'")
461 continue 497 continue
462 498
463 df = pd.DataFrame(data) 499 taxa_levels = taxa.split(" / ") if taxa else []
464 # Write to Excel 500 while len(taxa_levels) < 7:
465 temp_path = output_file + ".xlsx" 501 taxa_levels.append("Unannotated")
502
503 seq_ids_str = ",".join(sorted(info["seq_ids"]))
504 sources_str = ",".join(sorted(info["sources"]))
505
506 raw_rows.append(
507 {
508 "cluster": cluster_index,
509 "count": count,
510 "seq_id": seq_ids_str,
511 "source": sources_str,
512 "taxa_full": taxa,
513 "kingdom": taxa_levels[0],
514 "phylum": taxa_levels[1],
515 "class": taxa_levels[2],
516 "order": taxa_levels[3],
517 "family": taxa_levels[4],
518 "genus": taxa_levels[5],
519 "species": taxa_levels[6],
520 }
521 )
522
523 if write_processed:
524 for cluster_index, cluster_data in enumerate(cluster_data_list):
525 taxa_map = cluster_data["taxa_map"]
526
527 annotated_support = sum(
528 info["count"]
529 for taxa, info in taxa_map.items()
530 if taxa != "Unannotated read"
531 )
532
533 if annotated_support < args.min_cluster_support:
534 continue
535
536 taxa_counts = {taxa: info["count"] for taxa, info in taxa_map.items()}
537 processed_groups = calculate_cluster_taxa(taxa_counts, args)
538
539 for group in processed_groups:
540 for taxa, count in group.items():
541 if count <= 0:
542 continue
543
544 if "Uncertain taxa / Uncertain taxa / Uncertain taxa" in taxa:
545 continue
546 else:
547 info = taxa_map.get(taxa)
548 seq_ids_set = info["seq_ids"]
549 sources_set = info["sources"]
550 taxa_full = taxa
551 taxa_levels = taxa.split(" / ") if taxa else []
552 while len(taxa_levels) < 7:
553 taxa_levels.append("Unannotated")
554
555 seq_ids_str = ",".join(sorted(seq_ids_set))
556 sources_str = ",".join(sorted(sources_set))
557 processed_rows.append(
558 {
559 "cluster": cluster_index,
560 "count": count,
561 "seq_id": seq_ids_str,
562 "source": sources_str,
563 "taxa_full": taxa_full,
564 "kingdom": taxa_levels[0],
565 "phylum": taxa_levels[1],
566 "class": taxa_levels[2],
567 "order": taxa_levels[3],
568 "family": taxa_levels[4],
569 "genus": taxa_levels[5],
570 "species": taxa_levels[6],
571 }
572 )
573
574 if not raw_rows and not processed_rows:
575 if log_messages is not None:
576 log_message(
577 log_messages,
578 "No taxa data to write; taxa Excel file not created.",
579 )
580 return
581
582 raw_df = pd.DataFrame(raw_rows) if raw_rows else None
583 processed_df = pd.DataFrame(processed_rows) if processed_rows else None
584
585 settings_data = [
586 ["uncertain_taxa_use_ratio", args.uncertain_taxa_use_ratio],
587 ["min_to_split", args.min_to_split],
588 ["min_count_to_split", args.min_count_to_split],
589 ["min_cluster_support", args.min_cluster_support]
590 ]
591 settings_df = pd.DataFrame(settings_data, columns=["Parameter", "Value"])
592
593 temp_path = output_file + ".tmp.xlsx"
466 os.makedirs(os.path.dirname(temp_path), exist_ok=True) 594 os.makedirs(os.path.dirname(temp_path), exist_ok=True)
467 with pd.ExcelWriter(temp_path, engine='openpyxl') as writer: 595 with pd.ExcelWriter(temp_path, engine="openpyxl") as writer:
468 df.to_excel(writer, sheet_name='Raw_Taxa_Clusters', index=False, engine='openpyxl') 596 if raw_df is not None:
469 os.replace(temp_path, output_file) 597 raw_df.to_excel(
470 598 writer, sheet_name="Raw_Taxa_Clusters", index=False
471 def write_taxa_processed_output(cluster_data_list, args, output_file): 599 )
472 """ 600 if processed_df is not None:
473 Write processed (potentially split) taxa information to an Excel file. 601 processed_df.to_excel(
474 602 writer, sheet_name="Processed_Taxa_Clusters", index=False
475 This file contains the resulting sub-clusters from the taxonomic dominance 603 )
476 analysis and a separate sheet documenting the parameters used. 604 settings_df.to_excel(writer, sheet_name="Settings", index=False)
477 605
478 :param cluster_data_list: List of tuples (eval_list, simi_list, taxa_dict) per cluster.
479 :type cluster_data_list: list[tuple]
480 :param args: Parsed script arguments, used for taxa calculation and settings documentation.
481 :type args: argparse.Namespace
482 :param output_file: Path to the output Excel file.
483 :type output_file: str
484 :return: None
485 :rtype: None
486 """
487 # Create main dataframe
488 data = []
489 for cluster_num, (_, _, taxa_dict) in enumerate(cluster_data_list):
490 processed_taxa = calculate_cluster_taxa(taxa_dict, args)
491
492 for taxa_group in processed_taxa:
493 for taxa, count in taxa_group.items():
494 if 'Uncertain taxa / Uncertain taxa / Uncertain taxa' in taxa:
495 if args.show_unannotated_clusters:
496 data.append({
497 'cluster': cluster_num,
498 'count': count,
499 'taxa_full': 'Unannotated read',
500 'kingdom': 'Unannotated',
501 'phylum': 'Unannotated',
502 'class': 'Unannotated',
503 'order': 'Unannotated',
504 'family': 'Unannotated',
505 'genus': 'Unannotated',
506 'species': 'Unannotated'
507 })
508 else:
509 # Split taxa into taxonomic levels
510 taxa_levels = taxa.split(' / ') if taxa else []
511
512 try:
513 data.append({
514 'cluster': cluster_num,
515 'count': count,
516 'taxa_full': taxa,
517 'kingdom': taxa_levels[0],
518 'phylum': taxa_levels[1],
519 'class': taxa_levels[2],
520 'order': taxa_levels[3],
521 'family': taxa_levels[4],
522 'genus': taxa_levels[5],
523 'species': taxa_levels[6]
524 })
525 except IndexError:
526 # Skip entries with incomplete taxonomic data
527 print(f"Skipped entry in cluster {cluster_num}: incomplete taxonomic data for '{taxa}'")
528 continue
529
530 df = pd.DataFrame(data)
531
532 # Create settings dataframe
533 settings_data = [
534 ['uncertain_taxa_use_ratio', args.uncertain_taxa_use_ratio],
535 ['min_to_split', args.min_to_split],
536 ['min_count_to_split', args.min_count_to_split]
537 ]
538 settings_df = pd.DataFrame(settings_data, columns=['Parameter', 'Value'])
539
540 # Write to Excel with multiple sheets
541 temp_path = output_file + ".xlsx"
542 os.makedirs(os.path.dirname(temp_path), exist_ok=True)
543 with pd.ExcelWriter(temp_path, engine='openpyxl') as writer:
544 df.to_excel(writer, sheet_name='Processed_Taxa_Clusters', index=False, engine='openpyxl')
545 settings_df.to_excel(writer, sheet_name='Settings', index=False, engine='openpyxl')
546
547 # Auto-adjust column widths for better readability
548 for sheet_name in writer.sheets: 606 for sheet_name in writer.sheets:
549 worksheet = writer.sheets[sheet_name] 607 worksheet = writer.sheets[sheet_name]
550 for column in worksheet.columns: 608 for column in worksheet.columns:
551 max_length = 0 609 max_length = 0
552 column_letter = column[0].column_letter 610 column_letter = column[0].column_letter
553 for cell in column: 611 for cell in column:
554 try: 612 try:
555 if len(str(cell.value)) > max_length: 613 cell_len = len(str(cell.value))
556 max_length = len(str(cell.value)) 614 if cell_len > max_length:
557 except: 615 max_length = cell_len
616 except AttributeError:
558 pass 617 pass
559 adjusted_width = min(max_length + 2, 50) # Cap at 50 characters 618 adjusted_width = min(max_length + 2, 50)
560 worksheet.column_dimensions[column_letter].width = adjusted_width 619 worksheet.column_dimensions[column_letter].width = adjusted_width
620
561 os.replace(temp_path, output_file) 621 os.replace(temp_path, output_file)
562 622
563 def create_similarity_plot(all_simi_data, cluster_simi_lengths, args, output_file): 623 if log_messages is not None:
564 """ 624 if raw_df is not None:
565 Create a bar plot showing the distribution of intra-cluster similarity values. 625 log_message(log_messages, "Raw taxa per cluster written succesfully")
566 626 if processed_df is not None:
567 The plot uses different colors to distinguish reads belonging to different clusters. 627 log_message(
568 628 log_messages,
569 :param all_simi_data: List of all similarity values, sorted descending. 629 "Processed taxa (split clusters) written succesfully",
630 )
631
632
633 def create_similarity_plot(all_simi_data, cluster_simi_lengths, args, output_file, log_messages=None):
634 """Create and save a similarity distribution bar plot.
635
636 Bars are colored per cluster using a fixed repeating colormap.
637
638 :param all_simi_data: All similarity values across clusters.
570 :type all_simi_data: list[float] 639 :type all_simi_data: list[float]
571 :param cluster_simi_lengths: List of lengths of similarity data per cluster, used for coloring. 640 :param cluster_simi_lengths: Per-cluster similarity list lengths.
572 :type cluster_simi_lengths: list[int] 641 :type cluster_simi_lengths: list[int]
573 :param args: Parsed script arguments, used for plot y-limits. 642 :param args: Namespace containing y-axis plot limits.
574 :type args: argparse.Namespace 643 :type args: argparse.Namespace
575 :param output_file: Path to the output plot file (e.g., .png). 644 :param output_file: Output PNG file path.
576 :type output_file: str 645 :type output_file: str
646 :param log_messages: Optional log message list.
647 :type log_messages: list[str] or None
577 :return: None 648 :return: None
578 :rtype: None 649 :rtype: None
579 """ 650 """
580 if not all_simi_data: 651 if not all_simi_data:
652 if log_messages is not None:
653 log_message(log_messages, "No similarity data; similarity plot not created.")
581 return 654 return
582 655
583 sorted_simi_list = sorted(all_simi_data, reverse=True) 656 sorted_simi_list = sorted(all_simi_data, reverse=True)
584 bar_positions = list(range(len(sorted_simi_list))) 657 bar_positions = list(range(len(sorted_simi_list)))
585 658
586 # Create colormap for different clusters
587 colormap_full = [] 659 colormap_full = []
588 for i, length in enumerate(cluster_simi_lengths): 660 for i, length in enumerate(cluster_simi_lengths):
589 color = COLORMAP[i % len(COLORMAP)] 661 color = COLORMAP[i % len(COLORMAP)]
590 colormap_full.extend([color] * length) 662 colormap_full.extend([color] * length)
591 663
592 plt.figure(figsize=(12, 6)) 664 plt.figure(figsize=(12, 6))
593 plt.bar(bar_positions, sorted_simi_list, width=1, color=colormap_full) 665 plt.bar(bar_positions, sorted_simi_list, width=1, color=colormap_full)
594 plt.grid(axis='y', linestyle='--', color='gray', alpha=0.7) 666 plt.grid(axis="y", linestyle="--", color="gray", alpha=0.7)
595 plt.ylabel("Similarity (%)") 667 plt.ylabel("Similarity (%)")
596 plt.xlabel("Reads") 668 plt.xlabel("Reads")
597 plt.title("Intra-cluster Similarity Distribution") 669 plt.title("Intra-cluster Similarity Distribution")
598 plt.ylim(ymin=args.simi_plot_y_min, ymax=args.simi_plot_y_max) 670 plt.ylim(ymin=args.simi_plot_y_min, ymax=args.simi_plot_y_max)
599 plt.tight_layout() 671 plt.tight_layout()
600 672
601 plt.savefig(output_file, format='png', dpi=300, bbox_inches='tight') 673 plt.savefig(output_file, format="png", dpi=300, bbox_inches="tight")
602 plt.close() 674 plt.close()
603 675
604 676 if log_messages is not None:
605 def create_evalue_plot(all_eval_data, cluster_eval_lengths, output_file): 677 log_message(log_messages, "Similarity plot written succesfully")
606 """ 678
607 Create a bar plot showing the distribution of E-values. 679
608 680 def summarize_and_log(cluster_data_list, all_simi_data, log_messages, args):
609 The y-axis is log-scaled and displays ``1/E-values``. Reads are ordered 681 """Compute global summary statistics and append them to the log.
610 by E-value (ascending). The plot uses different colors to distinguish reads 682
611 belonging to different clusters. 683 Summary includes:
612 684 - total clusters
613 :param all_eval_data: List of E-values from all reads. Assumes E-values start at index 1. 685 - total annotated/unannotated reads
614 :type all_eval_data: list[float | int] 686 - per-cluster annotation presence
615 :param cluster_eval_lengths: List of lengths of annotated E-value data per cluster, used for coloring. 687 - top taxa distribution
616 :type cluster_eval_lengths: list[int] 688 - similarity mean and standard deviation
617 :param output_file: Path to the output plot file (e.g., .png). 689
618 :type output_file: str 690 :param cluster_data_list: List of processed cluster descriptors.
691 :type cluster_data_list: list[dict]
692 :param all_simi_data: All similarity values from all clusters.
693 :type all_simi_data: list[float]
694 :param log_messages: List collecting log output lines.
695 :type log_messages: list[str]
696 :param args: Argument namespace with configuration parameters.
697 :type args: argparse.Namespace
619 :return: None 698 :return: None
620 :rtype: None 699 :rtype: None
621 """ 700 """
622 if len(all_eval_data) <= 1: # Only unannotated reads 701 total_clusters = len(cluster_data_list)
623 return 702 total_annotated = sum(c["annotated"] for c in cluster_data_list)
624 703 total_unannotated = sum(c["unannotated"] for c in cluster_data_list)
625 sorted_eval_list = sorted(all_eval_data[1:]) # Skip unannotated count 704 grand_total = total_annotated + total_unannotated
626 705
627 if not sorted_eval_list: 706 clusters_with_annotations = sum(
628 return 707 1 for c in cluster_data_list if c["annotated"] > 0
629 708 )
630 bar_positions = list(range(len(sorted_eval_list))) 709 clusters_unannotated_only = total_clusters - clusters_with_annotations
631 bar_heights = [1 / e if e > 0 else 0 for e in sorted_eval_list] 710
632 711 log_message(log_messages, "=== SUMMARY ===")
633 # Create colormap for different clusters 712 log_message(log_messages, f"Clusters parsed: {total_clusters}")
634 colormap_full = [] 713 log_message(log_messages, f"Total reads: {grand_total}")
635 for i, length in enumerate(cluster_eval_lengths): 714 log_message(log_messages, f"Annotated reads: {total_annotated}")
636 color = COLORMAP[i % len(COLORMAP)] 715 log_message(log_messages, f"Unannotated reads: {total_unannotated}")
637 colormap_full.extend([color] * length) 716 log_message(log_messages, f"Clusters with annotations: {clusters_with_annotations}")
638 717 log_message(log_messages, f"Clusters fully unannotated: {clusters_unannotated_only}")
639 plt.figure(figsize=(12, 6)) 718 log_message(log_messages, f"Minimum cluster support for processed taxa: {args.min_cluster_support}")
640 plt.bar(bar_positions, bar_heights, width=1, color=colormap_full) 719
641 plt.yscale('log') 720 taxa_counter = Counter()
642 plt.grid(axis='y', linestyle='--', color='gray', alpha=0.7) 721 for c in cluster_data_list:
643 plt.ylabel("1/E-values") 722 for taxa, info in c["taxa_map"].items():
644 plt.xlabel("Reads") 723 if taxa == "Unannotated read":
645 plt.title("E-value Distribution") 724 continue
646 plt.tight_layout() 725 taxa_counter[taxa] += info["count"]
647 726
648 plt.savefig(output_file, format='png', dpi=300, bbox_inches='tight') 727 if taxa_counter:
649 plt.close() 728 log_message(log_messages, "=== TAXA SUMMARY (top 10) ===")
650 729 for taxa, count in taxa_counter.most_common(10):
651 def prepare_evalue_histogram(evalue_list, unannotated_list): 730 log_message(log_messages, f"{taxa}: {count} reads")
652 """ 731 log_message(log_messages, f"Total unique taxa: {len(taxa_counter)}")
653 Generate histogram data for E-value distributions. 732 else:
654 733 log_message(log_messages, "No annotated taxa found.")
655 This function processes a list of E-values from BLAST or similar search 734
656 results, filters out invalid or zero entries, and computes histogram data 735 if all_simi_data:
657 suitable for plotting. The histogram represents the frequency distribution 736 avg = sum(all_simi_data) / len(all_simi_data)
658 of E-values across all annotated hits. 737 variance = sum((s - avg) ** 2 for s in all_simi_data) / len(all_simi_data)
659 738 st_dev = sqrt(variance)
660 :param evalue_list: List of E-values from BLAST hits 739 log_message(log_messages, "=== SIMILARITY SUMMARY ===")
661 :type evalue_list: list[float | int] 740 log_message(log_messages, f"Mean similarity: {avg:.2f}")
662 :param unannotated_list: List of unannotated E-values 741 log_message(log_messages, f"Std similarity: {st_dev:.2f}")
663 :type unannotated_list: list 742 else:
664 :return: Tuple containing: 743 log_message(log_messages, "No similarity values available for summary.")
665 - **counts** (*numpy.ndarray*): Number of entries per histogram bin. 744
666 - **bins** (*numpy.ndarray*): Bin edges corresponding to the histogram.
667 Returns ``(None, None)`` if no valid data is available.
668 :rtype: tuple[numpy.ndarray, numpy.ndarray] | tuple[None, None]
669 :note:
670 - Only positive numeric E-values are included in the histogram.
671 - Uses 50 bins in the range (0, 1) for visualization consistency.
672 """
673 data = [ev for ev in evalue_list if isinstance(ev, (int, float)) and ev > 0]
674 if not data:
675 return None, None
676
677 counts, bins, _ = plt.hist(data, bins=50, range=(0, 1))
678 plt.close()
679 return counts, bins
680
681 def create_evalue_plot_test(evalue_list, unannotated_list, output_file):
682 """
683 Create and save an E-value distribution plot, returning the computed histogram data.
684
685 This function visualizes the frequency distribution of E-values from BLAST or
686 annotation results. It saves the plot to the specified output file and returns
687 the histogram data (counts and bins) for testing with pytests.
688
689 :param evalue_list: List of numeric E-values to plot
690 :type evalue_list: list[float | int]
691 :param unannotated_list: Optional list of E-values for unannotated sequences.
692 :type unannotated_list: list
693 :param output_file: Path where the histogram image will be saved.
694 :type output_file: str
695
696 :return: Tuple containing:
697 - **counts** (*numpy.ndarray*): Frequency counts per histogram bin.
698 - **bins** (*numpy.ndarray*): Histogram bin edges.
699 Returns ``(None, None)`` if no valid data was available for plotting.
700 :rtype: tuple[numpy.ndarray, numpy.ndarray] | tuple[None, None]
701 """
702 counts, bins = prepare_evalue_histogram(evalue_list, unannotated_list)
703 if counts is None:
704 return None, None
705
706 plt.hist([ev for ev in evalue_list if isinstance(ev, (int, float)) and ev > 0],
707 bins=50, range=(0, 1))
708 plt.xlabel("E-value")
709 plt.ylabel("Frequency")
710 plt.title("E-value Distribution")
711 plt.savefig(output_file)
712 plt.close()
713 return counts, bins
714 745
715 746
716 def main(arg_list=None): 747 def main(arg_list=None):
717 """ 748 """Main entry point for CD-HIT cluster processing.
718 Main entry point of the script. 749
719 750 Orchestrates parsing, cluster processing, statistic aggregation,
720 Parses arguments, processes cd-hit cluster data, aggregates results, 751 visualization, and generation of all requested outputs.
721 and generates requested outputs (text summaries, plots, and Excel reports). 752
722 753 :param arg_list: Optional list of arguments (used by tests).
723 :param arg_list: List of arguments for testing purposes. 754 :type arg_list: list[str] or None
724 :type arg_list: list, optional
725 :return: None 755 :return: None
726 :rtype: None 756 :rtype: None
727 """ 757 """
728 args = parse_arguments(arg_list) 758 args = parse_arguments(arg_list)
729 # Parse cluster file 759 log_messages: list[str] = []
760 log_message(log_messages, "=== CD-HIT cluster processing started ===")
761
762 log_message(log_messages, "cluster_file: provided")
763 log_message(
764 log_messages,
765 "annotation_file: provided" if args.input_annotation else "annotation_file: none",
766 )
767
768 log_message(log_messages, "=== PARAMETERS ===")
769 skip_keys = {
770 "output_similarity_txt",
771 "output_similarity_plot",
772 "output_count",
773 "output_excel",
774 "input_cluster",
775 "input_annotation",
776 "log_file",
777 }
778 for key, value in vars(args).items():
779 if key in skip_keys:
780 continue
781 log_message(log_messages, f"{key}: {value}")
782 log_message(log_messages, "=== PARAMETERS END===")
783
730 clusters = parse_cluster_file( 784 clusters = parse_cluster_file(
731 args.input_cluster, 785 args.input_cluster,
732 args.input_annotation, 786 args.input_annotation,
733 args.print_empty_files 787 raise_on_error=False,
788 log_messages=log_messages,
734 ) 789 )
735 # Process each cluster 790
736 all_eval_data = [0] # For full sample statistics
737 all_simi_data = []
738 cluster_eval_lengths = []
739 cluster_simi_lengths = []
740 cluster_data_list = [] 791 cluster_data_list = []
792 all_simi_data: list[float] = []
793 cluster_simi_lengths: list[int] = []
741 794
742 for cluster in clusters: 795 for cluster in clusters:
743 eval_list, simi_list, taxa_dict = process_cluster_data(cluster) 796 simi_list, taxa_map, annotated_count, unannotated_count = process_cluster_data(
744 cluster_data_list.append((eval_list, simi_list, taxa_dict)) 797 cluster
745 # Collect data for full sample plots 798 )
746 all_eval_data[0] += eval_list[0] 799 cluster_data = {
747 if len(eval_list) > 1: 800 "similarities": simi_list,
748 all_eval_data.extend(sorted(eval_list[1:])) 801 "taxa_map": taxa_map,
749 cluster_eval_lengths.append(len(eval_list[1:])) 802 "annotated": annotated_count,
803 "unannotated": unannotated_count,
804 }
805 cluster_data_list.append(cluster_data)
750 806
751 if simi_list: 807 if simi_list:
752 all_simi_data.extend(sorted(simi_list, reverse=True)) 808 all_simi_data.extend(simi_list)
753 cluster_simi_lengths.append(len(simi_list)) 809 cluster_simi_lengths.append(len(simi_list))
754 810
755 # Generate outputs based on what was requested
756 if args.output_similarity_txt: 811 if args.output_similarity_txt:
757 write_similarity_output(all_simi_data, args.output_similarity_txt) 812 write_similarity_output(
758 813 cluster_data_list, args.output_similarity_txt, log_messages)
759 if args.output_similarity_plot and all_simi_data: 814
760 create_similarity_plot(all_simi_data, cluster_simi_lengths, args, args.output_similarity_plot) 815 if args.output_similarity_plot:
761 816 create_similarity_plot(
762 if args.output_evalue_txt: 817 all_simi_data,
763 write_evalue_output(all_eval_data, args.output_evalue_txt) 818 cluster_simi_lengths,
764 819 args,
765 if args.output_evalue_plot and len(all_eval_data) > 1: 820 args.output_similarity_plot,
766 create_evalue_plot(all_eval_data, cluster_eval_lengths, args.output_evalue_plot) 821 log_messages)
767 822
768 if args.output_count: 823 if args.output_count:
769 write_count_output(all_eval_data, cluster_data_list, args.output_count) 824 write_count_output(cluster_data_list, args.output_count, log_messages)
770 825
771 if args.output_taxa_clusters: 826 if args.output_excel:
772 write_taxa_clusters_output(cluster_data_list, args.output_taxa_clusters) 827 write_raw = bool(args.output_taxa_clusters)
773 828 write_processed = bool(args.output_taxa_processed)
774 if args.output_taxa_processed: 829
775 write_taxa_processed_output(cluster_data_list, args, args.output_taxa_processed) 830 if not write_raw and not write_processed:
776 831 if log_messages is not None:
777 print(f"Processing complete. Processed {len(clusters)} clusters.") 832 log_message(log_messages, "output_excel provided but no taxa sheet flags set; no Excel file written.")
833 else:
834 write_taxa_excel(cluster_data_list,
835 args,
836 args.output_excel,
837 write_raw=write_raw,
838 write_processed=write_processed,
839 log_messages=log_messages)
840 else:
841 if args.output_taxa_clusters or args.output_taxa_processed:
842 if log_messages is not None:
843 log_message(
844 log_messages,
845 "WARNING: Raw/processed taxa output flags set but no --output_excel path provided; skipping Excel output."
846 )
847
848 summarize_and_log(cluster_data_list, all_simi_data, log_messages, args)
849 log_message(log_messages, "=== CD-HIT cluster processing finished ===")
850
851 if args.log_file:
852 os.makedirs(os.path.dirname(args.log_file), exist_ok=True)
853 with open(args.log_file, "w") as f:
854 f.write("\n".join(log_messages))
778 855
779 856
780 if __name__ == "__main__": 857 if __name__ == "__main__":
781 main() 858 main()