comparison cdhit_analysis.py @ 0:00d56396b32a draft

planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
author onnodg
date Tue, 14 Oct 2025 09:09:46 +0000
parents
children ff68835adb2b
comparison
equal deleted inserted replaced
-1:000000000000 0:00d56396b32a
1 #!/usr/bin/env python3
2
3 import argparse
4 import os
5 import re
6 from collections import Counter, defaultdict
7 from math import sqrt
8 import pandas as pd
9 import matplotlib
10
11 matplotlib.use('Agg') # Non-interactive backend for Galaxy
12 import matplotlib.pyplot as plt
13
14 """
15 This script processes cluster output files from cd-hit-est for use in Galaxy.
16 It extracts cluster information, associates taxa and e-values from annotation files,
17 performs statistical calculations, and generates text and plot outputs
18 summarizing similarity and taxonomic distributions.
19
20
21 Main steps:
22 1. Parse cd-hit-est cluster file and (optional) annotation file.
23 2. Process each cluster to extract similarity, taxa, and e-value information.
24 3. Aggregate results across clusters.
25 4. Generate requested outputs: text summaries, plots, and Excel reports.
26
27
28 Note: Uses a non-interactive matplotlib backend (Agg) for compatibility with Galaxy.
29 """
30
31
32 def parse_arguments(args_list=None):
33 """Parse command-line arguments for the script."""
34 parser = argparse.ArgumentParser(
35 description='Create taxa analysis from cd-hit cluster files')
36 parser.add_argument('--input_cluster', type=str, required=True,
37 help='Input cluster file (.clstr)')
38 parser.add_argument('--input_annotation', type=str, required=False,
39 help='Input annotation file (.out)')
40
41 # Galaxy output files
42 parser.add_argument('--output_similarity_txt', type=str,
43 help='Similarity text output file')
44 parser.add_argument('--output_similarity_plot', type=str,
45 help='Similarity plot output file')
46 parser.add_argument('--output_evalue_txt', type=str,
47 help='E-value text output file')
48 parser.add_argument('--output_evalue_plot', type=str,
49 help='E-value plot output file')
50 parser.add_argument('--output_count', type=str,
51 help='Count summary output file')
52 parser.add_argument('--output_taxa_clusters', type=str,
53 help='Taxa per cluster output file')
54 parser.add_argument('--output_taxa_processed', type=str,
55 help='Processed taxa output file')
56 # Plot parameters
57 parser.add_argument('--simi_plot_y_min', type=float, default=95.0,
58 help='Minimum value of the y-axis in the similarity plot')
59 parser.add_argument('--simi_plot_y_max', type=float, default=100.0,
60 help='Maximum value of the y-axis in the similarity plot')
61
62 # Uncertain taxa configuration
63 parser.add_argument('--uncertain_taxa_use_ratio', type=float, default=0.5,
64 help='Ratio at which uncertain taxa count toward the correct taxa')
65 parser.add_argument('--min_to_split', type=float, default=0.45,
66 help='Minimum percentage for taxonomic split')
67 parser.add_argument('--min_count_to_split', type=int, default=10,
68 help='Minimum count for taxonomic split')
69
70 # Processing options
71 parser.add_argument('--show_unannotated_clusters', action='store_true', default=False,
72 help='Show unannotated clusters in output')
73 parser.add_argument('--make_taxa_in_cluster_split', action='store_true', default=False,
74 help='Split clusters with multiple taxa')
75 parser.add_argument('--print_empty_files', action='store_true', default=False,
76 help='Print messages about empty annotation files')
77
78 return parser.parse_args(args_list)
79
80
81 # Color map for plots
82 COLORMAP = [
83 # List of RGBA tuples for bar coloring in plots
84 (0.12156862745098039, 0.4666666666666667, 0.7058823529411765, 1.0),
85 (1.0, 0.4980392156862745, 0.054901960784313725, 1.0),
86 (0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0),
87 (0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0),
88 (0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0),
89 (0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0),
90 (0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0),
91 (0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0),
92 (0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0),
93 (0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0)
94 ]
95
96
97 def parse_cluster_file(cluster_file, annotation_file=None, print_empty=False, raise_on_error=False):
98 """
99 Parse the cd-hit-est cluster file (.clstr) and (optionally) an Excel annotation file.
100
101 It extracts cluster information (header, read count, similarity) and associates
102 taxonomic information and E-values from the annotation file based on the read header.
103
104 :param cluster_file: Path to cd-hit cluster file (.clstr).
105 :type cluster_file: str
106 :param annotation_file: Path to Excel annotation file with taxa and e-values.
107 :type annotation_file: str, optional
108 :param print_empty: Print a message if the annotation file is not found or empty.
109 :type print_empty: bool
110 :param raise_on_error: Raise parsing errors instead of printing warnings.
111 :type raise_on_error: bool
112 :return: List of clusters, where each cluster is a dict mapping read header to a dict of read info.
113 :rtype: list[dict[str, dict]]
114 :raises ValueError: If similarity cannot be parsed from a cluster line.
115 :raises UnboundLocalError: If an error occurs during annotation processing.
116 """
117 clusters = []
118 current_cluster = {}
119
120 # Load annotations if provided
121 annotations = {}
122 if annotation_file and os.path.exists(annotation_file):
123 # Lees het Excel-bestand
124 df = pd.read_excel(annotation_file, sheet_name='Individual_Reads', engine='openpyxl')
125
126 # Itereer over de rijen
127 for _, row in df.iterrows():
128 header = row['header'] # kolomnaam zoals in Excel
129 evalue = row['e_value'] # of de kolomnaam die je wilt gebruiken
130 taxa = row['taxa'] # afhankelijk van hoe je taxa opslaat
131
132 annotations[header] = {'evalue': evalue, 'taxa': taxa}
133 elif annotation_file and print_empty:
134 print(f"Annotation file {annotation_file} not found or empty")
135 with open(cluster_file, 'r') as f:
136 for line in f:
137 line = line.strip()
138 if line.startswith('>Cluster'):
139 # Start of new cluster, save previous if exists
140 if current_cluster:
141 clusters.append(current_cluster)
142 current_cluster = {}
143 else:
144 # Parse sequence line
145 parts = line.split()
146 if len(parts) >= 2:
147 # Extract header and count
148 header_part = parts[2].strip('>.')
149 header_parts = header_part.split('(')
150 if len(header_parts) > 1:
151 last_part = header_parts[-1].strip(')')
152 header = header_parts[0]
153 if last_part:
154 count = int(last_part)
155 else:
156 print('no count')
157 count = 0
158 header = ''
159 # Extract similarity
160 similarity_part = parts[-1].strip()
161 if '*' in similarity_part:
162 similarity = 100.0 # Representative sequence
163 else:
164 matches = re.findall(r'[\d.]+', similarity_part)
165 if matches:
166 similarity = float(matches[-1])
167 else:
168 raise ValueError(f"Could not parse similarity from: '{similarity_part}'")
169 # Get annotation info
170 try:
171 if header in annotations:
172 taxa = annotations[header]['taxa']
173 evalue = annotations[header]['evalue']
174 else:
175 taxa = 'Unannotated read'
176 evalue = 'Unannotated read'
177
178 current_cluster[header] = {
179 'count': count,
180 'similarity': similarity,
181 'taxa': taxa,
182 'evalue': evalue
183 }
184 except UnboundLocalError as e:
185 if raise_on_error:
186 raise UnboundLocalError(str(e))
187 print(f"Error: {e}, No annotations found")
188
189 # Add the last cluster
190 if current_cluster:
191 clusters.append(current_cluster)
192
193 return clusters
194
195
196 def process_cluster_data(cluster):
197 """
198 Process a single cluster to extract E-value, similarity, and taxa data for all reads.
199
200 Aggregates information from all reads in the cluster, storing read counts,
201 E-values, similarities, and taxa in lists and a dictionary.
202
203 :param cluster: Cluster data mapping read headers to read info.
204 :type cluster: dict
205 :return: A tuple containing: (list of E-values, list of similarity values, dict of taxa -> counts).
206 The first element of the E-value list ([0]) stores the count of unannotated reads.
207 :rtype: tuple[list[float | int], list[float], dict[str, int]]
208 """
209 eval_list = [0.0] # First element for unannotated count
210 simi_list = []
211 taxa_dict = {'Unannotated read': 0}
212
213 for header, info in cluster.items():
214 count = info['count']
215 similarity = info['similarity']
216 taxa = info['taxa']
217 evalue = info['evalue']
218
219 if evalue == 'Unannotated read':
220 eval_list[0] += count
221 taxa_dict['Unannotated read'] += count
222 else:
223 try:
224 eval_val = float(evalue)
225 for _ in range(count):
226 eval_list.append(eval_val)
227 except ValueError:
228 eval_list[0] += count
229
230 if taxa not in taxa_dict:
231 taxa_dict[taxa] = 0
232 taxa_dict[taxa] += count
233
234 # Add similarity values
235 for _ in range(count):
236 simi_list.append(similarity)
237 return eval_list, simi_list, taxa_dict
238
239
240 def calculate_cluster_taxa(taxa_dict, args):
241 """
242 Calculate the most likely taxa for a cluster based on read counts.
243
244 This function applies the 'uncertain taxa use ratio' for unannotated reads
245 and uses a recursive approach to potentially split a cluster into sub-clusters
246 if taxonomic dominance is not strong enough (based on ``min_to_split``
247 and ``min_count_to_split``).
248
249 :param taxa_dict: Mapping of taxa (full string) -> read counts.
250 :type taxa_dict: dict[str, int]
251 :param args: Parsed script arguments, including parameters for taxa calculation.
252 :type args: argparse.Namespace
253 :return: A list of refined taxa assignments (dictionaries), where each dictionary
254 represents a potentially split sub-cluster.
255 :rtype: list[dict[str, float | int]]
256 """
257 # Replace 'Unannotated read' with uncertain taxa format
258 processed_dict = {}
259 for taxa, count in taxa_dict.items():
260 if taxa == 'Unannotated read':
261 uncertain_taxa = ' / '.join(['Uncertain taxa'] * 7)
262 processed_dict[uncertain_taxa] = count
263 else:
264 processed_dict[taxa] = count
265
266 return _recursive_taxa_calculation(processed_dict, args, 0)
267
268
269 def _recursive_taxa_calculation(taxa_dict, args, level):
270 """
271 Recursive helper to calculate and potentially split taxa at each taxonomic level.
272
273 :param taxa_dict: Taxa counts at the current level (or sub-group).
274 :type taxa_dict: dict[str, float | int]
275 :param args: Parsed script arguments.
276 :type args: argparse.Namespace
277 :param level: Index of the current taxonomic level (0=kingdom, max 6=species).
278 :type level: int
279 :return: List of refined taxa dictionaries.
280 :rtype: list[dict[str, float | int]]
281 """
282 if level >= 7: # Max 7 taxonomic levels
283 return [taxa_dict]
284
285 level_dict = defaultdict(float)
286
287 # Group by taxonomic level
288 for taxa, count in taxa_dict.items():
289 taxa_parts = taxa.split(' / ')
290 if level < len(taxa_parts):
291 level_taxon = taxa_parts[level]
292 if level_taxon == 'Uncertain taxa':
293 level_dict[level_taxon] += count * args.uncertain_taxa_use_ratio
294 else:
295 level_dict[level_taxon] += count
296
297 if len(level_dict) <= 1:
298 # Only one taxon at this level, continue to next level
299 return _recursive_taxa_calculation(taxa_dict, args, level + 1)
300
301 # Sort by abundance
302 sorted_taxa = sorted(level_dict.items(), key=lambda x: x[1], reverse=True)
303
304 result = []
305 dominant_taxon = sorted_taxa[0][0]
306
307 # Check if we should split
308 for i in range(1, len(sorted_taxa)):
309 secondary_taxon = sorted_taxa[i][0]
310 total_count = sorted_taxa[0][1] + sorted_taxa[i][1]
311 ratio = sorted_taxa[i][1] / total_count
312
313 if ratio >= args.min_to_split or total_count <= args.min_count_to_split:
314 # Split off this taxon
315 split_dict = {taxa: count for taxa, count in taxa_dict.items()
316 if taxa.split(' / ')[level] == secondary_taxon}
317 result.extend(_recursive_taxa_calculation(split_dict, args, level + 1))
318
319 # Process the dominant group
320 dominant_dict = {taxa: count for taxa, count in taxa_dict.items()
321 if taxa.split(' / ')[level] == dominant_taxon}
322 result.extend(_recursive_taxa_calculation(dominant_dict, args, level + 1))
323
324 return result
325
326
327 def write_similarity_output(all_simi_data, output_file):
328 """
329 Write the similarity text output, including the mean and standard deviation,
330 and a count per similarity percentage.
331
332 :param all_simi_data: List of all similarity percentages from all reads.
333 :type all_simi_data: list[float]
334 :param output_file: Path to the output file.
335 :type output_file: str
336 :return: None
337 :rtype: None
338 """
339 if not all_simi_data:
340 return
341
342 avg = sum(all_simi_data) / len(all_simi_data)
343 variance = sum((s - avg) ** 2 for s in all_simi_data) / len(all_simi_data)
344 st_dev = sqrt(variance)
345
346 simi_counter = Counter(all_simi_data)
347 simi_sorted = sorted(simi_counter.items(), key=lambda x: -x[0])
348
349 with open(output_file, 'w') as f:
350 f.write(f"# Average similarity: {avg:.2f}\n")
351 f.write(f"# Standard deviation: {st_dev:.2f}\n")
352 f.write("similarity\tcount\n")
353 for similarity, count in simi_sorted:
354 f.write(f"{similarity}\t{count}\n")
355
356
357 def write_evalue_output(all_eval_data, output_file):
358 """
359 Write the E-value text output, including the count of unannotated reads
360 and a count per E-value.
361
362 :param all_eval_data: List of E-values from all reads. The first element ([0]) is the count of unannotated reads.
363 :type all_eval_data: list[float | int]
364 :param output_file: Path to the output file.
365 :type output_file: str
366 :return: None
367 :rtype: None
368 """
369 unanno_count = all_eval_data[0]
370 eval_counter = Counter(all_eval_data[1:])
371
372 with open(output_file, 'w') as f:
373 f.write("evalue\tcount\n")
374 if unanno_count > 0:
375 f.write(f"unannotated\t{unanno_count}\n")
376
377 eval_sorted = sorted(eval_counter.items(),
378 key=lambda x: (-x[1], float(x[0]) if isinstance(x[0], (int, float)) else float('inf')))
379 for value, count in eval_sorted:
380 f.write(f"{value}\t{count}\n")
381
382
383 def write_count_output(all_eval_data, cluster_data_list, output_file):
384 """
385 Write a summary of annotated and unannotated read counts per cluster
386 and for the total sample.
387
388 :param all_eval_data: List of E-values from all reads for the total count summary.
389 :type all_eval_data: list[float | int]
390 :param cluster_data_list: List of tuples (eval_list, simi_list, taxa_dict) per cluster.
391 :type cluster_data_list: list[tuple]
392 :param output_file: Path to the output file.
393 :type output_file: str
394 :return: None
395 :rtype: None
396 """
397 with open(output_file, 'w') as f:
398 f.write("cluster\tunannotated\tannotated\ttotal\tperc_unannotated\tperc_annotated\n")
399
400 for cluster_num, (eval_list, _, _) in enumerate(cluster_data_list):
401 unannotated = eval_list[0]
402 annotated = len(eval_list[1:])
403 total = unannotated + annotated
404
405 if total > 0:
406 perc_annotated = (annotated / total) * 100
407 perc_unannotated = (unannotated / total) * 100
408 else:
409 perc_annotated = perc_unannotated = 0
410
411 f.write(
412 f"{cluster_num}\t{unannotated}\t{annotated}\t{total}\t{perc_unannotated:.2f}\t{perc_annotated:.2f}\n")
413
414 # Add full sample summary
415 total_unannotated = all_eval_data[0]
416 total_annotated = len(all_eval_data[1:])
417 grand_total = total_unannotated + total_annotated
418
419 if grand_total > 0:
420 perc_annotated = (total_annotated / grand_total) * 100
421 perc_unannotated = (total_unannotated / grand_total) * 100
422 else:
423 perc_annotated = perc_unannotated = 0
424
425 f.write(
426 f"TOTAL\t{total_unannotated}\t{total_annotated}\t{grand_total}\t{perc_unannotated:.2f}\t{perc_annotated:.2f}\n")
427
428
429 def write_taxa_clusters_output(cluster_data_list, output_file):
430 """
431 Write raw taxa information per cluster to an Excel file.
432
433 Each row contains the cluster number, read count, the full taxa string,
434 and the separate taxonomic levels (kingdom through species).
435
436 :param cluster_data_list: List of tuples (eval_list, simi_list, taxa_dict) per cluster.
437 :type cluster_data_list: list[tuple]
438 :param output_file: Path to the output Excel file.
439 :type output_file: str
440 :return: None
441 :rtype: None
442 """
443 # Create main dataframe
444 data = []
445 for cluster_num, (_, _, taxa_dict) in enumerate(cluster_data_list):
446 for taxa, count in taxa_dict.items():
447 if count > 0:
448 # Split taxa into taxonomic levels
449 taxa_levels = taxa.split(' / ') if taxa else []
450 taxa_levels += ['Unannotated read'] * (7 - len(taxa_levels))
451 try:
452 data.append({
453 'cluster': cluster_num,
454 'count': count,
455 'taxa_full': taxa,
456 'kingdom': taxa_levels[0],
457 'phylum': taxa_levels[1],
458 'class': taxa_levels[2],
459 'order': taxa_levels[3],
460 'family': taxa_levels[4],
461 'genus': taxa_levels[5],
462 'species': taxa_levels[6]
463 })
464 except IndexError as e:
465 # Skip entries with incomplete taxonomic data
466 print(f"Skipped entry in cluster {cluster_num}: incomplete taxonomic data for '{taxa}, error: {e}'")
467 continue
468
469 df = pd.DataFrame(data)
470 # Write to Excel
471 temp_path = output_file + ".xlsx"
472 os.makedirs(os.path.dirname(temp_path), exist_ok=True)
473 with pd.ExcelWriter(temp_path, engine='openpyxl') as writer:
474 df.to_excel(writer, sheet_name='Raw_Taxa_Clusters', index=False, engine='openpyxl')
475 os.replace(temp_path, output_file)
476
477 def write_taxa_processed_output(cluster_data_list, args, output_file):
478 """
479 Write processed (potentially split) taxa information to an Excel file.
480
481 This file contains the resulting sub-clusters from the taxonomic dominance
482 analysis and a separate sheet documenting the parameters used.
483
484 :param cluster_data_list: List of tuples (eval_list, simi_list, taxa_dict) per cluster.
485 :type cluster_data_list: list[tuple]
486 :param args: Parsed script arguments, used for taxa calculation and settings documentation.
487 :type args: argparse.Namespace
488 :param output_file: Path to the output Excel file.
489 :type output_file: str
490 :return: None
491 :rtype: None
492 """
493 # Create main dataframe
494 data = []
495 for cluster_num, (_, _, taxa_dict) in enumerate(cluster_data_list):
496 processed_taxa = calculate_cluster_taxa(taxa_dict, args)
497
498 for taxa_group in processed_taxa:
499 for taxa, count in taxa_group.items():
500 if 'Uncertain taxa / Uncertain taxa / Uncertain taxa' in taxa:
501 if args.show_unannotated_clusters:
502 data.append({
503 'cluster': cluster_num,
504 'count': count,
505 'taxa_full': 'Unannotated read',
506 'kingdom': 'Unannotated',
507 'phylum': 'Unannotated',
508 'class': 'Unannotated',
509 'order': 'Unannotated',
510 'family': 'Unannotated',
511 'genus': 'Unannotated',
512 'species': 'Unannotated'
513 })
514 else:
515 # Split taxa into taxonomic levels
516 taxa_levels = taxa.split(' / ') if taxa else []
517
518 try:
519 data.append({
520 'cluster': cluster_num,
521 'count': count,
522 'taxa_full': taxa,
523 'kingdom': taxa_levels[0],
524 'phylum': taxa_levels[1],
525 'class': taxa_levels[2],
526 'order': taxa_levels[3],
527 'family': taxa_levels[4],
528 'genus': taxa_levels[5],
529 'species': taxa_levels[6]
530 })
531 except IndexError:
532 # Skip entries with incomplete taxonomic data
533 print(f"Skipped entry in cluster {cluster_num}: incomplete taxonomic data for '{taxa}'")
534 continue
535
536 df = pd.DataFrame(data)
537
538 # Create settings dataframe
539 settings_data = [
540 ['uncertain_taxa_use_ratio', args.uncertain_taxa_use_ratio],
541 ['min_to_split', args.min_to_split],
542 ['min_count_to_split', args.min_count_to_split]
543 ]
544 settings_df = pd.DataFrame(settings_data, columns=['Parameter', 'Value'])
545
546 # Write to Excel with multiple sheets
547 temp_path = output_file + ".xlsx"
548 os.makedirs(os.path.dirname(temp_path), exist_ok=True)
549 with pd.ExcelWriter(temp_path, engine='openpyxl') as writer:
550 df.to_excel(writer, sheet_name='Processed_Taxa_Clusters', index=False, engine='openpyxl')
551 settings_df.to_excel(writer, sheet_name='Settings', index=False, engine='openpyxl')
552
553 # Auto-adjust column widths for better readability
554 for sheet_name in writer.sheets:
555 worksheet = writer.sheets[sheet_name]
556 for column in worksheet.columns:
557 max_length = 0
558 column_letter = column[0].column_letter
559 for cell in column:
560 try:
561 if len(str(cell.value)) > max_length:
562 max_length = len(str(cell.value))
563 except:
564 pass
565 adjusted_width = min(max_length + 2, 50) # Cap at 50 characters
566 worksheet.column_dimensions[column_letter].width = adjusted_width
567 os.replace(temp_path, output_file)
568
569 def create_similarity_plot(all_simi_data, cluster_simi_lengths, args, output_file):
570 """
571 Create a bar plot showing the distribution of intra-cluster similarity values.
572
573 The plot uses different colors to distinguish reads belonging to different clusters.
574
575 :param all_simi_data: List of all similarity values, sorted descending.
576 :type all_simi_data: list[float]
577 :param cluster_simi_lengths: List of lengths of similarity data per cluster, used for coloring.
578 :type cluster_simi_lengths: list[int]
579 :param args: Parsed script arguments, used for plot y-limits.
580 :type args: argparse.Namespace
581 :param output_file: Path to the output plot file (e.g., .png).
582 :type output_file: str
583 :return: None
584 :rtype: None
585 """
586 if not all_simi_data:
587 return
588
589 sorted_simi_list = sorted(all_simi_data, reverse=True)
590 bar_positions = list(range(len(sorted_simi_list)))
591
592 # Create colormap for different clusters
593 colormap_full = []
594 for i, length in enumerate(cluster_simi_lengths):
595 color = COLORMAP[i % len(COLORMAP)]
596 colormap_full.extend([color] * length)
597
598 plt.figure(figsize=(12, 6))
599 plt.bar(bar_positions, sorted_simi_list, width=1, color=colormap_full)
600 plt.grid(axis='y', linestyle='--', color='gray', alpha=0.7)
601 plt.ylabel("Similarity (%)")
602 plt.xlabel("Reads")
603 plt.title("Intra-cluster Similarity Distribution")
604 plt.ylim(ymin=args.simi_plot_y_min, ymax=args.simi_plot_y_max)
605 plt.tight_layout()
606
607 plt.savefig(output_file, format='png', dpi=300, bbox_inches='tight')
608 plt.close()
609
610
611 def create_evalue_plot(all_eval_data, cluster_eval_lengths, output_file):
612 """
613 Create a bar plot showing the distribution of E-values.
614
615 The y-axis is log-scaled and displays ``1/E-values``. Reads are ordered
616 by E-value (ascending). The plot uses different colors to distinguish reads
617 belonging to different clusters.
618
619 :param all_eval_data: List of E-values from all reads. Assumes E-values start at index 1.
620 :type all_eval_data: list[float | int]
621 :param cluster_eval_lengths: List of lengths of annotated E-value data per cluster, used for coloring.
622 :type cluster_eval_lengths: list[int]
623 :param output_file: Path to the output plot file (e.g., .png).
624 :type output_file: str
625 :return: None
626 :rtype: None
627 """
628 if len(all_eval_data) <= 1: # Only unannotated reads
629 return
630
631 sorted_eval_list = sorted(all_eval_data[1:]) # Skip unannotated count
632
633 if not sorted_eval_list:
634 return
635
636 bar_positions = list(range(len(sorted_eval_list)))
637 bar_heights = [1 / e if e > 0 else 0 for e in sorted_eval_list]
638
639 # Create colormap for different clusters
640 colormap_full = []
641 for i, length in enumerate(cluster_eval_lengths):
642 color = COLORMAP[i % len(COLORMAP)]
643 colormap_full.extend([color] * length)
644
645 plt.figure(figsize=(12, 6))
646 plt.bar(bar_positions, bar_heights, width=1, color=colormap_full)
647 plt.yscale('log')
648 plt.grid(axis='y', linestyle='--', color='gray', alpha=0.7)
649 plt.ylabel("1/E-values")
650 plt.xlabel("Reads")
651 plt.title("E-value Distribution")
652 plt.tight_layout()
653
654 plt.savefig(output_file, format='png', dpi=300, bbox_inches='tight')
655 plt.close()
656
657 def prepare_evalue_histogram(evalue_list, unannotated_list):
658 """
659 Generate histogram data for E-value distributions.
660
661 This function processes a list of E-values from BLAST or similar search
662 results, filters out invalid or zero entries, and computes histogram data
663 suitable for plotting. The histogram represents the frequency distribution
664 of E-values across all annotated hits.
665
666 :param evalue_list: List of E-values from BLAST hits
667 :type evalue_list: list[float | int]
668 :param unannotated_list: List of unannotated E-values
669 :type unannotated_list: list
670 :return: Tuple containing:
671 - **counts** (*numpy.ndarray*): Number of entries per histogram bin.
672 - **bins** (*numpy.ndarray*): Bin edges corresponding to the histogram.
673 Returns ``(None, None)`` if no valid data is available.
674 :rtype: tuple[numpy.ndarray, numpy.ndarray] | tuple[None, None]
675 :note:
676 - Only positive numeric E-values are included in the histogram.
677 - Uses 50 bins in the range (0, 1) for visualization consistency.
678 """
679 data = [ev for ev in evalue_list if isinstance(ev, (int, float)) and ev > 0]
680 if not data:
681 return None, None
682
683 counts, bins, _ = plt.hist(data, bins=50, range=(0, 1))
684 plt.close()
685 return counts, bins
686
687 def create_evalue_plot_test(evalue_list, unannotated_list, output_file):
688 """
689 Create and save an E-value distribution plot, returning the computed histogram data.
690
691 This function visualizes the frequency distribution of E-values from BLAST or
692 annotation results. It saves the plot to the specified output file and returns
693 the histogram data (counts and bins) for testing with pytests.
694
695 :param evalue_list: List of numeric E-values to plot
696 :type evalue_list: list[float | int]
697 :param unannotated_list: Optional list of E-values for unannotated sequences.
698 :type unannotated_list: list
699 :param output_file: Path where the histogram image will be saved.
700 :type output_file: str
701
702 :return: Tuple containing:
703 - **counts** (*numpy.ndarray*): Frequency counts per histogram bin.
704 - **bins** (*numpy.ndarray*): Histogram bin edges.
705 Returns ``(None, None)`` if no valid data was available for plotting.
706 :rtype: tuple[numpy.ndarray, numpy.ndarray] | tuple[None, None]
707 """
708 counts, bins = prepare_evalue_histogram(evalue_list, unannotated_list)
709 if counts is None:
710 return None, None
711
712 plt.hist([ev for ev in evalue_list if isinstance(ev, (int, float)) and ev > 0],
713 bins=50, range=(0, 1))
714 plt.xlabel("E-value")
715 plt.ylabel("Frequency")
716 plt.title("E-value Distribution")
717 plt.savefig(output_file)
718 plt.close()
719 return counts, bins
720
721
722 def main(arg_list=None):
723 """
724 Main entry point of the script.
725
726 Parses arguments, processes cd-hit cluster data, aggregates results,
727 and generates requested outputs (text summaries, plots, and Excel reports).
728
729 :param arg_list: List of arguments for testing purposes.
730 :type arg_list: list, optional
731 :return: None
732 :rtype: None
733 """
734 args = parse_arguments(arg_list)
735 # Parse cluster file
736 clusters = parse_cluster_file(
737 args.input_cluster,
738 args.input_annotation,
739 args.print_empty_files
740 )
741 # Process each cluster
742 all_eval_data = [0] # For full sample statistics
743 all_simi_data = []
744 cluster_eval_lengths = []
745 cluster_simi_lengths = []
746 cluster_data_list = []
747
748 for cluster in clusters:
749 eval_list, simi_list, taxa_dict = process_cluster_data(cluster)
750 cluster_data_list.append((eval_list, simi_list, taxa_dict))
751 # Collect data for full sample plots
752 all_eval_data[0] += eval_list[0]
753 if len(eval_list) > 1:
754 all_eval_data.extend(sorted(eval_list[1:]))
755 cluster_eval_lengths.append(len(eval_list[1:]))
756
757 if simi_list:
758 all_simi_data.extend(sorted(simi_list, reverse=True))
759 cluster_simi_lengths.append(len(simi_list))
760
761 # Generate outputs based on what was requested
762 if args.output_similarity_txt:
763 write_similarity_output(all_simi_data, args.output_similarity_txt)
764
765 if args.output_similarity_plot and all_simi_data:
766 create_similarity_plot(all_simi_data, cluster_simi_lengths, args, args.output_similarity_plot)
767
768 if args.output_evalue_txt:
769 write_evalue_output(all_eval_data, args.output_evalue_txt)
770
771 if args.output_evalue_plot and len(all_eval_data) > 1:
772 create_evalue_plot(all_eval_data, cluster_eval_lengths, args.output_evalue_plot)
773
774 if args.output_count:
775 write_count_output(all_eval_data, cluster_data_list, args.output_count)
776
777 if args.output_taxa_clusters:
778 write_taxa_clusters_output(cluster_data_list, args.output_taxa_clusters)
779
780 if args.output_taxa_processed:
781 write_taxa_processed_output(cluster_data_list, args, args.output_taxa_processed)
782
783 print(f"Processing complete. Processed {len(clusters)} clusters.")
784
785
786 if __name__ == "__main__":
787 main()