Mercurial > repos > onnodg > cdhit_analysis
comparison cdhit_analysis.py @ 0:00d56396b32a draft
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
| author | onnodg |
|---|---|
| date | Tue, 14 Oct 2025 09:09:46 +0000 |
| parents | |
| children | ff68835adb2b |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:00d56396b32a |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 import argparse | |
| 4 import os | |
| 5 import re | |
| 6 from collections import Counter, defaultdict | |
| 7 from math import sqrt | |
| 8 import pandas as pd | |
| 9 import matplotlib | |
| 10 | |
| 11 matplotlib.use('Agg') # Non-interactive backend for Galaxy | |
| 12 import matplotlib.pyplot as plt | |
| 13 | |
| 14 """ | |
| 15 This script processes cluster output files from cd-hit-est for use in Galaxy. | |
| 16 It extracts cluster information, associates taxa and e-values from annotation files, | |
| 17 performs statistical calculations, and generates text and plot outputs | |
| 18 summarizing similarity and taxonomic distributions. | |
| 19 | |
| 20 | |
| 21 Main steps: | |
| 22 1. Parse cd-hit-est cluster file and (optional) annotation file. | |
| 23 2. Process each cluster to extract similarity, taxa, and e-value information. | |
| 24 3. Aggregate results across clusters. | |
| 25 4. Generate requested outputs: text summaries, plots, and Excel reports. | |
| 26 | |
| 27 | |
| 28 Note: Uses a non-interactive matplotlib backend (Agg) for compatibility with Galaxy. | |
| 29 """ | |
| 30 | |
| 31 | |
| 32 def parse_arguments(args_list=None): | |
| 33 """Parse command-line arguments for the script.""" | |
| 34 parser = argparse.ArgumentParser( | |
| 35 description='Create taxa analysis from cd-hit cluster files') | |
| 36 parser.add_argument('--input_cluster', type=str, required=True, | |
| 37 help='Input cluster file (.clstr)') | |
| 38 parser.add_argument('--input_annotation', type=str, required=False, | |
| 39 help='Input annotation file (.out)') | |
| 40 | |
| 41 # Galaxy output files | |
| 42 parser.add_argument('--output_similarity_txt', type=str, | |
| 43 help='Similarity text output file') | |
| 44 parser.add_argument('--output_similarity_plot', type=str, | |
| 45 help='Similarity plot output file') | |
| 46 parser.add_argument('--output_evalue_txt', type=str, | |
| 47 help='E-value text output file') | |
| 48 parser.add_argument('--output_evalue_plot', type=str, | |
| 49 help='E-value plot output file') | |
| 50 parser.add_argument('--output_count', type=str, | |
| 51 help='Count summary output file') | |
| 52 parser.add_argument('--output_taxa_clusters', type=str, | |
| 53 help='Taxa per cluster output file') | |
| 54 parser.add_argument('--output_taxa_processed', type=str, | |
| 55 help='Processed taxa output file') | |
| 56 # Plot parameters | |
| 57 parser.add_argument('--simi_plot_y_min', type=float, default=95.0, | |
| 58 help='Minimum value of the y-axis in the similarity plot') | |
| 59 parser.add_argument('--simi_plot_y_max', type=float, default=100.0, | |
| 60 help='Maximum value of the y-axis in the similarity plot') | |
| 61 | |
| 62 # Uncertain taxa configuration | |
| 63 parser.add_argument('--uncertain_taxa_use_ratio', type=float, default=0.5, | |
| 64 help='Ratio at which uncertain taxa count toward the correct taxa') | |
| 65 parser.add_argument('--min_to_split', type=float, default=0.45, | |
| 66 help='Minimum percentage for taxonomic split') | |
| 67 parser.add_argument('--min_count_to_split', type=int, default=10, | |
| 68 help='Minimum count for taxonomic split') | |
| 69 | |
| 70 # Processing options | |
| 71 parser.add_argument('--show_unannotated_clusters', action='store_true', default=False, | |
| 72 help='Show unannotated clusters in output') | |
| 73 parser.add_argument('--make_taxa_in_cluster_split', action='store_true', default=False, | |
| 74 help='Split clusters with multiple taxa') | |
| 75 parser.add_argument('--print_empty_files', action='store_true', default=False, | |
| 76 help='Print messages about empty annotation files') | |
| 77 | |
| 78 return parser.parse_args(args_list) | |
| 79 | |
| 80 | |
| 81 # Color map for plots | |
| 82 COLORMAP = [ | |
| 83 # List of RGBA tuples for bar coloring in plots | |
| 84 (0.12156862745098039, 0.4666666666666667, 0.7058823529411765, 1.0), | |
| 85 (1.0, 0.4980392156862745, 0.054901960784313725, 1.0), | |
| 86 (0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0), | |
| 87 (0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0), | |
| 88 (0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0), | |
| 89 (0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0), | |
| 90 (0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0), | |
| 91 (0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0), | |
| 92 (0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0), | |
| 93 (0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0) | |
| 94 ] | |
| 95 | |
| 96 | |
| 97 def parse_cluster_file(cluster_file, annotation_file=None, print_empty=False, raise_on_error=False): | |
| 98 """ | |
| 99 Parse the cd-hit-est cluster file (.clstr) and (optionally) an Excel annotation file. | |
| 100 | |
| 101 It extracts cluster information (header, read count, similarity) and associates | |
| 102 taxonomic information and E-values from the annotation file based on the read header. | |
| 103 | |
| 104 :param cluster_file: Path to cd-hit cluster file (.clstr). | |
| 105 :type cluster_file: str | |
| 106 :param annotation_file: Path to Excel annotation file with taxa and e-values. | |
| 107 :type annotation_file: str, optional | |
| 108 :param print_empty: Print a message if the annotation file is not found or empty. | |
| 109 :type print_empty: bool | |
| 110 :param raise_on_error: Raise parsing errors instead of printing warnings. | |
| 111 :type raise_on_error: bool | |
| 112 :return: List of clusters, where each cluster is a dict mapping read header to a dict of read info. | |
| 113 :rtype: list[dict[str, dict]] | |
| 114 :raises ValueError: If similarity cannot be parsed from a cluster line. | |
| 115 :raises UnboundLocalError: If an error occurs during annotation processing. | |
| 116 """ | |
| 117 clusters = [] | |
| 118 current_cluster = {} | |
| 119 | |
| 120 # Load annotations if provided | |
| 121 annotations = {} | |
| 122 if annotation_file and os.path.exists(annotation_file): | |
| 123 # Lees het Excel-bestand | |
| 124 df = pd.read_excel(annotation_file, sheet_name='Individual_Reads', engine='openpyxl') | |
| 125 | |
| 126 # Itereer over de rijen | |
| 127 for _, row in df.iterrows(): | |
| 128 header = row['header'] # kolomnaam zoals in Excel | |
| 129 evalue = row['e_value'] # of de kolomnaam die je wilt gebruiken | |
| 130 taxa = row['taxa'] # afhankelijk van hoe je taxa opslaat | |
| 131 | |
| 132 annotations[header] = {'evalue': evalue, 'taxa': taxa} | |
| 133 elif annotation_file and print_empty: | |
| 134 print(f"Annotation file {annotation_file} not found or empty") | |
| 135 with open(cluster_file, 'r') as f: | |
| 136 for line in f: | |
| 137 line = line.strip() | |
| 138 if line.startswith('>Cluster'): | |
| 139 # Start of new cluster, save previous if exists | |
| 140 if current_cluster: | |
| 141 clusters.append(current_cluster) | |
| 142 current_cluster = {} | |
| 143 else: | |
| 144 # Parse sequence line | |
| 145 parts = line.split() | |
| 146 if len(parts) >= 2: | |
| 147 # Extract header and count | |
| 148 header_part = parts[2].strip('>.') | |
| 149 header_parts = header_part.split('(') | |
| 150 if len(header_parts) > 1: | |
| 151 last_part = header_parts[-1].strip(')') | |
| 152 header = header_parts[0] | |
| 153 if last_part: | |
| 154 count = int(last_part) | |
| 155 else: | |
| 156 print('no count') | |
| 157 count = 0 | |
| 158 header = '' | |
| 159 # Extract similarity | |
| 160 similarity_part = parts[-1].strip() | |
| 161 if '*' in similarity_part: | |
| 162 similarity = 100.0 # Representative sequence | |
| 163 else: | |
| 164 matches = re.findall(r'[\d.]+', similarity_part) | |
| 165 if matches: | |
| 166 similarity = float(matches[-1]) | |
| 167 else: | |
| 168 raise ValueError(f"Could not parse similarity from: '{similarity_part}'") | |
| 169 # Get annotation info | |
| 170 try: | |
| 171 if header in annotations: | |
| 172 taxa = annotations[header]['taxa'] | |
| 173 evalue = annotations[header]['evalue'] | |
| 174 else: | |
| 175 taxa = 'Unannotated read' | |
| 176 evalue = 'Unannotated read' | |
| 177 | |
| 178 current_cluster[header] = { | |
| 179 'count': count, | |
| 180 'similarity': similarity, | |
| 181 'taxa': taxa, | |
| 182 'evalue': evalue | |
| 183 } | |
| 184 except UnboundLocalError as e: | |
| 185 if raise_on_error: | |
| 186 raise UnboundLocalError(str(e)) | |
| 187 print(f"Error: {e}, No annotations found") | |
| 188 | |
| 189 # Add the last cluster | |
| 190 if current_cluster: | |
| 191 clusters.append(current_cluster) | |
| 192 | |
| 193 return clusters | |
| 194 | |
| 195 | |
| 196 def process_cluster_data(cluster): | |
| 197 """ | |
| 198 Process a single cluster to extract E-value, similarity, and taxa data for all reads. | |
| 199 | |
| 200 Aggregates information from all reads in the cluster, storing read counts, | |
| 201 E-values, similarities, and taxa in lists and a dictionary. | |
| 202 | |
| 203 :param cluster: Cluster data mapping read headers to read info. | |
| 204 :type cluster: dict | |
| 205 :return: A tuple containing: (list of E-values, list of similarity values, dict of taxa -> counts). | |
| 206 The first element of the E-value list ([0]) stores the count of unannotated reads. | |
| 207 :rtype: tuple[list[float | int], list[float], dict[str, int]] | |
| 208 """ | |
| 209 eval_list = [0.0] # First element for unannotated count | |
| 210 simi_list = [] | |
| 211 taxa_dict = {'Unannotated read': 0} | |
| 212 | |
| 213 for header, info in cluster.items(): | |
| 214 count = info['count'] | |
| 215 similarity = info['similarity'] | |
| 216 taxa = info['taxa'] | |
| 217 evalue = info['evalue'] | |
| 218 | |
| 219 if evalue == 'Unannotated read': | |
| 220 eval_list[0] += count | |
| 221 taxa_dict['Unannotated read'] += count | |
| 222 else: | |
| 223 try: | |
| 224 eval_val = float(evalue) | |
| 225 for _ in range(count): | |
| 226 eval_list.append(eval_val) | |
| 227 except ValueError: | |
| 228 eval_list[0] += count | |
| 229 | |
| 230 if taxa not in taxa_dict: | |
| 231 taxa_dict[taxa] = 0 | |
| 232 taxa_dict[taxa] += count | |
| 233 | |
| 234 # Add similarity values | |
| 235 for _ in range(count): | |
| 236 simi_list.append(similarity) | |
| 237 return eval_list, simi_list, taxa_dict | |
| 238 | |
| 239 | |
| 240 def calculate_cluster_taxa(taxa_dict, args): | |
| 241 """ | |
| 242 Calculate the most likely taxa for a cluster based on read counts. | |
| 243 | |
| 244 This function applies the 'uncertain taxa use ratio' for unannotated reads | |
| 245 and uses a recursive approach to potentially split a cluster into sub-clusters | |
| 246 if taxonomic dominance is not strong enough (based on ``min_to_split`` | |
| 247 and ``min_count_to_split``). | |
| 248 | |
| 249 :param taxa_dict: Mapping of taxa (full string) -> read counts. | |
| 250 :type taxa_dict: dict[str, int] | |
| 251 :param args: Parsed script arguments, including parameters for taxa calculation. | |
| 252 :type args: argparse.Namespace | |
| 253 :return: A list of refined taxa assignments (dictionaries), where each dictionary | |
| 254 represents a potentially split sub-cluster. | |
| 255 :rtype: list[dict[str, float | int]] | |
| 256 """ | |
| 257 # Replace 'Unannotated read' with uncertain taxa format | |
| 258 processed_dict = {} | |
| 259 for taxa, count in taxa_dict.items(): | |
| 260 if taxa == 'Unannotated read': | |
| 261 uncertain_taxa = ' / '.join(['Uncertain taxa'] * 7) | |
| 262 processed_dict[uncertain_taxa] = count | |
| 263 else: | |
| 264 processed_dict[taxa] = count | |
| 265 | |
| 266 return _recursive_taxa_calculation(processed_dict, args, 0) | |
| 267 | |
| 268 | |
| 269 def _recursive_taxa_calculation(taxa_dict, args, level): | |
| 270 """ | |
| 271 Recursive helper to calculate and potentially split taxa at each taxonomic level. | |
| 272 | |
| 273 :param taxa_dict: Taxa counts at the current level (or sub-group). | |
| 274 :type taxa_dict: dict[str, float | int] | |
| 275 :param args: Parsed script arguments. | |
| 276 :type args: argparse.Namespace | |
| 277 :param level: Index of the current taxonomic level (0=kingdom, max 6=species). | |
| 278 :type level: int | |
| 279 :return: List of refined taxa dictionaries. | |
| 280 :rtype: list[dict[str, float | int]] | |
| 281 """ | |
| 282 if level >= 7: # Max 7 taxonomic levels | |
| 283 return [taxa_dict] | |
| 284 | |
| 285 level_dict = defaultdict(float) | |
| 286 | |
| 287 # Group by taxonomic level | |
| 288 for taxa, count in taxa_dict.items(): | |
| 289 taxa_parts = taxa.split(' / ') | |
| 290 if level < len(taxa_parts): | |
| 291 level_taxon = taxa_parts[level] | |
| 292 if level_taxon == 'Uncertain taxa': | |
| 293 level_dict[level_taxon] += count * args.uncertain_taxa_use_ratio | |
| 294 else: | |
| 295 level_dict[level_taxon] += count | |
| 296 | |
| 297 if len(level_dict) <= 1: | |
| 298 # Only one taxon at this level, continue to next level | |
| 299 return _recursive_taxa_calculation(taxa_dict, args, level + 1) | |
| 300 | |
| 301 # Sort by abundance | |
| 302 sorted_taxa = sorted(level_dict.items(), key=lambda x: x[1], reverse=True) | |
| 303 | |
| 304 result = [] | |
| 305 dominant_taxon = sorted_taxa[0][0] | |
| 306 | |
| 307 # Check if we should split | |
| 308 for i in range(1, len(sorted_taxa)): | |
| 309 secondary_taxon = sorted_taxa[i][0] | |
| 310 total_count = sorted_taxa[0][1] + sorted_taxa[i][1] | |
| 311 ratio = sorted_taxa[i][1] / total_count | |
| 312 | |
| 313 if ratio >= args.min_to_split or total_count <= args.min_count_to_split: | |
| 314 # Split off this taxon | |
| 315 split_dict = {taxa: count for taxa, count in taxa_dict.items() | |
| 316 if taxa.split(' / ')[level] == secondary_taxon} | |
| 317 result.extend(_recursive_taxa_calculation(split_dict, args, level + 1)) | |
| 318 | |
| 319 # Process the dominant group | |
| 320 dominant_dict = {taxa: count for taxa, count in taxa_dict.items() | |
| 321 if taxa.split(' / ')[level] == dominant_taxon} | |
| 322 result.extend(_recursive_taxa_calculation(dominant_dict, args, level + 1)) | |
| 323 | |
| 324 return result | |
| 325 | |
| 326 | |
| 327 def write_similarity_output(all_simi_data, output_file): | |
| 328 """ | |
| 329 Write the similarity text output, including the mean and standard deviation, | |
| 330 and a count per similarity percentage. | |
| 331 | |
| 332 :param all_simi_data: List of all similarity percentages from all reads. | |
| 333 :type all_simi_data: list[float] | |
| 334 :param output_file: Path to the output file. | |
| 335 :type output_file: str | |
| 336 :return: None | |
| 337 :rtype: None | |
| 338 """ | |
| 339 if not all_simi_data: | |
| 340 return | |
| 341 | |
| 342 avg = sum(all_simi_data) / len(all_simi_data) | |
| 343 variance = sum((s - avg) ** 2 for s in all_simi_data) / len(all_simi_data) | |
| 344 st_dev = sqrt(variance) | |
| 345 | |
| 346 simi_counter = Counter(all_simi_data) | |
| 347 simi_sorted = sorted(simi_counter.items(), key=lambda x: -x[0]) | |
| 348 | |
| 349 with open(output_file, 'w') as f: | |
| 350 f.write(f"# Average similarity: {avg:.2f}\n") | |
| 351 f.write(f"# Standard deviation: {st_dev:.2f}\n") | |
| 352 f.write("similarity\tcount\n") | |
| 353 for similarity, count in simi_sorted: | |
| 354 f.write(f"{similarity}\t{count}\n") | |
| 355 | |
| 356 | |
| 357 def write_evalue_output(all_eval_data, output_file): | |
| 358 """ | |
| 359 Write the E-value text output, including the count of unannotated reads | |
| 360 and a count per E-value. | |
| 361 | |
| 362 :param all_eval_data: List of E-values from all reads. The first element ([0]) is the count of unannotated reads. | |
| 363 :type all_eval_data: list[float | int] | |
| 364 :param output_file: Path to the output file. | |
| 365 :type output_file: str | |
| 366 :return: None | |
| 367 :rtype: None | |
| 368 """ | |
| 369 unanno_count = all_eval_data[0] | |
| 370 eval_counter = Counter(all_eval_data[1:]) | |
| 371 | |
| 372 with open(output_file, 'w') as f: | |
| 373 f.write("evalue\tcount\n") | |
| 374 if unanno_count > 0: | |
| 375 f.write(f"unannotated\t{unanno_count}\n") | |
| 376 | |
| 377 eval_sorted = sorted(eval_counter.items(), | |
| 378 key=lambda x: (-x[1], float(x[0]) if isinstance(x[0], (int, float)) else float('inf'))) | |
| 379 for value, count in eval_sorted: | |
| 380 f.write(f"{value}\t{count}\n") | |
| 381 | |
| 382 | |
| 383 def write_count_output(all_eval_data, cluster_data_list, output_file): | |
| 384 """ | |
| 385 Write a summary of annotated and unannotated read counts per cluster | |
| 386 and for the total sample. | |
| 387 | |
| 388 :param all_eval_data: List of E-values from all reads for the total count summary. | |
| 389 :type all_eval_data: list[float | int] | |
| 390 :param cluster_data_list: List of tuples (eval_list, simi_list, taxa_dict) per cluster. | |
| 391 :type cluster_data_list: list[tuple] | |
| 392 :param output_file: Path to the output file. | |
| 393 :type output_file: str | |
| 394 :return: None | |
| 395 :rtype: None | |
| 396 """ | |
| 397 with open(output_file, 'w') as f: | |
| 398 f.write("cluster\tunannotated\tannotated\ttotal\tperc_unannotated\tperc_annotated\n") | |
| 399 | |
| 400 for cluster_num, (eval_list, _, _) in enumerate(cluster_data_list): | |
| 401 unannotated = eval_list[0] | |
| 402 annotated = len(eval_list[1:]) | |
| 403 total = unannotated + annotated | |
| 404 | |
| 405 if total > 0: | |
| 406 perc_annotated = (annotated / total) * 100 | |
| 407 perc_unannotated = (unannotated / total) * 100 | |
| 408 else: | |
| 409 perc_annotated = perc_unannotated = 0 | |
| 410 | |
| 411 f.write( | |
| 412 f"{cluster_num}\t{unannotated}\t{annotated}\t{total}\t{perc_unannotated:.2f}\t{perc_annotated:.2f}\n") | |
| 413 | |
| 414 # Add full sample summary | |
| 415 total_unannotated = all_eval_data[0] | |
| 416 total_annotated = len(all_eval_data[1:]) | |
| 417 grand_total = total_unannotated + total_annotated | |
| 418 | |
| 419 if grand_total > 0: | |
| 420 perc_annotated = (total_annotated / grand_total) * 100 | |
| 421 perc_unannotated = (total_unannotated / grand_total) * 100 | |
| 422 else: | |
| 423 perc_annotated = perc_unannotated = 0 | |
| 424 | |
| 425 f.write( | |
| 426 f"TOTAL\t{total_unannotated}\t{total_annotated}\t{grand_total}\t{perc_unannotated:.2f}\t{perc_annotated:.2f}\n") | |
| 427 | |
| 428 | |
| 429 def write_taxa_clusters_output(cluster_data_list, output_file): | |
| 430 """ | |
| 431 Write raw taxa information per cluster to an Excel file. | |
| 432 | |
| 433 Each row contains the cluster number, read count, the full taxa string, | |
| 434 and the separate taxonomic levels (kingdom through species). | |
| 435 | |
| 436 :param cluster_data_list: List of tuples (eval_list, simi_list, taxa_dict) per cluster. | |
| 437 :type cluster_data_list: list[tuple] | |
| 438 :param output_file: Path to the output Excel file. | |
| 439 :type output_file: str | |
| 440 :return: None | |
| 441 :rtype: None | |
| 442 """ | |
| 443 # Create main dataframe | |
| 444 data = [] | |
| 445 for cluster_num, (_, _, taxa_dict) in enumerate(cluster_data_list): | |
| 446 for taxa, count in taxa_dict.items(): | |
| 447 if count > 0: | |
| 448 # Split taxa into taxonomic levels | |
| 449 taxa_levels = taxa.split(' / ') if taxa else [] | |
| 450 taxa_levels += ['Unannotated read'] * (7 - len(taxa_levels)) | |
| 451 try: | |
| 452 data.append({ | |
| 453 'cluster': cluster_num, | |
| 454 'count': count, | |
| 455 'taxa_full': taxa, | |
| 456 'kingdom': taxa_levels[0], | |
| 457 'phylum': taxa_levels[1], | |
| 458 'class': taxa_levels[2], | |
| 459 'order': taxa_levels[3], | |
| 460 'family': taxa_levels[4], | |
| 461 'genus': taxa_levels[5], | |
| 462 'species': taxa_levels[6] | |
| 463 }) | |
| 464 except IndexError as e: | |
| 465 # Skip entries with incomplete taxonomic data | |
| 466 print(f"Skipped entry in cluster {cluster_num}: incomplete taxonomic data for '{taxa}, error: {e}'") | |
| 467 continue | |
| 468 | |
| 469 df = pd.DataFrame(data) | |
| 470 # Write to Excel | |
| 471 temp_path = output_file + ".xlsx" | |
| 472 os.makedirs(os.path.dirname(temp_path), exist_ok=True) | |
| 473 with pd.ExcelWriter(temp_path, engine='openpyxl') as writer: | |
| 474 df.to_excel(writer, sheet_name='Raw_Taxa_Clusters', index=False, engine='openpyxl') | |
| 475 os.replace(temp_path, output_file) | |
| 476 | |
| 477 def write_taxa_processed_output(cluster_data_list, args, output_file): | |
| 478 """ | |
| 479 Write processed (potentially split) taxa information to an Excel file. | |
| 480 | |
| 481 This file contains the resulting sub-clusters from the taxonomic dominance | |
| 482 analysis and a separate sheet documenting the parameters used. | |
| 483 | |
| 484 :param cluster_data_list: List of tuples (eval_list, simi_list, taxa_dict) per cluster. | |
| 485 :type cluster_data_list: list[tuple] | |
| 486 :param args: Parsed script arguments, used for taxa calculation and settings documentation. | |
| 487 :type args: argparse.Namespace | |
| 488 :param output_file: Path to the output Excel file. | |
| 489 :type output_file: str | |
| 490 :return: None | |
| 491 :rtype: None | |
| 492 """ | |
| 493 # Create main dataframe | |
| 494 data = [] | |
| 495 for cluster_num, (_, _, taxa_dict) in enumerate(cluster_data_list): | |
| 496 processed_taxa = calculate_cluster_taxa(taxa_dict, args) | |
| 497 | |
| 498 for taxa_group in processed_taxa: | |
| 499 for taxa, count in taxa_group.items(): | |
| 500 if 'Uncertain taxa / Uncertain taxa / Uncertain taxa' in taxa: | |
| 501 if args.show_unannotated_clusters: | |
| 502 data.append({ | |
| 503 'cluster': cluster_num, | |
| 504 'count': count, | |
| 505 'taxa_full': 'Unannotated read', | |
| 506 'kingdom': 'Unannotated', | |
| 507 'phylum': 'Unannotated', | |
| 508 'class': 'Unannotated', | |
| 509 'order': 'Unannotated', | |
| 510 'family': 'Unannotated', | |
| 511 'genus': 'Unannotated', | |
| 512 'species': 'Unannotated' | |
| 513 }) | |
| 514 else: | |
| 515 # Split taxa into taxonomic levels | |
| 516 taxa_levels = taxa.split(' / ') if taxa else [] | |
| 517 | |
| 518 try: | |
| 519 data.append({ | |
| 520 'cluster': cluster_num, | |
| 521 'count': count, | |
| 522 'taxa_full': taxa, | |
| 523 'kingdom': taxa_levels[0], | |
| 524 'phylum': taxa_levels[1], | |
| 525 'class': taxa_levels[2], | |
| 526 'order': taxa_levels[3], | |
| 527 'family': taxa_levels[4], | |
| 528 'genus': taxa_levels[5], | |
| 529 'species': taxa_levels[6] | |
| 530 }) | |
| 531 except IndexError: | |
| 532 # Skip entries with incomplete taxonomic data | |
| 533 print(f"Skipped entry in cluster {cluster_num}: incomplete taxonomic data for '{taxa}'") | |
| 534 continue | |
| 535 | |
| 536 df = pd.DataFrame(data) | |
| 537 | |
| 538 # Create settings dataframe | |
| 539 settings_data = [ | |
| 540 ['uncertain_taxa_use_ratio', args.uncertain_taxa_use_ratio], | |
| 541 ['min_to_split', args.min_to_split], | |
| 542 ['min_count_to_split', args.min_count_to_split] | |
| 543 ] | |
| 544 settings_df = pd.DataFrame(settings_data, columns=['Parameter', 'Value']) | |
| 545 | |
| 546 # Write to Excel with multiple sheets | |
| 547 temp_path = output_file + ".xlsx" | |
| 548 os.makedirs(os.path.dirname(temp_path), exist_ok=True) | |
| 549 with pd.ExcelWriter(temp_path, engine='openpyxl') as writer: | |
| 550 df.to_excel(writer, sheet_name='Processed_Taxa_Clusters', index=False, engine='openpyxl') | |
| 551 settings_df.to_excel(writer, sheet_name='Settings', index=False, engine='openpyxl') | |
| 552 | |
| 553 # Auto-adjust column widths for better readability | |
| 554 for sheet_name in writer.sheets: | |
| 555 worksheet = writer.sheets[sheet_name] | |
| 556 for column in worksheet.columns: | |
| 557 max_length = 0 | |
| 558 column_letter = column[0].column_letter | |
| 559 for cell in column: | |
| 560 try: | |
| 561 if len(str(cell.value)) > max_length: | |
| 562 max_length = len(str(cell.value)) | |
| 563 except: | |
| 564 pass | |
| 565 adjusted_width = min(max_length + 2, 50) # Cap at 50 characters | |
| 566 worksheet.column_dimensions[column_letter].width = adjusted_width | |
| 567 os.replace(temp_path, output_file) | |
| 568 | |
| 569 def create_similarity_plot(all_simi_data, cluster_simi_lengths, args, output_file): | |
| 570 """ | |
| 571 Create a bar plot showing the distribution of intra-cluster similarity values. | |
| 572 | |
| 573 The plot uses different colors to distinguish reads belonging to different clusters. | |
| 574 | |
| 575 :param all_simi_data: List of all similarity values, sorted descending. | |
| 576 :type all_simi_data: list[float] | |
| 577 :param cluster_simi_lengths: List of lengths of similarity data per cluster, used for coloring. | |
| 578 :type cluster_simi_lengths: list[int] | |
| 579 :param args: Parsed script arguments, used for plot y-limits. | |
| 580 :type args: argparse.Namespace | |
| 581 :param output_file: Path to the output plot file (e.g., .png). | |
| 582 :type output_file: str | |
| 583 :return: None | |
| 584 :rtype: None | |
| 585 """ | |
| 586 if not all_simi_data: | |
| 587 return | |
| 588 | |
| 589 sorted_simi_list = sorted(all_simi_data, reverse=True) | |
| 590 bar_positions = list(range(len(sorted_simi_list))) | |
| 591 | |
| 592 # Create colormap for different clusters | |
| 593 colormap_full = [] | |
| 594 for i, length in enumerate(cluster_simi_lengths): | |
| 595 color = COLORMAP[i % len(COLORMAP)] | |
| 596 colormap_full.extend([color] * length) | |
| 597 | |
| 598 plt.figure(figsize=(12, 6)) | |
| 599 plt.bar(bar_positions, sorted_simi_list, width=1, color=colormap_full) | |
| 600 plt.grid(axis='y', linestyle='--', color='gray', alpha=0.7) | |
| 601 plt.ylabel("Similarity (%)") | |
| 602 plt.xlabel("Reads") | |
| 603 plt.title("Intra-cluster Similarity Distribution") | |
| 604 plt.ylim(ymin=args.simi_plot_y_min, ymax=args.simi_plot_y_max) | |
| 605 plt.tight_layout() | |
| 606 | |
| 607 plt.savefig(output_file, format='png', dpi=300, bbox_inches='tight') | |
| 608 plt.close() | |
| 609 | |
| 610 | |
| 611 def create_evalue_plot(all_eval_data, cluster_eval_lengths, output_file): | |
| 612 """ | |
| 613 Create a bar plot showing the distribution of E-values. | |
| 614 | |
| 615 The y-axis is log-scaled and displays ``1/E-values``. Reads are ordered | |
| 616 by E-value (ascending). The plot uses different colors to distinguish reads | |
| 617 belonging to different clusters. | |
| 618 | |
| 619 :param all_eval_data: List of E-values from all reads. Assumes E-values start at index 1. | |
| 620 :type all_eval_data: list[float | int] | |
| 621 :param cluster_eval_lengths: List of lengths of annotated E-value data per cluster, used for coloring. | |
| 622 :type cluster_eval_lengths: list[int] | |
| 623 :param output_file: Path to the output plot file (e.g., .png). | |
| 624 :type output_file: str | |
| 625 :return: None | |
| 626 :rtype: None | |
| 627 """ | |
| 628 if len(all_eval_data) <= 1: # Only unannotated reads | |
| 629 return | |
| 630 | |
| 631 sorted_eval_list = sorted(all_eval_data[1:]) # Skip unannotated count | |
| 632 | |
| 633 if not sorted_eval_list: | |
| 634 return | |
| 635 | |
| 636 bar_positions = list(range(len(sorted_eval_list))) | |
| 637 bar_heights = [1 / e if e > 0 else 0 for e in sorted_eval_list] | |
| 638 | |
| 639 # Create colormap for different clusters | |
| 640 colormap_full = [] | |
| 641 for i, length in enumerate(cluster_eval_lengths): | |
| 642 color = COLORMAP[i % len(COLORMAP)] | |
| 643 colormap_full.extend([color] * length) | |
| 644 | |
| 645 plt.figure(figsize=(12, 6)) | |
| 646 plt.bar(bar_positions, bar_heights, width=1, color=colormap_full) | |
| 647 plt.yscale('log') | |
| 648 plt.grid(axis='y', linestyle='--', color='gray', alpha=0.7) | |
| 649 plt.ylabel("1/E-values") | |
| 650 plt.xlabel("Reads") | |
| 651 plt.title("E-value Distribution") | |
| 652 plt.tight_layout() | |
| 653 | |
| 654 plt.savefig(output_file, format='png', dpi=300, bbox_inches='tight') | |
| 655 plt.close() | |
| 656 | |
| 657 def prepare_evalue_histogram(evalue_list, unannotated_list): | |
| 658 """ | |
| 659 Generate histogram data for E-value distributions. | |
| 660 | |
| 661 This function processes a list of E-values from BLAST or similar search | |
| 662 results, filters out invalid or zero entries, and computes histogram data | |
| 663 suitable for plotting. The histogram represents the frequency distribution | |
| 664 of E-values across all annotated hits. | |
| 665 | |
| 666 :param evalue_list: List of E-values from BLAST hits | |
| 667 :type evalue_list: list[float | int] | |
| 668 :param unannotated_list: List of unannotated E-values | |
| 669 :type unannotated_list: list | |
| 670 :return: Tuple containing: | |
| 671 - **counts** (*numpy.ndarray*): Number of entries per histogram bin. | |
| 672 - **bins** (*numpy.ndarray*): Bin edges corresponding to the histogram. | |
| 673 Returns ``(None, None)`` if no valid data is available. | |
| 674 :rtype: tuple[numpy.ndarray, numpy.ndarray] | tuple[None, None] | |
| 675 :note: | |
| 676 - Only positive numeric E-values are included in the histogram. | |
| 677 - Uses 50 bins in the range (0, 1) for visualization consistency. | |
| 678 """ | |
| 679 data = [ev for ev in evalue_list if isinstance(ev, (int, float)) and ev > 0] | |
| 680 if not data: | |
| 681 return None, None | |
| 682 | |
| 683 counts, bins, _ = plt.hist(data, bins=50, range=(0, 1)) | |
| 684 plt.close() | |
| 685 return counts, bins | |
| 686 | |
| 687 def create_evalue_plot_test(evalue_list, unannotated_list, output_file): | |
| 688 """ | |
| 689 Create and save an E-value distribution plot, returning the computed histogram data. | |
| 690 | |
| 691 This function visualizes the frequency distribution of E-values from BLAST or | |
| 692 annotation results. It saves the plot to the specified output file and returns | |
| 693 the histogram data (counts and bins) for testing with pytests. | |
| 694 | |
| 695 :param evalue_list: List of numeric E-values to plot | |
| 696 :type evalue_list: list[float | int] | |
| 697 :param unannotated_list: Optional list of E-values for unannotated sequences. | |
| 698 :type unannotated_list: list | |
| 699 :param output_file: Path where the histogram image will be saved. | |
| 700 :type output_file: str | |
| 701 | |
| 702 :return: Tuple containing: | |
| 703 - **counts** (*numpy.ndarray*): Frequency counts per histogram bin. | |
| 704 - **bins** (*numpy.ndarray*): Histogram bin edges. | |
| 705 Returns ``(None, None)`` if no valid data was available for plotting. | |
| 706 :rtype: tuple[numpy.ndarray, numpy.ndarray] | tuple[None, None] | |
| 707 """ | |
| 708 counts, bins = prepare_evalue_histogram(evalue_list, unannotated_list) | |
| 709 if counts is None: | |
| 710 return None, None | |
| 711 | |
| 712 plt.hist([ev for ev in evalue_list if isinstance(ev, (int, float)) and ev > 0], | |
| 713 bins=50, range=(0, 1)) | |
| 714 plt.xlabel("E-value") | |
| 715 plt.ylabel("Frequency") | |
| 716 plt.title("E-value Distribution") | |
| 717 plt.savefig(output_file) | |
| 718 plt.close() | |
| 719 return counts, bins | |
| 720 | |
| 721 | |
| 722 def main(arg_list=None): | |
| 723 """ | |
| 724 Main entry point of the script. | |
| 725 | |
| 726 Parses arguments, processes cd-hit cluster data, aggregates results, | |
| 727 and generates requested outputs (text summaries, plots, and Excel reports). | |
| 728 | |
| 729 :param arg_list: List of arguments for testing purposes. | |
| 730 :type arg_list: list, optional | |
| 731 :return: None | |
| 732 :rtype: None | |
| 733 """ | |
| 734 args = parse_arguments(arg_list) | |
| 735 # Parse cluster file | |
| 736 clusters = parse_cluster_file( | |
| 737 args.input_cluster, | |
| 738 args.input_annotation, | |
| 739 args.print_empty_files | |
| 740 ) | |
| 741 # Process each cluster | |
| 742 all_eval_data = [0] # For full sample statistics | |
| 743 all_simi_data = [] | |
| 744 cluster_eval_lengths = [] | |
| 745 cluster_simi_lengths = [] | |
| 746 cluster_data_list = [] | |
| 747 | |
| 748 for cluster in clusters: | |
| 749 eval_list, simi_list, taxa_dict = process_cluster_data(cluster) | |
| 750 cluster_data_list.append((eval_list, simi_list, taxa_dict)) | |
| 751 # Collect data for full sample plots | |
| 752 all_eval_data[0] += eval_list[0] | |
| 753 if len(eval_list) > 1: | |
| 754 all_eval_data.extend(sorted(eval_list[1:])) | |
| 755 cluster_eval_lengths.append(len(eval_list[1:])) | |
| 756 | |
| 757 if simi_list: | |
| 758 all_simi_data.extend(sorted(simi_list, reverse=True)) | |
| 759 cluster_simi_lengths.append(len(simi_list)) | |
| 760 | |
| 761 # Generate outputs based on what was requested | |
| 762 if args.output_similarity_txt: | |
| 763 write_similarity_output(all_simi_data, args.output_similarity_txt) | |
| 764 | |
| 765 if args.output_similarity_plot and all_simi_data: | |
| 766 create_similarity_plot(all_simi_data, cluster_simi_lengths, args, args.output_similarity_plot) | |
| 767 | |
| 768 if args.output_evalue_txt: | |
| 769 write_evalue_output(all_eval_data, args.output_evalue_txt) | |
| 770 | |
| 771 if args.output_evalue_plot and len(all_eval_data) > 1: | |
| 772 create_evalue_plot(all_eval_data, cluster_eval_lengths, args.output_evalue_plot) | |
| 773 | |
| 774 if args.output_count: | |
| 775 write_count_output(all_eval_data, cluster_data_list, args.output_count) | |
| 776 | |
| 777 if args.output_taxa_clusters: | |
| 778 write_taxa_clusters_output(cluster_data_list, args.output_taxa_clusters) | |
| 779 | |
| 780 if args.output_taxa_processed: | |
| 781 write_taxa_processed_output(cluster_data_list, args, args.output_taxa_processed) | |
| 782 | |
| 783 print(f"Processing complete. Processed {len(clusters)} clusters.") | |
| 784 | |
| 785 | |
| 786 if __name__ == "__main__": | |
| 787 main() |
