Mercurial > repos > onnodg > cdhit_analysis
comparison cdhit_analysis.py @ 4:e64af72e1b8f draft default tip
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit 4017d38cf327c48a6252e488ba792527dae97a70-dirty
| author | onnodg |
|---|---|
| date | Mon, 15 Dec 2025 16:44:40 +0000 |
| parents | c6981ea453ae |
| children |
comparison
equal
deleted
inserted
replaced
| 3:c6981ea453ae | 4:e64af72e1b8f |
|---|---|
| 1 """ | 1 """ |
| 2 This script processes cluster output files from cd-hit-est for use in Galaxy. | 2 This script processes cluster output files from cd-hit-est for use in Galaxy. |
| 3 It extracts cluster information, associates taxa and e-values from annotation files, | 3 It extracts cluster information, associates taxa from annotation files, |
| 4 performs statistical calculations, and generates text and plot outputs | 4 performs statistical calculations, and generates text and plot outputs |
| 5 summarizing similarity and taxonomic distributions. | 5 summarizing similarity and taxonomic distributions. |
| 6 | 6 |
| 7 | |
| 8 Main steps: | 7 Main steps: |
| 9 1. Parse cd-hit-est cluster file and (optional) annotation file. | 8 1. Parse cd-hit-est cluster file and (optional) annotation Excel file. |
| 10 2. Process each cluster to extract similarity, taxa, and e-value information. | 9 2. Process each cluster to extract similarity and taxa information. |
| 11 3. Aggregate results across clusters. | 10 3. Aggregate results across clusters. |
| 12 4. Generate requested outputs: text summaries, plots, and Excel reports. | 11 4. Generate requested outputs: text summaries, plots, and Excel reports. |
| 13 """ | 12 """ |
| 14 | 13 |
| 15 import argparse | 14 import argparse |
| 16 from collections import Counter, defaultdict | 15 from collections import Counter, defaultdict |
| 17 import os | 16 import os |
| 18 import re | 17 import re |
| 18 from math import sqrt | |
| 19 | |
| 19 import matplotlib.pyplot as plt | 20 import matplotlib.pyplot as plt |
| 20 import pandas as pd | 21 import pandas as pd |
| 21 from math import sqrt | |
| 22 import openpyxl | 22 import openpyxl |
| 23 | 23 |
| 24 def log_message(messages, text: str): | |
| 25 """Append a message to the log list.""" | |
| 26 messages.append(text) | |
| 24 | 27 |
| 25 | 28 |
| 26 def parse_arguments(args_list=None): | 29 def parse_arguments(args_list=None): |
| 27 """Parse command-line arguments for the script.""" | 30 """Parse command-line arguments for the script.""" |
| 28 parser = argparse.ArgumentParser( | 31 parser = argparse.ArgumentParser(description="Create taxa analysis from cd-hit cluster files") |
| 29 description='Create taxa analysis from cd-hit cluster files') | 32 parser.add_argument("--input_cluster", type=str, required=True, help="Input cluster file (.clstr)") |
| 30 parser.add_argument('--input_cluster', type=str, required=True, | 33 parser.add_argument("--input_annotation", type=str, required=False, help="Input annotation Excel file (header annotations)") |
| 31 help='Input cluster file (.clstr)') | 34 |
| 32 parser.add_argument('--input_annotation', type=str, required=False, | 35 parser.add_argument("--output_similarity_txt", type=str, help="Similarity text output file") |
| 33 help='Input annotation file (.out)') | 36 parser.add_argument("--output_similarity_plot",type=str, help="Similarity plot output file (PNG)") |
| 34 | 37 parser.add_argument("--output_count", type=str, help="Count summary output file") |
| 35 # Galaxy output files | 38 parser.add_argument("--output_excel", type=str, help="Count summary output file") |
| 36 parser.add_argument('--output_similarity_txt', type=str, | 39 parser.add_argument("--output_taxa_clusters", action="store_true", default=False, help="Excel output: include raw taxa per cluster sheet") |
| 37 help='Similarity text output file') | 40 parser.add_argument("--output_taxa_processed", action="store_true", default=False, help="Excel output: include processed/LCA taxa per cluster sheet") |
| 38 parser.add_argument('--output_similarity_plot', type=str, | 41 parser.add_argument("--log_file", type=str, help="Optional log file with run statistics") |
| 39 help='Similarity plot output file') | 42 |
| 40 parser.add_argument('--output_evalue_txt', type=str, | 43 parser.add_argument("--simi_plot_y_min", type=float, default=95.0, help="Minimum value of the y-axis in the similarity plot") |
| 41 help='E-value text output file') | 44 parser.add_argument("--simi_plot_y_max", type=float, default=100.0, help="Maximum value of the y-axis in the similarity plot") |
| 42 parser.add_argument('--output_evalue_plot', type=str, | 45 |
| 43 help='E-value plot output file') | 46 parser.add_argument("--uncertain_taxa_use_ratio", type=float, default=0.5, help="Ratio at which uncertain taxa count toward the dominant taxa") |
| 44 parser.add_argument('--output_count', type=str, | 47 parser.add_argument("--min_to_split", type=float, default=0.45, help=("Minimum fraction (0–0.5) for splitting a cluster taxonomically. " |
| 45 help='Count summary output file') | 48 "ratio = count(second) / (count(first) + count(second))")) |
| 46 parser.add_argument('--output_taxa_clusters', type=str, | 49 parser.add_argument("--min_count_to_split", type=int, default=10, help=("Minimum combined count of the two top taxa to avoid being forced " |
| 47 help='Taxa per cluster output file') | 50 "into an 'uncertain' split at this level.")) |
| 48 parser.add_argument('--output_taxa_processed', type=str, | 51 |
| 49 help='Processed taxa output file') | 52 parser.add_argument("--min_cluster_support", type=int, default=1, help=("Minimum number of annotated reads required for a cluster to be " |
| 50 # Plot parameters | 53 "included in the processed/LCA taxa sheet. Unannotated reads do not count toward this threshold.")) |
| 51 parser.add_argument('--simi_plot_y_min', type=float, default=95.0, | |
| 52 help='Minimum value of the y-axis in the similarity plot') | |
| 53 parser.add_argument('--simi_plot_y_max', type=float, default=100.0, | |
| 54 help='Maximum value of the y-axis in the similarity plot') | |
| 55 | |
| 56 # Uncertain taxa configuration | |
| 57 parser.add_argument('--uncertain_taxa_use_ratio', type=float, default=0.5, | |
| 58 help='Ratio at which uncertain taxa count toward the correct taxa') | |
| 59 parser.add_argument('--min_to_split', type=float, default=0.45, | |
| 60 help='Minimum percentage for taxonomic split') | |
| 61 parser.add_argument('--min_count_to_split', type=int, default=10, | |
| 62 help='Minimum count for taxonomic split') | |
| 63 | |
| 64 # Processing options | |
| 65 parser.add_argument('--show_unannotated_clusters', action='store_true', default=False, | |
| 66 help='Show unannotated clusters in output') | |
| 67 parser.add_argument('--make_taxa_in_cluster_split', action='store_true', default=False, | |
| 68 help='Split clusters with multiple taxa') | |
| 69 parser.add_argument('--print_empty_files', action='store_true', default=False, | |
| 70 help='Print messages about empty annotation files') | |
| 71 | 54 |
| 72 return parser.parse_args(args_list) | 55 return parser.parse_args(args_list) |
| 73 | 56 |
| 74 | 57 |
| 75 # Color map for plots | |
| 76 COLORMAP = [ | 58 COLORMAP = [ |
| 77 # List of RGBA tuples for bar coloring in plots | 59 # List of RGBA tuples for bar coloring in plots |
| 78 (0.12156862745098039, 0.4666666666666667, 0.7058823529411765, 1.0), | 60 (0.12156862745098039, 0.4666666666666667, 0.7058823529411765, 1.0), |
| 79 (1.0, 0.4980392156862745, 0.054901960784313725, 1.0), | 61 (1.0, 0.4980392156862745, 0.054901960784313725, 1.0), |
| 80 (0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0), | 62 (0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0), |
| 81 (0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0), | 63 (0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0), |
| 82 (0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0), | 64 (0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0), |
| 83 (0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0), | 65 (0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0), |
| 84 (0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0), | 66 (0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0), |
| 85 (0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0), | 67 (0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0), |
| 86 (0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0), | 68 (0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0), |
| 87 (0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0) | 69 (0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0), |
| 88 ] | 70 ] |
| 89 | 71 |
| 90 | 72 |
| 91 def parse_cluster_file(cluster_file, annotation_file=None, print_empty=False, raise_on_error=False): | 73 def parse_cluster_file(cluster_file, annotation_file, raise_on_error=False, log_messages=None): |
| 92 """ | 74 """Parse CD-HIT cluster output and attach taxonomic annotations. |
| 93 Parse the cd-hit-est cluster file (.clstr) and (optionally) an Excel annotation file. | 75 |
| 94 | 76 Cluster entries are parsed for read headers, counts, and similarities. |
| 95 It extracts cluster information (header, read count, similarity) and associates | 77 If an annotation Excel file is supplied, taxa, seq_id and source fields |
| 96 taxonomic information and E-values from the annotation file based on the read header. | 78 are mapped per read from sheet ``Individual_Reads``. |
| 97 | 79 |
| 98 :param cluster_file: Path to cd-hit cluster file (.clstr). | 80 :param cluster_file: Path to the CD-HIT cluster file. |
| 99 :type cluster_file: str | 81 :type cluster_file: str |
| 100 :param annotation_file: Path to Excel annotation file with taxa and e-values. | 82 :param annotation_file: Optional Excel annotation file. |
| 101 :type annotation_file: str, optional | 83 :type annotation_file: str or None |
| 102 :param print_empty: Print a message if the annotation file is not found or empty. | 84 :param raise_on_error: Whether to raise parsing errors directly. |
| 103 :type print_empty: bool | |
| 104 :param raise_on_error: Raise parsing errors instead of printing warnings. | |
| 105 :type raise_on_error: bool | 85 :type raise_on_error: bool |
| 106 :return: List of clusters, where each cluster is a dict mapping read header to a dict of read info. | 86 :param log_messages: Optional message collector for logging. |
| 107 :rtype: list[dict[str, dict]] | 87 :type log_messages: list[str] or None |
| 108 :raises ValueError: If similarity cannot be parsed from a cluster line. | 88 :return: List of clusters, where each cluster is a dict keyed by header. |
| 109 :raises UnboundLocalError: If an error occurs during annotation processing. | 89 :rtype: list[dict] |
| 110 """ | 90 """ |
| 111 clusters = [] | 91 clusters = [] |
| 112 current_cluster = {} | 92 current_cluster = {} |
| 113 | 93 |
| 114 # Load annotations if provided | |
| 115 annotations = {} | 94 annotations = {} |
| 116 if annotation_file and os.path.exists(annotation_file): | 95 if annotation_file and os.path.exists(annotation_file): |
| 117 # Lees het Excel-bestand | 96 try: |
| 118 df = pd.read_excel(annotation_file, sheet_name='Individual_Reads', engine='openpyxl') | 97 df = pd.read_excel( |
| 119 | 98 annotation_file, |
| 120 # Itereer over de rijen | 99 sheet_name="Individual_Reads", |
| 121 for _, row in df.iterrows(): | 100 engine="openpyxl", |
| 122 header = row['header'] # kolomnaam zoals in Excel | 101 ) |
| 123 evalue = row['e_value'] # of de kolomnaam die je wilt gebruiken | 102 required_cols = {"header", "seq_id", "source", "taxa"} |
| 124 taxa = row['taxa'] # afhankelijk van hoe je taxa opslaat | 103 missing = required_cols - set(df.columns) |
| 125 | 104 if missing: |
| 126 annotations[header] = {'evalue': evalue, 'taxa': taxa} | 105 msg = ( |
| 127 elif annotation_file and print_empty: | 106 f"Annotation file missing columns {missing}, " |
| 128 print(f"Annotation file {annotation_file} not found or empty") | 107 "continuing without annotations." |
| 129 with open(cluster_file, 'r') as f: | 108 ) |
| 109 if log_messages is not None: | |
| 110 log_message(log_messages, msg) | |
| 111 else: | |
| 112 for _, row in df.iterrows(): | |
| 113 header = str(row["header"]) | |
| 114 seq_id = str(row["seq_id"]) | |
| 115 source = str(row["source"]) | |
| 116 taxa = str(row["taxa"]) | |
| 117 annotations[header] = { | |
| 118 "seq_id": seq_id, | |
| 119 "source": source, | |
| 120 "taxa": taxa, | |
| 121 } | |
| 122 if log_messages is not None: | |
| 123 log_message( | |
| 124 log_messages, | |
| 125 f"Loaded {len(annotations)} annotated headers", | |
| 126 ) | |
| 127 except Exception as exc: | |
| 128 msg = ( | |
| 129 "Failed to read annotation file; proceeding without annotations. " | |
| 130 f"Details: {exc}" | |
| 131 ) | |
| 132 if log_messages is not None: | |
| 133 log_message(log_messages, msg) | |
| 134 else: | |
| 135 if log_messages is not None: | |
| 136 if annotation_file: | |
| 137 log_message( | |
| 138 log_messages, | |
| 139 "Annotation file not found; proceeding without annotations.", | |
| 140 ) | |
| 141 else: | |
| 142 log_message( | |
| 143 log_messages, | |
| 144 "No annotation file provided; proceeding without annotations.", | |
| 145 ) | |
| 146 | |
| 147 with open(cluster_file, "r") as f: | |
| 130 for line in f: | 148 for line in f: |
| 131 line = line.strip() | 149 line = line.strip() |
| 132 if line.startswith('>Cluster'): | 150 if not line: |
| 151 continue | |
| 152 | |
| 153 if line.startswith(">Cluster"): | |
| 133 # Start of new cluster, save previous if exists | 154 # Start of new cluster, save previous if exists |
| 134 if current_cluster: | 155 if current_cluster: |
| 135 clusters.append(current_cluster) | 156 clusters.append(current_cluster) |
| 136 current_cluster = {} | 157 current_cluster = {} |
| 137 else: | 158 else: |
| 138 # Parse sequence line | |
| 139 parts = line.split() | 159 parts = line.split() |
| 140 if len(parts) >= 2: | 160 if len(parts) < 3: |
| 141 # Extract header and count | 161 continue |
| 142 header_part = parts[2].strip('>.') | 162 |
| 143 header_parts = header_part.split('(') | 163 header_part = parts[2].strip(">.") |
| 144 if len(header_parts) > 1: | 164 header_parts = header_part.split("(") |
| 145 last_part = header_parts[-1].strip(')') | 165 header = header_parts[0] |
| 146 header = header_parts[0] | 166 count = 0 |
| 147 if last_part: | 167 if len(header_parts) > 1: |
| 168 last_part = header_parts[-1].strip(")") | |
| 169 if last_part: | |
| 170 try: | |
| 148 count = int(last_part) | 171 count = int(last_part) |
| 149 else: | 172 except ValueError: |
| 150 print('no count') | |
| 151 count = 0 | 173 count = 0 |
| 152 header = '' | 174 |
| 153 # Extract similarity | 175 similarity_part = parts[-1].strip() |
| 154 similarity_part = parts[-1].strip() | 176 if "*" in similarity_part: |
| 155 if '*' in similarity_part: | 177 similarity = 100.0 |
| 156 similarity = 100.0 # Representative sequence | 178 else: |
| 179 matches = re.findall(r"[\d.]+", similarity_part) | |
| 180 if matches: | |
| 181 similarity = float(matches[-1]) | |
| 157 else: | 182 else: |
| 158 matches = re.findall(r'[\d.]+', similarity_part) | 183 msg = f"Could not parse similarity from: '{similarity_part}'" |
| 159 if matches: | |
| 160 similarity = float(matches[-1]) | |
| 161 else: | |
| 162 raise ValueError(f"Could not parse similarity from: '{similarity_part}'") | |
| 163 # Get annotation info | |
| 164 try: | |
| 165 if header in annotations: | |
| 166 taxa = annotations[header]['taxa'] | |
| 167 evalue = annotations[header]['evalue'] | |
| 168 else: | |
| 169 taxa = 'Unannotated read' | |
| 170 evalue = 'Unannotated read' | |
| 171 | |
| 172 current_cluster[header] = { | |
| 173 'count': count, | |
| 174 'similarity': similarity, | |
| 175 'taxa': taxa, | |
| 176 'evalue': evalue | |
| 177 } | |
| 178 except UnboundLocalError as e: | |
| 179 if raise_on_error: | 184 if raise_on_error: |
| 180 raise UnboundLocalError(str(e)) | 185 raise ValueError(msg) |
| 181 print(f"Error: {e}, No annotations found") | 186 if log_messages is not None: |
| 182 | 187 log_message(log_messages, f"WARNING: {msg}") |
| 183 # Add the last cluster | 188 similarity = 0.0 |
| 189 | |
| 190 if header in annotations: | |
| 191 taxa = annotations[header]["taxa"] | |
| 192 seq_id = annotations[header]["seq_id"] | |
| 193 source = annotations[header]["source"] | |
| 194 else: | |
| 195 taxa = "Unannotated read" | |
| 196 seq_id = "Unannotated read" | |
| 197 source = "Unannotated read" | |
| 198 | |
| 199 current_cluster[header] = { | |
| 200 "count": count, | |
| 201 "similarity": similarity, | |
| 202 "taxa": taxa, | |
| 203 "seq_id": seq_id, | |
| 204 "source": source, | |
| 205 } | |
| 206 | |
| 184 if current_cluster: | 207 if current_cluster: |
| 185 clusters.append(current_cluster) | 208 clusters.append(current_cluster) |
| 186 | 209 |
| 210 if log_messages is not None: | |
| 211 log_message(log_messages, f"Parsed {len(clusters)} clusters") | |
| 212 | |
| 187 return clusters | 213 return clusters |
| 188 | 214 |
| 189 | 215 |
| 190 def process_cluster_data(cluster): | 216 def process_cluster_data(cluster: dict): |
| 191 """ | 217 """Convert a raw cluster into similarity lists and aggregated taxa counts. |
| 192 Process a single cluster to extract E-value, similarity, and taxa data for all reads. | 218 |
| 193 | 219 Similarity values are expanded based on read count. Taxa are grouped |
| 194 Aggregates information from all reads in the cluster, storing read counts, | 220 by unique taxonomic labels, tracking supporting seq_ids and sources. |
| 195 E-values, similarities, and taxa in lists and a dictionary. | 221 |
| 196 | 222 :param cluster: Mapping of read header to cluster metadata. |
| 197 :param cluster: Cluster data mapping read headers to read info. | |
| 198 :type cluster: dict | 223 :type cluster: dict |
| 199 :return: A tuple containing: (list of E-values, list of similarity values, dict of taxa -> counts). | 224 :return: similarity_list, taxa_map, annotated_count, unannotated_count |
| 200 The first element of the E-value list ([0]) stores the count of unannotated reads. | 225 :rtype: tuple[list[float], dict, int, int] |
| 201 :rtype: tuple[list[float | int], list[float], dict[str, int]] | 226 """ |
| 202 """ | 227 similarity_list: list[float] = [] |
| 203 eval_list = [0.0] # First element for unannotated count | 228 taxa_map: dict[str, dict] = {} |
| 204 simi_list = [] | 229 annotated_count = 0 |
| 205 taxa_dict = {'Unannotated read': 0} | 230 unannotated_count = 0 |
| 206 | 231 |
| 207 for header, info in cluster.items(): | 232 for _header, info in cluster.items(): |
| 208 count = info['count'] | 233 count = info["count"] |
| 209 similarity = info['similarity'] | 234 similarity = info["similarity"] |
| 210 taxa = info['taxa'] | 235 taxa = info["taxa"] |
| 211 evalue = info['evalue'] | 236 seq_id = info["seq_id"] |
| 212 | 237 source = info["source"] |
| 213 if evalue == 'Unannotated read': | 238 |
| 214 eval_list[0] += count | 239 similarity_list.extend([similarity] * count) |
| 215 taxa_dict['Unannotated read'] += count | 240 |
| 241 key = taxa if seq_id != "Unannotated read" else "Unannotated read" | |
| 242 if key not in taxa_map: | |
| 243 taxa_map[key] = { | |
| 244 "count": 0, | |
| 245 "seq_ids": set(), | |
| 246 "sources": set(), | |
| 247 } | |
| 248 | |
| 249 taxa_map[key]["count"] += count | |
| 250 taxa_map[key]["seq_ids"].add(seq_id) | |
| 251 taxa_map[key]["sources"].add(source) | |
| 252 | |
| 253 if seq_id == "Unannotated read": | |
| 254 unannotated_count += count | |
| 216 else: | 255 else: |
| 217 try: | 256 annotated_count += count |
| 218 eval_val = float(evalue) | 257 |
| 219 for _ in range(count): | 258 return similarity_list, taxa_map, annotated_count, unannotated_count |
| 220 eval_list.append(eval_val) | 259 |
| 221 except ValueError: | |
| 222 eval_list[0] += count | |
| 223 | |
| 224 if taxa not in taxa_dict: | |
| 225 taxa_dict[taxa] = 0 | |
| 226 taxa_dict[taxa] += count | |
| 227 | |
| 228 # Add similarity values | |
| 229 for _ in range(count): | |
| 230 simi_list.append(similarity) | |
| 231 return eval_list, simi_list, taxa_dict | |
| 232 | 260 |
| 233 | 261 |
| 234 def calculate_cluster_taxa(taxa_dict, args): | 262 def calculate_cluster_taxa(taxa_dict, args): |
| 235 """ | 263 """Resolve cluster-level taxa using weighted recursive LCA splitting. |
| 236 Calculate the most likely taxa for a cluster based on read counts. | 264 |
| 237 | 265 Unannotated reads are treated as fully uncertain taxa but contribute |
| 238 This function applies the 'uncertain taxa use ratio' for unannotated reads | 266 partially via ``uncertain_taxa_use_ratio`` when determining dominant levels. |
| 239 and uses a recursive approach to potentially split a cluster into sub-clusters | 267 |
| 240 if taxonomic dominance is not strong enough (based on ``min_to_split`` | 268 :param taxa_dict: Mapping of taxa string to total read count. |
| 241 and ``min_count_to_split``). | |
| 242 | |
| 243 :param taxa_dict: Mapping of taxa (full string) -> read counts. | |
| 244 :type taxa_dict: dict[str, int] | 269 :type taxa_dict: dict[str, int] |
| 245 :param args: Parsed script arguments, including parameters for taxa calculation. | 270 :param args: Argument namespace containing resolution parameters. |
| 246 :type args: argparse.Namespace | 271 :type args: argparse.Namespace |
| 247 :return: A list of refined taxa assignments (dictionaries), where each dictionary | 272 :return: List of taxonomic groups after recursive resolution. |
| 248 represents a potentially split sub-cluster. | 273 :rtype: list[dict[str, int]] |
| 249 :rtype: list[dict[str, float | int]] | 274 """ |
| 250 """ | |
| 251 # Replace 'Unannotated read' with uncertain taxa format | |
| 252 processed_dict = {} | 275 processed_dict = {} |
| 253 for taxa, count in taxa_dict.items(): | 276 for taxa, count in taxa_dict.items(): |
| 254 if taxa == 'Unannotated read': | 277 if taxa == "Unannotated read": |
| 255 uncertain_taxa = ' / '.join(['Uncertain taxa'] * 7) | 278 uncertain_taxa = " / ".join(["Uncertain taxa"] * 7) |
| 256 processed_dict[uncertain_taxa] = count | 279 processed_dict[uncertain_taxa] = count |
| 257 else: | 280 else: |
| 258 processed_dict[taxa] = count | 281 processed_dict[taxa] = count |
| 259 | 282 |
| 260 return _recursive_taxa_calculation(processed_dict, args, 0) | 283 return _recursive_taxa_calculation(processed_dict, args, 0) |
| 261 | 284 |
| 262 | 285 |
| 263 def _recursive_taxa_calculation(taxa_dict, args, level): | 286 def _recursive_taxa_calculation(taxa_dict, args, level): |
| 264 """ | 287 """Recursive helper performing level-by-level weighted LCA resolution. |
| 265 Recursive helper to calculate and potentially split taxa at each taxonomic level. | 288 |
| 266 | 289 At each taxonomic rank, taxa are grouped and compared. If multiple |
| 267 :param taxa_dict: Taxa counts at the current level (or sub-group). | 290 taxa exceed the configured split thresholds, the cluster is divided |
| 268 :type taxa_dict: dict[str, float | int] | 291 and recursion continues deeper. |
| 269 :param args: Parsed script arguments. | 292 |
| 293 :param taxa_dict: Mapping of taxa to count for this recursion stage. | |
| 294 :type taxa_dict: dict[str, int] | |
| 295 :param args: Parameter namespace controlling uncertainty and split rules. | |
| 270 :type args: argparse.Namespace | 296 :type args: argparse.Namespace |
| 271 :param level: Index of the current taxonomic level (0=kingdom, max 6=species). | 297 :param level: Current taxonomy depth (0–6). |
| 272 :type level: int | 298 :type level: int |
| 273 :return: List of refined taxa dictionaries. | 299 :return: One or more resolved taxon groups at deeper levels. |
| 274 :rtype: list[dict[str, float | int]] | 300 :rtype: list[dict[str, int]] |
| 275 """ | 301 """ |
| 276 if level >= 7: # Max 7 taxonomic levels | 302 if level >= 7: |
| 277 return [taxa_dict] | 303 return [taxa_dict] |
| 278 | 304 |
| 279 level_dict = defaultdict(float) | 305 level_dict = defaultdict(float) |
| 280 | |
| 281 # Group by taxonomic level | |
| 282 for taxa, count in taxa_dict.items(): | 306 for taxa, count in taxa_dict.items(): |
| 283 taxa_parts = taxa.split(' / ') | 307 taxa_parts = taxa.split(" / ") |
| 284 if level < len(taxa_parts): | 308 if level < len(taxa_parts): |
| 285 level_taxon = taxa_parts[level] | 309 level_taxon = taxa_parts[level] |
| 286 if level_taxon == 'Uncertain taxa': | 310 if level_taxon == "Uncertain taxa": |
| 287 level_dict[level_taxon] += count * args.uncertain_taxa_use_ratio | 311 level_dict[level_taxon] += count * args.uncertain_taxa_use_ratio |
| 288 else: | 312 else: |
| 289 level_dict[level_taxon] += count | 313 level_dict[level_taxon] += count |
| 290 | 314 |
| 291 if len(level_dict) <= 1: | 315 if len(level_dict) <= 1: |
| 292 # Only one taxon at this level, continue to next level | |
| 293 return _recursive_taxa_calculation(taxa_dict, args, level + 1) | 316 return _recursive_taxa_calculation(taxa_dict, args, level + 1) |
| 294 | 317 |
| 295 # Sort by abundance | |
| 296 sorted_taxa = sorted(level_dict.items(), key=lambda x: x[1], reverse=True) | 318 sorted_taxa = sorted(level_dict.items(), key=lambda x: x[1], reverse=True) |
| 297 | |
| 298 result = [] | 319 result = [] |
| 299 dominant_taxon = sorted_taxa[0][0] | 320 dominant_taxon = sorted_taxa[0][0] |
| 300 | 321 |
| 301 # Check if we should split | |
| 302 for i in range(1, len(sorted_taxa)): | 322 for i in range(1, len(sorted_taxa)): |
| 303 secondary_taxon = sorted_taxa[i][0] | 323 secondary_taxon = sorted_taxa[i][0] |
| 304 total_count = sorted_taxa[0][1] + sorted_taxa[i][1] | 324 total_count = sorted_taxa[0][1] + sorted_taxa[i][1] |
| 305 ratio = sorted_taxa[i][1] / total_count | 325 ratio = sorted_taxa[i][1] / total_count |
| 306 | 326 |
| 307 if ratio >= args.min_to_split or total_count <= args.min_count_to_split: | 327 if ratio >= args.min_to_split or total_count <= args.min_count_to_split: |
| 308 # Split off this taxon | 328 split_dict = { |
| 309 split_dict = {taxa: count for taxa, count in taxa_dict.items() | 329 taxa: count |
| 310 if taxa.split(' / ')[level] == secondary_taxon} | 330 for taxa, count in taxa_dict.items() |
| 331 if taxa.split(" / ")[level] == secondary_taxon | |
| 332 } | |
| 311 result.extend(_recursive_taxa_calculation(split_dict, args, level + 1)) | 333 result.extend(_recursive_taxa_calculation(split_dict, args, level + 1)) |
| 312 | 334 |
| 313 # Process the dominant group | 335 dominant_dict = { |
| 314 dominant_dict = {taxa: count for taxa, count in taxa_dict.items() | 336 taxa: count |
| 315 if taxa.split(' / ')[level] == dominant_taxon} | 337 for taxa, count in taxa_dict.items() |
| 338 if taxa.split(" / ")[level] == dominant_taxon | |
| 339 } | |
| 316 result.extend(_recursive_taxa_calculation(dominant_dict, args, level + 1)) | 340 result.extend(_recursive_taxa_calculation(dominant_dict, args, level + 1)) |
| 317 | 341 |
| 318 return result | 342 return result |
| 319 | 343 |
| 320 | 344 |
| 321 def write_similarity_output(all_simi_data, output_file): | 345 |
| 322 """ | 346 def write_similarity_output(cluster_data_list, output_file, log_messages=None): |
| 323 Write the similarity text output, including the mean and standard deviation, | 347 """Write similarity statistics and per-cluster distributions. |
| 324 and a count per similarity percentage. | 348 |
| 325 | 349 Output includes mean, standard deviation, and counts of each |
| 326 :param all_simi_data: List of all similarity percentages from all reads. | 350 similarity value per cluster. |
| 327 :type all_simi_data: list[float] | 351 |
| 328 :param output_file: Path to the output file. | 352 :param cluster_data_list: List of processed cluster dictionaries. |
| 353 :type cluster_data_list: list[dict] | |
| 354 :param output_file: Destination path for the TSV summary. | |
| 329 :type output_file: str | 355 :type output_file: str |
| 356 :param log_messages: Optional log collector. | |
| 357 :type log_messages: list[str] or None | |
| 330 :return: None | 358 :return: None |
| 331 :rtype: None | 359 :rtype: None |
| 332 """ | 360 """ |
| 333 if not all_simi_data: | 361 all_simi = [] |
| 362 pair_counter = Counter() | |
| 363 | |
| 364 for cluster_index, cluster_data in enumerate(cluster_data_list): | |
| 365 simi_list = cluster_data["similarities"] | |
| 366 if not simi_list: | |
| 367 continue | |
| 368 all_simi.extend(simi_list) | |
| 369 for s in simi_list: | |
| 370 pair_counter[(cluster_index, s)] += 1 | |
| 371 | |
| 372 if not all_simi: | |
| 373 if log_messages is not None: | |
| 374 log_message( | |
| 375 log_messages, | |
| 376 "No similarity data found; similarity text file not written.", | |
| 377 ) | |
| 334 return | 378 return |
| 335 | 379 |
| 336 avg = sum(all_simi_data) / len(all_simi_data) | 380 avg = sum(all_simi) / len(all_simi) |
| 337 variance = sum((s - avg) ** 2 for s in all_simi_data) / len(all_simi_data) | 381 variance = sum((s - avg) ** 2 for s in all_simi) / len(all_simi) |
| 338 st_dev = sqrt(variance) | 382 st_dev = sqrt(variance) |
| 339 | 383 |
| 340 simi_counter = Counter(all_simi_data) | 384 with open(output_file, "w") as f: |
| 341 simi_sorted = sorted(simi_counter.items(), key=lambda x: -x[0]) | 385 f.write(f"# Average similarity (all reads): {avg:.2f}\n") |
| 342 | 386 f.write(f"# Standard deviation (all reads): {st_dev:.2f}\n") |
| 343 with open(output_file, 'w') as f: | 387 f.write("cluster\tsimilarity\tcount\n") |
| 344 f.write(f"# Average similarity: {avg:.2f}\n") | 388 |
| 345 f.write(f"# Standard deviation: {st_dev:.2f}\n") | 389 for (cluster_index, similarity), count in sorted( |
| 346 f.write("similarity\tcount\n") | 390 pair_counter.items(), key=lambda x: (x[0][0], -x[0][1]) |
| 347 for similarity, count in simi_sorted: | 391 ): |
| 348 f.write(f"{similarity}\t{count}\n") | 392 f.write(f"{cluster_index}\t{similarity}\t{count}\n") |
| 349 | 393 |
| 350 | 394 if log_messages is not None: |
| 351 def write_evalue_output(all_eval_data, output_file): | 395 log_message(log_messages, "Similarity text summary written succesfully") |
| 352 """ | 396 log_message( |
| 353 Write the E-value text output, including the count of unannotated reads | 397 log_messages, f"Similarity mean = {avg:.2f}, std = {st_dev:.2f}" |
| 354 and a count per E-value. | 398 ) |
| 355 | 399 |
| 356 :param all_eval_data: List of E-values from all reads. The first element ([0]) is the count of unannotated reads. | 400 |
| 357 :type all_eval_data: list[float | int] | 401 def write_count_output(cluster_data_list, output_file, log_messages=None): |
| 358 :param output_file: Path to the output file. | 402 """Write counts of annotated and unannotated reads per cluster. |
| 403 | |
| 404 A summary row containing total and percentage values is appended. | |
| 405 | |
| 406 :param cluster_data_list: List of processed clusters. | |
| 407 :type cluster_data_list: list[dict] | |
| 408 :param output_file: Path for the count summary text file. | |
| 359 :type output_file: str | 409 :type output_file: str |
| 410 :param log_messages: Optional logging accumulator. | |
| 411 :type log_messages: list[str] or None | |
| 360 :return: None | 412 :return: None |
| 361 :rtype: None | 413 :rtype: None |
| 362 """ | 414 """ |
| 363 unanno_count = all_eval_data[0] | 415 total_annotated = 0 |
| 364 eval_counter = Counter(all_eval_data[1:]) | 416 total_unannotated = 0 |
| 365 | 417 |
| 366 with open(output_file, 'w') as f: | 418 with open(output_file, "w") as f: |
| 367 f.write("evalue\tcount\n") | 419 f.write( |
| 368 if unanno_count > 0: | 420 "cluster\tunannotated\tannotated\ttotal\tperc_unannotated\tperc_annotated\n" |
| 369 f.write(f"unannotated\t{unanno_count}\n") | 421 ) |
| 370 | 422 |
| 371 eval_sorted = sorted(eval_counter.items(), | 423 for cluster_index, cluster_data in enumerate(cluster_data_list): |
| 372 key=lambda x: (-x[1], float(x[0]) if isinstance(x[0], (int, float)) else float('inf'))) | 424 unannotated = cluster_data["unannotated"] |
| 373 for value, count in eval_sorted: | 425 annotated = cluster_data["annotated"] |
| 374 f.write(f"{value}\t{count}\n") | |
| 375 | |
| 376 | |
| 377 def write_count_output(all_eval_data, cluster_data_list, output_file): | |
| 378 """ | |
| 379 Write a summary of annotated and unannotated read counts per cluster | |
| 380 and for the total sample. | |
| 381 | |
| 382 :param all_eval_data: List of E-values from all reads for the total count summary. | |
| 383 :type all_eval_data: list[float | int] | |
| 384 :param cluster_data_list: List of tuples (eval_list, simi_list, taxa_dict) per cluster. | |
| 385 :type cluster_data_list: list[tuple] | |
| 386 :param output_file: Path to the output file. | |
| 387 :type output_file: str | |
| 388 :return: None | |
| 389 :rtype: None | |
| 390 """ | |
| 391 with open(output_file, 'w') as f: | |
| 392 f.write("cluster\tunannotated\tannotated\ttotal\tperc_unannotated\tperc_annotated\n") | |
| 393 | |
| 394 for cluster_num, (eval_list, _, _) in enumerate(cluster_data_list): | |
| 395 unannotated = eval_list[0] | |
| 396 annotated = len(eval_list[1:]) | |
| 397 total = unannotated + annotated | 426 total = unannotated + annotated |
| 427 | |
| 428 total_annotated += annotated | |
| 429 total_unannotated += unannotated | |
| 398 | 430 |
| 399 if total > 0: | 431 if total > 0: |
| 400 perc_annotated = (annotated / total) * 100 | 432 perc_annotated = (annotated / total) * 100 |
| 401 perc_unannotated = (unannotated / total) * 100 | 433 perc_unannotated = (unannotated / total) * 100 |
| 402 else: | 434 else: |
| 403 perc_annotated = perc_unannotated = 0 | 435 perc_annotated = perc_unannotated = 0.0 |
| 404 | 436 |
| 405 f.write( | 437 f.write( |
| 406 f"{cluster_num}\t{unannotated}\t{annotated}\t{total}\t{perc_unannotated:.2f}\t{perc_annotated:.2f}\n") | 438 f"{cluster_index}\t{unannotated}\t{annotated}\t{total}" |
| 407 | 439 f"\t{perc_unannotated:.2f}\t{perc_annotated:.2f}\n" |
| 408 # Add full sample summary | 440 ) |
| 409 total_unannotated = all_eval_data[0] | 441 |
| 410 total_annotated = len(all_eval_data[1:]) | 442 grand_total = total_annotated + total_unannotated |
| 411 grand_total = total_unannotated + total_annotated | |
| 412 | |
| 413 if grand_total > 0: | 443 if grand_total > 0: |
| 414 perc_annotated = (total_annotated / grand_total) * 100 | 444 perc_annotated = (total_annotated / grand_total) * 100 |
| 415 perc_unannotated = (total_unannotated / grand_total) * 100 | 445 perc_unannotated = (total_unannotated / grand_total) * 100 |
| 416 else: | 446 else: |
| 417 perc_annotated = perc_unannotated = 0 | 447 perc_annotated = perc_unannotated = 0.0 |
| 418 | 448 |
| 419 f.write( | 449 f.write( |
| 420 f"TOTAL\t{total_unannotated}\t{total_annotated}\t{grand_total}\t{perc_unannotated:.2f}\t{perc_annotated:.2f}\n") | 450 f"TOTAL\t{total_unannotated}\t{total_annotated}\t{grand_total}" |
| 421 | 451 f"\t{perc_unannotated:.2f}\t{perc_annotated:.2f}\n" |
| 422 | 452 ) |
| 423 def write_taxa_clusters_output(cluster_data_list, output_file): | 453 |
| 424 """ | 454 if log_messages is not None: |
| 425 Write raw taxa information per cluster to an Excel file. | 455 log_message(log_messages, "Count summary written succesfully") |
| 426 | 456 log_message( |
| 427 Each row contains the cluster number, read count, the full taxa string, | 457 log_messages, |
| 428 and the separate taxonomic levels (kingdom through species). | 458 f"TOTAL annotated={total_annotated}, unannotated={total_unannotated}, total={grand_total}", |
| 429 | 459 ) |
| 430 :param cluster_data_list: List of tuples (eval_list, simi_list, taxa_dict) per cluster. | 460 |
| 431 :type cluster_data_list: list[tuple] | 461 |
| 432 :param output_file: Path to the output Excel file. | 462 def write_taxa_excel(cluster_data_list, args, output_file, write_raw: bool, write_processed: bool, log_messages=None): |
| 463 """Write raw and processed taxa data into a combined Excel report. | |
| 464 | |
| 465 Generates up to three sheets: | |
| 466 | |
| 467 - ``Raw_Taxa_Clusters``: Full taxa per read cluster (if enabled). | |
| 468 - ``Processed_Taxa_Clusters``: LCA-resolved taxa (if enabled). | |
| 469 - ``Settings``: Parameters used for taxonomic resolution. | |
| 470 | |
| 471 seq_id and source tracking are kept only in the raw sheet. | |
| 472 | |
| 473 :param cluster_data_list: List containing similarity/taxa data per cluster. | |
| 474 :type cluster_data_list: list[dict] | |
| 475 :param args: Parsed arguments containing LCA configuration. | |
| 476 :type args: argparse.Namespace | |
| 477 :param output_file: Path to the combined Excel file. | |
| 433 :type output_file: str | 478 :type output_file: str |
| 479 :param write_raw: Whether to include the raw taxa sheet. | |
| 480 :type write_raw: bool | |
| 481 :param write_processed: Whether to include the processed/LCA sheet. | |
| 482 :type write_processed: bool | |
| 483 :param log_messages: Optional log collector. | |
| 484 :type log_messages: list[str] or None | |
| 434 :return: None | 485 :return: None |
| 435 :rtype: None | 486 :rtype: None |
| 436 """ | 487 """ |
| 437 # Create main dataframe | 488 raw_rows = [] |
| 438 data = [] | 489 processed_rows = [] |
| 439 for cluster_num, (_, _, taxa_dict) in enumerate(cluster_data_list): | 490 |
| 440 for taxa, count in taxa_dict.items(): | 491 if write_raw: |
| 441 if count > 0: | 492 for cluster_index, cluster_data in enumerate(cluster_data_list): |
| 442 # Split taxa into taxonomic levels | 493 taxa_map = cluster_data["taxa_map"] |
| 443 taxa_levels = taxa.split(' / ') if taxa else [] | 494 for taxa, info in taxa_map.items(): |
| 444 taxa_levels += ['Unannotated read'] * (7 - len(taxa_levels)) | 495 count = info["count"] |
| 445 try: | 496 if count <= 0: |
| 446 data.append({ | |
| 447 'cluster': cluster_num, | |
| 448 'count': count, | |
| 449 'taxa_full': taxa, | |
| 450 'kingdom': taxa_levels[0], | |
| 451 'phylum': taxa_levels[1], | |
| 452 'class': taxa_levels[2], | |
| 453 'order': taxa_levels[3], | |
| 454 'family': taxa_levels[4], | |
| 455 'genus': taxa_levels[5], | |
| 456 'species': taxa_levels[6] | |
| 457 }) | |
| 458 except IndexError as e: | |
| 459 # Skip entries with incomplete taxonomic data | |
| 460 print(f"Skipped entry in cluster {cluster_num}: incomplete taxonomic data for '{taxa}, error: {e}'") | |
| 461 continue | 497 continue |
| 462 | 498 |
| 463 df = pd.DataFrame(data) | 499 taxa_levels = taxa.split(" / ") if taxa else [] |
| 464 # Write to Excel | 500 while len(taxa_levels) < 7: |
| 465 temp_path = output_file + ".xlsx" | 501 taxa_levels.append("Unannotated") |
| 502 | |
| 503 seq_ids_str = ",".join(sorted(info["seq_ids"])) | |
| 504 sources_str = ",".join(sorted(info["sources"])) | |
| 505 | |
| 506 raw_rows.append( | |
| 507 { | |
| 508 "cluster": cluster_index, | |
| 509 "count": count, | |
| 510 "seq_id": seq_ids_str, | |
| 511 "source": sources_str, | |
| 512 "taxa_full": taxa, | |
| 513 "kingdom": taxa_levels[0], | |
| 514 "phylum": taxa_levels[1], | |
| 515 "class": taxa_levels[2], | |
| 516 "order": taxa_levels[3], | |
| 517 "family": taxa_levels[4], | |
| 518 "genus": taxa_levels[5], | |
| 519 "species": taxa_levels[6], | |
| 520 } | |
| 521 ) | |
| 522 | |
| 523 if write_processed: | |
| 524 for cluster_index, cluster_data in enumerate(cluster_data_list): | |
| 525 taxa_map = cluster_data["taxa_map"] | |
| 526 | |
| 527 annotated_support = sum( | |
| 528 info["count"] | |
| 529 for taxa, info in taxa_map.items() | |
| 530 if taxa != "Unannotated read" | |
| 531 ) | |
| 532 | |
| 533 if annotated_support < args.min_cluster_support: | |
| 534 continue | |
| 535 | |
| 536 taxa_counts = {taxa: info["count"] for taxa, info in taxa_map.items()} | |
| 537 processed_groups = calculate_cluster_taxa(taxa_counts, args) | |
| 538 | |
| 539 for group in processed_groups: | |
| 540 for taxa, count in group.items(): | |
| 541 if count <= 0: | |
| 542 continue | |
| 543 | |
| 544 if "Uncertain taxa / Uncertain taxa / Uncertain taxa" in taxa: | |
| 545 continue | |
| 546 else: | |
| 547 info = taxa_map.get(taxa) | |
| 548 seq_ids_set = info["seq_ids"] | |
| 549 sources_set = info["sources"] | |
| 550 taxa_full = taxa | |
| 551 taxa_levels = taxa.split(" / ") if taxa else [] | |
| 552 while len(taxa_levels) < 7: | |
| 553 taxa_levels.append("Unannotated") | |
| 554 | |
| 555 seq_ids_str = ",".join(sorted(seq_ids_set)) | |
| 556 sources_str = ",".join(sorted(sources_set)) | |
| 557 processed_rows.append( | |
| 558 { | |
| 559 "cluster": cluster_index, | |
| 560 "count": count, | |
| 561 "seq_id": seq_ids_str, | |
| 562 "source": sources_str, | |
| 563 "taxa_full": taxa_full, | |
| 564 "kingdom": taxa_levels[0], | |
| 565 "phylum": taxa_levels[1], | |
| 566 "class": taxa_levels[2], | |
| 567 "order": taxa_levels[3], | |
| 568 "family": taxa_levels[4], | |
| 569 "genus": taxa_levels[5], | |
| 570 "species": taxa_levels[6], | |
| 571 } | |
| 572 ) | |
| 573 | |
| 574 if not raw_rows and not processed_rows: | |
| 575 if log_messages is not None: | |
| 576 log_message( | |
| 577 log_messages, | |
| 578 "No taxa data to write; taxa Excel file not created.", | |
| 579 ) | |
| 580 return | |
| 581 | |
| 582 raw_df = pd.DataFrame(raw_rows) if raw_rows else None | |
| 583 processed_df = pd.DataFrame(processed_rows) if processed_rows else None | |
| 584 | |
| 585 settings_data = [ | |
| 586 ["uncertain_taxa_use_ratio", args.uncertain_taxa_use_ratio], | |
| 587 ["min_to_split", args.min_to_split], | |
| 588 ["min_count_to_split", args.min_count_to_split], | |
| 589 ["min_cluster_support", args.min_cluster_support] | |
| 590 ] | |
| 591 settings_df = pd.DataFrame(settings_data, columns=["Parameter", "Value"]) | |
| 592 | |
| 593 temp_path = output_file + ".tmp.xlsx" | |
| 466 os.makedirs(os.path.dirname(temp_path), exist_ok=True) | 594 os.makedirs(os.path.dirname(temp_path), exist_ok=True) |
| 467 with pd.ExcelWriter(temp_path, engine='openpyxl') as writer: | 595 with pd.ExcelWriter(temp_path, engine="openpyxl") as writer: |
| 468 df.to_excel(writer, sheet_name='Raw_Taxa_Clusters', index=False, engine='openpyxl') | 596 if raw_df is not None: |
| 469 os.replace(temp_path, output_file) | 597 raw_df.to_excel( |
| 470 | 598 writer, sheet_name="Raw_Taxa_Clusters", index=False |
| 471 def write_taxa_processed_output(cluster_data_list, args, output_file): | 599 ) |
| 472 """ | 600 if processed_df is not None: |
| 473 Write processed (potentially split) taxa information to an Excel file. | 601 processed_df.to_excel( |
| 474 | 602 writer, sheet_name="Processed_Taxa_Clusters", index=False |
| 475 This file contains the resulting sub-clusters from the taxonomic dominance | 603 ) |
| 476 analysis and a separate sheet documenting the parameters used. | 604 settings_df.to_excel(writer, sheet_name="Settings", index=False) |
| 477 | 605 |
| 478 :param cluster_data_list: List of tuples (eval_list, simi_list, taxa_dict) per cluster. | |
| 479 :type cluster_data_list: list[tuple] | |
| 480 :param args: Parsed script arguments, used for taxa calculation and settings documentation. | |
| 481 :type args: argparse.Namespace | |
| 482 :param output_file: Path to the output Excel file. | |
| 483 :type output_file: str | |
| 484 :return: None | |
| 485 :rtype: None | |
| 486 """ | |
| 487 # Create main dataframe | |
| 488 data = [] | |
| 489 for cluster_num, (_, _, taxa_dict) in enumerate(cluster_data_list): | |
| 490 processed_taxa = calculate_cluster_taxa(taxa_dict, args) | |
| 491 | |
| 492 for taxa_group in processed_taxa: | |
| 493 for taxa, count in taxa_group.items(): | |
| 494 if 'Uncertain taxa / Uncertain taxa / Uncertain taxa' in taxa: | |
| 495 if args.show_unannotated_clusters: | |
| 496 data.append({ | |
| 497 'cluster': cluster_num, | |
| 498 'count': count, | |
| 499 'taxa_full': 'Unannotated read', | |
| 500 'kingdom': 'Unannotated', | |
| 501 'phylum': 'Unannotated', | |
| 502 'class': 'Unannotated', | |
| 503 'order': 'Unannotated', | |
| 504 'family': 'Unannotated', | |
| 505 'genus': 'Unannotated', | |
| 506 'species': 'Unannotated' | |
| 507 }) | |
| 508 else: | |
| 509 # Split taxa into taxonomic levels | |
| 510 taxa_levels = taxa.split(' / ') if taxa else [] | |
| 511 | |
| 512 try: | |
| 513 data.append({ | |
| 514 'cluster': cluster_num, | |
| 515 'count': count, | |
| 516 'taxa_full': taxa, | |
| 517 'kingdom': taxa_levels[0], | |
| 518 'phylum': taxa_levels[1], | |
| 519 'class': taxa_levels[2], | |
| 520 'order': taxa_levels[3], | |
| 521 'family': taxa_levels[4], | |
| 522 'genus': taxa_levels[5], | |
| 523 'species': taxa_levels[6] | |
| 524 }) | |
| 525 except IndexError: | |
| 526 # Skip entries with incomplete taxonomic data | |
| 527 print(f"Skipped entry in cluster {cluster_num}: incomplete taxonomic data for '{taxa}'") | |
| 528 continue | |
| 529 | |
| 530 df = pd.DataFrame(data) | |
| 531 | |
| 532 # Create settings dataframe | |
| 533 settings_data = [ | |
| 534 ['uncertain_taxa_use_ratio', args.uncertain_taxa_use_ratio], | |
| 535 ['min_to_split', args.min_to_split], | |
| 536 ['min_count_to_split', args.min_count_to_split] | |
| 537 ] | |
| 538 settings_df = pd.DataFrame(settings_data, columns=['Parameter', 'Value']) | |
| 539 | |
| 540 # Write to Excel with multiple sheets | |
| 541 temp_path = output_file + ".xlsx" | |
| 542 os.makedirs(os.path.dirname(temp_path), exist_ok=True) | |
| 543 with pd.ExcelWriter(temp_path, engine='openpyxl') as writer: | |
| 544 df.to_excel(writer, sheet_name='Processed_Taxa_Clusters', index=False, engine='openpyxl') | |
| 545 settings_df.to_excel(writer, sheet_name='Settings', index=False, engine='openpyxl') | |
| 546 | |
| 547 # Auto-adjust column widths for better readability | |
| 548 for sheet_name in writer.sheets: | 606 for sheet_name in writer.sheets: |
| 549 worksheet = writer.sheets[sheet_name] | 607 worksheet = writer.sheets[sheet_name] |
| 550 for column in worksheet.columns: | 608 for column in worksheet.columns: |
| 551 max_length = 0 | 609 max_length = 0 |
| 552 column_letter = column[0].column_letter | 610 column_letter = column[0].column_letter |
| 553 for cell in column: | 611 for cell in column: |
| 554 try: | 612 try: |
| 555 if len(str(cell.value)) > max_length: | 613 cell_len = len(str(cell.value)) |
| 556 max_length = len(str(cell.value)) | 614 if cell_len > max_length: |
| 557 except: | 615 max_length = cell_len |
| 616 except AttributeError: | |
| 558 pass | 617 pass |
| 559 adjusted_width = min(max_length + 2, 50) # Cap at 50 characters | 618 adjusted_width = min(max_length + 2, 50) |
| 560 worksheet.column_dimensions[column_letter].width = adjusted_width | 619 worksheet.column_dimensions[column_letter].width = adjusted_width |
| 620 | |
| 561 os.replace(temp_path, output_file) | 621 os.replace(temp_path, output_file) |
| 562 | 622 |
| 563 def create_similarity_plot(all_simi_data, cluster_simi_lengths, args, output_file): | 623 if log_messages is not None: |
| 564 """ | 624 if raw_df is not None: |
| 565 Create a bar plot showing the distribution of intra-cluster similarity values. | 625 log_message(log_messages, "Raw taxa per cluster written succesfully") |
| 566 | 626 if processed_df is not None: |
| 567 The plot uses different colors to distinguish reads belonging to different clusters. | 627 log_message( |
| 568 | 628 log_messages, |
| 569 :param all_simi_data: List of all similarity values, sorted descending. | 629 "Processed taxa (split clusters) written succesfully", |
| 630 ) | |
| 631 | |
| 632 | |
| 633 def create_similarity_plot(all_simi_data, cluster_simi_lengths, args, output_file, log_messages=None): | |
| 634 """Create and save a similarity distribution bar plot. | |
| 635 | |
| 636 Bars are colored per cluster using a fixed repeating colormap. | |
| 637 | |
| 638 :param all_simi_data: All similarity values across clusters. | |
| 570 :type all_simi_data: list[float] | 639 :type all_simi_data: list[float] |
| 571 :param cluster_simi_lengths: List of lengths of similarity data per cluster, used for coloring. | 640 :param cluster_simi_lengths: Per-cluster similarity list lengths. |
| 572 :type cluster_simi_lengths: list[int] | 641 :type cluster_simi_lengths: list[int] |
| 573 :param args: Parsed script arguments, used for plot y-limits. | 642 :param args: Namespace containing y-axis plot limits. |
| 574 :type args: argparse.Namespace | 643 :type args: argparse.Namespace |
| 575 :param output_file: Path to the output plot file (e.g., .png). | 644 :param output_file: Output PNG file path. |
| 576 :type output_file: str | 645 :type output_file: str |
| 646 :param log_messages: Optional log message list. | |
| 647 :type log_messages: list[str] or None | |
| 577 :return: None | 648 :return: None |
| 578 :rtype: None | 649 :rtype: None |
| 579 """ | 650 """ |
| 580 if not all_simi_data: | 651 if not all_simi_data: |
| 652 if log_messages is not None: | |
| 653 log_message(log_messages, "No similarity data; similarity plot not created.") | |
| 581 return | 654 return |
| 582 | 655 |
| 583 sorted_simi_list = sorted(all_simi_data, reverse=True) | 656 sorted_simi_list = sorted(all_simi_data, reverse=True) |
| 584 bar_positions = list(range(len(sorted_simi_list))) | 657 bar_positions = list(range(len(sorted_simi_list))) |
| 585 | 658 |
| 586 # Create colormap for different clusters | |
| 587 colormap_full = [] | 659 colormap_full = [] |
| 588 for i, length in enumerate(cluster_simi_lengths): | 660 for i, length in enumerate(cluster_simi_lengths): |
| 589 color = COLORMAP[i % len(COLORMAP)] | 661 color = COLORMAP[i % len(COLORMAP)] |
| 590 colormap_full.extend([color] * length) | 662 colormap_full.extend([color] * length) |
| 591 | 663 |
| 592 plt.figure(figsize=(12, 6)) | 664 plt.figure(figsize=(12, 6)) |
| 593 plt.bar(bar_positions, sorted_simi_list, width=1, color=colormap_full) | 665 plt.bar(bar_positions, sorted_simi_list, width=1, color=colormap_full) |
| 594 plt.grid(axis='y', linestyle='--', color='gray', alpha=0.7) | 666 plt.grid(axis="y", linestyle="--", color="gray", alpha=0.7) |
| 595 plt.ylabel("Similarity (%)") | 667 plt.ylabel("Similarity (%)") |
| 596 plt.xlabel("Reads") | 668 plt.xlabel("Reads") |
| 597 plt.title("Intra-cluster Similarity Distribution") | 669 plt.title("Intra-cluster Similarity Distribution") |
| 598 plt.ylim(ymin=args.simi_plot_y_min, ymax=args.simi_plot_y_max) | 670 plt.ylim(ymin=args.simi_plot_y_min, ymax=args.simi_plot_y_max) |
| 599 plt.tight_layout() | 671 plt.tight_layout() |
| 600 | 672 |
| 601 plt.savefig(output_file, format='png', dpi=300, bbox_inches='tight') | 673 plt.savefig(output_file, format="png", dpi=300, bbox_inches="tight") |
| 602 plt.close() | 674 plt.close() |
| 603 | 675 |
| 604 | 676 if log_messages is not None: |
| 605 def create_evalue_plot(all_eval_data, cluster_eval_lengths, output_file): | 677 log_message(log_messages, "Similarity plot written succesfully") |
| 606 """ | 678 |
| 607 Create a bar plot showing the distribution of E-values. | 679 |
| 608 | 680 def summarize_and_log(cluster_data_list, all_simi_data, log_messages, args): |
| 609 The y-axis is log-scaled and displays ``1/E-values``. Reads are ordered | 681 """Compute global summary statistics and append them to the log. |
| 610 by E-value (ascending). The plot uses different colors to distinguish reads | 682 |
| 611 belonging to different clusters. | 683 Summary includes: |
| 612 | 684 - total clusters |
| 613 :param all_eval_data: List of E-values from all reads. Assumes E-values start at index 1. | 685 - total annotated/unannotated reads |
| 614 :type all_eval_data: list[float | int] | 686 - per-cluster annotation presence |
| 615 :param cluster_eval_lengths: List of lengths of annotated E-value data per cluster, used for coloring. | 687 - top taxa distribution |
| 616 :type cluster_eval_lengths: list[int] | 688 - similarity mean and standard deviation |
| 617 :param output_file: Path to the output plot file (e.g., .png). | 689 |
| 618 :type output_file: str | 690 :param cluster_data_list: List of processed cluster descriptors. |
| 691 :type cluster_data_list: list[dict] | |
| 692 :param all_simi_data: All similarity values from all clusters. | |
| 693 :type all_simi_data: list[float] | |
| 694 :param log_messages: List collecting log output lines. | |
| 695 :type log_messages: list[str] | |
| 696 :param args: Argument namespace with configuration parameters. | |
| 697 :type args: argparse.Namespace | |
| 619 :return: None | 698 :return: None |
| 620 :rtype: None | 699 :rtype: None |
| 621 """ | 700 """ |
| 622 if len(all_eval_data) <= 1: # Only unannotated reads | 701 total_clusters = len(cluster_data_list) |
| 623 return | 702 total_annotated = sum(c["annotated"] for c in cluster_data_list) |
| 624 | 703 total_unannotated = sum(c["unannotated"] for c in cluster_data_list) |
| 625 sorted_eval_list = sorted(all_eval_data[1:]) # Skip unannotated count | 704 grand_total = total_annotated + total_unannotated |
| 626 | 705 |
| 627 if not sorted_eval_list: | 706 clusters_with_annotations = sum( |
| 628 return | 707 1 for c in cluster_data_list if c["annotated"] > 0 |
| 629 | 708 ) |
| 630 bar_positions = list(range(len(sorted_eval_list))) | 709 clusters_unannotated_only = total_clusters - clusters_with_annotations |
| 631 bar_heights = [1 / e if e > 0 else 0 for e in sorted_eval_list] | 710 |
| 632 | 711 log_message(log_messages, "=== SUMMARY ===") |
| 633 # Create colormap for different clusters | 712 log_message(log_messages, f"Clusters parsed: {total_clusters}") |
| 634 colormap_full = [] | 713 log_message(log_messages, f"Total reads: {grand_total}") |
| 635 for i, length in enumerate(cluster_eval_lengths): | 714 log_message(log_messages, f"Annotated reads: {total_annotated}") |
| 636 color = COLORMAP[i % len(COLORMAP)] | 715 log_message(log_messages, f"Unannotated reads: {total_unannotated}") |
| 637 colormap_full.extend([color] * length) | 716 log_message(log_messages, f"Clusters with annotations: {clusters_with_annotations}") |
| 638 | 717 log_message(log_messages, f"Clusters fully unannotated: {clusters_unannotated_only}") |
| 639 plt.figure(figsize=(12, 6)) | 718 log_message(log_messages, f"Minimum cluster support for processed taxa: {args.min_cluster_support}") |
| 640 plt.bar(bar_positions, bar_heights, width=1, color=colormap_full) | 719 |
| 641 plt.yscale('log') | 720 taxa_counter = Counter() |
| 642 plt.grid(axis='y', linestyle='--', color='gray', alpha=0.7) | 721 for c in cluster_data_list: |
| 643 plt.ylabel("1/E-values") | 722 for taxa, info in c["taxa_map"].items(): |
| 644 plt.xlabel("Reads") | 723 if taxa == "Unannotated read": |
| 645 plt.title("E-value Distribution") | 724 continue |
| 646 plt.tight_layout() | 725 taxa_counter[taxa] += info["count"] |
| 647 | 726 |
| 648 plt.savefig(output_file, format='png', dpi=300, bbox_inches='tight') | 727 if taxa_counter: |
| 649 plt.close() | 728 log_message(log_messages, "=== TAXA SUMMARY (top 10) ===") |
| 650 | 729 for taxa, count in taxa_counter.most_common(10): |
| 651 def prepare_evalue_histogram(evalue_list, unannotated_list): | 730 log_message(log_messages, f"{taxa}: {count} reads") |
| 652 """ | 731 log_message(log_messages, f"Total unique taxa: {len(taxa_counter)}") |
| 653 Generate histogram data for E-value distributions. | 732 else: |
| 654 | 733 log_message(log_messages, "No annotated taxa found.") |
| 655 This function processes a list of E-values from BLAST or similar search | 734 |
| 656 results, filters out invalid or zero entries, and computes histogram data | 735 if all_simi_data: |
| 657 suitable for plotting. The histogram represents the frequency distribution | 736 avg = sum(all_simi_data) / len(all_simi_data) |
| 658 of E-values across all annotated hits. | 737 variance = sum((s - avg) ** 2 for s in all_simi_data) / len(all_simi_data) |
| 659 | 738 st_dev = sqrt(variance) |
| 660 :param evalue_list: List of E-values from BLAST hits | 739 log_message(log_messages, "=== SIMILARITY SUMMARY ===") |
| 661 :type evalue_list: list[float | int] | 740 log_message(log_messages, f"Mean similarity: {avg:.2f}") |
| 662 :param unannotated_list: List of unannotated E-values | 741 log_message(log_messages, f"Std similarity: {st_dev:.2f}") |
| 663 :type unannotated_list: list | 742 else: |
| 664 :return: Tuple containing: | 743 log_message(log_messages, "No similarity values available for summary.") |
| 665 - **counts** (*numpy.ndarray*): Number of entries per histogram bin. | 744 |
| 666 - **bins** (*numpy.ndarray*): Bin edges corresponding to the histogram. | |
| 667 Returns ``(None, None)`` if no valid data is available. | |
| 668 :rtype: tuple[numpy.ndarray, numpy.ndarray] | tuple[None, None] | |
| 669 :note: | |
| 670 - Only positive numeric E-values are included in the histogram. | |
| 671 - Uses 50 bins in the range (0, 1) for visualization consistency. | |
| 672 """ | |
| 673 data = [ev for ev in evalue_list if isinstance(ev, (int, float)) and ev > 0] | |
| 674 if not data: | |
| 675 return None, None | |
| 676 | |
| 677 counts, bins, _ = plt.hist(data, bins=50, range=(0, 1)) | |
| 678 plt.close() | |
| 679 return counts, bins | |
| 680 | |
| 681 def create_evalue_plot_test(evalue_list, unannotated_list, output_file): | |
| 682 """ | |
| 683 Create and save an E-value distribution plot, returning the computed histogram data. | |
| 684 | |
| 685 This function visualizes the frequency distribution of E-values from BLAST or | |
| 686 annotation results. It saves the plot to the specified output file and returns | |
| 687 the histogram data (counts and bins) for testing with pytests. | |
| 688 | |
| 689 :param evalue_list: List of numeric E-values to plot | |
| 690 :type evalue_list: list[float | int] | |
| 691 :param unannotated_list: Optional list of E-values for unannotated sequences. | |
| 692 :type unannotated_list: list | |
| 693 :param output_file: Path where the histogram image will be saved. | |
| 694 :type output_file: str | |
| 695 | |
| 696 :return: Tuple containing: | |
| 697 - **counts** (*numpy.ndarray*): Frequency counts per histogram bin. | |
| 698 - **bins** (*numpy.ndarray*): Histogram bin edges. | |
| 699 Returns ``(None, None)`` if no valid data was available for plotting. | |
| 700 :rtype: tuple[numpy.ndarray, numpy.ndarray] | tuple[None, None] | |
| 701 """ | |
| 702 counts, bins = prepare_evalue_histogram(evalue_list, unannotated_list) | |
| 703 if counts is None: | |
| 704 return None, None | |
| 705 | |
| 706 plt.hist([ev for ev in evalue_list if isinstance(ev, (int, float)) and ev > 0], | |
| 707 bins=50, range=(0, 1)) | |
| 708 plt.xlabel("E-value") | |
| 709 plt.ylabel("Frequency") | |
| 710 plt.title("E-value Distribution") | |
| 711 plt.savefig(output_file) | |
| 712 plt.close() | |
| 713 return counts, bins | |
| 714 | 745 |
| 715 | 746 |
| 716 def main(arg_list=None): | 747 def main(arg_list=None): |
| 717 """ | 748 """Main entry point for CD-HIT cluster processing. |
| 718 Main entry point of the script. | 749 |
| 719 | 750 Orchestrates parsing, cluster processing, statistic aggregation, |
| 720 Parses arguments, processes cd-hit cluster data, aggregates results, | 751 visualization, and generation of all requested outputs. |
| 721 and generates requested outputs (text summaries, plots, and Excel reports). | 752 |
| 722 | 753 :param arg_list: Optional list of arguments (used by tests). |
| 723 :param arg_list: List of arguments for testing purposes. | 754 :type arg_list: list[str] or None |
| 724 :type arg_list: list, optional | |
| 725 :return: None | 755 :return: None |
| 726 :rtype: None | 756 :rtype: None |
| 727 """ | 757 """ |
| 728 args = parse_arguments(arg_list) | 758 args = parse_arguments(arg_list) |
| 729 # Parse cluster file | 759 log_messages: list[str] = [] |
| 760 log_message(log_messages, "=== CD-HIT cluster processing started ===") | |
| 761 | |
| 762 log_message(log_messages, "cluster_file: provided") | |
| 763 log_message( | |
| 764 log_messages, | |
| 765 "annotation_file: provided" if args.input_annotation else "annotation_file: none", | |
| 766 ) | |
| 767 | |
| 768 log_message(log_messages, "=== PARAMETERS ===") | |
| 769 skip_keys = { | |
| 770 "output_similarity_txt", | |
| 771 "output_similarity_plot", | |
| 772 "output_count", | |
| 773 "output_excel", | |
| 774 "input_cluster", | |
| 775 "input_annotation", | |
| 776 "log_file", | |
| 777 } | |
| 778 for key, value in vars(args).items(): | |
| 779 if key in skip_keys: | |
| 780 continue | |
| 781 log_message(log_messages, f"{key}: {value}") | |
| 782 log_message(log_messages, "=== PARAMETERS END===") | |
| 783 | |
| 730 clusters = parse_cluster_file( | 784 clusters = parse_cluster_file( |
| 731 args.input_cluster, | 785 args.input_cluster, |
| 732 args.input_annotation, | 786 args.input_annotation, |
| 733 args.print_empty_files | 787 raise_on_error=False, |
| 788 log_messages=log_messages, | |
| 734 ) | 789 ) |
| 735 # Process each cluster | 790 |
| 736 all_eval_data = [0] # For full sample statistics | |
| 737 all_simi_data = [] | |
| 738 cluster_eval_lengths = [] | |
| 739 cluster_simi_lengths = [] | |
| 740 cluster_data_list = [] | 791 cluster_data_list = [] |
| 792 all_simi_data: list[float] = [] | |
| 793 cluster_simi_lengths: list[int] = [] | |
| 741 | 794 |
| 742 for cluster in clusters: | 795 for cluster in clusters: |
| 743 eval_list, simi_list, taxa_dict = process_cluster_data(cluster) | 796 simi_list, taxa_map, annotated_count, unannotated_count = process_cluster_data( |
| 744 cluster_data_list.append((eval_list, simi_list, taxa_dict)) | 797 cluster |
| 745 # Collect data for full sample plots | 798 ) |
| 746 all_eval_data[0] += eval_list[0] | 799 cluster_data = { |
| 747 if len(eval_list) > 1: | 800 "similarities": simi_list, |
| 748 all_eval_data.extend(sorted(eval_list[1:])) | 801 "taxa_map": taxa_map, |
| 749 cluster_eval_lengths.append(len(eval_list[1:])) | 802 "annotated": annotated_count, |
| 803 "unannotated": unannotated_count, | |
| 804 } | |
| 805 cluster_data_list.append(cluster_data) | |
| 750 | 806 |
| 751 if simi_list: | 807 if simi_list: |
| 752 all_simi_data.extend(sorted(simi_list, reverse=True)) | 808 all_simi_data.extend(simi_list) |
| 753 cluster_simi_lengths.append(len(simi_list)) | 809 cluster_simi_lengths.append(len(simi_list)) |
| 754 | 810 |
| 755 # Generate outputs based on what was requested | |
| 756 if args.output_similarity_txt: | 811 if args.output_similarity_txt: |
| 757 write_similarity_output(all_simi_data, args.output_similarity_txt) | 812 write_similarity_output( |
| 758 | 813 cluster_data_list, args.output_similarity_txt, log_messages) |
| 759 if args.output_similarity_plot and all_simi_data: | 814 |
| 760 create_similarity_plot(all_simi_data, cluster_simi_lengths, args, args.output_similarity_plot) | 815 if args.output_similarity_plot: |
| 761 | 816 create_similarity_plot( |
| 762 if args.output_evalue_txt: | 817 all_simi_data, |
| 763 write_evalue_output(all_eval_data, args.output_evalue_txt) | 818 cluster_simi_lengths, |
| 764 | 819 args, |
| 765 if args.output_evalue_plot and len(all_eval_data) > 1: | 820 args.output_similarity_plot, |
| 766 create_evalue_plot(all_eval_data, cluster_eval_lengths, args.output_evalue_plot) | 821 log_messages) |
| 767 | 822 |
| 768 if args.output_count: | 823 if args.output_count: |
| 769 write_count_output(all_eval_data, cluster_data_list, args.output_count) | 824 write_count_output(cluster_data_list, args.output_count, log_messages) |
| 770 | 825 |
| 771 if args.output_taxa_clusters: | 826 if args.output_excel: |
| 772 write_taxa_clusters_output(cluster_data_list, args.output_taxa_clusters) | 827 write_raw = bool(args.output_taxa_clusters) |
| 773 | 828 write_processed = bool(args.output_taxa_processed) |
| 774 if args.output_taxa_processed: | 829 |
| 775 write_taxa_processed_output(cluster_data_list, args, args.output_taxa_processed) | 830 if not write_raw and not write_processed: |
| 776 | 831 if log_messages is not None: |
| 777 print(f"Processing complete. Processed {len(clusters)} clusters.") | 832 log_message(log_messages, "output_excel provided but no taxa sheet flags set; no Excel file written.") |
| 833 else: | |
| 834 write_taxa_excel(cluster_data_list, | |
| 835 args, | |
| 836 args.output_excel, | |
| 837 write_raw=write_raw, | |
| 838 write_processed=write_processed, | |
| 839 log_messages=log_messages) | |
| 840 else: | |
| 841 if args.output_taxa_clusters or args.output_taxa_processed: | |
| 842 if log_messages is not None: | |
| 843 log_message( | |
| 844 log_messages, | |
| 845 "WARNING: Raw/processed taxa output flags set but no --output_excel path provided; skipping Excel output." | |
| 846 ) | |
| 847 | |
| 848 summarize_and_log(cluster_data_list, all_simi_data, log_messages, args) | |
| 849 log_message(log_messages, "=== CD-HIT cluster processing finished ===") | |
| 850 | |
| 851 if args.log_file: | |
| 852 os.makedirs(os.path.dirname(args.log_file), exist_ok=True) | |
| 853 with open(args.log_file, "w") as f: | |
| 854 f.write("\n".join(log_messages)) | |
| 778 | 855 |
| 779 | 856 |
| 780 if __name__ == "__main__": | 857 if __name__ == "__main__": |
| 781 main() | 858 main() |
