comparison blast_annotations_processor.py @ 2:9ca209477dfd draft default tip

planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_annotations_tool commit 4017d38cf327c48a6252e488ba792527dae97a70-dirty
author onnodg
date Mon, 15 Dec 2025 16:43:36 +0000
parents a3989edf0a4a
children
comparison
equal deleted inserted replaced
1:2acf82433aa4 2:9ca209477dfd
21 - Headers in BLAST and FASTA should correspond. 21 - Headers in BLAST and FASTA should correspond.
22 - Uses matplotlib, pandas, and openpyxl for visualization and reporting. 22 - Uses matplotlib, pandas, and openpyxl for visualization and reporting.
23 """ 23 """
24 24
25 import argparse 25 import argparse
26 from collections import defaultdict
27 import json 26 import json
28 import os 27 import os
28 from collections import OrderedDict
29 from collections import defaultdict
30
29 import matplotlib.pyplot as plt 31 import matplotlib.pyplot as plt
32 import numpy as np
30 import pandas as pd 33 import pandas as pd
31 import numpy as np
32 import openpyxl
33 34
34 # Default taxonomic levels 35 # Default taxonomic levels
35 TAXONOMIC_LEVELS = ["K", "P", "C", "O", "F", "G", "S"] 36 TAXONOMIC_LEVELS = ["K", "P", "C", "O", "F", "G", "S"]
36 37
38
37 def parse_arguments(arg_list=None): 39 def parse_arguments(arg_list=None):
38 """Parse command line arguments for cluster processing.""" 40 """Parse command line arguments for cluster processing."""
39 parser = argparse.ArgumentParser( 41 parser = argparse.ArgumentParser(description='Process BLAST annotation results for Galaxy')
40 description='Process BLAST annotation results for Galaxy' 42
41 ) 43 parser.add_argument('--input-anno', required=True, help='Annotated BLAST output file (tabular format)')
42 # Required inputs 44 parser.add_argument('--input-unanno', required=True, help='Unannotated sequences file (FASTA format)')
43 parser.add_argument('--input-anno', required=True, 45
44 help='Annotated BLAST output file (tabular format)') 46 parser.add_argument('--eval-plot', help='Output path for E-value plot (PNG)')
45 parser.add_argument('--input-unanno', required=True, 47 parser.add_argument('--taxa-output', help='Output path for taxa report (tabular)')
46 help='Unannotated sequences file (FASTA format)') 48 parser.add_argument('--circle-data', help='Output path for circular taxonomy data (txt)')
47 # Optional outputs 49 parser.add_argument('--header-anno', help='Output path for header annotations (tabular/xlsx)')
48 parser.add_argument('--eval-plot', 50 parser.add_argument('--log', help='Output path for log file (txt)', required=True)
49 help='Output path for E-value plot (PNG)') 51 parser.add_argument('--filtered-fasta', required=True,
50 parser.add_argument('--taxa-output', 52 help='Filtered fasta file (fasta format) for downstream analysis')
51 help='Output path for taxa report (tabular)') 53
52 parser.add_argument('--circle-data', 54 parser.add_argument('--uncertain-threshold', type=float, default=0.9, required=True,
53 help='Output path for circular taxonomy data (txt)')
54 parser.add_argument('--header-anno',
55 help='Output path for header annotations (tabular/xlsx)')
56 parser.add_argument('--anno-stats',
57 help='Output path for annotation statistics (txt)')
58 # Parameters
59 parser.add_argument('--uncertain-threshold', type=float, default=0.9,
60 help='Threshold for resolving taxonomic conflicts (default: 0.9)') 55 help='Threshold for resolving taxonomic conflicts (default: 0.9)')
61 parser.add_argument('--eval-threshold', type=float, default=1e-10, 56 parser.add_argument('--eval-threshold', default='1e-10', type=float, required=True,
62 help='E-value threshold for filtering results (default: 1e-10)') 57 help='E-value threshold for filtering results (default: 1e-10)')
63 parser.add_argument('--use-counts', action='store_true', default=True, 58 parser.add_argument('--use-counts', action='store_true', default=False, required=False,
64 help='Use read counts in circular data') 59 help='Use read counts in circular data')
60 parser.add_argument('--ignore-rank', default='unknown', required=False,
61 help='Ignore rank when containing this text')
62 parser.add_argument('--ignore-taxonomy', default='environmental', required=False,
63 help="Don't use taxonomy containing this taxonomy")
64 parser.add_argument('--bitscore-perc-cutoff', type=float, default=8, required=True,
65 help='Bitscore percentage cutoff for considered hits')
66 parser.add_argument('--min-bitscore', type=int, default=0, required=True,
67 help='Minimum bitscore threshold for hits')
68 parser.add_argument('-iot', '--ignore-obiclean-type', type=str, default='singleton', required=False,
69 help='Ignore sequences with this obiclean type')
70 parser.add_argument('-iit', '--ignore-illuminapairend-type', type=str, default='pairend', required=False,
71 help='Ignore sequences with this illumina paired end output type')
72 parser.add_argument('--min-identity', type=int, default=0, required=True,
73 help='Minimum sequence identity to consider a hit')
74 parser.add_argument('--min-coverage', type=int, default=0, required=True,
75 help='Minimum sequence coverage to consider a hit')
76 parser.add_argument('--ignore-seqids', type=str, default='', required=False,
77 help='Ignore sequences with this sequence identifier')
78 parser.add_argument('--min-support', type=int, default=0, required=True,
79 help='A taxon is kept only if it (or its descendants) have at least N reads assigned.')
65 80
66 return parser.parse_args(arg_list) 81 return parser.parse_args(arg_list)
67 82
68 83
84 def log_message(log_messages, msg):
85 """
86 Helper to both print and collect log messages.
87 """
88 if log_messages is not None:
89 log_messages.append(msg)
90
91
69 def list_to_string(x): 92 def list_to_string(x):
70 """ 93 """
71 Convert a list, pandas Series, or numpy array to a comma-separated string.""" 94 Convert a list, pandas Series, or numpy array to a comma-separated string.
95 """
72 if isinstance(x, (list, pd.Series, np.ndarray)): 96 if isinstance(x, (list, pd.Series, np.ndarray)):
73 return ", ".join(map(str, x)) 97 return ", ".join(map(str, x))
74 return str(x) 98 return str(x)
75 99
76 def make_eval_plot(e_val_sets, output_path): 100
77 """ 101 def make_eval_plot(e_val_sets, output_path, log_messages):
78 Create a bar plot of E-values per read. 102 """
79 103 Generate an E-value distribution plot.
80 :param e_val_sets: List of sets containing E-values per read. 104
105 This function aggregates E-values per read, transforms them onto a log axis
106 and produces a visual summary of the distribution of best hits across reads.
107 If no E-values are available, no plot is created.
108
109 :param e_val_sets: Set of E-values per read.
81 :type e_val_sets: list[set[str]] 110 :type e_val_sets: list[set[str]]
82 :param output_path: Path to save the plot (PNG). 111 :param output_path: Output PNG file.
83 :type output_path: str 112 :type output_path: str
113 :param log_messages: Log collection list.
114 :type log_messages: list[str]
84 :return: None 115 :return: None
85 :rtype: None 116 :rtype: None
86 """ 117 """
87 if not e_val_sets: 118 if not e_val_sets:
88 print("No E-values to plot") 119 log_message(log_messages, "No E-values to plot")
89 return None 120 return None
90 121
91 # Convert string E-values to floats and sort 122 processed_sets = sorted([sorted(float(e_val) for e_val in e_val_set) for e_val_set in e_val_sets])
92 processed_sets = sorted([sorted(float(e_val) for e_val in e_val_set)
93 for e_val_set in e_val_sets])
94 123
95 bar_positions = [] 124 bar_positions = []
96 bar_heights = [] 125 bar_heights = []
97 126
98 for i, e_set in enumerate(processed_sets): 127 for i, e_set in enumerate(processed_sets):
109 plt.bar(bar_positions, bar_heights, width=0.29, color=['blue', 'red']) 138 plt.bar(bar_positions, bar_heights, width=0.29, color=['blue', 'red'])
110 139
111 plt.yscale('log') 140 plt.yscale('log')
112 plt.grid(axis='y', linestyle='--', color='gray', alpha=0.7) 141 plt.grid(axis='y', linestyle='--', color='gray', alpha=0.7)
113 plt.ylabel("e-values") 142 plt.ylabel("e-values")
114 plt.xticks(ticks=range(len(processed_sets)), 143 plt.xticks(ticks=range(len(processed_sets)), labels=[f'{i + 1}' for i in range(len(processed_sets))])
115 labels=[f'{i + 1}' for i in range(len(processed_sets))])
116 144
117 plt.tight_layout() 145 plt.tight_layout()
118 plt.savefig(output_path, format='png') 146 plt.savefig(output_path, format='png')
119 plt.close() 147 plt.close()
120 return None 148 return None
145 - 'percentage_unique_annotated' (float): Percentage of unique annotated sequences. 173 - 'percentage_unique_annotated' (float): Percentage of unique annotated sequences.
146 - 'unique_annotated' (int): Number of unique annotated sequences. 174 - 'unique_annotated' (int): Number of unique annotated sequences.
147 - 'total_unique' (int): Total number of unique sequences. 175 - 'total_unique' (int): Total number of unique sequences.
148 :rtype: dict[str, float | int] 176 :rtype: dict[str, float | int]
149 """ 177 """
150 # Count total sequences in unannotated file
151 total_sequences = 0 178 total_sequences = 0
152 with open(unanno_file_path, 'r') as f: 179 with open(unanno_file_path, 'r') as f:
153 total_sequences = sum(1 for line in f if line.startswith('>')) 180 total_sequences = sum(1 for line in f if line.startswith('>'))
154 181
155 percentage_annotated = (anno_count / total_sequences * 100) if total_sequences > 0 else 0 182 percentage_annotated = (anno_count / total_sequences * 100) if total_sequences > 0 else 0
156 percentage_unique_annotated = (unique_anno_count / total_unique_count * 100) if total_unique_count > 0 else 0 183 percentage_unique_annotated = (unique_anno_count / total_unique_count * 100) if total_unique_count > 0 else 0
157 184
158 return { 185 return {'percentage_annotated': percentage_annotated, 'annotated_sequences': anno_count,
159 'percentage_annotated': percentage_annotated, 186 'total_sequences': total_sequences, 'percentage_unique_annotated': percentage_unique_annotated,
160 'annotated_sequences': anno_count, 187 'unique_annotated': unique_anno_count, 'total_unique': total_unique_count}
161 'total_sequences': total_sequences, 188
162 'percentage_unique_annotated': percentage_unique_annotated, 189
163 'unique_annotated': unique_anno_count, 190 def resolve_tax_majority(taxon_counts, uncertain_threshold):
164 'total_unique': total_unique_count 191 total_count = sum(taxon_counts.values())
165 } 192 if total_count == 0:
166 193 return "Uncertain taxa", "Uncertain taxa"
167 def _choose_taxon(level_counts: dict, threshold: float): 194 most_common_taxon, count = max(taxon_counts.items(), key=lambda x: x[1])
168 """ 195 # Use most common if above threshold
169 Determine the most representative taxonomic level based on counts and a threshold. 196 if count / total_count >= uncertain_threshold:
170 197 return most_common_taxon, most_common_taxon
171 This function evaluates a dictionary of taxonomic level counts and returns the 198 conflicting_taxa = list(taxon_counts.keys())
172 level name that holds a strict majority. If no level reaches the threshold, or if 199 conflicting_levels = [taxon.split(" / ") for taxon in conflicting_taxa]
173 there is a tie for the highest count, the function returns ``None``. 200
174 201 # Resolve uncertainty per level
175 :param level_counts: A dictionary mapping taxonomic level names (e.g., "species", 202 for level_idx in range(min(len(level) for level in conflicting_levels)):
176 "genus") to their respective counts. 203 current_level_names = {level[level_idx] for level in conflicting_levels}
177 :type level_counts: dict[str, int] 204 if len(current_level_names) > 1:
178 :param threshold: The minimum fraction (between 0 and 1) of the total counts that 205 lowest_common_path = conflicting_levels[0][:level_idx + 1]
179 the top taxonomic level must have to be considered dominant. 206 lowest_common_path[-1] = 'Uncertain taxa'
180 :type threshold: float 207 small_output = " / ".join(lowest_common_path)
181 208 min_conflicting_level = min(len(level) for level in conflicting_levels)
182 :return: The name of the taxonomic level that meets or exceeds the threshold and 209 uncertain_path = conflicting_levels[0][:level_idx + 1]
183 is not tied with another level. Returns ``None`` if no level qualifies or 210 uncertain_path[-1] = 'Uncertain taxa'
184 if there is a tie. 211 for _ in range(level_idx + 1, min_conflicting_level):
185 :rtype: str | None 212 uncertain_path.append('Uncertain taxa')
186 213 return small_output, " / ".join(uncertain_path)
187 :example: 214 return conflicting_taxa[0], conflicting_taxa[0]
188 _choose_taxon({"species": 6, "genus": 3, "family": 1}, 0.5)
189 'species'
190 _choose_taxon({"species": 4, "genus": 4}, 0.6)
191 None
192 _choose_taxon({"species": 2, "genus": 3}, 0.6)
193 'genus'
194 """
195 total = sum(level_counts.values())
196 if total == 0:
197 return None
198 items = sorted(level_counts.items(), key=lambda x: x[1], reverse=True)
199 top_name, top_count = items[0]
200 # tie -> no clear majority
201 if sum(1 for _, c in items if c == top_count) > 1:
202 return None
203 if top_count / total >= threshold:
204 return top_name
205 return None
206
207
208 def _resolve_conflicts_recursive(conflicting: list, level_idx: int, threshold: float, max_levels: int, prefix=None):
209 """
210 Recursively resolve taxonomic classification conflicts across multiple levels.
211
212 This function attempts to resolve conflicts between multiple taxonomic paths by
213 recursively evaluating each taxonomic level. At each level, it determines whether
214 there is a clear majority (based on a specified threshold) for a taxon name. If
215 a majority is found, the function continues to the next level using only the
216 taxa that match that name. If no majority exists, the function constructs
217 "uncertain" outputs to indicate ambiguous classification.
218
219 :param conflicting: List of tuples containing full taxonomic paths and their
220 associated counts, e.g. ``[("Bacteria / Proteobacteria / Gammaproteobacteria", 10),
221 ("Bacteria / Proteobacteria / Alphaproteobacteria", 5)]``.
222 :type conflicting: list[tuple[str, int]]
223 :param level_idx: The current taxonomic level index being evaluated (0-based).
224 :type level_idx: int
225 :param threshold: The minimum fraction (between 0 and 1) required for a strict
226 majority decision at a given taxonomic level.
227 :type threshold: float
228 :param max_levels: The maximum number of taxonomic levels to evaluate before
229 stopping recursion.
230 :type max_levels: int
231 :param prefix: A list of already-resolved taxon names up to the previous level.
232 Defaults to ``None``, in which case an empty list is used.
233 :type prefix: list[str] | None
234
235 :return: A tuple containing:
236 - **short_output** (str): A simplified taxonomic path with "Uncertain taxa"
237 appended if no consensus was reached at the current level.
238 - **full_output** (str): The full taxonomic path, filled with additional
239 "Uncertain taxa" entries up to the minimal conflicting level depth.
240 :rtype: tuple[str, str]
241
242 :example:
243 conflicts = [
244 ... ("Bacteria / Proteobacteria / Gammaproteobacteria", 10),
245 ... ("Bacteria / Proteobacteria / Alphaproteobacteria", 5)
246 ... ]
247 _resolve_conflicts_recursive(conflicts, level_idx=0, threshold=0.6, max_levels=5)
248 ('Bacteria / Proteobacteria / Uncertain taxa',
249 'Bacteria / Proteobacteria / Uncertain taxa / Uncertain taxa')
250 """
251 if prefix is None:
252 prefix = []
253
254 if not conflicting:
255 return "", ""
256
257 # split paths (keeps sync)
258 conflicting_levels = [t.split(" / ") for t, _ in conflicting]
259 min_conflicting_level = min(len(p) for p in conflicting_levels)
260
261 # If only one full path remains -> return it (both short & full)
262 if len(conflicting) == 1:
263 return conflicting[0][0], conflicting[0][0]
264
265 # If we've reached deepest comparable level or max_levels -> pick most supported full path
266 if level_idx >= min_conflicting_level or level_idx >= max_levels:
267 full_counts = defaultdict(int)
268 for t, c in conflicting:
269 full_counts[t] += c
270 chosen_full = max(full_counts.items(), key=lambda x: x[1])[0]
271 return chosen_full, chosen_full
272
273 # Count names at this level
274 level_counts = defaultdict(int)
275 for taxon, count in conflicting:
276 parts = taxon.split(" / ")
277 if level_idx < len(parts):
278 level_counts[parts[level_idx]] += count
279
280 # If everyone agrees at this level, append that name and go deeper
281 if len(level_counts) == 1:
282 name = next(iter(level_counts))
283 return _resolve_conflicts_recursive(conflicting, level_idx + 1, threshold, max_levels, prefix + [name])
284
285 # Check for a strict majority at this level
286 chosen_level_name = _choose_taxon(level_counts, threshold)
287 if chosen_level_name is not None:
288 # Keep only taxa that match the chosen name at this level and go deeper
289 filtered = [(t, c) for (t, c) in conflicting
290 if level_idx < len(t.split(" / ")) and t.split(" / ")[level_idx] == chosen_level_name]
291 return _resolve_conflicts_recursive(filtered, level_idx + 1, threshold, max_levels, prefix + [chosen_level_name])
292
293 # No majority at this level -> construct uncertain outputs.
294 # short: prefix + 'Uncertain taxa' (stop here)
295 short_path = prefix + ['Uncertain taxa']
296 short_output = " / ".join(short_path)
297
298 # full: prefix + 'Uncertain taxa' + fill until min_conflicting_level
299 full_path = short_path + ['Uncertain taxa'] * (min_conflicting_level - (level_idx + 1))
300 full_output = " / ".join(full_path)
301
302 return short_output, full_output
303
304
305 def resolve_taxon_majority(taxon_counts: dict, threshold: float = 0.90, max_levels: int = 7):
306 """
307 Resolve the majority consensus taxonomic classification from a set of annotated paths.
308
309 :param taxon_counts: A dictionary mapping full taxonomic paths to their respective
310 occurrence counts. For example:
311 ``{"Bacteria / Proteobacteria / Gammaproteobacteria": 10,
312 "Bacteria / Proteobacteria / Alphaproteobacteria": 5}``.
313 :type taxon_counts: dict[str, int]
314 :param threshold: Minimum fraction (between 0 and 1) required for a strict majority
315 at each taxonomic level. Defaults to ``0.90``.
316 :type threshold: float, optional
317 :param max_levels: Maximum number of taxonomic levels to evaluate before stopping
318 recursion. Defaults to ``7``.
319 :type max_levels: int, optional
320
321 :return: A tuple containing:
322 - **short_output** (str): The consensus taxonomic path, possibly ending with
323 "Uncertain taxa" if ambiguity remains.
324 - **full_output** (str): The expanded taxonomic path including placeholder
325 "Uncertain taxa" labels up to the minimal conflicting depth.
326 :rtype: tuple[str, str]
327 """
328 conflicting = list(taxon_counts.items()) # list of (path, count) keeps taxa<->count synced
329 return _resolve_conflicts_recursive(conflicting, level_idx=0, threshold=threshold, max_levels=max_levels)
330 215
331 216
332 def process_taxa_output(taxa_dicts, output_path, uncertain_threshold): 217 def process_taxa_output(taxa_dicts, output_path, uncertain_threshold):
333 """ 218 """
334 Generate a tabular taxa report from a list of read taxonomies. 219 Generate a hierarchical taxa summary file.
335 220
336 This function processes taxonomic assignments for each read in a dataset 221 This function resolves best taxonomic assignments per read and aggregates
337 and produces a summary report similar in style to Kraken2 output. The 222 counts into taxonomic levels, producing a readable text report listing
338 report includes: 223 percentages, counts and resolved taxonomy.
339 224
340 - The number of reads that were marked as "Uncertain taxa" at each 225 :param taxa_dicts: Best taxa dictionaries per read.
341 taxonomic rank. 226 :type taxa_dicts: list[dict]
342 - For all other taxa, the number of reads assigned and their percentage 227 :param output_path: Path of output report.
343 relative to the sample total.
344 - The taxonomic rank for each assignment, indicated in the report with
345 indentation proportional to the rank depth.
346
347 :param taxa_dicts: List of taxon count dictionaries, one per read, where
348 keys are taxon names and values are counts of how often
349 they were assigned to the read.
350 :type taxa_dicts: list[dict[str, int]]
351 :param output_path: Path to save the tabular taxa report.
352 :type output_path: str 228 :type output_path: str
353 :param uncertain_threshold: Threshold for resolving conflicting taxonomic 229 :param uncertain_threshold: Confidence value for majority voting.
354 assignments (values below the threshold are
355 marked as "Uncertain taxa").
356 :type uncertain_threshold: float 230 :type uncertain_threshold: float
357 :return: None. The function writes the taxa report to ``output_path``. 231 :return: None
358 :rtype: None 232 :rtype: None
359 """ 233 """
360 uncertain_dict = {level: 0 for level in TAXONOMIC_LEVELS} 234 uncertain_dict = {level: 0 for level in TAXONOMIC_LEVELS}
361 aggregated_counts = defaultdict(int) 235 aggregated_counts = defaultdict(int)
362 236
363 for read_taxa in taxa_dicts: 237 for read_taxa in taxa_dicts:
364 resolved_taxon, _ = resolve_taxon_majority(read_taxa, uncertain_threshold) 238 if not read_taxa:
239 continue
240 resolved_taxon, _ = resolve_tax_majority(read_taxa, uncertain_threshold)
365 241
366 # Add counts for resolved taxon 242 # Add counts for resolved taxon
367 levels = resolved_taxon.split(" / ") 243 levels = resolved_taxon.split(" / ")
368 for i in range(1, len(levels) + 1): 244 for i in range(1, len(levels) + 1):
369 aggregated_counts[" / ".join(levels[:i])] += 1 245 aggregated_counts[" / ".join(levels[:i])] += 1
370 246
371 # Calculate total count 247 total_count = sum(value for key, value in aggregated_counts.items() if len(key.split(" / ")) == 1)
372 total_count = sum(value for key, value in aggregated_counts.items()
373 if len(key.split(" / ")) == 1)
374 248
375 report_lines = [] 249 report_lines = []
376 250
377 for taxonomy, count in sorted(aggregated_counts.items(), key=lambda x: x[0]): 251 for taxonomy, count in sorted(aggregated_counts.items(), key=lambda x: x[0]):
378 levels = taxonomy.split(" / ") 252 levels = taxonomy.split(" / ")
379 indent = " " * (len(levels) - 1) * 2 253 indent = " " * (len(levels) - 1) * 2
380 taxon_name = levels[-1] 254 taxon_name = levels[-1]
381 taxon_level_code = TAXONOMIC_LEVELS[len(levels) - 1] if len(levels) <= len(TAXONOMIC_LEVELS) else "U" 255 taxon_level_code = TAXONOMIC_LEVELS[len(levels) - 1] if len(levels) <= len(TAXONOMIC_LEVELS) else "U"
382 percentage = (count / total_count) * 100 if total_count > 0 else 0 256 percentage = (count / total_count) * 100 if total_count > 0 else 0
383 257
384 report_lines.append( 258 report_lines.append(f"{percentage:.2f}\t{count}\t{total_count}\t{taxon_level_code}\t{indent}{taxon_name}")
385 f"{percentage:.2f}\t{count}\t{total_count}\t{taxon_level_code}\t{indent}{taxon_name}"
386 )
387 259
388 if taxon_name == 'Uncertain taxa': 260 if taxon_name == 'Uncertain taxa':
389 uncertain_dict[taxon_level_code] += count 261 uncertain_dict[taxon_level_code] += count
390 262
391 # Write output
392 with open(output_path, 'w') as f: 263 with open(output_path, 'w') as f:
393 f.write("Uncertain count per taxonomie level" + str(uncertain_dict) + '\n') 264 f.write("Uncertain count per taxonomic level" + str(uncertain_dict) + '\n')
394 f.write('percentage_rooted\tnumber_rooted\ttotal_num\ttaxon_level\tindentificatie\n') 265 f.write('percentage_rooted\tnumber_rooted\ttotal_num\ttaxon_level\tidentification\n')
395 f.write("\n".join(report_lines)) 266 f.write("\n".join(report_lines))
396 267
397 268
398 def process_header_annotations(taxa_dicts, headers, e_values, output_path, uncertain_threshold, 269 def process_header_annotations(taxa_dicts, headers, output_path, uncertain_threshold, source_list, seq_id_list,
399 identity_list, coverage_list, source_list, bitscore_list): 270 log_messages):
400 """ 271 """
401 Generate an Excel report with per-read and aggregated taxonomic annotations. 272 Create an Excel report summarizing individual and merged read annotations.
402 273
403 This function processes taxonomic assignments per read, along with associated 274 This function converts best taxonomy assignments into a per-read table,
404 metrics (e-value, identity, coverage, bitscore, source), and generates an Excel 275 generates an aggregated table grouped by taxon, and saves both tables to
405 file with two sheets: 276 separate sheets in a single Excel file.
406 277
407 1. **Individual_Reads** – One row per sequence, including all metrics and 278 :param taxa_dicts: Taxonomy assignments per read.
408 the resolved taxonomic path. 279 :type taxa_dicts: list[dict]
409 2. **Merged_by_Taxa** – Aggregated results per unique taxonomic path, including 280 :param headers: Original FASTA headers.
410 merged metrics and counts.
411
412 Internally, the function creates a temporary TSV file, processes it into
413 a pandas DataFrame, expands the taxonomic path into separate columns
414 (e.g., kingdom, phylum, class, …), and then writes the results into a
415 multi-sheet Excel file. Column widths are automatically adjusted for readability.
416
417 :param taxa_dicts: List of taxon count dictionaries, one per read, where keys
418 are taxa and values are their counts in the read.
419 :type taxa_dicts: list[dict[str, int]]
420 :param headers: Sequence headers corresponding to each read. Used to extract
421 read identifiers and counts.
422 :type headers: list[str] 281 :type headers: list[str]
423 :param e_values: List of e-values per read. Each element is a list of values 282 :param output_path: Output Excel file name.
424 associated with that read.
425 :type e_values: list[list[float]]
426 :param output_path: Path to the Excel file to be created.
427 :type output_path: str 283 :type output_path: str
428 :param uncertain_threshold: Threshold for resolving conflicting taxonomic 284 :param uncertain_threshold: Confidence level for taxonomy conflict resolution.
429 assignments. Values below the threshold are
430 replaced with "Uncertain taxa".
431 :type uncertain_threshold: float 285 :type uncertain_threshold: float
432 :param identity_list: Percent identity values per read. 286 :param source_list: Source identifiers per read.
433 :type identity_list: list[list[str]]
434 :param coverage_list: Coverage percentage values per read.
435 :type coverage_list: list[list[str]]
436 :param source_list: Source identifiers per read (e.g., database names).
437 :type source_list: list[list[str]] 287 :type source_list: list[list[str]]
438 :param bitscore_list: Bitscore values per read. 288 :param seq_id_list: Sequence identifiers per read.
439 :type bitscore_list: list[list[str]] 289 :type seq_id_list: list[list[str]]
440 :return: None. The function writes results to an Excel file at ``output_path``. 290 :param log_messages: Log collection list.
291 :type log_messages: list[str]
292 :return: None
441 :rtype: None 293 :rtype: None
442 :raises PermissionError: If the temporary TSV or output Excel file cannot be written
443 (e.g., file is open in another program).
444 :raises IndexError: If the input lists (headers, e_values, etc.) are not aligned
445 with ``taxa_dicts``.
446 :raises ValueError: If sequence headers are not formatted as expected for extracting counts.
447 """ 294 """
448 report_lines = [] 295 report_lines = []
449 for i, read_taxa in enumerate(taxa_dicts): 296 for i, read_taxa in enumerate(taxa_dicts):
450 _, resolved_taxon_long = resolve_taxon_majority(read_taxa, uncertain_threshold) 297 if not read_taxa:
451 try:
452 e_val = e_values[i][0] if e_values[i] else "N/A"
453 identity = identity_list[i][0] if identity_list[i] else "N/A"
454 coverage = coverage_list[i][0] if coverage_list[i] else "N/A"
455 source = source_list[i][0] if source_list[i] else "N/A"
456 bitscore = bitscore_list[i][0] if bitscore_list[i] else "N/A"
457 except (IndexError, TypeError) as e:
458 print(f"Mismatch while extracting values: {e}. Check that your annotated and unannotated input files correspond correctly.")
459 continue 298 continue
460 299 _, resolved_taxon_long = resolve_tax_majority(read_taxa, uncertain_threshold)
300 source = source_list[i][0] if source_list[i] else "N/A"
301 seq_id = seq_id_list[i][0] if seq_id_list[i] else "N/A"
461 header = headers[i] if i < len(headers) else f"Header missing" 302 header = headers[i] if i < len(headers) else f"Header missing"
462 303
463 # Extract count 304 # Extract count
464 try: 305 try:
465 header_base, count_str = header.rsplit("(", 1) 306 header_base, count_str = header.rsplit("(", 1)
466 count = int(count_str.rstrip(")")) 307 count = int(count_str.rstrip(")"))
467 except ValueError as e: 308 except ValueError as e:
468 print(f'Failed extracting count: {e}') 309 log_message(log_messages, f'Failed extracting count: {e}')
469 header_base = header 310 header_base = header
470 count = 1 311 count = 1
471 312
472 report_lines.append( 313 report_lines.append(f'{header_base}\t{seq_id}\t{source}\t{count}\t{resolved_taxon_long}')
473 f'{header_base}\t{e_val}\t{identity}\t{coverage}\t{bitscore}\t{count}\t{source}\t{resolved_taxon_long}') 314 # temp path is needed to write to a xlsx from galaxy
474 # Create temporary TSV for processing
475 temp_tsv_path = 'temp.tsv' 315 temp_tsv_path = 'temp.tsv'
476 try: 316 try:
477 with open(temp_tsv_path, 'w') as f: 317 with open(temp_tsv_path, 'w') as f:
478 f.write('header\te_value\tidentity percentage\tcoverage\tbitscore\tcount\tsource\ttaxa\n') 318 f.write('header\tseq_id\tsource\tcount\ttaxa\n')
479 f.write("\n".join(report_lines)) 319 f.write("\n".join(report_lines))
480 except PermissionError as e: 320 except PermissionError as e:
481 print(f"Unable to write to file, error: {e} file might be opened") 321 log_message(log_messages, f"Unable to write to file, error: {e} file might be opened")
482 322
483 # Process the data
484 df = pd.read_csv(temp_tsv_path, sep='\t', encoding="latin1") 323 df = pd.read_csv(temp_tsv_path, sep='\t', encoding="latin1")
485 if not df.empty: 324 if not df.empty:
486 # Split taxa into separate columns
487 taxa_split = df["taxa"].str.split(" / ", expand=True) 325 taxa_split = df["taxa"].str.split(" / ", expand=True)
488 max_levels = taxa_split.shape[1] if taxa_split.shape[1] is not None else 7 326 max_levels = taxa_split.shape[1] if taxa_split.shape[1] is not None else 7
489 level_names = ["kingdom", "phylum", "class", "order", "family", "genus", "species"][:max_levels] 327 level_names = ["kingdom", "phylum", "class", "order", "family", "genus", "species"][:max_levels]
490 taxa_split.columns = level_names 328 taxa_split.columns = level_names
491 329
492 # Add taxa columns to main dataframe
493 df_individual = pd.concat([df, taxa_split], axis=1) 330 df_individual = pd.concat([df, taxa_split], axis=1)
494 df_individual = df_individual.sort_values(['species', 'genus', 'family'], ascending=[True, True, True]) 331 df_individual = df_individual.sort_values(['species', 'genus', 'family'], ascending=[True, True, True])
495 # Create merged dataframe - first add taxa columns, then aggregate
496 df_for_merge = df_individual.copy() 332 df_for_merge = df_individual.copy()
497 333
498 group_cols = ["taxa"] 334 group_cols = ["taxa"]
499 agg_dict = { 335 agg_dict = {"seq_id": lambda x: list_to_string(list(x.unique())), "count": "sum",
500 "e_value": lambda x: f"{x.min()}–{x.max()}", 336 "source": lambda x: list_to_string(list(x.unique())), }
501 "identity percentage": lambda x: list_to_string(list(x.unique())), 337
502 "coverage": lambda x: list_to_string(list(x.unique())),
503 "bitscore": lambda x: list_to_string(list(x.unique())),
504 "count": "sum",
505 "source": "first"
506 }
507
508 # Add aggregation for taxa levels that actually exist
509 for level in level_names: 338 for level in level_names:
510 if level in df_for_merge.columns: 339 if level in df_for_merge.columns:
511 agg_dict[level] = "first" 340 agg_dict[level] = "first"
512 341
513 df_merged = df_for_merge.groupby(group_cols, as_index=False).agg(agg_dict) 342 df_merged = df_for_merge.groupby(group_cols, as_index=False).agg(agg_dict)
514 343
515 sort_columns = [] 344 sort_columns = []
516 sort_ascending = [] 345 sort_ascending = []
517 346
518 # Add family, genus, species if they exist
519 for level in ['species', 'genus', 'family']: 347 for level in ['species', 'genus', 'family']:
520 if level in df_merged.columns: 348 if level in df_merged.columns:
521 sort_columns.append(level) 349 sort_columns.append(level)
522 sort_ascending.append(True) 350 sort_ascending.append(True)
523 351
524 df_merged = df_merged.sort_values(sort_columns, ascending=sort_ascending) 352 df_merged = df_merged.sort_values(sort_columns, ascending=sort_ascending)
525 353
526 # Write both datasets to single Excel file with multiple sheets
527 temp_path = output_path + ".xlsx" 354 temp_path = output_path + ".xlsx"
528 os.makedirs(os.path.dirname(temp_path), exist_ok=True) 355 os.makedirs(os.path.dirname(temp_path), exist_ok=True)
529 with pd.ExcelWriter(temp_path, engine='openpyxl', mode='w') as writer: 356 with pd.ExcelWriter(temp_path, engine='openpyxl', mode='w') as writer:
530 df_individual.to_excel(writer, sheet_name='Individual_Reads', index=False) 357 df_individual.to_excel(writer, sheet_name='Individual_Reads', index=False)
531 df_merged.to_excel(writer, sheet_name='Merged_by_Taxa', index=False) 358 df_merged.to_excel(writer, sheet_name='Merged_by_Taxa', index=False)
532 359
533 # Auto-adjust column widths for both sheets
534 for sheet_name in writer.sheets: 360 for sheet_name in writer.sheets:
535 worksheet = writer.sheets[sheet_name] 361 worksheet = writer.sheets[sheet_name]
536 for column in worksheet.columns: 362 for column in worksheet.columns:
537 max_length = 0 363 max_length = 0
538 column_letter = column[0].column_letter 364 column_letter = column[0].column_letter
539 for cell in column: 365 for cell in column:
540 try: 366 try:
541 if len(str(cell.value)) > max_length: 367 if len(str(cell.value)) > max_length:
542 max_length = len(str(cell.value)) 368 max_length = len(str(cell.value))
543 except: 369 except AttributeError as e:
370 log_message(log_messages, f"Error {e}, {cell} has no value, something probably went wrong")
544 pass 371 pass
545 adjusted_width = min(max_length + 2, 50) # Cap at 50 characters 372 adjusted_width = min(max_length + 2, 50)
546 worksheet.column_dimensions[column_letter].width = adjusted_width 373 worksheet.column_dimensions[column_letter].width = adjusted_width
547 os.replace(temp_path, output_path) 374 os.replace(temp_path, output_path)
548 # Clean up temporary file 375 # Clean up temporary file
549 os.remove(temp_tsv_path) 376 os.remove(temp_tsv_path)
550 else: 377 else:
551 print("Dataframe empty, no annotation results") 378 log_message(log_messages, "Dataframe empty, no annotation results")
552 379
553 380
554 def create_circle_diagram(taxa_dicts, output_path, use_counts, uncertain_threshold): 381 def create_circle_data(taxa_dicts, output_path, use_counts, uncertain_threshold):
555 """ 382 """
556 Generate hierarchical JSON data for a circular taxonomy diagram. 383 Generate circular taxonomy layer data.
557 384
558 This function aggregates taxonomic classification results from multiple reads 385 The function converts resolved taxonomy per read into hierarchical
559 and prepares hierarchical count data suitable for circular visualization. 386 cumulative counts that can be visualized as concentric circles.
560 387
561 :param taxa_dicts: A list of dictionaries, where each dictionary represents a 388 :param taxa_dicts: Per-read taxonomy assignments.
562 mapping from full taxonomic paths to their corresponding counts for a single 389 :type taxa_dicts: list[dict]
563 read. Each key should be a taxonomic path in the format 390 :param output_path: JSON destination file.
564 :type taxa_dicts: list[dict[str, int]]
565 :param output_path: File path to write the generated JSON output containing
566 circle diagram data.
567 :type output_path: str 391 :type output_path: str
568 :param use_counts: Whether to aggregate counts by total read frequency 392 :param use_counts: Whether to count total reads or unique taxa.
569 (``True``) or to count only unique taxa once (``False``). 393 :type use_counts: bool
570 :type use_counts: boolean 394 :param uncertain_threshold: Majority resolution threshold.
571 :param uncertain_threshold: Fraction (between 0 and 1) representing the minimum
572 majority threshold used for resolving conflicting taxonomic assignments.
573 :type uncertain_threshold: float 395 :type uncertain_threshold: float
574 396 :return: None
575 :return: None. The resulting JSON file will be written to ``output_path``. The
576 file contains a list of dictionaries, one per taxonomic level, with:
577 - **labels** (*list[str]*): Names of taxa at that level.
578 - **sizes** (*list[int]*): Corresponding aggregated counts.
579 :rtype: None 397 :rtype: None
580
581 :note:
582 - Ambiguous classifications are labeled as "Uncertain taxa".
583 - The number of hierarchical levels is determined by the length of
584 ``TAXONOMIC_LEVELS``.
585 """ 398 """
586 aggregated_counts = defaultdict(int) 399 aggregated_counts = defaultdict(int)
587 seen_taxa = set() 400 seen_taxa = set()
588 401
589 for read_taxa in taxa_dicts: 402 for read_taxa in taxa_dicts:
590 _, resolved_taxon_long = resolve_taxon_majority(read_taxa, uncertain_threshold) 403 if not read_taxa:
404 continue
405 _, resolved_taxon_long = resolve_tax_majority(read_taxa, uncertain_threshold)
591 406
592 levels = resolved_taxon_long.split(" / ") 407 levels = resolved_taxon_long.split(" / ")
593 408
594 if use_counts: 409 if use_counts:
595 for i in range(1, len(levels) + 1): 410 for i in range(1, len(levels) + 1):
598 if resolved_taxon_long not in seen_taxa: 413 if resolved_taxon_long not in seen_taxa:
599 for i in range(1, len(levels) + 1): 414 for i in range(1, len(levels) + 1):
600 aggregated_counts[" / ".join(levels[:i])] += 1 415 aggregated_counts[" / ".join(levels[:i])] += 1
601 seen_taxa.add(resolved_taxon_long) 416 seen_taxa.add(resolved_taxon_long)
602 417
603 # Prepare circle data
604 circle_data = [] 418 circle_data = []
605 for level in range(len(TAXONOMIC_LEVELS)): 419 for level in range(len(TAXONOMIC_LEVELS)):
606 circle_data.append({"labels": [], "sizes": []}) 420 circle_data.append({"labels": [], "sizes": []})
607 421
608 for taxonomy, count in sorted(aggregated_counts.items(), key=lambda x: x[0]): 422 for taxonomy, count in sorted(aggregated_counts.items(), key=lambda x: x[0]):
612 circle_data[len(levels) - 1]["sizes"].append(count) 426 circle_data[len(levels) - 1]["sizes"].append(count)
613 with open(output_path, "w") as f: 427 with open(output_path, "w") as f:
614 json.dump(circle_data, f, indent=2) 428 json.dump(circle_data, f, indent=2)
615 429
616 430
617 def process_single_file(anno_file_path, unanno_file_path, args): 431 def read_unannotated_fasta(unanno_file_path, ignore_illuminapairend_type, ignore_obiclean_type, min_support,
618 """ 432 filtered_fasta_path, log_messages):
619 Process a single annotated BLAST file and generate all requested analytical outputs. 433 """
620 434 Read, filter and optionally rewrite an unannotated FASTA file.
621 This function integrates multiple steps for analyzing annotated BLAST output 435
622 files and corresponding unannotated FASTA sequences. It organizes BLAST hits 436 This function extracts dereplicated sequence counts, applies header filters
623 per query, extracts the best-scoring matches, and produces various types of 437 and a minimum support threshold, collects statistics and, if requested,
624 output files (plots, tables, diagrams, and statistics) according to user 438 writes a filtered FASTA file for downstream processing.
625 arguments. 439
626 440 :param unanno_file_path: Path to FASTA file.
627 :param anno_file_path: Path to the annotated BLAST results file. 441 :type unanno_file_path: str
442 :param ignore_illuminapairend_type: Illumina pair-end status to remove.
443 :type ignore_illuminapairend_type: str
444 :param ignore_obiclean_type: Obiclean label to remove.
445 :type ignore_obiclean_type: str
446 :param min_support: Minimum dereplicated read count retained.
447 :type min_support: int
448 :param filtered_fasta_path: Optional path for filtered FASTA output.
449 :type filtered_fasta_path: str | None
450 :param log_messages: Log collection list.
451 :type log_messages: list[str]
452 :return: (headers, total_unique_count, invalid_count)
453 :rtype: tuple[list[str], int, int]
454 """
455 headers = []
456 total_unique_count = 0
457 invalid_count = 0
458
459 total_headers = 0
460 low_support_count = 0
461 header_filter_count = 0
462 written_fasta_count = 0
463
464 if not os.path.exists(unanno_file_path):
465 log_message(log_messages, f"Warning: Unannotated file {unanno_file_path} not found")
466 return headers, total_unique_count, invalid_count
467
468 write_fasta = filtered_fasta_path is not None
469 filtered_fasta = None
470 if write_fasta:
471 try:
472 filtered_fasta = open(filtered_fasta_path, "w")
473 except OSError as e:
474 log_message(log_messages, f"Error: Cannot open {filtered_fasta_path} for writing: {e}")
475 write_fasta = False
476
477 write_flag = False
478
479 with open(unanno_file_path, 'r') as f:
480 for line in f:
481 if line.startswith('>'):
482 total_headers += 1
483 header_line = line.rstrip("\n")
484 header = header_line.split()[0].strip('>')
485 if "count=" in header_line:
486 count = int(header_line.split("count=")[1].split(";")[0])
487 else:
488 count = 0
489
490 passes_header_filter = check_header_string(header_line, ignore_illuminapairend_type,
491 ignore_obiclean_type)
492
493 if passes_header_filter and int(count) >= int(min_support):
494 headers.append(header)
495 total_unique_count += count
496
497 write_flag = True
498 if write_fasta:
499 filtered_fasta.write(line)
500 written_fasta_count += 1
501 else:
502 # Invalid header (either header filter or low support)
503 if not passes_header_filter:
504 header_filter_count += 1
505 elif int(count) < int(min_support):
506 low_support_count += 1
507
508 invalid_count += 1
509 write_flag = False
510 else:
511 if write_flag and write_fasta:
512 filtered_fasta.write(line)
513
514 if write_fasta and filtered_fasta is not None:
515 filtered_fasta.close()
516 log_message(log_messages, f"Filtered FASTA written succesfully"
517 f"({written_fasta_count} sequences)")
518
519 log_message(log_messages, f"FASTA: total headers: {total_headers}")
520 log_message(log_messages, f"FASTA: headers kept after filters and min_support={min_support}: {len(headers)}")
521 log_message(log_messages, f"FASTA: removed due to header filters (illumina/obiclean/etc.): {header_filter_count}")
522 log_message(log_messages, f"FASTA: removed due to low dereplicated count (<{min_support}): {low_support_count}")
523 log_message(log_messages, f"FASTA: total invalid (header filter + low support): {invalid_count}")
524
525 return headers, total_unique_count, invalid_count
526
527
528 def parse_blast_output(anno_file_path, eval_threshold, ignore_taxonomy, ignore_rank, min_coverage, min_identity,
529 min_bitscore, ignore_seqids, log_messages):
530 """
531 Parse a BLAST tabular file and retain high-quality hits.
532
533 This function reads BLAST annotations, applies E-value and quality filters,
534 excludes invalid taxa, groups remaining hits by query identifier, and logs
535 summary statistics.
536
537 :param anno_file_path: Input BLAST result file.
628 :type anno_file_path: str 538 :type anno_file_path: str
629 :param unanno_file_path: Path to a FASTA file containing unannotated sequences. 539 :param eval_threshold: Maximum allowed E-value.
630 Used to determine sequence order and total unique sequence counts. 540 :type eval_threshold: float
541 :param ignore_taxonomy: Taxonomy substring filter.
542 :type ignore_taxonomy: str
543 :param ignore_rank: Rank substring filter.
544 :type ignore_rank: str
545 :param min_coverage: Minimum coverage threshold.
546 :type min_coverage: int
547 :param min_identity: Minimum identity threshold.
548 :type min_identity: int
549 :param min_bitscore: Minimum bitscore threshold.
550 :type min_bitscore: int
551 :param ignore_seqids: Comma-separated seqids to ignore.
552 :type ignore_seqids: str
553 :param log_messages: Log collection list.
554 :type log_messages: list[str]
555 :return: Mapping of q_id → hits
556 :rtype: OrderedDict[str, list[dict]]
557 """
558 if not os.path.exists(anno_file_path):
559 log_message(log_messages, f"Error: Input file {anno_file_path} not found")
560 return OrderedDict()
561
562 blast_groups = OrderedDict()
563
564 total_hits = 0
565 kept_hits = 0
566 filtered_hits = 0
567 invalid_taxon_count = 0
568 ignored_seqid_count = 0
569
570 try:
571 with open(anno_file_path, 'r') as f:
572 for line in f:
573 if line.startswith("#"):
574 continue
575
576 parts = line.strip().split('\t')
577 if len(parts) < 7:
578 log_message(log_messages, f"less than 7 parts in line, skipped faulty line: {line}")
579 continue
580
581 q_id = parts[0]
582 total_hits += 1
583
584 try:
585 e_val = float(parts[6])
586 except ValueError as e:
587 log_message(log_messages, f"Error {e}, while extracting evalue")
588 filtered_hits += 1
589 continue
590
591 taxon = parts[-1].strip()
592 if is_invalid_taxon(taxon, ignore_taxonomy, ignore_rank):
593 invalid_taxon_count += 1
594 filtered_hits += 1
595 continue
596
597 seq_id = parts[2] if len(parts) > 2 else ''
598 ignore_seqid_set = set(parse_list_param(ignore_seqids))
599 if seq_id in ignore_seqid_set:
600 ignored_seqid_count += 1
601 filtered_hits += 1
602 continue
603
604 identity = parts[4] if len(parts) > 4 else ''
605 cov = parts[5] if len(parts) > 5 else ''
606 bitscore = float(parts[7]) if len(parts) > 7 and parts[7] else 0.0
607 source = parts[8] if len(parts) > 8 else ''
608
609 hit = {'e_val': e_val, 'identity': identity, 'cov': cov, 'bitscore': bitscore, 'source': source,
610 'taxon': taxon, 'seq_id': seq_id, }
611
612 if not is_insufficient_hit(hit, min_coverage, min_identity, min_bitscore, eval_threshold):
613 blast_groups.setdefault(q_id, []).append(hit)
614 kept_hits += 1
615 else:
616 filtered_hits += 1
617 except Exception as e:
618 log_message(log_messages, f"Error reading BLAST file {anno_file_path}: {e}")
619 return OrderedDict()
620
621 log_message(log_messages, f"BLAST: total hits read: {total_hits}")
622 log_message(log_messages, f"BLAST: hits kept after quality filters: {kept_hits}")
623 log_message(log_messages, f"BLAST: hits filtered (evalue/coverage/identity/bitscore): {filtered_hits}")
624 log_message(log_messages, f"BLAST: hits removed due to invalid taxon: {invalid_taxon_count}")
625 log_message(log_messages, f"BLAST: hits removed due to ignored seqids: {ignored_seqid_count}")
626
627 return blast_groups
628
629
630 def process_single_query(header, hits, bitscore_perc_cutoff):
631 """
632 Process all BLAST hits for a single query/header and return a summary dict.
633 """
634 current_read_info = {'taxa_dict': {}, 'header': header, 'e_values': set(), 'identities': set(), 'coverages': set(),
635 'sources': set(), 'bitscores': set(), 'seq_id': set(), 'best_taxa_dict': {}}
636
637 max_bitscore = max(h['bitscore'] for h in hits)
638 if bitscore_perc_cutoff > 0:
639 top_bitscore = float(max_bitscore) * (1 - (float(bitscore_perc_cutoff) / 100.0))
640 else:
641 top_bitscore = float(max_bitscore)
642 current_read_info['best_taxa_dict'] = defaultdict(int)
643
644 for hit in hits:
645 e_val = hit['e_val']
646 identity = hit['identity']
647 cov = hit['cov']
648 bitscore = hit['bitscore']
649 source = hit['source']
650 taxon = hit['taxon']
651 seq_id = hit['seq_id']
652
653 # Keep track of hits that pass bitscore threshold
654 if bitscore >= top_bitscore:
655 current_read_info['best_taxa_dict'][taxon] += 1
656 current_read_info['e_values'].add(str(e_val))
657 current_read_info['identities'].add(identity)
658 current_read_info['coverages'].add(cov)
659 current_read_info['sources'].add(source)
660 current_read_info['bitscores'].add(bitscore)
661 current_read_info['seq_id'].add(seq_id)
662
663 return current_read_info
664
665
666 def process_single_file(anno_file_path, unanno_file_path, args, log_messages):
667 """
668 Execute the complete BLAST annotation processing workflow.
669
670 This function performs FASTA parsing, BLAST parsing, taxonomy resolution,
671 per-read and aggregated reporting, summary statistics computation, and
672 generation of optional plot and table outputs.
673
674 All validations, errors and counters are written into the log collector.
675
676 :param anno_file_path: BLAST annotation file.
677 :type anno_file_path: str
678 :param unanno_file_path: Input FASTA file.
631 :type unanno_file_path: str 679 :type unanno_file_path: str
632 :param args: Parsed command-line arguments containing the following attributes: 680 :param args: Parsed arguments namespace.
633 - **eval_threshold** (*float*): Maximum allowed E-value for valid hits.
634 - **eval_plot** (*str | None*): Output path for E-value distribution plot.
635 - **taxa_output** (*str | None*): Output path for taxonomic summary table.
636 - **header_anno** (*str | None*): Output path for annotated header table.
637 - **circle_data** (*str | None*): Output path for circular diagram JSON data.
638 - **anno_stats** (*str | None*): Output path for annotation statistics file.
639 - **use_counts** (*bool*): Whether to use read counts or unique taxa only.
640 - **uncertain_threshold** (*float*): Majority threshold for resolving taxonomic ambiguity.
641 :type args: argparse.Namespace 681 :type args: argparse.Namespace
642 682 :param log_messages: Log collection list.
643 :return: None. This function writes multiple output files to the paths 683 :type log_messages: list[str]
644 specified in ``args`` 684 :return: None
645 :rtype: None 685 :rtype: None
646 686 """
647 :raises FileNotFoundError: If the annotated BLAST or unannotated FASTA file cannot be found. 687 log_message(log_messages, f"Starting processing for FASTA")
648 :raises ValueError: If file content is malformed or missing required columns. 688 log_message(log_messages, "=== PARAMETERS USED ===")
649 689
650 :notes: 690 skip_keys = {
651 - The function first groups BLAST hits by query ID and filters out invalid or 691 "input_anno",
652 low-confidence taxa. 692 "input_unanno",
653 - For each query, the best BLAST hit is determined based on bitscore and E-value. 693 "eval_plot",
654 - The total number of unique annotated sequences is inferred from sequence 694 "taxa_output",
655 headers containing a ``count=`` field. 695 "circle_data",
656 - Progress and warnings are printed to standard output for traceability. 696 "header_anno",
657 """ 697 "log",
658 # Core data structures - one per read 698 "filtered_fasta",
659 read_data = [] # List of dicts, one per read 699 }
700
701 for key, value in vars(args).items():
702 if key in skip_keys:
703 continue
704 log_message(log_messages, f"{key}: {value}")
705
706 log_message(log_messages, "=== END PARAMETERS ===")
707 unanno_headers_ordered, total_unique_count, invalid_count = read_unannotated_fasta(unanno_file_path,
708 args.ignore_illuminapairend_type, args.ignore_obiclean_type, args.min_support, args.filtered_fasta,
709 log_messages)
710
711 unanno_set = set(unanno_headers_ordered)
712
713 if not os.path.exists(anno_file_path):
714 log_message(log_messages, f"Error: Input file {anno_file_path} not found")
715 with open(args.log, 'w') as f:
716 print('gaat nog niet goed hoor')
717 f.write("\n".join(log_messages))
718 return
719
720 log_message(log_messages, f"Reading BLAST annotations")
721 blast_groups = parse_blast_output(anno_file_path, args.eval_threshold, args.ignore_taxonomy, args.ignore_rank,
722 args.min_coverage, args.min_identity, args.min_bitscore, args.ignore_seqids, log_messages)
723
724 extra_blast_qids = [q for q in blast_groups.keys() if q not in unanno_set]
725 if extra_blast_qids:
726 log_message(log_messages,
727 f"Note: {len(extra_blast_qids)} BLAST q_ids not in FASTA (showing up to 10): {extra_blast_qids[:10]}")
728
729 read_data = []
660 headers = [] 730 headers = []
661 seen_reads = set() 731 seen_reads = set()
662 annotated_unique_count = 0 732 annotated_unique_count = 0
663 total_unique_count = 0 733
664 734 total_headers_for_annotation = len(unanno_headers_ordered)
665 # Current read processing 735 reads_with_hits = 0
666 current_q_id = '' 736
667 current_read_info = None 737 for header in unanno_headers_ordered:
668 738 if header not in blast_groups:
669 # Check if file exists and has content 739 continue
670 if not os.path.exists(anno_file_path): 740 hits = blast_groups[header]
671 print(f"Error: Input file {anno_file_path} not found") 741 current_read_info = process_single_query(header, hits, args.bitscore_perc_cutoff)
672 return 742 read_data.append(current_read_info)
673 743 reads_with_hits += 1
674 unanno_headers_ordered = [] 744
675 if os.path.exists(unanno_file_path): 745 if header not in seen_reads:
676 with open(unanno_file_path, 'r') as f: 746 seen_reads.add(header)
677 for line in f: 747 headers.append(header)
678 if line.startswith('>'): 748 try:
679 header = line.split()[0].strip('>') 749 count = int(header.split('(')[1].split(')')[0])
680 unanno_headers_ordered.append(header) 750 except (IndexError, ValueError, AttributeError) as e:
681 print(f"Found {len(unanno_headers_ordered)} headers in unannotated file") 751 log_message(log_messages, f"Error {e}: could not parse count in header: {header}")
682 else: 752 count = 0
683 print(f"Warning: Unannotated file {unanno_file_path} not found") 753 annotated_unique_count += count
684 754
685 # Counter to track which read we're currently expecting 755 reads_without_hits = total_headers_for_annotation - reads_with_hits
686 unanno_set = set(unanno_headers_ordered) 756 log_message(log_messages, f"ANNOTATION: total FASTA headers considered: {total_headers_for_annotation}")
687 read_counter = 0 757 log_message(log_messages, f"ANNOTATION: reads with BLAST hits: {reads_with_hits}")
688 758 log_message(log_messages, f"ANNOTATION: reads without BLAST hits: {reads_without_hits}")
689 # Count total unique sequences from unannotated file 759 log_message(log_messages, f"ANNOTATION: unique annotated count (from header counts): {annotated_unique_count}")
690 if os.path.exists(unanno_file_path): 760 log_message(log_messages, f"ANNOTATION: total unique count (from FASTA): {total_unique_count}")
691 with open(unanno_file_path, 'r') as f: 761
692 for line in f:
693 if "count=" in line:
694 count = int(line.split("count=")[1].split(";")[0])
695 total_unique_count += count
696
697 # Process annotated file
698 from collections import OrderedDict
699 blast_groups = OrderedDict() # q_id -> list of hit dicts
700
701 # Group BLAST hits per q_id
702 try:
703 with open(anno_file_path, 'r') as f:
704 for line in f:
705 if line.startswith("#"):
706 continue
707 parts = line.strip().split('\t')
708 if len(parts) < 7:
709 continue
710
711 q_id = parts[0]
712 try:
713 e_val = float(parts[6])
714 except Exception:
715 continue
716 if e_val > args.eval_threshold:
717 continue
718 valid_taxa = True
719 taxon = parts[-1].strip()
720 if is_invalid_taxon(taxon):
721 print(f"Skipping {q_id}: invalid taxon: {taxon}")
722 valid_taxa = False
723
724 identity = parts[4] if len(parts) > 4 else ''
725 cov = parts[5] if len(parts) > 5 else ''
726 bitscore = float(parts[7] if len(parts) > 7 else '')
727 source = parts[8] if len(parts) > 8 else ''
728 if valid_taxa:
729 hit = {
730 'e_val': e_val,
731 'identity': identity,
732 'cov': cov,
733 'bitscore': bitscore,
734 'source': source,
735 'taxon': taxon
736 }
737 blast_groups.setdefault(q_id, []).append(hit)
738 except Exception as e:
739 print(f"Error reading BLAST file {anno_file_path}: {e}")
740 return
741
742 # BLAST-reads not in fasta
743 extra_blast_qids = [q for q in blast_groups.keys() if q not in unanno_set]
744 if extra_blast_qids:
745 print(f"Note: {len(extra_blast_qids)} BLAST q_ids not in FASTA: {extra_blast_qids[:10]}...")
746
747 # Process fasta order
748 for header in unanno_headers_ordered:
749 if header not in blast_groups:
750 print(f"Skipping {header}: no BLAST hits")
751 continue
752 current_read_info = {
753 'taxa_dict': {},
754 'header': header,
755 'e_values': set(),
756 'identities': set(),
757 'coverages': set(),
758 'sources': set(),
759 'bitscores': set(),
760 'best_eval': float(),
761 'best_identity': '',
762 'best_coverage': '',
763 'best_source': '',
764 'best_bitscore': -1,
765 'best_taxa_dict' : {}
766 }
767
768 # process all hits for this header
769 for hit in blast_groups[header]:
770 e_val = hit['e_val']
771 identity = hit['identity']
772 cov = hit['cov']
773 bitscore = hit['bitscore']
774 source = hit['source']
775 taxon = hit['taxon']
776
777 current_read_info['e_values'].add(str(e_val))
778 current_read_info['identities'].add(identity)
779 current_read_info['coverages'].add(cov)
780 current_read_info['sources'].add(source)
781 current_read_info['bitscores'].add(bitscore)
782
783 # Keep track of best hit
784 if (bitscore > current_read_info['best_bitscore'] or
785 (bitscore == current_read_info['best_bitscore'] and e_val < current_read_info['best_eval'])):
786 # New best set → reset
787 current_read_info['best_bitscore'] = bitscore
788 current_read_info['best_eval'] = e_val
789 current_read_info['best_identity'] = identity
790 current_read_info['best_coverage'] = cov
791 current_read_info['best_source'] = source
792 current_read_info['best_taxa_dict'] = {taxon: 1}
793
794 # If hit is of same quality: add to existing dictionary
795 elif bitscore == current_read_info['best_bitscore'] and e_val == current_read_info['best_eval']:
796 td = current_read_info['best_taxa_dict']
797 td[taxon] = td.get(taxon, 0) + 1
798 # Might be unnecesary, best_taxa_dict seems to be the same as taxa_dict
799 # in output. Too scared to remove.
800 if len(current_read_info['e_values']) == 1:
801 taxa_dict = current_read_info['taxa_dict']
802 taxa_dict[taxon] = taxa_dict.get(taxon, 0) + 1
803 read_data.append(current_read_info)
804
805 if header not in seen_reads:
806 seen_reads.add(header)
807 headers.append(header)
808 try:
809 count = int(header.split('(')[1].split(')')[0])
810 except IndexError or ValueError or AttributeError:
811 count = 0
812 annotated_unique_count += count
813
814 # Extract data for different functions from unified structure
815 taxa_dicts = [read['taxa_dict'] for read in read_data] 762 taxa_dicts = [read['taxa_dict'] for read in read_data]
816 td = [read['best_taxa_dict'] for read in read_data] 763 td = [read['best_taxa_dict'] for read in read_data]
817 # Generate outputs based on arguments 764
818 if args.eval_plot: 765 if args.eval_plot:
819 e_val_sets = [read['e_values'] for read in read_data] 766 e_val_sets = [read['e_values'] for read in read_data]
820 make_eval_plot(e_val_sets, args.eval_plot) 767 make_eval_plot(e_val_sets, args.eval_plot, log_messages)
768 log_message(log_messages, f"E-value plot written succesfully")
821 769
822 if args.taxa_output: 770 if args.taxa_output:
823 process_taxa_output(td, args.taxa_output, args.uncertain_threshold) 771 process_taxa_output(td, args.taxa_output, args.uncertain_threshold)
772 log_message(log_messages, f"Taxa summary written succesfully")
824 773
825 if args.header_anno: 774 if args.header_anno:
826 # Extract best values efficiently - single list comprehension per metric 775 source_for_headers = [[read['sources']] for read in read_data]
827 e_values_for_headers = [[read['best_eval']] for read in read_data] 776 seq_id_for_headers = [[read['seq_id']] for read in read_data]
828 identity_for_headers = [[read['best_identity']] for read in read_data] 777 process_header_annotations(td, headers, args.header_anno, args.uncertain_threshold, source_for_headers,
829 coverage_for_headers = [[read['best_coverage']] for read in read_data] 778 seq_id_for_headers, log_messages)
830 source_for_headers = [[read['best_source']] for read in read_data] 779 log_message(log_messages, f"Header annotations written succesfully")
831 bitscore_for_headers= [[read['best_bitscore']] for read in read_data]
832 process_header_annotations(td, headers, e_values_for_headers,
833 args.header_anno, args.uncertain_threshold,
834 identity_for_headers, coverage_for_headers, source_for_headers, bitscore_for_headers)
835 780
836 if args.circle_data: 781 if args.circle_data:
837 create_circle_diagram(td, args.circle_data, args.use_counts, 782 create_circle_data(td, args.circle_data, args.use_counts, args.uncertain_threshold)
838 args.uncertain_threshold) 783 log_message(log_messages, f"Circle diagram JSON written succesfully")
839 784
840 if args.anno_stats: 785 stats = calculate_annotation_stats(len(taxa_dicts), unanno_file_path, annotated_unique_count,
841 stats = calculate_annotation_stats(len(taxa_dicts), unanno_file_path, 786 int(total_unique_count))
842 annotated_unique_count, total_unique_count) 787
843 with open(args.anno_stats, 'w') as f: 788 log_message(log_messages, "=== ANNOTATION STATISTICS ===")
844 f.write('metric\tvalue\n') 789 for key, value in stats.items():
845 for key, value in stats.items(): 790 log_message(log_messages, f"{key}: {value}")
846 f.write(f'{key}\t{value}\n') 791
847 792 with open(args.log, 'w') as f:
848 def is_invalid_taxon(taxon: str) -> bool: 793 f.write("\n".join(log_messages))
794
795
796 def check_header_string(header_line, invalid_header_string, invalid_line_string):
797 """
798 Checks the header input for string that it needs to be filtered on
799 """
800 invalid_string_list = parse_list_param(invalid_header_string) + parse_list_param(invalid_line_string)
801 for string in invalid_string_list:
802 if string.lower() == 'singleton':
803 string = "obiclean_status={'XXX': 's'}"
804 elif string.lower() == 'variant':
805 string = "obiclean_status={'XXX': 'i'}"
806 elif string.lower() == 'head':
807 string = "obiclean_status={'XXX': 'h'}"
808 elif string.lower() == 'pairend' or string.lower() == 'paired end':
809 string = "PairEnd"
810 elif string.lower() == 'consensus' or string.lower() == 'cons':
811 string = "CONS"
812 if string in header_line:
813 return False
814 return True
815
816
817 def is_insufficient_hit(hit, min_cov, min_id, min_bit, min_eval):
818 """
819 Checks if hit has values that pass the thresholds
820 """
821 if float(hit['cov']) < min_cov:
822 return True
823 elif float(hit['identity']) < min_id:
824 return True
825 elif float(hit['bitscore']) < min_bit:
826 return True
827 elif int(hit['e_val']) > min_eval:
828 return True
829 else:
830 return False
831
832
833 def is_invalid_taxon(taxon: str, invalid_taxa, invalid_ranks) -> bool:
849 """ 834 """
850 Determine whether a given taxonomic path should be considered invalid and excluded from analysis. 835 Determine whether a given taxonomic path should be considered invalid and excluded from analysis.
851 836 """
852 This function identifies and filters out taxonomic strings that contain unreliable, 837 invalid_string_list = parse_list_param(invalid_ranks) + parse_list_param(invalid_taxa)
853 incomplete, or placeholder taxonomic information. It is used to clean BLAST or
854 classification outputs before aggregation and visualization.
855
856 :param taxon: Full taxonomic path as a string
857 :type taxon: str
858 :return: ``True`` if the taxon is considered invalid according to the criteria below;
859 otherwise ``False``.
860 :rtype: bool
861
862 :criteria:
863 A taxon is considered invalid if:
864 - It contains the substring ``"invalid taxid"`` or ``"GenBank invalid taxon"``.
865 - It includes the term ``"environmental sample"`` (non-specific environmental sequence).
866 - It has any hierarchical level labeled as ``"unknown"`` followed by a more
867 specific level (e.g., *unknown genus / Homo sapiens*).
868
869 :note:
870 This function performs a **case-insensitive** check and assumes the
871 input taxonomic path uses ``" / "`` as a level delimiter.
872 """
873 taxon_lower = taxon.lower() 838 taxon_lower = taxon.lower()
874 839 for string in invalid_string_list:
875 if "invalid taxid" in taxon_lower or "genbank invalid taxon" in taxon_lower: 840 if string in taxon_lower:
876 return True
877
878 if "environmental sample" in taxon_lower:
879 return True
880
881 # check for "unknown" followed by something deeper
882 parts = taxon.split(" / ")
883 for i, part in enumerate(parts):
884 if "unknown" in part.lower() and i < len(parts) - 1:
885 # there is something more specific after an unknown level
886 return True 841 return True
887 return False 842 return False
843
844
845 def parse_list_param(param):
846 """
847 Safely convert a comma-separated Galaxy text parameter into a list.
848 """
849 if not param:
850 return []
851 return [x.strip() for x in param.strip(', \n').split(',') if x.strip()]
888 852
889 853
890 def main(arg_list=None): 854 def main(arg_list=None):
891 """ 855 """
892 Entry point for Galaxy-compatible BLAST annotation processing. 856 Entry point for Galaxy-compatible BLAST annotation processing.
899 :rtype: None 863 :rtype: None
900 864
901 Notes: 865 Notes:
902 Calls `process_single_file` with parsed arguments. 866 Calls `process_single_file` with parsed arguments.
903 """ 867 """
868 log_messages = []
904 args = parse_arguments(arg_list) 869 args = parse_arguments(arg_list)
905 for output_file in [args.eval_plot, args.header_anno, args.taxa_output, args.circle_data, args.anno_stats]: 870 process_single_file(args.input_anno, args.input_unanno, args, log_messages)
906 if output_file:
907 os.makedirs(os.path.dirname(output_file), exist_ok=True)
908 process_single_file(args.input_anno, args.input_unanno, args)
909 print("Processing completed successfully") 871 print("Processing completed successfully")
910 872
911 873
912 if __name__ == "__main__": 874 if __name__ == "__main__":
913 main() 875 main()