blast_annotations_processor: blast_annotations

comparison blast_annotations_processor.py @ 2:9ca209477dfd draft default tip

planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_annotations_tool commit 4017d38cf327c48a6252e488ba792527dae97a70-dirty

author	onnodg
date	Mon, 15 Dec 2025 16:43:36 +0000
parents	a3989edf0a4a
children

comparison

equal deleted inserted replaced

-:2acf82433aa4
+:9ca209477dfd
 - Headers in BLAST and FASTA should correspond.
 - Uses matplotlib, pandas, and openpyxl for visualization and reporting.
 """
 import argparse
-from collections import defaultdict
 import json
 import os
+from collections import OrderedDict
+from collections import defaultdict
 import matplotlib.pyplot as plt
+import numpy as np
 import pandas as pd
-import numpy as np
-import openpyxl
 # Default taxonomic levels
 TAXONOMIC_LEVELS = ["K", "P", "C", "O", "F", "G", "S"]
 def parse_arguments(arg_list=None):
 """Parse command line arguments for cluster processing."""
-parser = argparse.ArgumentParser(
+parser = argparse.ArgumentParser(description='Process BLAST annotation results for Galaxy')
-description='Process BLAST annotation results for Galaxy'
-)
+parser.add_argument('--input-anno', required=True, help='Annotated BLAST output file (tabular format)')
-# Required inputs
+parser.add_argument('--input-unanno', required=True, help='Unannotated sequences file (FASTA format)')
-parser.add_argument('--input-anno', required=True,
-help='Annotated BLAST output file (tabular format)')
+parser.add_argument('--eval-plot', help='Output path for E-value plot (PNG)')
-parser.add_argument('--input-unanno', required=True,
+parser.add_argument('--taxa-output', help='Output path for taxa report (tabular)')
-help='Unannotated sequences file (FASTA format)')
+parser.add_argument('--circle-data', help='Output path for circular taxonomy data (txt)')
-# Optional outputs
+parser.add_argument('--header-anno', help='Output path for header annotations (tabular/xlsx)')
-parser.add_argument('--eval-plot',
+parser.add_argument('--log', help='Output path for log file (txt)', required=True)
-help='Output path for E-value plot (PNG)')
+parser.add_argument('--filtered-fasta', required=True,
-parser.add_argument('--taxa-output',
+help='Filtered fasta file (fasta format) for downstream analysis')
-help='Output path for taxa report (tabular)')
-parser.add_argument('--circle-data',
+parser.add_argument('--uncertain-threshold', type=float, default=0.9, required=True,
-help='Output path for circular taxonomy data (txt)')
-parser.add_argument('--header-anno',
-help='Output path for header annotations (tabular/xlsx)')
-parser.add_argument('--anno-stats',
-help='Output path for annotation statistics (txt)')
-# Parameters
-parser.add_argument('--uncertain-threshold', type=float, default=0.9,
 help='Threshold for resolving taxonomic conflicts (default: 0.9)')
-parser.add_argument('--eval-threshold', type=float, default=1e-10,
+parser.add_argument('--eval-threshold', default='1e-10', type=float, required=True,
 help='E-value threshold for filtering results (default: 1e-10)')
-parser.add_argument('--use-counts', action='store_true', default=True,
+parser.add_argument('--use-counts', action='store_true', default=False, required=False,
 help='Use read counts in circular data')
+parser.add_argument('--ignore-rank', default='unknown', required=False,
+help='Ignore rank when containing this text')
+parser.add_argument('--ignore-taxonomy', default='environmental', required=False,
+help="Don't use taxonomy containing this taxonomy")
+parser.add_argument('--bitscore-perc-cutoff', type=float, default=8, required=True,
+help='Bitscore percentage cutoff for considered hits')
+parser.add_argument('--min-bitscore', type=int, default=0, required=True,
+help='Minimum bitscore threshold for hits')
+parser.add_argument('-iot', '--ignore-obiclean-type', type=str, default='singleton', required=False,
+help='Ignore sequences with this obiclean type')
+parser.add_argument('-iit', '--ignore-illuminapairend-type', type=str, default='pairend', required=False,
+help='Ignore sequences with this illumina paired end output type')
+parser.add_argument('--min-identity', type=int, default=0, required=True,
+help='Minimum sequence identity to consider a hit')
+parser.add_argument('--min-coverage', type=int, default=0, required=True,
+help='Minimum sequence coverage to consider a hit')
+parser.add_argument('--ignore-seqids', type=str, default='', required=False,
+help='Ignore sequences with this sequence identifier')
+parser.add_argument('--min-support', type=int, default=0, required=True,
+help='A taxon is kept only if it (or its descendants) have at least N reads assigned.')
 return parser.parse_args(arg_list)
+def log_message(log_messages, msg):
+"""
+Helper to both print and collect log messages.
+"""
+if log_messages is not None:
+log_messages.append(msg)
 def list_to_string(x):
 """
-Convert a list, pandas Series, or numpy array to a comma-separated string."""
+Convert a list, pandas Series, or numpy array to a comma-separated string.
+"""
 if isinstance(x, (list, pd.Series, np.ndarray)):
 return ", ".join(map(str, x))
 return str(x)
-def make_eval_plot(e_val_sets, output_path):
-"""
+def make_eval_plot(e_val_sets, output_path, log_messages):
-Create a bar plot of E-values per read.
+"""
+Generate an E-value distribution plot.
-:param e_val_sets: List of sets containing E-values per read.
+This function aggregates E-values per read, transforms them onto a log axis
+and produces a visual summary of the distribution of best hits across reads.
+If no E-values are available, no plot is created.
+:param e_val_sets: Set of E-values per read.
 :type e_val_sets: list[set[str]]
-:param output_path: Path to save the plot (PNG).
+:param output_path: Output PNG file.
 :type output_path: str
+:param log_messages: Log collection list.
+:type log_messages: list[str]
 :return: None
 :rtype: None
 """
 if not e_val_sets:
-print("No E-values to plot")
+log_message(log_messages, "No E-values to plot")
 return None
-# Convert string E-values to floats and sort
+processed_sets = sorted([sorted(float(e_val) for e_val in e_val_set) for e_val_set in e_val_sets])
-processed_sets = sorted([sorted(float(e_val) for e_val in e_val_set)
-for e_val_set in e_val_sets])
 bar_positions = []
 bar_heights = []
 for i, e_set in enumerate(processed_sets):
 plt.bar(bar_positions, bar_heights, width=0.29, color=['blue', 'red'])
 plt.yscale('log')
 plt.grid(axis='y', linestyle='--', color='gray', alpha=0.7)
 plt.ylabel("e-values")
-plt.xticks(ticks=range(len(processed_sets)),
+plt.xticks(ticks=range(len(processed_sets)), labels=[f'{i + 1}' for i in range(len(processed_sets))])
-labels=[f'{i + 1}' for i in range(len(processed_sets))])
 plt.tight_layout()
 plt.savefig(output_path, format='png')
 plt.close()
 return None
 - 'percentage_unique_annotated' (float): Percentage of unique annotated sequences.
 - 'unique_annotated' (int): Number of unique annotated sequences.
 - 'total_unique' (int): Total number of unique sequences.
 :rtype: dict[str, float | int]
 """
-# Count total sequences in unannotated file
 total_sequences = 0
 with open(unanno_file_path, 'r') as f:
 total_sequences = sum(1 for line in f if line.startswith('>'))
 percentage_annotated = (anno_count / total_sequences * 100) if total_sequences > 0 else 0
 percentage_unique_annotated = (unique_anno_count / total_unique_count * 100) if total_unique_count > 0 else 0
-return {
+return {'percentage_annotated': percentage_annotated, 'annotated_sequences': anno_count,
-'percentage_annotated': percentage_annotated,
+'total_sequences': total_sequences, 'percentage_unique_annotated': percentage_unique_annotated,
-'annotated_sequences': anno_count,
+'unique_annotated': unique_anno_count, 'total_unique': total_unique_count}
-'total_sequences': total_sequences,
-'percentage_unique_annotated': percentage_unique_annotated,
-'unique_annotated': unique_anno_count,
+def resolve_tax_majority(taxon_counts, uncertain_threshold):
-'total_unique': total_unique_count
+total_count = sum(taxon_counts.values())
-}
+if total_count == 0:
+return "Uncertain taxa", "Uncertain taxa"
-def _choose_taxon(level_counts: dict, threshold: float):
+most_common_taxon, count = max(taxon_counts.items(), key=lambda x: x[1])
-"""
+# Use most common if above threshold
-Determine the most representative taxonomic level based on counts and a threshold.
+if count / total_count >= uncertain_threshold:
+return most_common_taxon, most_common_taxon
-This function evaluates a dictionary of taxonomic level counts and returns the
+conflicting_taxa = list(taxon_counts.keys())
-level name that holds a strict majority. If no level reaches the threshold, or if
+conflicting_levels = [taxon.split(" / ") for taxon in conflicting_taxa]
-there is a tie for the highest count, the function returns ``None``.
+# Resolve uncertainty per level
-:param level_counts: A dictionary mapping taxonomic level names (e.g., "species",
+for level_idx in range(min(len(level) for level in conflicting_levels)):
-"genus") to their respective counts.
+current_level_names = {level[level_idx] for level in conflicting_levels}
-:type level_counts: dict[str, int]
+if len(current_level_names) > 1:
-:param threshold: The minimum fraction (between 0 and 1) of the total counts that
+lowest_common_path = conflicting_levels[0][:level_idx + 1]
-the top taxonomic level must have to be considered dominant.
+lowest_common_path[-1] = 'Uncertain taxa'
-:type threshold: float
+small_output = " / ".join(lowest_common_path)
+min_conflicting_level = min(len(level) for level in conflicting_levels)
-:return: The name of the taxonomic level that meets or exceeds the threshold and
+uncertain_path = conflicting_levels[0][:level_idx + 1]
-is not tied with another level. Returns ``None`` if no level qualifies or
+uncertain_path[-1] = 'Uncertain taxa'
-if there is a tie.
+for _ in range(level_idx + 1, min_conflicting_level):
-:rtype: str | None
+uncertain_path.append('Uncertain taxa')
+return small_output, " / ".join(uncertain_path)
-:example:
+return conflicting_taxa[0], conflicting_taxa[0]
-_choose_taxon({"species": 6, "genus": 3, "family": 1}, 0.5)
-'species'
-_choose_taxon({"species": 4, "genus": 4}, 0.6)
-None
-_choose_taxon({"species": 2, "genus": 3}, 0.6)
-'genus'
-"""
-total = sum(level_counts.values())
-if total == 0:
-return None
-items = sorted(level_counts.items(), key=lambda x: x[1], reverse=True)
-top_name, top_count = items[0]
-# tie -> no clear majority
-if sum(1 for _, c in items if c == top_count) > 1:
-return None
-if top_count / total >= threshold:
-return top_name
-return None
-def _resolve_conflicts_recursive(conflicting: list, level_idx: int, threshold: float, max_levels: int, prefix=None):
-"""
-Recursively resolve taxonomic classification conflicts across multiple levels.
-This function attempts to resolve conflicts between multiple taxonomic paths by
-recursively evaluating each taxonomic level. At each level, it determines whether
-there is a clear majority (based on a specified threshold) for a taxon name. If
-a majority is found, the function continues to the next level using only the
-taxa that match that name. If no majority exists, the function constructs
-"uncertain" outputs to indicate ambiguous classification.
-:param conflicting: List of tuples containing full taxonomic paths and their
-associated counts, e.g. ``[("Bacteria / Proteobacteria / Gammaproteobacteria", 10),
-("Bacteria / Proteobacteria / Alphaproteobacteria", 5)]``.
-:type conflicting: list[tuple[str, int]]
-:param level_idx: The current taxonomic level index being evaluated (0-based).
-:type level_idx: int
-:param threshold: The minimum fraction (between 0 and 1) required for a strict
-majority decision at a given taxonomic level.
-:type threshold: float
-:param max_levels: The maximum number of taxonomic levels to evaluate before
-stopping recursion.
-:type max_levels: int
-:param prefix: A list of already-resolved taxon names up to the previous level.
-Defaults to ``None``, in which case an empty list is used.
-:type prefix: list[str] | None
-:return: A tuple containing:
-- **short_output** (str): A simplified taxonomic path with "Uncertain taxa"
-appended if no consensus was reached at the current level.
-- **full_output** (str): The full taxonomic path, filled with additional
-"Uncertain taxa" entries up to the minimal conflicting level depth.
-:rtype: tuple[str, str]
-:example:
-conflicts = [
-...     ("Bacteria / Proteobacteria / Gammaproteobacteria", 10),
-...     ("Bacteria / Proteobacteria / Alphaproteobacteria", 5)
-... ]
-_resolve_conflicts_recursive(conflicts, level_idx=0, threshold=0.6, max_levels=5)
-('Bacteria / Proteobacteria / Uncertain taxa',
-'Bacteria / Proteobacteria / Uncertain taxa / Uncertain taxa')
-"""
-if prefix is None:
-prefix = []
-if not conflicting:
-return "", ""
-# split paths (keeps sync)
-conflicting_levels = [t.split(" / ") for t, _ in conflicting]
-min_conflicting_level = min(len(p) for p in conflicting_levels)
-# If only one full path remains -> return it (both short & full)
-if len(conflicting) == 1:
-return conflicting[0][0], conflicting[0][0]
-# If we've reached deepest comparable level or max_levels -> pick most supported full path
-if level_idx >= min_conflicting_level or level_idx >= max_levels:
-full_counts = defaultdict(int)
-for t, c in conflicting:
-full_counts[t] += c
-chosen_full = max(full_counts.items(), key=lambda x: x[1])[0]
-return chosen_full, chosen_full
-# Count names at this level
-level_counts = defaultdict(int)
-for taxon, count in conflicting:
-parts = taxon.split(" / ")
-if level_idx < len(parts):
-level_counts[parts[level_idx]] += count
-# If everyone agrees at this level, append that name and go deeper
-if len(level_counts) == 1:
-name = next(iter(level_counts))
-return _resolve_conflicts_recursive(conflicting, level_idx + 1, threshold, max_levels, prefix + [name])
-# Check for a strict majority at this level
-chosen_level_name = _choose_taxon(level_counts, threshold)
-if chosen_level_name is not None:
-# Keep only taxa that match the chosen name at this level and go deeper
-filtered = [(t, c) for (t, c) in conflicting
-if level_idx < len(t.split(" / ")) and t.split(" / ")[level_idx] == chosen_level_name]
-return _resolve_conflicts_recursive(filtered, level_idx + 1, threshold, max_levels, prefix + [chosen_level_name])
-# No majority at this level -> construct uncertain outputs.
-# short: prefix + 'Uncertain taxa' (stop here)
-short_path = prefix + ['Uncertain taxa']
-short_output = " / ".join(short_path)
-# full: prefix + 'Uncertain taxa' + fill until min_conflicting_level
-full_path = short_path + ['Uncertain taxa'] * (min_conflicting_level - (level_idx + 1))
-full_output = " / ".join(full_path)
-return short_output, full_output
-def resolve_taxon_majority(taxon_counts: dict, threshold: float = 0.90, max_levels: int = 7):
-"""
-Resolve the majority consensus taxonomic classification from a set of annotated paths.
-:param taxon_counts: A dictionary mapping full taxonomic paths to their respective
-occurrence counts. For example:
-``{"Bacteria / Proteobacteria / Gammaproteobacteria": 10,
-"Bacteria / Proteobacteria / Alphaproteobacteria": 5}``.
-:type taxon_counts: dict[str, int]
-:param threshold: Minimum fraction (between 0 and 1) required for a strict majority
-at each taxonomic level. Defaults to ``0.90``.
-:type threshold: float, optional
-:param max_levels: Maximum number of taxonomic levels to evaluate before stopping
-recursion. Defaults to ``7``.
-:type max_levels: int, optional
-:return: A tuple containing:
-- **short_output** (str): The consensus taxonomic path, possibly ending with
-"Uncertain taxa" if ambiguity remains.
-- **full_output** (str): The expanded taxonomic path including placeholder
-"Uncertain taxa" labels up to the minimal conflicting depth.
-:rtype: tuple[str, str]
-"""
-conflicting = list(taxon_counts.items())  # list of (path, count) keeps taxa<->count synced
-return _resolve_conflicts_recursive(conflicting, level_idx=0, threshold=threshold, max_levels=max_levels)
 def process_taxa_output(taxa_dicts, output_path, uncertain_threshold):
 """
-Generate a tabular taxa report from a list of read taxonomies.
+Generate a hierarchical taxa summary file.
-This function processes taxonomic assignments for each read in a dataset
+This function resolves best taxonomic assignments per read and aggregates
-and produces a summary report similar in style to Kraken2 output. The
+counts into taxonomic levels, producing a readable text report listing
-report includes:
+percentages, counts and resolved taxonomy.
-- The number of reads that were marked as "Uncertain taxa" at each
+:param taxa_dicts: Best taxa dictionaries per read.
-taxonomic rank.
+:type taxa_dicts: list[dict]
-- For all other taxa, the number of reads assigned and their percentage
+:param output_path: Path of output report.
-relative to the sample total.
-- The taxonomic rank for each assignment, indicated in the report with
-indentation proportional to the rank depth.
-:param taxa_dicts: List of taxon count dictionaries, one per read, where
-keys are taxon names and values are counts of how often
-they were assigned to the read.
-:type taxa_dicts: list[dict[str, int]]
-:param output_path: Path to save the tabular taxa report.
 :type output_path: str
-:param uncertain_threshold: Threshold for resolving conflicting taxonomic
+:param uncertain_threshold: Confidence value for majority voting.
-assignments (values below the threshold are
-marked as "Uncertain taxa").
 :type uncertain_threshold: float
-:return: None. The function writes the taxa report to ``output_path``.
+:return: None
 :rtype: None
 """
 uncertain_dict = {level: 0 for level in TAXONOMIC_LEVELS}
 aggregated_counts = defaultdict(int)
 for read_taxa in taxa_dicts:
-resolved_taxon, _ = resolve_taxon_majority(read_taxa, uncertain_threshold)
+if not read_taxa:
+continue
+resolved_taxon, _ = resolve_tax_majority(read_taxa, uncertain_threshold)
 # Add counts for resolved taxon
 levels = resolved_taxon.split(" / ")
 for i in range(1, len(levels) + 1):
 aggregated_counts[" / ".join(levels[:i])] += 1
-# Calculate total count
+total_count = sum(value for key, value in aggregated_counts.items() if len(key.split(" / ")) == 1)
-total_count = sum(value for key, value in aggregated_counts.items()
-if len(key.split(" / ")) == 1)
 report_lines = []
 for taxonomy, count in sorted(aggregated_counts.items(), key=lambda x: x[0]):
 levels = taxonomy.split(" / ")
 indent = " " * (len(levels) - 1) * 2
 taxon_name = levels[-1]
 taxon_level_code = TAXONOMIC_LEVELS[len(levels) - 1] if len(levels) <= len(TAXONOMIC_LEVELS) else "U"
 percentage = (count / total_count) * 100 if total_count > 0 else 0
-report_lines.append(
+report_lines.append(f"{percentage:.2f}\t{count}\t{total_count}\t{taxon_level_code}\t{indent}{taxon_name}")
-f"{percentage:.2f}\t{count}\t{total_count}\t{taxon_level_code}\t{indent}{taxon_name}"
-)
 if taxon_name == 'Uncertain taxa':
 uncertain_dict[taxon_level_code] += count
-# Write output
 with open(output_path, 'w') as f:
-f.write("Uncertain count per taxonomie level" + str(uncertain_dict) + '\n')
+f.write("Uncertain count per taxonomic level" + str(uncertain_dict) + '\n')
-f.write('percentage_rooted\tnumber_rooted\ttotal_num\ttaxon_level\tindentificatie\n')
+f.write('percentage_rooted\tnumber_rooted\ttotal_num\ttaxon_level\tidentification\n')
 f.write("\n".join(report_lines))
-def process_header_annotations(taxa_dicts, headers, e_values, output_path, uncertain_threshold,
+def process_header_annotations(taxa_dicts, headers, output_path, uncertain_threshold, source_list, seq_id_list,
-identity_list, coverage_list, source_list, bitscore_list):
+log_messages):
 """
-Generate an Excel report with per-read and aggregated taxonomic annotations.
+Create an Excel report summarizing individual and merged read annotations.
-This function processes taxonomic assignments per read, along with associated
+This function converts best taxonomy assignments into a per-read table,
-metrics (e-value, identity, coverage, bitscore, source), and generates an Excel
+generates an aggregated table grouped by taxon, and saves both tables to
-file with two sheets:
+separate sheets in a single Excel file.
-1. **Individual_Reads** – One row per sequence, including all metrics and
+:param taxa_dicts: Taxonomy assignments per read.
-the resolved taxonomic path.
+:type taxa_dicts: list[dict]
-2. **Merged_by_Taxa** – Aggregated results per unique taxonomic path, including
+:param headers: Original FASTA headers.
-merged metrics and counts.
-Internally, the function creates a temporary TSV file, processes it into
-a pandas DataFrame, expands the taxonomic path into separate columns
-(e.g., kingdom, phylum, class, …), and then writes the results into a
-multi-sheet Excel file. Column widths are automatically adjusted for readability.
-:param taxa_dicts: List of taxon count dictionaries, one per read, where keys
-are taxa and values are their counts in the read.
-:type taxa_dicts: list[dict[str, int]]
-:param headers: Sequence headers corresponding to each read. Used to extract
-read identifiers and counts.
 :type headers: list[str]
-:param e_values: List of e-values per read. Each element is a list of values
+:param output_path: Output Excel file name.
-associated with that read.
-:type e_values: list[list[float]]
-:param output_path: Path to the Excel file to be created.
 :type output_path: str
-:param uncertain_threshold: Threshold for resolving conflicting taxonomic
+:param uncertain_threshold: Confidence level for taxonomy conflict resolution.
-assignments. Values below the threshold are
-replaced with "Uncertain taxa".
 :type uncertain_threshold: float
-:param identity_list: Percent identity values per read.
+:param source_list: Source identifiers per read.
-:type identity_list: list[list[str]]
-:param coverage_list: Coverage percentage values per read.
-:type coverage_list: list[list[str]]
-:param source_list: Source identifiers per read (e.g., database names).
 :type source_list: list[list[str]]
-:param bitscore_list: Bitscore values per read.
+:param seq_id_list: Sequence identifiers per read.
-:type bitscore_list: list[list[str]]
+:type seq_id_list: list[list[str]]
-:return: None. The function writes results to an Excel file at ``output_path``.
+:param log_messages: Log collection list.
+:type log_messages: list[str]
+:return: None
 :rtype: None
-:raises PermissionError: If the temporary TSV or output Excel file cannot be written
-(e.g., file is open in another program).
-:raises IndexError: If the input lists (headers, e_values, etc.) are not aligned
-with ``taxa_dicts``.
-:raises ValueError: If sequence headers are not formatted as expected for extracting counts.
 """
 report_lines = []
 for i, read_taxa in enumerate(taxa_dicts):
-_, resolved_taxon_long = resolve_taxon_majority(read_taxa, uncertain_threshold)
+if not read_taxa:
-try:
-e_val = e_values[i][0] if e_values[i] else "N/A"
-identity = identity_list[i][0] if identity_list[i] else "N/A"
-coverage = coverage_list[i][0] if coverage_list[i] else "N/A"
-source = source_list[i][0] if source_list[i] else "N/A"
-bitscore = bitscore_list[i][0] if bitscore_list[i] else "N/A"
-except (IndexError, TypeError) as e:
-print(f"Mismatch while extracting values: {e}. Check that your annotated and unannotated input files correspond correctly.")
 continue
+_, resolved_taxon_long = resolve_tax_majority(read_taxa, uncertain_threshold)
+source = source_list[i][0] if source_list[i] else "N/A"
+seq_id = seq_id_list[i][0] if seq_id_list[i] else "N/A"
 header = headers[i] if i < len(headers) else f"Header missing"
 # Extract count
 try:
 header_base, count_str = header.rsplit("(", 1)
 count = int(count_str.rstrip(")"))
 except ValueError as e:
-print(f'Failed extracting count: {e}')
+log_message(log_messages, f'Failed extracting count: {e}')
 header_base = header
 count = 1
-report_lines.append(
+report_lines.append(f'{header_base}\t{seq_id}\t{source}\t{count}\t{resolved_taxon_long}')
-f'{header_base}\t{e_val}\t{identity}\t{coverage}\t{bitscore}\t{count}\t{source}\t{resolved_taxon_long}')
+# temp path is needed to write to a xlsx from galaxy
-# Create temporary TSV for processing
 temp_tsv_path = 'temp.tsv'
 try:
 with open(temp_tsv_path, 'w') as f:
-f.write('header\te_value\tidentity percentage\tcoverage\tbitscore\tcount\tsource\ttaxa\n')
+f.write('header\tseq_id\tsource\tcount\ttaxa\n')
 f.write("\n".join(report_lines))
 except PermissionError as e:
-print(f"Unable to write to file, error: {e} file might be opened")
+log_message(log_messages, f"Unable to write to file, error: {e} file might be opened")
-# Process the data
 df = pd.read_csv(temp_tsv_path, sep='\t', encoding="latin1")
 if not df.empty:
-# Split taxa into separate columns
 taxa_split = df["taxa"].str.split(" / ", expand=True)
 max_levels = taxa_split.shape[1] if taxa_split.shape[1] is not None else 7
 level_names = ["kingdom", "phylum", "class", "order", "family", "genus", "species"][:max_levels]
 taxa_split.columns = level_names
-# Add taxa columns to main dataframe
 df_individual = pd.concat([df, taxa_split], axis=1)
 df_individual = df_individual.sort_values(['species', 'genus', 'family'], ascending=[True, True, True])
-# Create merged dataframe - first add taxa columns, then aggregate
 df_for_merge = df_individual.copy()
 group_cols = ["taxa"]
-agg_dict = {
+agg_dict = {"seq_id": lambda x: list_to_string(list(x.unique())), "count": "sum",
-"e_value": lambda x: f"{x.min()}–{x.max()}",
+"source": lambda x: list_to_string(list(x.unique())), }
-"identity percentage": lambda x: list_to_string(list(x.unique())),
-"coverage": lambda x: list_to_string(list(x.unique())),
-"bitscore": lambda x: list_to_string(list(x.unique())),
-"count": "sum",
-"source": "first"
-}
-# Add aggregation for taxa levels that actually exist
 for level in level_names:
 if level in df_for_merge.columns:
 agg_dict[level] = "first"
 df_merged = df_for_merge.groupby(group_cols, as_index=False).agg(agg_dict)
 sort_columns = []
 sort_ascending = []
-# Add family, genus, species if they exist
 for level in ['species', 'genus', 'family']:
 if level in df_merged.columns:
 sort_columns.append(level)
 sort_ascending.append(True)
 df_merged = df_merged.sort_values(sort_columns, ascending=sort_ascending)
-# Write both datasets to single Excel file with multiple sheets
 temp_path = output_path + ".xlsx"
 os.makedirs(os.path.dirname(temp_path), exist_ok=True)
 with pd.ExcelWriter(temp_path, engine='openpyxl', mode='w') as writer:
 df_individual.to_excel(writer, sheet_name='Individual_Reads', index=False)
 df_merged.to_excel(writer, sheet_name='Merged_by_Taxa', index=False)
-# Auto-adjust column widths for both sheets
 for sheet_name in writer.sheets:
 worksheet = writer.sheets[sheet_name]
 for column in worksheet.columns:
 max_length = 0
 column_letter = column[0].column_letter
 for cell in column:
 try:
 if len(str(cell.value)) > max_length:
 max_length = len(str(cell.value))
-except:
+except AttributeError as e:
+log_message(log_messages, f"Error {e}, {cell} has no value, something probably went wrong")
 pass
-adjusted_width = min(max_length + 2, 50)  # Cap at 50 characters
+adjusted_width = min(max_length + 2, 50)
 worksheet.column_dimensions[column_letter].width = adjusted_width
 os.replace(temp_path, output_path)
 # Clean up temporary file
 os.remove(temp_tsv_path)
 else:
-print("Dataframe empty, no annotation results")
+log_message(log_messages, "Dataframe empty, no annotation results")
-def create_circle_diagram(taxa_dicts, output_path, use_counts, uncertain_threshold):
+def create_circle_data(taxa_dicts, output_path, use_counts, uncertain_threshold):
 """
-Generate hierarchical JSON data for a circular taxonomy diagram.
+Generate circular taxonomy layer data.
-This function aggregates taxonomic classification results from multiple reads
+The function converts resolved taxonomy per read into hierarchical
-and prepares hierarchical count data suitable for circular visualization.
+cumulative counts that can be visualized as concentric circles.
-:param taxa_dicts: A list of dictionaries, where each dictionary represents a
+:param taxa_dicts: Per-read taxonomy assignments.
-mapping from full taxonomic paths to their corresponding counts for a single
+:type taxa_dicts: list[dict]
-read. Each key should be a taxonomic path in the format
+:param output_path: JSON destination file.
-:type taxa_dicts: list[dict[str, int]]
-:param output_path: File path to write the generated JSON output containing
-circle diagram data.
 :type output_path: str
-:param use_counts: Whether to aggregate counts by total read frequency
+:param use_counts: Whether to count total reads or unique taxa.
-(``True``) or to count only unique taxa once (``False``).
+:type use_counts: bool
-:type use_counts: boolean
+:param uncertain_threshold: Majority resolution threshold.
-:param uncertain_threshold: Fraction (between 0 and 1) representing the minimum
-majority threshold used for resolving conflicting taxonomic assignments.
 :type uncertain_threshold: float
+:return: None
-:return: None. The resulting JSON file will be written to ``output_path``. The
-file contains a list of dictionaries, one per taxonomic level, with:
-- **labels** (*list[str]*): Names of taxa at that level.
-- **sizes** (*list[int]*): Corresponding aggregated counts.
 :rtype: None
-:note:
-- Ambiguous classifications are labeled as "Uncertain taxa".
-- The number of hierarchical levels is determined by the length of
-``TAXONOMIC_LEVELS``.
 """
 aggregated_counts = defaultdict(int)
 seen_taxa = set()
 for read_taxa in taxa_dicts:
-_, resolved_taxon_long = resolve_taxon_majority(read_taxa, uncertain_threshold)
+if not read_taxa:
+continue
+_, resolved_taxon_long = resolve_tax_majority(read_taxa, uncertain_threshold)
 levels = resolved_taxon_long.split(" / ")
 if use_counts:
 for i in range(1, len(levels) + 1):
 if resolved_taxon_long not in seen_taxa:
 for i in range(1, len(levels) + 1):
 aggregated_counts[" / ".join(levels[:i])] += 1
 seen_taxa.add(resolved_taxon_long)
-# Prepare circle data
 circle_data = []
 for level in range(len(TAXONOMIC_LEVELS)):
 circle_data.append({"labels": [], "sizes": []})
 for taxonomy, count in sorted(aggregated_counts.items(), key=lambda x: x[0]):
 circle_data[len(levels) - 1]["sizes"].append(count)
 with open(output_path, "w") as f:
 json.dump(circle_data, f, indent=2)
-def process_single_file(anno_file_path, unanno_file_path, args):
+def read_unannotated_fasta(unanno_file_path, ignore_illuminapairend_type, ignore_obiclean_type, min_support,
-"""
+filtered_fasta_path, log_messages):
-Process a single annotated BLAST file and generate all requested analytical outputs.
+"""
+Read, filter and optionally rewrite an unannotated FASTA file.
-This function integrates multiple steps for analyzing annotated BLAST output
-files and corresponding unannotated FASTA sequences. It organizes BLAST hits
+This function extracts dereplicated sequence counts, applies header filters
-per query, extracts the best-scoring matches, and produces various types of
+and a minimum support threshold, collects statistics and, if requested,
-output files (plots, tables, diagrams, and statistics) according to user
+writes a filtered FASTA file for downstream processing.
-arguments.
+:param unanno_file_path: Path to FASTA file.
-:param anno_file_path: Path to the annotated BLAST results file.
+:type unanno_file_path: str
+:param ignore_illuminapairend_type: Illumina pair-end status to remove.
+:type ignore_illuminapairend_type: str
+:param ignore_obiclean_type: Obiclean label to remove.
+:type ignore_obiclean_type: str
+:param min_support: Minimum dereplicated read count retained.
+:type min_support: int
+:param filtered_fasta_path: Optional path for filtered FASTA output.
+:type filtered_fasta_path: str | None
+:param log_messages: Log collection list.
+:type log_messages: list[str]
+:return: (headers, total_unique_count, invalid_count)
+:rtype: tuple[list[str], int, int]
+"""
+headers = []
+total_unique_count = 0
+invalid_count = 0
+total_headers = 0
+low_support_count = 0
+header_filter_count = 0
+written_fasta_count = 0
+if not os.path.exists(unanno_file_path):
+log_message(log_messages, f"Warning: Unannotated file {unanno_file_path} not found")
+return headers, total_unique_count, invalid_count
+write_fasta = filtered_fasta_path is not None
+filtered_fasta = None
+if write_fasta:
+try:
+filtered_fasta = open(filtered_fasta_path, "w")
+except OSError as e:
+log_message(log_messages, f"Error: Cannot open {filtered_fasta_path} for writing: {e}")
+write_fasta = False
+write_flag = False
+with open(unanno_file_path, 'r') as f:
+for line in f:
+if line.startswith('>'):
+total_headers += 1
+header_line = line.rstrip("\n")
+header = header_line.split()[0].strip('>')
+if "count=" in header_line:
+count = int(header_line.split("count=")[1].split(";")[0])
+else:
+count = 0
+passes_header_filter = check_header_string(header_line, ignore_illuminapairend_type,
+ignore_obiclean_type)
+if passes_header_filter and int(count) >= int(min_support):
+headers.append(header)
+total_unique_count += count
+write_flag = True
+if write_fasta:
+filtered_fasta.write(line)
+written_fasta_count += 1
+else:
+# Invalid header (either header filter or low support)
+if not passes_header_filter:
+header_filter_count += 1
+elif int(count) < int(min_support):
+low_support_count += 1
+invalid_count += 1
+write_flag = False
+else:
+if write_flag and write_fasta:
+filtered_fasta.write(line)
+if write_fasta and filtered_fasta is not None:
+filtered_fasta.close()
+log_message(log_messages, f"Filtered FASTA written succesfully"
+f"({written_fasta_count} sequences)")
+log_message(log_messages, f"FASTA: total headers: {total_headers}")
+log_message(log_messages, f"FASTA: headers kept after filters and min_support={min_support}: {len(headers)}")
+log_message(log_messages, f"FASTA: removed due to header filters (illumina/obiclean/etc.): {header_filter_count}")
+log_message(log_messages, f"FASTA: removed due to low dereplicated count (<{min_support}): {low_support_count}")
+log_message(log_messages, f"FASTA: total invalid (header filter + low support): {invalid_count}")
+return headers, total_unique_count, invalid_count
+def parse_blast_output(anno_file_path, eval_threshold, ignore_taxonomy, ignore_rank, min_coverage, min_identity,
+min_bitscore, ignore_seqids, log_messages):
+"""
+Parse a BLAST tabular file and retain high-quality hits.
+This function reads BLAST annotations, applies E-value and quality filters,
+excludes invalid taxa, groups remaining hits by query identifier, and logs
+summary statistics.
+:param anno_file_path: Input BLAST result file.
 :type anno_file_path: str
-:param unanno_file_path: Path to a FASTA file containing unannotated sequences.
+:param eval_threshold: Maximum allowed E-value.
-Used to determine sequence order and total unique sequence counts.
+:type eval_threshold: float
+:param ignore_taxonomy: Taxonomy substring filter.
+:type ignore_taxonomy: str
+:param ignore_rank: Rank substring filter.
+:type ignore_rank: str
+:param min_coverage: Minimum coverage threshold.
+:type min_coverage: int
+:param min_identity: Minimum identity threshold.
+:type min_identity: int
+:param min_bitscore: Minimum bitscore threshold.
+:type min_bitscore: int
+:param ignore_seqids: Comma-separated seqids to ignore.
+:type ignore_seqids: str
+:param log_messages: Log collection list.
+:type log_messages: list[str]
+:return: Mapping of q_id → hits
+:rtype: OrderedDict[str, list[dict]]
+"""
+if not os.path.exists(anno_file_path):
+log_message(log_messages, f"Error: Input file {anno_file_path} not found")
+return OrderedDict()
+blast_groups = OrderedDict()
+total_hits = 0
+kept_hits = 0
+filtered_hits = 0
+invalid_taxon_count = 0
+ignored_seqid_count = 0
+try:
+with open(anno_file_path, 'r') as f:
+for line in f:
+if line.startswith("#"):
+continue
+parts = line.strip().split('\t')
+if len(parts) < 7:
+log_message(log_messages, f"less than 7 parts in line, skipped faulty line: {line}")
+continue
+q_id = parts[0]
+total_hits += 1
+try:
+e_val = float(parts[6])
+except ValueError as e:
+log_message(log_messages, f"Error {e}, while extracting evalue")
+filtered_hits += 1
+continue
+taxon = parts[-1].strip()
+if is_invalid_taxon(taxon, ignore_taxonomy, ignore_rank):
+invalid_taxon_count += 1
+filtered_hits += 1
+continue
+seq_id = parts[2] if len(parts) > 2 else ''
+ignore_seqid_set = set(parse_list_param(ignore_seqids))
+if seq_id in ignore_seqid_set:
+ignored_seqid_count += 1
+filtered_hits += 1
+continue
+identity = parts[4] if len(parts) > 4 else ''
+cov = parts[5] if len(parts) > 5 else ''
+bitscore = float(parts[7]) if len(parts) > 7 and parts[7] else 0.0
+source = parts[8] if len(parts) > 8 else ''
+hit = {'e_val': e_val, 'identity': identity, 'cov': cov, 'bitscore': bitscore, 'source': source,
+'taxon': taxon, 'seq_id': seq_id, }
+if not is_insufficient_hit(hit, min_coverage, min_identity, min_bitscore, eval_threshold):
+blast_groups.setdefault(q_id, []).append(hit)
+kept_hits += 1
+else:
+filtered_hits += 1
+except Exception as e:
+log_message(log_messages, f"Error reading BLAST file {anno_file_path}: {e}")
+return OrderedDict()
+log_message(log_messages, f"BLAST: total hits read: {total_hits}")
+log_message(log_messages, f"BLAST: hits kept after quality filters: {kept_hits}")
+log_message(log_messages, f"BLAST: hits filtered (evalue/coverage/identity/bitscore): {filtered_hits}")
+log_message(log_messages, f"BLAST: hits removed due to invalid taxon: {invalid_taxon_count}")
+log_message(log_messages, f"BLAST: hits removed due to ignored seqids: {ignored_seqid_count}")
+return blast_groups
+def process_single_query(header, hits, bitscore_perc_cutoff):
+"""
+Process all BLAST hits for a single query/header and return a summary dict.
+"""
+current_read_info = {'taxa_dict': {}, 'header': header, 'e_values': set(), 'identities': set(), 'coverages': set(),
+'sources': set(), 'bitscores': set(), 'seq_id': set(), 'best_taxa_dict': {}}
+max_bitscore = max(h['bitscore'] for h in hits)
+if bitscore_perc_cutoff > 0:
+top_bitscore = float(max_bitscore) * (1 - (float(bitscore_perc_cutoff) / 100.0))
+else:
+top_bitscore = float(max_bitscore)
+current_read_info['best_taxa_dict'] = defaultdict(int)
+for hit in hits:
+e_val = hit['e_val']
+identity = hit['identity']
+cov = hit['cov']
+bitscore = hit['bitscore']
+source = hit['source']
+taxon = hit['taxon']
+seq_id = hit['seq_id']
+# Keep track of hits that pass bitscore threshold
+if bitscore >= top_bitscore:
+current_read_info['best_taxa_dict'][taxon] += 1
+current_read_info['e_values'].add(str(e_val))
+current_read_info['identities'].add(identity)
+current_read_info['coverages'].add(cov)
+current_read_info['sources'].add(source)
+current_read_info['bitscores'].add(bitscore)
+current_read_info['seq_id'].add(seq_id)
+return current_read_info
+def process_single_file(anno_file_path, unanno_file_path, args, log_messages):
+"""
+Execute the complete BLAST annotation processing workflow.
+This function performs FASTA parsing, BLAST parsing, taxonomy resolution,
+per-read and aggregated reporting, summary statistics computation, and
+generation of optional plot and table outputs.
+All validations, errors and counters are written into the log collector.
+:param anno_file_path: BLAST annotation file.
+:type anno_file_path: str
+:param unanno_file_path: Input FASTA file.
 :type unanno_file_path: str
-:param args: Parsed command-line arguments containing the following attributes:
+:param args: Parsed arguments namespace.
-- **eval_threshold** (*float*): Maximum allowed E-value for valid hits.
-- **eval_plot** (*str | None*): Output path for E-value distribution plot.
-- **taxa_output** (*str | None*): Output path for taxonomic summary table.
-- **header_anno** (*str | None*): Output path for annotated header table.
-- **circle_data** (*str | None*): Output path for circular diagram JSON data.
-- **anno_stats** (*str | None*): Output path for annotation statistics file.
-- **use_counts** (*bool*): Whether to use read counts or unique taxa only.
-- **uncertain_threshold** (*float*): Majority threshold for resolving taxonomic ambiguity.
 :type args: argparse.Namespace
+:param log_messages: Log collection list.
-:return: None. This function writes multiple output files to the paths
+:type log_messages: list[str]
-specified in ``args``
+:return: None
 :rtype: None
+"""
-:raises FileNotFoundError: If the annotated BLAST or unannotated FASTA file cannot be found.
+log_message(log_messages, f"Starting processing for FASTA")
-:raises ValueError: If file content is malformed or missing required columns.
+log_message(log_messages, "=== PARAMETERS USED ===")
-:notes:
+skip_keys = {
-- The function first groups BLAST hits by query ID and filters out invalid or
+"input_anno",
-low-confidence taxa.
+"input_unanno",
-- For each query, the best BLAST hit is determined based on bitscore and E-value.
+"eval_plot",
-- The total number of unique annotated sequences is inferred from sequence
+"taxa_output",
-headers containing a ``count=`` field.
+"circle_data",
-- Progress and warnings are printed to standard output for traceability.
+"header_anno",
-"""
+"log",
-# Core data structures - one per read
+"filtered_fasta",
-read_data = []  # List of dicts, one per read
+}
+for key, value in vars(args).items():
+if key in skip_keys:
+continue
+log_message(log_messages, f"{key}: {value}")
+log_message(log_messages, "=== END PARAMETERS ===")
+unanno_headers_ordered, total_unique_count, invalid_count = read_unannotated_fasta(unanno_file_path,
+args.ignore_illuminapairend_type, args.ignore_obiclean_type, args.min_support, args.filtered_fasta,
+log_messages)
+unanno_set = set(unanno_headers_ordered)
+if not os.path.exists(anno_file_path):
+log_message(log_messages, f"Error: Input file {anno_file_path} not found")
+with open(args.log, 'w') as f:
+print('gaat nog niet goed hoor')
+f.write("\n".join(log_messages))
+return
+log_message(log_messages, f"Reading BLAST annotations")
+blast_groups = parse_blast_output(anno_file_path, args.eval_threshold, args.ignore_taxonomy, args.ignore_rank,
+args.min_coverage, args.min_identity, args.min_bitscore, args.ignore_seqids, log_messages)
+extra_blast_qids = [q for q in blast_groups.keys() if q not in unanno_set]
+if extra_blast_qids:
+log_message(log_messages,
+f"Note: {len(extra_blast_qids)} BLAST q_ids not in FASTA (showing up to 10): {extra_blast_qids[:10]}")
+read_data = []
 headers = []
 seen_reads = set()
 annotated_unique_count = 0
-total_unique_count = 0
+total_headers_for_annotation = len(unanno_headers_ordered)
-# Current read processing
+reads_with_hits = 0
-current_q_id = ''
-current_read_info = None
+for header in unanno_headers_ordered:
+if header not in blast_groups:
-# Check if file exists and has content
+continue
-if not os.path.exists(anno_file_path):
+hits = blast_groups[header]
-print(f"Error: Input file {anno_file_path} not found")
+current_read_info = process_single_query(header, hits, args.bitscore_perc_cutoff)
-return
+read_data.append(current_read_info)
+reads_with_hits += 1
-unanno_headers_ordered = []
-if os.path.exists(unanno_file_path):
+if header not in seen_reads:
-with open(unanno_file_path, 'r') as f:
+seen_reads.add(header)
-for line in f:
+headers.append(header)
-if line.startswith('>'):
+try:
-header = line.split()[0].strip('>')
+count = int(header.split('(')[1].split(')')[0])
-unanno_headers_ordered.append(header)
+except (IndexError, ValueError, AttributeError) as e:
-print(f"Found {len(unanno_headers_ordered)} headers in unannotated file")
+log_message(log_messages, f"Error {e}: could not parse count in header: {header}")
-else:
+count = 0
-print(f"Warning: Unannotated file {unanno_file_path} not found")
+annotated_unique_count += count
-# Counter to track which read we're currently expecting
+reads_without_hits = total_headers_for_annotation - reads_with_hits
-unanno_set = set(unanno_headers_ordered)
+log_message(log_messages, f"ANNOTATION: total FASTA headers considered: {total_headers_for_annotation}")
-read_counter = 0
+log_message(log_messages, f"ANNOTATION: reads with BLAST hits: {reads_with_hits}")
+log_message(log_messages, f"ANNOTATION: reads without BLAST hits: {reads_without_hits}")
-# Count total unique sequences from unannotated file
+log_message(log_messages, f"ANNOTATION: unique annotated count (from header counts): {annotated_unique_count}")
-if os.path.exists(unanno_file_path):
+log_message(log_messages, f"ANNOTATION: total unique count (from FASTA): {total_unique_count}")
-with open(unanno_file_path, 'r') as f:
-for line in f:
-if "count=" in line:
-count = int(line.split("count=")[1].split(";")[0])
-total_unique_count += count
-# Process annotated file
-from collections import OrderedDict
-blast_groups = OrderedDict()  # q_id -> list of hit dicts
-# Group BLAST hits per q_id
-try:
-with open(anno_file_path, 'r') as f:
-for line in f:
-if line.startswith("#"):
-continue
-parts = line.strip().split('\t')
-if len(parts) < 7:
-continue
-q_id = parts[0]
-try:
-e_val = float(parts[6])
-except Exception:
-continue
-if e_val > args.eval_threshold:
-continue
-valid_taxa = True
-taxon = parts[-1].strip()
-if is_invalid_taxon(taxon):
-print(f"Skipping {q_id}: invalid taxon: {taxon}")
-valid_taxa = False
-identity = parts[4] if len(parts) > 4 else ''
-cov = parts[5] if len(parts) > 5 else ''
-bitscore = float(parts[7] if len(parts) > 7 else '')
-source = parts[8] if len(parts) > 8 else ''
-if valid_taxa:
-hit = {
-'e_val': e_val,
-'identity': identity,
-'cov': cov,
-'bitscore': bitscore,
-'source': source,
-'taxon': taxon
-}
-blast_groups.setdefault(q_id, []).append(hit)
-except Exception as e:
-print(f"Error reading BLAST file {anno_file_path}: {e}")
-return
-# BLAST-reads not in fasta
-extra_blast_qids = [q for q in blast_groups.keys() if q not in unanno_set]
-if extra_blast_qids:
-print(f"Note: {len(extra_blast_qids)} BLAST q_ids not in FASTA: {extra_blast_qids[:10]}...")
-# Process fasta order
-for header in unanno_headers_ordered:
-if header not in blast_groups:
-print(f"Skipping {header}: no BLAST hits")
-continue
-current_read_info = {
-'taxa_dict': {},
-'header': header,
-'e_values': set(),
-'identities': set(),
-'coverages': set(),
-'sources': set(),
-'bitscores': set(),
-'best_eval': float(),
-'best_identity': '',
-'best_coverage': '',
-'best_source': '',
-'best_bitscore': -1,
-'best_taxa_dict' : {}
-}
-# process all hits for this header
-for hit in blast_groups[header]:
-e_val = hit['e_val']
-identity = hit['identity']
-cov = hit['cov']
-bitscore = hit['bitscore']
-source = hit['source']
-taxon = hit['taxon']
-current_read_info['e_values'].add(str(e_val))
-current_read_info['identities'].add(identity)
-current_read_info['coverages'].add(cov)
-current_read_info['sources'].add(source)
-current_read_info['bitscores'].add(bitscore)
-# Keep track of best hit
-if (bitscore > current_read_info['best_bitscore'] or
-(bitscore == current_read_info['best_bitscore'] and e_val < current_read_info['best_eval'])):
-# New best set → reset
-current_read_info['best_bitscore'] = bitscore
-current_read_info['best_eval'] = e_val
-current_read_info['best_identity'] = identity
-current_read_info['best_coverage'] = cov
-current_read_info['best_source'] = source
-current_read_info['best_taxa_dict'] = {taxon: 1}
-# If hit is of same quality: add to existing dictionary
-elif bitscore == current_read_info['best_bitscore'] and e_val == current_read_info['best_eval']:
-td = current_read_info['best_taxa_dict']
-td[taxon] = td.get(taxon, 0) + 1
-# Might be unnecesary, best_taxa_dict seems to be the same as taxa_dict
-# in output. Too scared to remove.
-if len(current_read_info['e_values']) == 1:
-taxa_dict = current_read_info['taxa_dict']
-taxa_dict[taxon] = taxa_dict.get(taxon, 0) + 1
-read_data.append(current_read_info)
-if header not in seen_reads:
-seen_reads.add(header)
-headers.append(header)
-try:
-count = int(header.split('(')[1].split(')')[0])
-except IndexError or ValueError or AttributeError:
-count = 0
-annotated_unique_count += count
-# Extract data for different functions from unified structure
 taxa_dicts = [read['taxa_dict'] for read in read_data]
 td = [read['best_taxa_dict'] for read in read_data]
-# Generate outputs based on arguments
 if args.eval_plot:
 e_val_sets = [read['e_values'] for read in read_data]
-make_eval_plot(e_val_sets, args.eval_plot)
+make_eval_plot(e_val_sets, args.eval_plot, log_messages)
+log_message(log_messages, f"E-value plot written succesfully")
 if args.taxa_output:
 process_taxa_output(td, args.taxa_output, args.uncertain_threshold)
+log_message(log_messages, f"Taxa summary written succesfully")
 if args.header_anno:
-# Extract best values efficiently - single list comprehension per metric
+source_for_headers = [[read['sources']] for read in read_data]
-e_values_for_headers = [[read['best_eval']] for read in read_data]
+seq_id_for_headers = [[read['seq_id']] for read in read_data]
-identity_for_headers = [[read['best_identity']] for read in read_data]
+process_header_annotations(td, headers, args.header_anno, args.uncertain_threshold, source_for_headers,
-coverage_for_headers = [[read['best_coverage']] for read in read_data]
+seq_id_for_headers, log_messages)
-source_for_headers = [[read['best_source']] for read in read_data]
+log_message(log_messages, f"Header annotations written succesfully")
-bitscore_for_headers= [[read['best_bitscore']] for read in read_data]
-process_header_annotations(td, headers, e_values_for_headers,
-args.header_anno, args.uncertain_threshold,
-identity_for_headers, coverage_for_headers, source_for_headers, bitscore_for_headers)
 if args.circle_data:
-create_circle_diagram(td, args.circle_data, args.use_counts,
+create_circle_data(td, args.circle_data, args.use_counts, args.uncertain_threshold)
-args.uncertain_threshold)
+log_message(log_messages, f"Circle diagram JSON written succesfully")
-if args.anno_stats:
+stats = calculate_annotation_stats(len(taxa_dicts), unanno_file_path, annotated_unique_count,
-stats = calculate_annotation_stats(len(taxa_dicts), unanno_file_path,
+int(total_unique_count))
-annotated_unique_count, total_unique_count)
-with open(args.anno_stats, 'w') as f:
+log_message(log_messages, "=== ANNOTATION STATISTICS ===")
-f.write('metric\tvalue\n')
+for key, value in stats.items():
-for key, value in stats.items():
+log_message(log_messages, f"{key}: {value}")
-f.write(f'{key}\t{value}\n')
+with open(args.log, 'w') as f:
-def is_invalid_taxon(taxon: str) -> bool:
+f.write("\n".join(log_messages))
+def check_header_string(header_line, invalid_header_string, invalid_line_string):
+"""
+Checks the header input for string that it needs to be filtered on
+"""
+invalid_string_list = parse_list_param(invalid_header_string) + parse_list_param(invalid_line_string)
+for string in invalid_string_list:
+if string.lower() == 'singleton':
+string = "obiclean_status={'XXX': 's'}"
+elif string.lower() == 'variant':
+string = "obiclean_status={'XXX': 'i'}"
+elif string.lower() == 'head':
+string = "obiclean_status={'XXX': 'h'}"
+elif string.lower() == 'pairend' or string.lower() == 'paired end':
+string = "PairEnd"
+elif string.lower() == 'consensus' or string.lower() == 'cons':
+string = "CONS"
+if string in header_line:
+return False
+return True
+def is_insufficient_hit(hit, min_cov, min_id, min_bit, min_eval):
+"""
+Checks if hit has values that pass the thresholds
+"""
+if float(hit['cov']) < min_cov:
+return True
+elif float(hit['identity']) < min_id:
+return True
+elif float(hit['bitscore']) < min_bit:
+return True
+elif int(hit['e_val']) > min_eval:
+return True
+else:
+return False
+def is_invalid_taxon(taxon: str, invalid_taxa, invalid_ranks) -> bool:
 """
 Determine whether a given taxonomic path should be considered invalid and excluded from analysis.
+"""
-This function identifies and filters out taxonomic strings that contain unreliable,
+invalid_string_list = parse_list_param(invalid_ranks) + parse_list_param(invalid_taxa)
-incomplete, or placeholder taxonomic information. It is used to clean BLAST or
-classification outputs before aggregation and visualization.
-:param taxon: Full taxonomic path as a string
-:type taxon: str
-:return: ``True`` if the taxon is considered invalid according to the criteria below;
-otherwise ``False``.
-:rtype: bool
-:criteria:
-A taxon is considered invalid if:
-- It contains the substring ``"invalid taxid"`` or ``"GenBank invalid taxon"``.
-- It includes the term ``"environmental sample"`` (non-specific environmental sequence).
-- It has any hierarchical level labeled as ``"unknown"`` followed by a more
-specific level (e.g., *unknown genus / Homo sapiens*).
-:note:
-This function performs a **case-insensitive** check and assumes the
-input taxonomic path uses ``" / "`` as a level delimiter.
-"""
 taxon_lower = taxon.lower()
+for string in invalid_string_list:
-if "invalid taxid" in taxon_lower or "genbank invalid taxon" in taxon_lower:
+if string in taxon_lower:
-return True
-if "environmental sample" in taxon_lower:
-return True
-# check for "unknown" followed by something deeper
-parts = taxon.split(" / ")
-for i, part in enumerate(parts):
-if "unknown" in part.lower() and i < len(parts) - 1:
-# there is something more specific after an unknown level
 return True
 return False
+def parse_list_param(param):
+"""
+Safely convert a comma-separated Galaxy text parameter into a list.
+"""
+if not param:
+return []
+return [x.strip() for x in param.strip(', \n').split(',') if x.strip()]
 def main(arg_list=None):
 """
 Entry point for Galaxy-compatible BLAST annotation processing.
 :rtype: None
 Notes:
 Calls `process_single_file` with parsed arguments.
 """
+log_messages = []
 args = parse_arguments(arg_list)
-for output_file in [args.eval_plot, args.header_anno, args.taxa_output, args.circle_data, args.anno_stats]:
+process_single_file(args.input_anno, args.input_unanno, args, log_messages)
-if output_file:
-os.makedirs(os.path.dirname(output_file), exist_ok=True)
-process_single_file(args.input_anno, args.input_unanno, args)
 print("Processing completed successfully")
 if __name__ == "__main__":
 main()

Mercurial > repos > onnodg > blast_annotations_processor

comparison blast_annotations_processor.py @ 2:9ca209477dfd draft default tip