Mercurial > repos > onnodg > cdhit_analysis

diff cdhit_analysis.py @ 0:00d56396b32a draft
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
author: onnodg
date: Tue, 14 Oct 2025 09:09:46 +0000
children: ff68835adb2b
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cdhit_analysis.py	Tue Oct 14 09:09:46 2025 +0000
@@ -0,0 +1,787 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import re
+from collections import Counter, defaultdict
+from math import sqrt
+import pandas as pd
+import matplotlib
+
+matplotlib.use('Agg')  # Non-interactive backend for Galaxy
+import matplotlib.pyplot as plt
+
+"""
+This script processes cluster output files from cd-hit-est for use in Galaxy.
+It extracts cluster information, associates taxa and e-values from annotation files,
+performs statistical calculations, and generates text and plot outputs
+summarizing similarity and taxonomic distributions.
+
+
+Main steps:
+1. Parse cd-hit-est cluster file and (optional) annotation file.
+2. Process each cluster to extract similarity, taxa, and e-value information.
+3. Aggregate results across clusters.
+4. Generate requested outputs: text summaries, plots, and Excel reports.
+
+
+Note: Uses a non-interactive matplotlib backend (Agg) for compatibility with Galaxy.
+"""
+
+
+def parse_arguments(args_list=None):
+    """Parse command-line arguments for the script."""
+    parser = argparse.ArgumentParser(
+        description='Create taxa analysis from cd-hit cluster files')
+    parser.add_argument('--input_cluster', type=str, required=True,
+                        help='Input cluster file (.clstr)')
+    parser.add_argument('--input_annotation', type=str, required=False,
+                        help='Input annotation file (.out)')
+
+    # Galaxy output files
+    parser.add_argument('--output_similarity_txt', type=str,
+                        help='Similarity text output file')
+    parser.add_argument('--output_similarity_plot', type=str,
+                        help='Similarity plot output file')
+    parser.add_argument('--output_evalue_txt', type=str,
+                        help='E-value text output file')
+    parser.add_argument('--output_evalue_plot', type=str,
+                        help='E-value plot output file')
+    parser.add_argument('--output_count', type=str,
+                        help='Count summary output file')
+    parser.add_argument('--output_taxa_clusters', type=str,
+                        help='Taxa per cluster output file')
+    parser.add_argument('--output_taxa_processed', type=str,
+                        help='Processed taxa output file')
+    # Plot parameters
+    parser.add_argument('--simi_plot_y_min', type=float, default=95.0,
+                        help='Minimum value of the y-axis in the similarity plot')
+    parser.add_argument('--simi_plot_y_max', type=float, default=100.0,
+                        help='Maximum value of the y-axis in the similarity plot')
+
+    # Uncertain taxa configuration
+    parser.add_argument('--uncertain_taxa_use_ratio', type=float, default=0.5,
+                        help='Ratio at which uncertain taxa count toward the correct taxa')
+    parser.add_argument('--min_to_split', type=float, default=0.45,
+                        help='Minimum percentage for taxonomic split')
+    parser.add_argument('--min_count_to_split', type=int, default=10,
+                        help='Minimum count for taxonomic split')
+
+    # Processing options
+    parser.add_argument('--show_unannotated_clusters', action='store_true', default=False,
+                        help='Show unannotated clusters in output')
+    parser.add_argument('--make_taxa_in_cluster_split', action='store_true', default=False,
+                        help='Split clusters with multiple taxa')
+    parser.add_argument('--print_empty_files', action='store_true', default=False,
+                        help='Print messages about empty annotation files')
+
+    return parser.parse_args(args_list)
+
+
+# Color map for plots
+COLORMAP = [
+# List of RGBA tuples for bar coloring in plots
+    (0.12156862745098039, 0.4666666666666667, 0.7058823529411765, 1.0),
+    (1.0, 0.4980392156862745, 0.054901960784313725, 1.0),
+    (0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0),
+    (0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0),
+    (0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0),
+    (0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0),
+    (0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0),
+    (0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0),
+    (0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0),
+    (0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0)
+]
+
+
+def parse_cluster_file(cluster_file, annotation_file=None, print_empty=False, raise_on_error=False):
+    """
+    Parse the cd-hit-est cluster file (.clstr) and (optionally) an Excel annotation file.
+
+    It extracts cluster information (header, read count, similarity) and associates
+    taxonomic information and E-values from the annotation file based on the read header.
+
+    :param cluster_file: Path to cd-hit cluster file (.clstr).
+    :type cluster_file: str
+    :param annotation_file: Path to Excel annotation file with taxa and e-values.
+    :type annotation_file: str, optional
+    :param print_empty: Print a message if the annotation file is not found or empty.
+    :type print_empty: bool
+    :param raise_on_error: Raise parsing errors instead of printing warnings.
+    :type raise_on_error: bool
+    :return: List of clusters, where each cluster is a dict mapping read header to a dict of read info.
+    :rtype: list[dict[str, dict]]
+    :raises ValueError: If similarity cannot be parsed from a cluster line.
+    :raises UnboundLocalError: If an error occurs during annotation processing.
+    """
+    clusters = []
+    current_cluster = {}
+
+    # Load annotations if provided
+    annotations = {}
+    if annotation_file and os.path.exists(annotation_file):
+        # Lees het Excel-bestand
+        df = pd.read_excel(annotation_file, sheet_name='Individual_Reads', engine='openpyxl')
+
+        # Itereer over de rijen
+        for _, row in df.iterrows():
+            header = row['header']  # kolomnaam zoals in Excel
+            evalue = row['e_value']  # of de kolomnaam die je wilt gebruiken
+            taxa = row['taxa']  # afhankelijk van hoe je taxa opslaat
+
+            annotations[header] = {'evalue': evalue, 'taxa': taxa}
+    elif annotation_file and print_empty:
+        print(f"Annotation file {annotation_file} not found or empty")
+    with open(cluster_file, 'r') as f:
+        for line in f:
+            line = line.strip()
+            if line.startswith('>Cluster'):
+                # Start of new cluster, save previous if exists
+                if current_cluster:
+                    clusters.append(current_cluster)
+                    current_cluster = {}
+            else:
+                # Parse sequence line
+                parts = line.split()
+                if len(parts) >= 2:
+                    # Extract header and count
+                    header_part = parts[2].strip('>.')
+                    header_parts = header_part.split('(')
+                    if len(header_parts) > 1:
+                        last_part = header_parts[-1].strip(')')
+                        header = header_parts[0]
+                        if last_part:
+                            count = int(last_part)
+                        else:
+                            print('no count')
+                            count = 0
+                            header = ''
+                    # Extract similarity
+                    similarity_part = parts[-1].strip()
+                    if '*' in similarity_part:
+                        similarity = 100.0  # Representative sequence
+                    else:
+                        matches = re.findall(r'[\d.]+', similarity_part)
+                        if matches:
+                            similarity = float(matches[-1])
+                        else:
+                            raise ValueError(f"Could not parse similarity from: '{similarity_part}'")
+                    # Get annotation info
+                    try:
+                        if header in annotations:
+                            taxa = annotations[header]['taxa']
+                            evalue = annotations[header]['evalue']
+                        else:
+                            taxa = 'Unannotated read'
+                            evalue = 'Unannotated read'
+
+                        current_cluster[header] = {
+                            'count': count,
+                            'similarity': similarity,
+                            'taxa': taxa,
+                            'evalue': evalue
+                        }
+                    except UnboundLocalError as e:
+                        if raise_on_error:
+                            raise UnboundLocalError(str(e))
+                        print(f"Error: {e}, No annotations found")
+
+    # Add the last cluster
+    if current_cluster:
+        clusters.append(current_cluster)
+
+    return clusters
+
+
+def process_cluster_data(cluster):
+    """
+    Process a single cluster to extract E-value, similarity, and taxa data for all reads.
+
+    Aggregates information from all reads in the cluster, storing read counts,
+    E-values, similarities, and taxa in lists and a dictionary.
+
+    :param cluster: Cluster data mapping read headers to read info.
+    :type cluster: dict
+    :return: A tuple containing: (list of E-values, list of similarity values, dict of taxa -> counts).
+             The first element of the E-value list ([0]) stores the count of unannotated reads.
+    :rtype: tuple[list[float | int], list[float], dict[str, int]]
+    """
+    eval_list = [0.0]  # First element for unannotated count
+    simi_list = []
+    taxa_dict = {'Unannotated read': 0}
+
+    for header, info in cluster.items():
+        count = info['count']
+        similarity = info['similarity']
+        taxa = info['taxa']
+        evalue = info['evalue']
+
+        if evalue == 'Unannotated read':
+            eval_list[0] += count
+            taxa_dict['Unannotated read'] += count
+        else:
+            try:
+                eval_val = float(evalue)
+                for _ in range(count):
+                    eval_list.append(eval_val)
+            except ValueError:
+                eval_list[0] += count
+
+            if taxa not in taxa_dict:
+                taxa_dict[taxa] = 0
+            taxa_dict[taxa] += count
+
+        # Add similarity values
+        for _ in range(count):
+            simi_list.append(similarity)
+    return eval_list, simi_list, taxa_dict
+
+
+def calculate_cluster_taxa(taxa_dict, args):
+    """
+    Calculate the most likely taxa for a cluster based on read counts.
+
+    This function applies the 'uncertain taxa use ratio' for unannotated reads
+    and uses a recursive approach to potentially split a cluster into sub-clusters
+    if taxonomic dominance is not strong enough (based on ``min_to_split``
+    and ``min_count_to_split``).
+
+    :param taxa_dict: Mapping of taxa (full string) -> read counts.
+    :type taxa_dict: dict[str, int]
+    :param args: Parsed script arguments, including parameters for taxa calculation.
+    :type args: argparse.Namespace
+    :return: A list of refined taxa assignments (dictionaries), where each dictionary
+             represents a potentially split sub-cluster.
+    :rtype: list[dict[str, float | int]]
+    """
+    # Replace 'Unannotated read' with uncertain taxa format
+    processed_dict = {}
+    for taxa, count in taxa_dict.items():
+        if taxa == 'Unannotated read':
+            uncertain_taxa = ' / '.join(['Uncertain taxa'] * 7)
+            processed_dict[uncertain_taxa] = count
+        else:
+            processed_dict[taxa] = count
+
+    return _recursive_taxa_calculation(processed_dict, args, 0)
+
+
+def _recursive_taxa_calculation(taxa_dict, args, level):
+    """
+    Recursive helper to calculate and potentially split taxa at each taxonomic level.
+
+    :param taxa_dict: Taxa counts at the current level (or sub-group).
+    :type taxa_dict: dict[str, float | int]
+    :param args: Parsed script arguments.
+    :type args: argparse.Namespace
+    :param level: Index of the current taxonomic level (0=kingdom, max 6=species).
+    :type level: int
+    :return: List of refined taxa dictionaries.
+    :rtype: list[dict[str, float | int]]
+    """
+    if level >= 7:  # Max 7 taxonomic levels
+        return [taxa_dict]
+
+    level_dict = defaultdict(float)
+
+    # Group by taxonomic level
+    for taxa, count in taxa_dict.items():
+        taxa_parts = taxa.split(' / ')
+        if level < len(taxa_parts):
+            level_taxon = taxa_parts[level]
+            if level_taxon == 'Uncertain taxa':
+                level_dict[level_taxon] += count * args.uncertain_taxa_use_ratio
+            else:
+                level_dict[level_taxon] += count
+
+    if len(level_dict) <= 1:
+        # Only one taxon at this level, continue to next level
+        return _recursive_taxa_calculation(taxa_dict, args, level + 1)
+
+    # Sort by abundance
+    sorted_taxa = sorted(level_dict.items(), key=lambda x: x[1], reverse=True)
+
+    result = []
+    dominant_taxon = sorted_taxa[0][0]
+
+    # Check if we should split
+    for i in range(1, len(sorted_taxa)):
+        secondary_taxon = sorted_taxa[i][0]
+        total_count = sorted_taxa[0][1] + sorted_taxa[i][1]
+        ratio = sorted_taxa[i][1] / total_count
+
+        if ratio >= args.min_to_split or total_count <= args.min_count_to_split:
+            # Split off this taxon
+            split_dict = {taxa: count for taxa, count in taxa_dict.items()
+                          if taxa.split(' / ')[level] == secondary_taxon}
+            result.extend(_recursive_taxa_calculation(split_dict, args, level + 1))
+
+    # Process the dominant group
+    dominant_dict = {taxa: count for taxa, count in taxa_dict.items()
+                     if taxa.split(' / ')[level] == dominant_taxon}
+    result.extend(_recursive_taxa_calculation(dominant_dict, args, level + 1))
+
+    return result
+
+
+def write_similarity_output(all_simi_data, output_file):
+    """
+    Write the similarity text output, including the mean and standard deviation,
+    and a count per similarity percentage.
+
+    :param all_simi_data: List of all similarity percentages from all reads.
+    :type all_simi_data: list[float]
+    :param output_file: Path to the output file.
+    :type output_file: str
+    :return: None
+    :rtype: None
+    """
+    if not all_simi_data:
+        return
+
+    avg = sum(all_simi_data) / len(all_simi_data)
+    variance = sum((s - avg) ** 2 for s in all_simi_data) / len(all_simi_data)
+    st_dev = sqrt(variance)
+
+    simi_counter = Counter(all_simi_data)
+    simi_sorted = sorted(simi_counter.items(), key=lambda x: -x[0])
+
+    with open(output_file, 'w') as f:
+        f.write(f"# Average similarity: {avg:.2f}\n")
+        f.write(f"# Standard deviation: {st_dev:.2f}\n")
+        f.write("similarity\tcount\n")
+        for similarity, count in simi_sorted:
+            f.write(f"{similarity}\t{count}\n")
+
+
+def write_evalue_output(all_eval_data, output_file):
+    """
+    Write the E-value text output, including the count of unannotated reads
+    and a count per E-value.
+
+    :param all_eval_data: List of E-values from all reads. The first element ([0]) is the count of unannotated reads.
+    :type all_eval_data: list[float | int]
+    :param output_file: Path to the output file.
+    :type output_file: str
+    :return: None
+    :rtype: None
+    """
+    unanno_count = all_eval_data[0]
+    eval_counter = Counter(all_eval_data[1:])
+
+    with open(output_file, 'w') as f:
+        f.write("evalue\tcount\n")
+        if unanno_count > 0:
+            f.write(f"unannotated\t{unanno_count}\n")
+
+        eval_sorted = sorted(eval_counter.items(),
+                             key=lambda x: (-x[1], float(x[0]) if isinstance(x[0], (int, float)) else float('inf')))
+        for value, count in eval_sorted:
+            f.write(f"{value}\t{count}\n")
+
+
+def write_count_output(all_eval_data, cluster_data_list, output_file):
+    """
+    Write a summary of annotated and unannotated read counts per cluster
+    and for the total sample.
+
+    :param all_eval_data: List of E-values from all reads for the total count summary.
+    :type all_eval_data: list[float | int]
+    :param cluster_data_list: List of tuples (eval_list, simi_list, taxa_dict) per cluster.
+    :type cluster_data_list: list[tuple]
+    :param output_file: Path to the output file.
+    :type output_file: str
+    :return: None
+    :rtype: None
+    """
+    with open(output_file, 'w') as f:
+        f.write("cluster\tunannotated\tannotated\ttotal\tperc_unannotated\tperc_annotated\n")
+
+        for cluster_num, (eval_list, _, _) in enumerate(cluster_data_list):
+            unannotated = eval_list[0]
+            annotated = len(eval_list[1:])
+            total = unannotated + annotated
+
+            if total > 0:
+                perc_annotated = (annotated / total) * 100
+                perc_unannotated = (unannotated / total) * 100
+            else:
+                perc_annotated = perc_unannotated = 0
+
+            f.write(
+                f"{cluster_num}\t{unannotated}\t{annotated}\t{total}\t{perc_unannotated:.2f}\t{perc_annotated:.2f}\n")
+
+        # Add full sample summary
+        total_unannotated = all_eval_data[0]
+        total_annotated = len(all_eval_data[1:])
+        grand_total = total_unannotated + total_annotated
+
+        if grand_total > 0:
+            perc_annotated = (total_annotated / grand_total) * 100
+            perc_unannotated = (total_unannotated / grand_total) * 100
+        else:
+            perc_annotated = perc_unannotated = 0
+
+        f.write(
+            f"TOTAL\t{total_unannotated}\t{total_annotated}\t{grand_total}\t{perc_unannotated:.2f}\t{perc_annotated:.2f}\n")
+
+
+def write_taxa_clusters_output(cluster_data_list, output_file):
+    """
+    Write raw taxa information per cluster to an Excel file.
+
+    Each row contains the cluster number, read count, the full taxa string,
+    and the separate taxonomic levels (kingdom through species).
+
+    :param cluster_data_list: List of tuples (eval_list, simi_list, taxa_dict) per cluster.
+    :type cluster_data_list: list[tuple]
+    :param output_file: Path to the output Excel file.
+    :type output_file: str
+    :return: None
+    :rtype: None
+    """
+    # Create main dataframe
+    data = []
+    for cluster_num, (_, _, taxa_dict) in enumerate(cluster_data_list):
+        for taxa, count in taxa_dict.items():
+            if count > 0:
+                # Split taxa into taxonomic levels
+                taxa_levels = taxa.split(' / ') if taxa else []
+                taxa_levels += ['Unannotated read'] * (7 - len(taxa_levels))
+                try:
+                    data.append({
+                        'cluster': cluster_num,
+                        'count': count,
+                        'taxa_full': taxa,
+                        'kingdom': taxa_levels[0],
+                        'phylum': taxa_levels[1],
+                        'class': taxa_levels[2],
+                        'order': taxa_levels[3],
+                        'family': taxa_levels[4],
+                        'genus': taxa_levels[5],
+                        'species': taxa_levels[6]
+                    })
+                except IndexError as e:
+                    # Skip entries with incomplete taxonomic data
+                    print(f"Skipped entry in cluster {cluster_num}: incomplete taxonomic data for '{taxa}, error: {e}'")
+                    continue
+
+    df = pd.DataFrame(data)
+    # Write to Excel
+    temp_path = output_file + ".xlsx"
+    os.makedirs(os.path.dirname(temp_path), exist_ok=True)
+    with pd.ExcelWriter(temp_path, engine='openpyxl') as writer:
+        df.to_excel(writer, sheet_name='Raw_Taxa_Clusters', index=False, engine='openpyxl')
+    os.replace(temp_path, output_file)
+
+def write_taxa_processed_output(cluster_data_list, args, output_file):
+    """
+    Write processed (potentially split) taxa information to an Excel file.
+
+    This file contains the resulting sub-clusters from the taxonomic dominance
+    analysis and a separate sheet documenting the parameters used.
+
+    :param cluster_data_list: List of tuples (eval_list, simi_list, taxa_dict) per cluster.
+    :type cluster_data_list: list[tuple]
+    :param args: Parsed script arguments, used for taxa calculation and settings documentation.
+    :type args: argparse.Namespace
+    :param output_file: Path to the output Excel file.
+    :type output_file: str
+    :return: None
+    :rtype: None
+    """
+    # Create main dataframe
+    data = []
+    for cluster_num, (_, _, taxa_dict) in enumerate(cluster_data_list):
+        processed_taxa = calculate_cluster_taxa(taxa_dict, args)
+
+        for taxa_group in processed_taxa:
+            for taxa, count in taxa_group.items():
+                if 'Uncertain taxa / Uncertain taxa / Uncertain taxa' in taxa:
+                    if args.show_unannotated_clusters:
+                        data.append({
+                            'cluster': cluster_num,
+                            'count': count,
+                            'taxa_full': 'Unannotated read',
+                            'kingdom': 'Unannotated',
+                            'phylum': 'Unannotated',
+                            'class': 'Unannotated',
+                            'order': 'Unannotated',
+                            'family': 'Unannotated',
+                            'genus': 'Unannotated',
+                            'species': 'Unannotated'
+                        })
+                else:
+                    # Split taxa into taxonomic levels
+                    taxa_levels = taxa.split(' / ') if taxa else []
+
+                    try:
+                        data.append({
+                            'cluster': cluster_num,
+                            'count': count,
+                            'taxa_full': taxa,
+                            'kingdom': taxa_levels[0],
+                            'phylum': taxa_levels[1],
+                            'class': taxa_levels[2],
+                            'order': taxa_levels[3],
+                            'family': taxa_levels[4],
+                            'genus': taxa_levels[5],
+                            'species': taxa_levels[6]
+                        })
+                    except IndexError:
+                        # Skip entries with incomplete taxonomic data
+                        print(f"Skipped entry in cluster {cluster_num}: incomplete taxonomic data for '{taxa}'")
+                        continue
+
+    df = pd.DataFrame(data)
+
+    # Create settings dataframe
+    settings_data = [
+        ['uncertain_taxa_use_ratio', args.uncertain_taxa_use_ratio],
+        ['min_to_split', args.min_to_split],
+        ['min_count_to_split', args.min_count_to_split]
+    ]
+    settings_df = pd.DataFrame(settings_data, columns=['Parameter', 'Value'])
+
+    # Write to Excel with multiple sheets
+    temp_path = output_file + ".xlsx"
+    os.makedirs(os.path.dirname(temp_path), exist_ok=True)
+    with pd.ExcelWriter(temp_path, engine='openpyxl') as writer:
+        df.to_excel(writer, sheet_name='Processed_Taxa_Clusters', index=False, engine='openpyxl')
+        settings_df.to_excel(writer, sheet_name='Settings', index=False, engine='openpyxl')
+
+        # Auto-adjust column widths for better readability
+        for sheet_name in writer.sheets:
+            worksheet = writer.sheets[sheet_name]
+            for column in worksheet.columns:
+                max_length = 0
+                column_letter = column[0].column_letter
+                for cell in column:
+                    try:
+                        if len(str(cell.value)) > max_length:
+                            max_length = len(str(cell.value))
+                    except:
+                        pass
+                adjusted_width = min(max_length + 2, 50)  # Cap at 50 characters
+                worksheet.column_dimensions[column_letter].width = adjusted_width
+    os.replace(temp_path, output_file)
+
+def create_similarity_plot(all_simi_data, cluster_simi_lengths, args, output_file):
+    """
+    Create a bar plot showing the distribution of intra-cluster similarity values.
+
+    The plot uses different colors to distinguish reads belonging to different clusters.
+
+    :param all_simi_data: List of all similarity values, sorted descending.
+    :type all_simi_data: list[float]
+    :param cluster_simi_lengths: List of lengths of similarity data per cluster, used for coloring.
+    :type cluster_simi_lengths: list[int]
+    :param args: Parsed script arguments, used for plot y-limits.
+    :type args: argparse.Namespace
+    :param output_file: Path to the output plot file (e.g., .png).
+    :type output_file: str
+    :return: None
+    :rtype: None
+    """
+    if not all_simi_data:
+        return
+
+    sorted_simi_list = sorted(all_simi_data, reverse=True)
+    bar_positions = list(range(len(sorted_simi_list)))
+
+    # Create colormap for different clusters
+    colormap_full = []
+    for i, length in enumerate(cluster_simi_lengths):
+        color = COLORMAP[i % len(COLORMAP)]
+        colormap_full.extend([color] * length)
+
+    plt.figure(figsize=(12, 6))
+    plt.bar(bar_positions, sorted_simi_list, width=1, color=colormap_full)
+    plt.grid(axis='y', linestyle='--', color='gray', alpha=0.7)
+    plt.ylabel("Similarity (%)")
+    plt.xlabel("Reads")
+    plt.title("Intra-cluster Similarity Distribution")
+    plt.ylim(ymin=args.simi_plot_y_min, ymax=args.simi_plot_y_max)
+    plt.tight_layout()
+
+    plt.savefig(output_file, format='png', dpi=300, bbox_inches='tight')
+    plt.close()
+
+
+def create_evalue_plot(all_eval_data, cluster_eval_lengths, output_file):
+    """
+    Create a bar plot showing the distribution of E-values.
+
+    The y-axis is log-scaled and displays ``1/E-values``. Reads are ordered
+    by E-value (ascending). The plot uses different colors to distinguish reads
+    belonging to different clusters.
+
+    :param all_eval_data: List of E-values from all reads. Assumes E-values start at index 1.
+    :type all_eval_data: list[float | int]
+    :param cluster_eval_lengths: List of lengths of annotated E-value data per cluster, used for coloring.
+    :type cluster_eval_lengths: list[int]
+    :param output_file: Path to the output plot file (e.g., .png).
+    :type output_file: str
+    :return: None
+    :rtype: None
+    """
+    if len(all_eval_data) <= 1:  # Only unannotated reads
+        return
+
+    sorted_eval_list = sorted(all_eval_data[1:])  # Skip unannotated count
+
+    if not sorted_eval_list:
+        return
+
+    bar_positions = list(range(len(sorted_eval_list)))
+    bar_heights = [1 / e if e > 0 else 0 for e in sorted_eval_list]
+
+    # Create colormap for different clusters
+    colormap_full = []
+    for i, length in enumerate(cluster_eval_lengths):
+        color = COLORMAP[i % len(COLORMAP)]
+        colormap_full.extend([color] * length)
+
+    plt.figure(figsize=(12, 6))
+    plt.bar(bar_positions, bar_heights, width=1, color=colormap_full)
+    plt.yscale('log')
+    plt.grid(axis='y', linestyle='--', color='gray', alpha=0.7)
+    plt.ylabel("1/E-values")
+    plt.xlabel("Reads")
+    plt.title("E-value Distribution")
+    plt.tight_layout()
+
+    plt.savefig(output_file, format='png', dpi=300, bbox_inches='tight')
+    plt.close()
+
+def prepare_evalue_histogram(evalue_list, unannotated_list):
+    """
+    Generate histogram data for E-value distributions.
+
+    This function processes a list of E-values from BLAST or similar search
+    results, filters out invalid or zero entries, and computes histogram data
+    suitable for plotting. The histogram represents the frequency distribution
+    of E-values across all annotated hits.
+
+    :param evalue_list: List of E-values from BLAST hits
+    :type evalue_list: list[float | int]
+    :param unannotated_list: List of unannotated E-values
+    :type unannotated_list: list
+    :return: Tuple containing:
+        - **counts** (*numpy.ndarray*): Number of entries per histogram bin.
+        - **bins** (*numpy.ndarray*): Bin edges corresponding to the histogram.
+        Returns ``(None, None)`` if no valid data is available.
+    :rtype: tuple[numpy.ndarray, numpy.ndarray] | tuple[None, None]
+    :note:
+        - Only positive numeric E-values are included in the histogram.
+        - Uses 50 bins in the range (0, 1) for visualization consistency.
+    """
+    data = [ev for ev in evalue_list if isinstance(ev, (int, float)) and ev > 0]
+    if not data:
+        return None, None
+
+    counts, bins, _ = plt.hist(data, bins=50, range=(0, 1))
+    plt.close()
+    return counts, bins
+
+def create_evalue_plot_test(evalue_list, unannotated_list, output_file):
+    """
+    Create and save an E-value distribution plot, returning the computed histogram data.
+
+    This function visualizes the frequency distribution of E-values from BLAST or
+    annotation results. It saves the plot to the specified output file and returns
+    the histogram data (counts and bins) for testing with pytests.
+
+    :param evalue_list: List of numeric E-values to plot
+    :type evalue_list: list[float | int]
+    :param unannotated_list: Optional list of E-values for unannotated sequences.
+    :type unannotated_list: list
+    :param output_file: Path where the histogram image  will be saved.
+    :type output_file: str
+
+    :return: Tuple containing:
+        - **counts** (*numpy.ndarray*): Frequency counts per histogram bin.
+        - **bins** (*numpy.ndarray*): Histogram bin edges.
+        Returns ``(None, None)`` if no valid data was available for plotting.
+    :rtype: tuple[numpy.ndarray, numpy.ndarray] | tuple[None, None]
+    """
+    counts, bins = prepare_evalue_histogram(evalue_list, unannotated_list)
+    if counts is None:
+        return None, None
+
+    plt.hist([ev for ev in evalue_list if isinstance(ev, (int, float)) and ev > 0],
+             bins=50, range=(0, 1))
+    plt.xlabel("E-value")
+    plt.ylabel("Frequency")
+    plt.title("E-value Distribution")
+    plt.savefig(output_file)
+    plt.close()
+    return counts, bins
+
+
+def main(arg_list=None):
+    """
+    Main entry point of the script.
+
+    Parses arguments, processes cd-hit cluster data, aggregates results,
+    and generates requested outputs (text summaries, plots, and Excel reports).
+
+    :param arg_list: List of arguments for testing purposes.
+    :type arg_list: list, optional
+    :return: None
+    :rtype: None
+    """
+    args = parse_arguments(arg_list)
+   # Parse cluster file
+    clusters = parse_cluster_file(
+        args.input_cluster,
+        args.input_annotation,
+        args.print_empty_files
+    )
+    # Process each cluster
+    all_eval_data = [0]  # For full sample statistics
+    all_simi_data = []
+    cluster_eval_lengths = []
+    cluster_simi_lengths = []
+    cluster_data_list = []
+
+    for cluster in clusters:
+        eval_list, simi_list, taxa_dict = process_cluster_data(cluster)
+        cluster_data_list.append((eval_list, simi_list, taxa_dict))
+        # Collect data for full sample plots
+        all_eval_data[0] += eval_list[0]
+        if len(eval_list) > 1:
+            all_eval_data.extend(sorted(eval_list[1:]))
+            cluster_eval_lengths.append(len(eval_list[1:]))
+
+        if simi_list:
+            all_simi_data.extend(sorted(simi_list, reverse=True))
+            cluster_simi_lengths.append(len(simi_list))
+
+    # Generate outputs based on what was requested
+    if args.output_similarity_txt:
+        write_similarity_output(all_simi_data, args.output_similarity_txt)
+
+    if args.output_similarity_plot and all_simi_data:
+        create_similarity_plot(all_simi_data, cluster_simi_lengths, args, args.output_similarity_plot)
+
+    if args.output_evalue_txt:
+        write_evalue_output(all_eval_data, args.output_evalue_txt)
+
+    if args.output_evalue_plot and len(all_eval_data) > 1:
+        create_evalue_plot(all_eval_data, cluster_eval_lengths, args.output_evalue_plot)
+
+    if args.output_count:
+        write_count_output(all_eval_data, cluster_data_list, args.output_count)
+
+    if args.output_taxa_clusters:
+        write_taxa_clusters_output(cluster_data_list, args.output_taxa_clusters)
+
+    if args.output_taxa_processed:
+        write_taxa_processed_output(cluster_data_list, args, args.output_taxa_processed)
+
+    print(f"Processing complete. Processed {len(clusters)} clusters.")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
author	onnodg
date	Tue, 14 Oct 2025 09:09:46 +0000
parents
children	ff68835adb2b