view cdhit_analysis.xml @ 4:e64af72e1b8f draft default tip

planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit 4017d38cf327c48a6252e488ba792527dae97a70-dirty
author onnodg
date Mon, 15 Dec 2025 16:44:40 +0000
parents c6981ea453ae
children
line wrap: on
line source

<tool id="cdhit_cluster_analysis" name="CD-HIT Cluster Analysis" version="2.0.0">
    <description>Analyze CD-HIT clustering results with taxonomic annotation</description>

    <requirements>
        <requirement type="package" version="3.12.3">python</requirement>
        <requirement type="package" version="3.10.6">matplotlib</requirement>
        <requirement type="package" version="2.3.2">pandas</requirement>
        <requirement type="package" version="3.1.5">openpyxl</requirement>
    </requirements>

    <command detect_errors="exit_code"><![CDATA[
bash '$__tool_directory__/cdhit_analysis.sh'
        --input_cluster '$input_cluster'
        --input_annotation '$input_annotation'
        #if $output_options.similarity_output:
            --output_similarity_txt '$similarity_txt'
            --output_similarity_plot '$similarity_plot'
        #end if

        #if $output_options.count_output:
            --output_count '$cluster_count'
        #end if

        #if $output_options.taxa_output:
            --output_excel '$taxa_excel'
            #if $output_options.show_all:
                --output_taxa_clusters
            #end if
            #if $output_options.show_calculated:
                --output_taxa_processed
            #end if
        #end if
        --log_file '$log_file'

        --simi_plot_y_min '$plot_params.simi_plot_y_min'
        --simi_plot_y_max '$plot_params.simi_plot_y_max'
        --min_cluster_support '$taxa_params.min_cluster_support'
        --uncertain_taxa_use_ratio '$taxa_params.uncertain_taxa_use_ratio'
        --min_to_split '$taxa_params.min_to_split'
        --min_count_to_split '$taxa_params.min_count_to_split'

    ]]></command>

    <inputs>
        <param name="input_cluster" type="data" format="txt" label="CD-HIT cluster file"
               help="Output cluster file from cd-hit-est" />
        <param name="input_annotation" type="data" format="xlsx"
               label="Excel Annotations file"
               help="Excel workfile with annotations per header" />

        <section name="output_options" title="Output Options" expanded="true">
            <param name="similarity_output" type="boolean" truevalue="true" falsevalue="false"
                   checked="false" label="Create cluster similarity output"
                   help="Generate similarity analysis and plots" />
            <param name="count_output" type="boolean" truevalue="true" falsevalue="false"
                   checked="false" label="Create cluster count output"
                   help="Generate read count summaries" />
            <param name="taxa_output" type="boolean" truevalue="true" falsevalue="false"
                   checked="false" label="Create taxa annotations output"
                   help="Generate taxonomic analysis" />
            <param name="show_all" type="boolean" truevalue="true" falsevalue="false"
                   checked="false" label="Show all annotations per cluster"
                   help="Ouput all annotations found per cluster in the excel file" />
            <param name="show_calculated" type="boolean" truevalue="true" falsevalue="false"
                   checked="false" label="Show calculated annotations per cluster"
                   help="Output calculated annotations per cluster in the excel file" />
        </section>

        <section name="plot_params" title="Plot Parameters" expanded="false">
            <param name="simi_plot_y_min" type="float" value="95.0" min="0" max="100"
                   label="Similarity plot Y-axis minimum"
                   help="Minimum value for similarity plot Y-axis" />
            <param name="simi_plot_y_max" type="float" value="100.0" min="0" max="100"
                   label="Similarity plot Y-axis maximum"
                   help="Maximum value for similarity plot Y-axis" />
        </section>

        <section name="taxa_params" title="Taxonomic Analysis Parameters" expanded="false">
            <param name="uncertain_taxa_use_ratio" type="float" value="0.5" min="0" max="1"
                   label="Uncertain taxa ratio"
                   help="Ratio at which uncertain taxa count toward the correct taxa" />
            <param name="min_to_split" type="float" value="0.45" min="0" max="0.5"
                   label="Minimum percentage to split"
                   help="Minimum percentage the second most abundant taxon has to be for taxonomic split" />
            <param name="min_count_to_split" type="integer" value="10" min="1"
                   label="Minimum count to split"
                   help="Minimum count for taxonomic split within clusters" />
            <param name="min_cluster_support" type="integer" value="1" min="1"
                   label="Minimum cluster size"
                   help="Clusters are ignored if they are smaller than this number" />
        </section>
    </inputs>

    <outputs>
        <data name="similarity_txt" format="txt" label="Similarity data" >
            <filter>output_options['similarity_output']</filter>
        </data>
        <data name="similarity_plot" format="png" label="Similarity plot" >
            <filter>output_options['similarity_output']</filter>
        </data>
        <data name="cluster_count" format="txt" label="Count summary" >
            <filter>output_options['count_output']</filter>
        </data>
        <data name="taxa_excel" format="xlsx" label="Taxon output per cluster" >
            <filter>output_options['taxa_output']</filter>
        </data>
        <data name="log_file" format="txt" label="Log file"/>
    </outputs>

    <tests>
        <test expect_num_outputs="5">
            <param name="input_cluster" value="prev_anno.txt" />
            <param name="input_annotation" value="prev4.xlsx" />
            <section name="output_options">
                <param name="similarity_output" value="true" />
                <param name="count_output" value="true" />
                <param name="taxa_output" value="true" />
                <param name="show_all" value="true" />
                <param name="show_calculated" value="true" />
            </section>
            <section name="taxa_params">
                <param name="uncertain_taxa_use_ratio" value="0.5" />
                <param name="min_to_split" value="0.45" />
                <param name="min_count_to_split" value="10" />
                <param name="min_cluster_support" value="1" />
            </section>
            <section name="plot_params">
                <param name="simi_plot_y_min" value="95" />
                <param name="simi_plot_y_max" value="100" />
            </section>
            <output name="log_file" file="test1_logs.txt"/>
            <output name="similarity_txt" file="test1_similarity.txt" />
            <output name="similarity_plot" file="test1_similarity.png" compare="sim_size" />
            <output name="cluster_count" file="test1_summary.txt" />
            <output name="taxa_excel" file="test1_un_report.xlsx" decompress="true" />
         </test>
        <test expect_num_outputs="5">
            <param name="input_cluster" value="test2_clusters.txt" />
            <param name="input_annotation" value="test2_annotations.xlsx" />
            <section name="output_options">
                <param name="similarity_output" value="true" />
                <param name="count_output" value="true" />
                <param name="taxa_output" value="true" />
                <param name="show_all" value="true" />
                <param name="show_calculated" value="true" />
            </section>
            <section name="taxa_params">
                <param name="uncertain_taxa_use_ratio" value="0.5" />
                <param name="min_to_split" value="0.45" />
                <param name="min_count_to_split" value="10" />
                <param name="min_cluster_support" value="1" />
            </section>
            <section name="plot_params">
                <param name="simi_plot_y_min" value="95" />
                <param name="simi_plot_y_max" value="100" />
            </section>
            <output name="log_file" file="test2_logs.txt"/>
            <output name="similarity_txt" file="test2_similarity.txt" />
            <output name="similarity_plot" file="test2_similarity.png" compare="sim_size" />
            <output name="cluster_count" file="test2_summary.txt" />
            <output name="taxa_excel" file="test2_un_report.xlsx" decompress="true" />
       </test>
        <test expect_num_outputs="3">
            <param name="input_cluster" value="test2_clusters.txt" />
            <param name="input_annotation" value="test2_annotations.xlsx" />
            <section name="output_options">
                <param name="count_output" value="true" />
                <param name="taxa_output" value="true" />
                <param name="show_all" value="true" />
                <param name="show_calculated" value="false" />
            </section>
            <section name="taxa_params">
                <param name="uncertain_taxa_use_ratio" value="0.2" />
                <param name="min_to_split" value="0.1" />
                <param name="min_count_to_split" value="3" />
                <param name="min_cluster_support" value="4" />
            </section>
            <section name="plot_params">
                <param name="simi_plot_y_min" value="95" />
                <param name="simi_plot_y_max" value="100" />
            </section>
            <output name="log_file" file="test3_logs.txt"/>
            <output name="cluster_count" file="test3_summary.txt" />
            <output name="taxa_excel" file="test3_un_report.xlsx" decompress="true" />
        </test>
    </tests>

    <help><![CDATA[
**CD-HIT Cluster Analysis**

This tool analyzes CD-HIT clustering output together with an annotation Excel file,
producing similarity statistics, count summaries, and taxonomic assignments.

**Input Files:**

1. **CD-HIT cluster file (.txt/.clstr)**: Required. The cluster file output from cd-hit-est containing clustered sequences.

2. **Annotation file (.xlsx)**: Tab-separated file containing sequence annotations with columns:

**Output Options:**

- **Cluster similarity output**: Creates similarity analysis with plots and text files showing intra-cluster similarity distributions
- **Cluster count output**: Creates summary tables with annotated/unannotated read counts per cluster
- **Taxa annotations output**: Creates taxonomic analysis determining the most likely taxa for each cluster
  - **Raw_Taxa_Clusters** — all annotations per cluster
  - **Processed_Taxa_Clusters** — Annotations per cluster after weighted LCA

**Parameters:**

- **Plot Parameters**: Control the size of similarity plots (X and Y-axis limits)
- **Taxonomic Analysis Parameters**: Control when clusters are valid and when clusters are split

**Output Files:**

- **Similarity data**: Tab-separated file with similarity statistics
- **Similarity plot**: PNG image showing similarity distribution across clusters
- **Count summary**: Tab-separated file with read counts per cluster
- **Taxon output per cluster**: Excel file showing all taxa found in each cluster
- **Log file**: Contains cluster statistics and error logs

**Note**: The tool expects that sequence counts are included in the cluster file headers in the format "header(count)".

-------------

.. class:: infomark

**Credits**

Based on a script by Nick Kortleven, translated, modified and wrapped by Onno de Gorter,
Developed for the New light on old remedies project, a PhD research by Anja Fischer.

Link to the project website:

* https://ahm.uva.nl/funded-research-projects/new-lights-on-old-remedies/new-lights-on-old-remedies.html

    ]]></help>
    <creator>
        <organization name="Naturalis Biodiversity Center" url="https://www.naturalis.nl/en/science" />
        <person givenName="Onno" familyName="de Gorter" url="https://github.com/Onnodg"/>
        <person givenName="Nick" familyName="Kortleven" url="https://github.com/tombkingsts" />
    </creator>
</tool>