Mercurial > repos > onnodg > cdhit_analysis

diff cdhit_analysis.xml @ 4:e64af72e1b8f draft default tip
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit 4017d38cf327c48a6252e488ba792527dae97a70-dirty
author: onnodg
date: Mon, 15 Dec 2025 16:44:40 +0000
parents: c6981ea453ae
--- a/cdhit_analysis.xml	Fri Oct 24 09:38:24 2025 +0000
+++ b/cdhit_analysis.xml	Mon Dec 15 16:44:40 2025 +0000
@@ -1,262 +1,242 @@
-<tool id="cdhit_cluster_analysis" name="CD-HIT Cluster Analysis" version="1.0.2">
-    <description>Analyze CD-HIT clustering results with taxonomic annotation</description>
-
-    <requirements>
-        <requirement type="package" version="3.12.3">python</requirement>
-        <requirement type="package" version="3.10.6">matplotlib</requirement>
-        <requirement type="package" version="2.3.2">pandas</requirement>
-        <requirement type="package" version="3.1.5">openpyxl</requirement>
-    </requirements>
-
-    <command detect_errors="exit_code"><![CDATA[
-bash '$__tool_directory__/cdhit_analysis.sh'
-        --input_cluster '$input_cluster'
-        --input_annotation '$input_annotation'
-
-        #if $output_options.similarity_output:
-            --output_similarity_txt '$similarity_txt'
-            --output_similarity_plot '$similarity_plot'
-        #end if
-
-        #if $output_options.evalue_output:
-            --output_evalue_txt '$evalue_txt'
-            --output_evalue_plot '$evalue_plot'
-        #end if
-
-        #if $output_options.count_output:
-            --output_count '$cluster_count'
-        #end if
-
-        #if $output_options.taxa_output:
-            --output_taxa_clusters '$cluster_taxa'
-            --output_taxa_processed '$processed_taxa'
-        #end if
-
-        --simi_plot_y_min '$plot_params.simi_plot_y_min'
-        --simi_plot_y_max '$plot_params.simi_plot_y_max'
-
-        --uncertain_taxa_use_ratio '$taxa_params.uncertain_taxa_use_ratio'
-        --min_to_split '$taxa_params.min_to_split'
-        --min_count_to_split '$taxa_params.min_count_to_split'
-
-        #if $processing_options.show_unannotated_clusters:
-            --show_unannotated_clusters
-        #end if
-
-        #if $processing_options.make_taxa_in_cluster_split:
-            --make_taxa_in_cluster_split
-        #end if
-
-        #if $processing_options.print_empty_files:
-            --print_empty_files
-        #end if
-    ]]></command>
-
-    <inputs>
-        <param name="input_cluster" type="data" format="txt" label="CD-HIT cluster file"
-               help="Output cluster file from cd-hit-est" />
-        <param name="input_annotation" type="data" format="xlsx"
-               label="Excel Annotations file"
-               help="Excel workfile with annotations per header" />
-
-        <section name="output_options" title="Output Options" expanded="true">
-            <param name="similarity_output" type="boolean" truevalue="true" falsevalue="false"
-                   checked="true" label="Create cluster similarity output"
-                   help="Generate similarity analysis and plots" />
-            <param name="evalue_output" type="boolean" truevalue="true" falsevalue="false"
-                   checked="true" label="Create cluster E-value output"
-                   help="Generate E-value analysis and plots" />
-            <param name="count_output" type="boolean" truevalue="true" falsevalue="false"
-                   checked="true" label="Create cluster count output"
-                   help="Generate read count summaries" />
-            <param name="taxa_output" type="boolean" truevalue="true" falsevalue="false"
-                   checked="true" label="Create taxa annotations output"
-                   help="Generate taxonomic analysis" />
-        </section>
-
-        <section name="plot_params" title="Plot Parameters" expanded="false">
-            <param name="simi_plot_y_min" type="float" value="95.0" min="0" max="100"
-                   label="Similarity plot Y-axis minimum"
-                   help="Minimum value for similarity plot Y-axis" />
-            <param name="simi_plot_y_max" type="float" value="100.0" min="0" max="100"
-                   label="Similarity plot Y-axis maximum"
-                   help="Maximum value for similarity plot Y-axis" />
-        </section>
-
-        <section name="taxa_params" title="Taxonomic Analysis Parameters" expanded="false">
-            <param name="uncertain_taxa_use_ratio" type="float" value="0.5" min="0" max="1"
-                   label="Uncertain taxa ratio"
-                   help="Ratio at which uncertain taxa count toward the correct taxa" />
-            <param name="min_to_split" type="float" value="0.45" min="0" max="1"
-                   label="Minimum percentage to split"
-                   help="Minimum percentage for taxonomic split" />
-            <param name="min_count_to_split" type="integer" value="10" min="1"
-                   label="Minimum count to split"
-                   help="Minimum count for taxonomic split" />
-        </section>
-
-        <section name="processing_options" title="Processing Options" expanded="false">
-            <param name="show_unannotated_clusters" type="boolean" truevalue="true" falsevalue="false"
-                   checked="false" label="Show unannotated clusters"
-                   help="Include unannotated clusters in output" />
-            <param name="make_taxa_in_cluster_split" type="boolean" truevalue="true" falsevalue="false"
-                   checked="false" label="Split clusters with multiple taxa"
-                   help="Split clusters containing multiple taxa instead of marking as uncertain" />
-            <param name="print_empty_files" type="boolean" truevalue="true" falsevalue="false"
-                   checked="false" label="Print empty file messages"
-                   help="Print messages about empty annotation files" />
-        </section>
-    </inputs>
-
-    <outputs>
-        <data name="similarity_txt" format="txt" label="Similarity data" >
-            <filter>output_options['similarity_output']</filter>
-        </data>
-
-        <data name="similarity_plot" format="png" label="Similarity plot" >
-            <filter>output_options['similarity_output']</filter>
-        </data>
-
-        <data name="evalue_txt" format="txt" label="E-value data" >
-            <filter>output_options['evalue_output']</filter>
-        </data>
-
-        <data name="evalue_plot" format="png" label="E-value plot" >
-            <filter>output_options['evalue_output']</filter>
-        </data>
-
-        <data name="cluster_count" format="txt" label="Count summary" >
-            <filter>output_options['count_output']</filter>
-        </data>
-
-        <data name="cluster_taxa" format="xlsx" label="Raw taxa per cluster" >
-            <filter>output_options['taxa_output']</filter>
-        </data>
-
-        <data name="processed_taxa" format="xlsx" label="Processed taxa" >
-            <filter>output_options['taxa_output']</filter>
-        </data>
-    </outputs>
-
-    <tests>
-        <test expect_num_outputs="7">
-            <param name="input_cluster" value="29-test.clstr.txt" />
-            <param name="input_annotation" value="header_anno_29_test.xlsx" />
-            <section name="output_options">
-                <param name="similarity_output" value="true" />
-                <param name="evalue_output" value="true" />
-                <param name="count_output" value="true" />
-                <param name="taxa_output" value="true" />
-            </section>
-            <output name="similarity_txt" file="sim_out.txt" />
-            <output name="similarity_plot" file="sim_out.png" compare="sim_size"/>
-            <output name="evalue_txt" file="evalue_out.txt" />
-            <output name="evalue_plot" file="evalue_out.png" compare="sim_size"/>
-            <output name="cluster_count" file="count_out.txt" />
-            <output name="cluster_taxa" file="taxa_out.xlsx" decompress="true"/>
-            <output name="processed_taxa" file="processed.xlsx" decompress="true"/>
-        </test>
-        <test expect_num_outputs="7">
-            <param name="input_cluster" value="input2_test.clstr.txt" />
-            <param name="input_annotation" value="header_anno_excel.xlsx" />
-            <section name="output_options">
-                <param name="similarity_output" value="true" />
-                <param name="evalue_output" value="true" />
-                <param name="count_output" value="true" />
-                <param name="taxa_output" value="true" />
-            </section>
-            <output name="similarity_txt" file="test2_sim_out.txt" />
-            <output name="similarity_plot" file="test2_sim_out.png" compare="sim_size"/>
-            <output name="evalue_txt" file="test2_evalue_out.txt" />
-            <output name="evalue_plot" file="test2_evalue_out.png" compare="sim_size"/>
-            <output name="cluster_count" file="test_2count_out.txt" />
-            <output name="cluster_taxa" file="test_2taxa_out.xlsx" decompress="true"/>
-            <output name="processed_taxa" file="test_2processed.xlsx" decompress="true"/>
-        </test>
-        <test expect_num_outputs="5">
-            <param name="input_cluster" value="input2_test.clstr.txt" />
-            <param name="input_annotation" value="header_anno_excel.xlsx" />
-            <section name="output_options">
-                <param name="similarity_output" value="true" />
-                <param name="count_output" value="true" />
-                <param name="taxa_output" value="true" />
-                <param name="evalue_output" value="false" />
-            </section>
-            <section name="processing_options">
-                <param name="show_unannotated_clusters" value="true"/>
-                <param name="make_taxa_in_cluster_split" value="true"/>
-                <param name="print_empty_files" value="true"/>
-            </section>
-            <section name="taxa_params">
-                <param name="uncertain_taxa_use_ratio" value="0.6"/>
-                <param name="min_to_split" value="0.6"/>
-                <param name="min_count_to_split" value="6"/>
-            </section>
-            <section name="plot_params">
-                <param name="simi_plot_y_min" value="0.4" />
-                <param name="simi_plot_y_max"  value="0.4"  />
-            </section>
-            <output name="similarity_txt" file="test2_sim_extra_out.txt" />
-            <output name="similarity_plot" file="test2_sim_extra_out.png" compare="sim_size"/>
-            <output name="cluster_count" file="test_2count_extra_out.txt" />
-            <output name="cluster_taxa" file="test_2taxa_extra_out.xlsx" decompress="true"/>
-            <output name="processed_taxa" file="test_2processed_extra.xlsx" decompress="true"/>
-        </test>
-    </tests>
-
-    <help><![CDATA[
-**CD-HIT Cluster Analysis**
-
-This tool analyzes CD-HIT clustering results and provides various outputs including taxonomic analysis, similarity analysis, E-value analysis, and read count summaries.
-
-**Input Files:**
-
-1. **CD-HIT cluster file (.txt/.clstr)**: Required. The cluster file output from cd-hit-est containing clustered sequences.
-
-2. **Annotation file (.xlsx)**: Tab-separated file containing sequence annotations with columns:
-
-**Output Options:**
-
-- **Cluster similarity output**: Creates similarity analysis with plots and text files showing intra-cluster similarity distributions
-- **Cluster e-value output**: Creates E-value analysis with plots and text files showing E-value distributions
-- **Cluster count output**: Creates summary tables with annotated/unannotated read counts per cluster
-- **Taxa annotations output**: Creates taxonomic analysis determining the most likely taxa for each cluster
-
-**Parameters:**
-
-- **Plot Parameters**: Control the size of similarity plots (X and Y-axis limits)
-- **Taxonomic Analysis Parameters**: Control how uncertain taxa are handled and when clusters are split
-- **Processing Options**: Control display of unannotated clusters and verbose output
-
-**Output Files:**
-
-- **Similarity data**: Tab-separated file with similarity statistics
-- **Similarity plot**: PNG image showing similarity distribution across clusters
-- **E-value data**: Tab-separated file with E-value statistics
-- **E-value plot**: PNG image showing E-value distribution
-- **Count summary**: Tab-separated file with read counts per cluster
-- **Raw taxa per cluster**: Excel file showing all taxa found in each cluster
-- **Processed taxa**: Excel file with clusters where a taxon was assigned
-
-**Note**: The tool expects that sequence counts are included in the cluster file headers in the format "header(count)".
-
--------------
-
-.. class:: infomark
-
-**Credits**
-
-Based on a script by Nick Kortleven, translated, modified and wrapped by Onno de Gorter,
-Developed for the New light on old remedies project, a PhD research by Anja Fischer.
-
-Link to the project website:
-
-* https://ahm.uva.nl/funded-research-projects/new-lights-on-old-remedies/new-lights-on-old-remedies.html
-
-    ]]></help>
-    <creator>
-        <organization name="Naturalis Biodiversity Center" url="https://www.naturalis.nl/en/science" />
-        <person givenName="Onno" familyName="de Gorter" url="https://github.com/Onnodg"/>
-        <person givenName="Nick" familyName="Kortleven" url="https://github.com/tombkingsts" />
-    </creator>
+<tool id="cdhit_cluster_analysis" name="CD-HIT Cluster Analysis" version="2.0.0">
+    <description>Analyze CD-HIT clustering results with taxonomic annotation</description>
+
+    <requirements>
+        <requirement type="package" version="3.12.3">python</requirement>
+        <requirement type="package" version="3.10.6">matplotlib</requirement>
+        <requirement type="package" version="2.3.2">pandas</requirement>
+        <requirement type="package" version="3.1.5">openpyxl</requirement>
+    </requirements>
+
+    <command detect_errors="exit_code"><![CDATA[
+bash '$__tool_directory__/cdhit_analysis.sh'
+        --input_cluster '$input_cluster'
+        --input_annotation '$input_annotation'
+        #if $output_options.similarity_output:
+            --output_similarity_txt '$similarity_txt'
+            --output_similarity_plot '$similarity_plot'
+        #end if
+
+        #if $output_options.count_output:
+            --output_count '$cluster_count'
+        #end if
+
+        #if $output_options.taxa_output:
+            --output_excel '$taxa_excel'
+            #if $output_options.show_all:
+                --output_taxa_clusters
+            #end if
+            #if $output_options.show_calculated:
+                --output_taxa_processed
+            #end if
+        #end if
+        --log_file '$log_file'
+
+        --simi_plot_y_min '$plot_params.simi_plot_y_min'
+        --simi_plot_y_max '$plot_params.simi_plot_y_max'
+        --min_cluster_support '$taxa_params.min_cluster_support'
+        --uncertain_taxa_use_ratio '$taxa_params.uncertain_taxa_use_ratio'
+        --min_to_split '$taxa_params.min_to_split'
+        --min_count_to_split '$taxa_params.min_count_to_split'
+
+    ]]></command>
+
+    <inputs>
+        <param name="input_cluster" type="data" format="txt" label="CD-HIT cluster file"
+               help="Output cluster file from cd-hit-est" />
+        <param name="input_annotation" type="data" format="xlsx"
+               label="Excel Annotations file"
+               help="Excel workfile with annotations per header" />
+
+        <section name="output_options" title="Output Options" expanded="true">
+            <param name="similarity_output" type="boolean" truevalue="true" falsevalue="false"
+                   checked="false" label="Create cluster similarity output"
+                   help="Generate similarity analysis and plots" />
+            <param name="count_output" type="boolean" truevalue="true" falsevalue="false"
+                   checked="false" label="Create cluster count output"
+                   help="Generate read count summaries" />
+            <param name="taxa_output" type="boolean" truevalue="true" falsevalue="false"
+                   checked="false" label="Create taxa annotations output"
+                   help="Generate taxonomic analysis" />
+            <param name="show_all" type="boolean" truevalue="true" falsevalue="false"
+                   checked="false" label="Show all annotations per cluster"
+                   help="Ouput all annotations found per cluster in the excel file" />
+            <param name="show_calculated" type="boolean" truevalue="true" falsevalue="false"
+                   checked="false" label="Show calculated annotations per cluster"
+                   help="Output calculated annotations per cluster in the excel file" />
+        </section>
+
+        <section name="plot_params" title="Plot Parameters" expanded="false">
+            <param name="simi_plot_y_min" type="float" value="95.0" min="0" max="100"
+                   label="Similarity plot Y-axis minimum"
+                   help="Minimum value for similarity plot Y-axis" />
+            <param name="simi_plot_y_max" type="float" value="100.0" min="0" max="100"
+                   label="Similarity plot Y-axis maximum"
+                   help="Maximum value for similarity plot Y-axis" />
+        </section>
+
+        <section name="taxa_params" title="Taxonomic Analysis Parameters" expanded="false">
+            <param name="uncertain_taxa_use_ratio" type="float" value="0.5" min="0" max="1"
+                   label="Uncertain taxa ratio"
+                   help="Ratio at which uncertain taxa count toward the correct taxa" />
+            <param name="min_to_split" type="float" value="0.45" min="0" max="0.5"
+                   label="Minimum percentage to split"
+                   help="Minimum percentage the second most abundant taxon has to be for taxonomic split" />
+            <param name="min_count_to_split" type="integer" value="10" min="1"
+                   label="Minimum count to split"
+                   help="Minimum count for taxonomic split within clusters" />
+            <param name="min_cluster_support" type="integer" value="1" min="1"
+                   label="Minimum cluster size"
+                   help="Clusters are ignored if they are smaller than this number" />
+        </section>
+    </inputs>
+
+    <outputs>
+        <data name="similarity_txt" format="txt" label="Similarity data" >
+            <filter>output_options['similarity_output']</filter>
+        </data>
+        <data name="similarity_plot" format="png" label="Similarity plot" >
+            <filter>output_options['similarity_output']</filter>
+        </data>
+        <data name="cluster_count" format="txt" label="Count summary" >
+            <filter>output_options['count_output']</filter>
+        </data>
+        <data name="taxa_excel" format="xlsx" label="Taxon output per cluster" >
+            <filter>output_options['taxa_output']</filter>
+        </data>
+        <data name="log_file" format="txt" label="Log file"/>
+    </outputs>
+
+    <tests>
+        <test expect_num_outputs="5">
+            <param name="input_cluster" value="prev_anno.txt" />
+            <param name="input_annotation" value="prev4.xlsx" />
+            <section name="output_options">
+                <param name="similarity_output" value="true" />
+                <param name="count_output" value="true" />
+                <param name="taxa_output" value="true" />
+                <param name="show_all" value="true" />
+                <param name="show_calculated" value="true" />
+            </section>
+            <section name="taxa_params">
+                <param name="uncertain_taxa_use_ratio" value="0.5" />
+                <param name="min_to_split" value="0.45" />
+                <param name="min_count_to_split" value="10" />
+                <param name="min_cluster_support" value="1" />
+            </section>
+            <section name="plot_params">
+                <param name="simi_plot_y_min" value="95" />
+                <param name="simi_plot_y_max" value="100" />
+            </section>
+            <output name="log_file" file="test1_logs.txt"/>
+            <output name="similarity_txt" file="test1_similarity.txt" />
+            <output name="similarity_plot" file="test1_similarity.png" compare="sim_size" />
+            <output name="cluster_count" file="test1_summary.txt" />
+            <output name="taxa_excel" file="test1_un_report.xlsx" decompress="true" />
+         </test>
+        <test expect_num_outputs="5">
+            <param name="input_cluster" value="test2_clusters.txt" />
+            <param name="input_annotation" value="test2_annotations.xlsx" />
+            <section name="output_options">
+                <param name="similarity_output" value="true" />
+                <param name="count_output" value="true" />
+                <param name="taxa_output" value="true" />
+                <param name="show_all" value="true" />
+                <param name="show_calculated" value="true" />
+            </section>
+            <section name="taxa_params">
+                <param name="uncertain_taxa_use_ratio" value="0.5" />
+                <param name="min_to_split" value="0.45" />
+                <param name="min_count_to_split" value="10" />
+                <param name="min_cluster_support" value="1" />
+            </section>
+            <section name="plot_params">
+                <param name="simi_plot_y_min" value="95" />
+                <param name="simi_plot_y_max" value="100" />
+            </section>
+            <output name="log_file" file="test2_logs.txt"/>
+            <output name="similarity_txt" file="test2_similarity.txt" />
+            <output name="similarity_plot" file="test2_similarity.png" compare="sim_size" />
+            <output name="cluster_count" file="test2_summary.txt" />
+            <output name="taxa_excel" file="test2_un_report.xlsx" decompress="true" />
+       </test>
+        <test expect_num_outputs="3">
+            <param name="input_cluster" value="test2_clusters.txt" />
+            <param name="input_annotation" value="test2_annotations.xlsx" />
+            <section name="output_options">
+                <param name="count_output" value="true" />
+                <param name="taxa_output" value="true" />
+                <param name="show_all" value="true" />
+                <param name="show_calculated" value="false" />
+            </section>
+            <section name="taxa_params">
+                <param name="uncertain_taxa_use_ratio" value="0.2" />
+                <param name="min_to_split" value="0.1" />
+                <param name="min_count_to_split" value="3" />
+                <param name="min_cluster_support" value="4" />
+            </section>
+            <section name="plot_params">
+                <param name="simi_plot_y_min" value="95" />
+                <param name="simi_plot_y_max" value="100" />
+            </section>
+            <output name="log_file" file="test3_logs.txt"/>
+            <output name="cluster_count" file="test3_summary.txt" />
+            <output name="taxa_excel" file="test3_un_report.xlsx" decompress="true" />
+        </test>
+    </tests>
+
+    <help><![CDATA[
+**CD-HIT Cluster Analysis**
+
+This tool analyzes CD-HIT clustering output together with an annotation Excel file,
+producing similarity statistics, count summaries, and taxonomic assignments.
+
+**Input Files:**
+
+1. **CD-HIT cluster file (.txt/.clstr)**: Required. The cluster file output from cd-hit-est containing clustered sequences.
+
+2. **Annotation file (.xlsx)**: Tab-separated file containing sequence annotations with columns:
+
+**Output Options:**
+
+- **Cluster similarity output**: Creates similarity analysis with plots and text files showing intra-cluster similarity distributions
+- **Cluster count output**: Creates summary tables with annotated/unannotated read counts per cluster
+- **Taxa annotations output**: Creates taxonomic analysis determining the most likely taxa for each cluster
+  - **Raw_Taxa_Clusters** — all annotations per cluster
+  - **Processed_Taxa_Clusters** — Annotations per cluster after weighted LCA
+
+**Parameters:**
+
+- **Plot Parameters**: Control the size of similarity plots (X and Y-axis limits)
+- **Taxonomic Analysis Parameters**: Control when clusters are valid and when clusters are split
+
+**Output Files:**
+
+- **Similarity data**: Tab-separated file with similarity statistics
+- **Similarity plot**: PNG image showing similarity distribution across clusters
+- **Count summary**: Tab-separated file with read counts per cluster
+- **Taxon output per cluster**: Excel file showing all taxa found in each cluster
+- **Log file**: Contains cluster statistics and error logs
+
+**Note**: The tool expects that sequence counts are included in the cluster file headers in the format "header(count)".
+
+-------------
+
+.. class:: infomark
+
+**Credits**
+
+Based on a script by Nick Kortleven, translated, modified and wrapped by Onno de Gorter,
+Developed for the New light on old remedies project, a PhD research by Anja Fischer.
+
+Link to the project website:
+
+* https://ahm.uva.nl/funded-research-projects/new-lights-on-old-remedies/new-lights-on-old-remedies.html
+
+    ]]></help>
+    <creator>
+        <organization name="Naturalis Biodiversity Center" url="https://www.naturalis.nl/en/science" />
+        <person givenName="Onno" familyName="de Gorter" url="https://github.com/Onnodg"/>
+        <person givenName="Nick" familyName="Kortleven" url="https://github.com/tombkingsts" />
+    </creator>
 </tool>
\ No newline at end of file
author	onnodg
date	Mon, 15 Dec 2025 16:44:40 +0000
parents	c6981ea453ae
children