Mercurial > repos > onnodg > cdhit_analysis
diff cdhit_analysis.xml @ 4:e64af72e1b8f draft default tip
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit 4017d38cf327c48a6252e488ba792527dae97a70-dirty
| author | onnodg |
|---|---|
| date | Mon, 15 Dec 2025 16:44:40 +0000 |
| parents | c6981ea453ae |
| children |
line wrap: on
line diff
--- a/cdhit_analysis.xml Fri Oct 24 09:38:24 2025 +0000 +++ b/cdhit_analysis.xml Mon Dec 15 16:44:40 2025 +0000 @@ -1,262 +1,242 @@ -<tool id="cdhit_cluster_analysis" name="CD-HIT Cluster Analysis" version="1.0.2"> - <description>Analyze CD-HIT clustering results with taxonomic annotation</description> - - <requirements> - <requirement type="package" version="3.12.3">python</requirement> - <requirement type="package" version="3.10.6">matplotlib</requirement> - <requirement type="package" version="2.3.2">pandas</requirement> - <requirement type="package" version="3.1.5">openpyxl</requirement> - </requirements> - - <command detect_errors="exit_code"><![CDATA[ -bash '$__tool_directory__/cdhit_analysis.sh' - --input_cluster '$input_cluster' - --input_annotation '$input_annotation' - - #if $output_options.similarity_output: - --output_similarity_txt '$similarity_txt' - --output_similarity_plot '$similarity_plot' - #end if - - #if $output_options.evalue_output: - --output_evalue_txt '$evalue_txt' - --output_evalue_plot '$evalue_plot' - #end if - - #if $output_options.count_output: - --output_count '$cluster_count' - #end if - - #if $output_options.taxa_output: - --output_taxa_clusters '$cluster_taxa' - --output_taxa_processed '$processed_taxa' - #end if - - --simi_plot_y_min '$plot_params.simi_plot_y_min' - --simi_plot_y_max '$plot_params.simi_plot_y_max' - - --uncertain_taxa_use_ratio '$taxa_params.uncertain_taxa_use_ratio' - --min_to_split '$taxa_params.min_to_split' - --min_count_to_split '$taxa_params.min_count_to_split' - - #if $processing_options.show_unannotated_clusters: - --show_unannotated_clusters - #end if - - #if $processing_options.make_taxa_in_cluster_split: - --make_taxa_in_cluster_split - #end if - - #if $processing_options.print_empty_files: - --print_empty_files - #end if - ]]></command> - - <inputs> - <param name="input_cluster" type="data" format="txt" label="CD-HIT cluster file" - help="Output cluster file from cd-hit-est" /> - <param name="input_annotation" type="data" format="xlsx" - label="Excel Annotations file" - help="Excel workfile with annotations per header" /> - - <section name="output_options" title="Output Options" expanded="true"> - <param name="similarity_output" type="boolean" truevalue="true" falsevalue="false" - checked="true" label="Create cluster similarity output" - help="Generate similarity analysis and plots" /> - <param name="evalue_output" type="boolean" truevalue="true" falsevalue="false" - checked="true" label="Create cluster E-value output" - help="Generate E-value analysis and plots" /> - <param name="count_output" type="boolean" truevalue="true" falsevalue="false" - checked="true" label="Create cluster count output" - help="Generate read count summaries" /> - <param name="taxa_output" type="boolean" truevalue="true" falsevalue="false" - checked="true" label="Create taxa annotations output" - help="Generate taxonomic analysis" /> - </section> - - <section name="plot_params" title="Plot Parameters" expanded="false"> - <param name="simi_plot_y_min" type="float" value="95.0" min="0" max="100" - label="Similarity plot Y-axis minimum" - help="Minimum value for similarity plot Y-axis" /> - <param name="simi_plot_y_max" type="float" value="100.0" min="0" max="100" - label="Similarity plot Y-axis maximum" - help="Maximum value for similarity plot Y-axis" /> - </section> - - <section name="taxa_params" title="Taxonomic Analysis Parameters" expanded="false"> - <param name="uncertain_taxa_use_ratio" type="float" value="0.5" min="0" max="1" - label="Uncertain taxa ratio" - help="Ratio at which uncertain taxa count toward the correct taxa" /> - <param name="min_to_split" type="float" value="0.45" min="0" max="1" - label="Minimum percentage to split" - help="Minimum percentage for taxonomic split" /> - <param name="min_count_to_split" type="integer" value="10" min="1" - label="Minimum count to split" - help="Minimum count for taxonomic split" /> - </section> - - <section name="processing_options" title="Processing Options" expanded="false"> - <param name="show_unannotated_clusters" type="boolean" truevalue="true" falsevalue="false" - checked="false" label="Show unannotated clusters" - help="Include unannotated clusters in output" /> - <param name="make_taxa_in_cluster_split" type="boolean" truevalue="true" falsevalue="false" - checked="false" label="Split clusters with multiple taxa" - help="Split clusters containing multiple taxa instead of marking as uncertain" /> - <param name="print_empty_files" type="boolean" truevalue="true" falsevalue="false" - checked="false" label="Print empty file messages" - help="Print messages about empty annotation files" /> - </section> - </inputs> - - <outputs> - <data name="similarity_txt" format="txt" label="Similarity data" > - <filter>output_options['similarity_output']</filter> - </data> - - <data name="similarity_plot" format="png" label="Similarity plot" > - <filter>output_options['similarity_output']</filter> - </data> - - <data name="evalue_txt" format="txt" label="E-value data" > - <filter>output_options['evalue_output']</filter> - </data> - - <data name="evalue_plot" format="png" label="E-value plot" > - <filter>output_options['evalue_output']</filter> - </data> - - <data name="cluster_count" format="txt" label="Count summary" > - <filter>output_options['count_output']</filter> - </data> - - <data name="cluster_taxa" format="xlsx" label="Raw taxa per cluster" > - <filter>output_options['taxa_output']</filter> - </data> - - <data name="processed_taxa" format="xlsx" label="Processed taxa" > - <filter>output_options['taxa_output']</filter> - </data> - </outputs> - - <tests> - <test expect_num_outputs="7"> - <param name="input_cluster" value="29-test.clstr.txt" /> - <param name="input_annotation" value="header_anno_29_test.xlsx" /> - <section name="output_options"> - <param name="similarity_output" value="true" /> - <param name="evalue_output" value="true" /> - <param name="count_output" value="true" /> - <param name="taxa_output" value="true" /> - </section> - <output name="similarity_txt" file="sim_out.txt" /> - <output name="similarity_plot" file="sim_out.png" compare="sim_size"/> - <output name="evalue_txt" file="evalue_out.txt" /> - <output name="evalue_plot" file="evalue_out.png" compare="sim_size"/> - <output name="cluster_count" file="count_out.txt" /> - <output name="cluster_taxa" file="taxa_out.xlsx" decompress="true"/> - <output name="processed_taxa" file="processed.xlsx" decompress="true"/> - </test> - <test expect_num_outputs="7"> - <param name="input_cluster" value="input2_test.clstr.txt" /> - <param name="input_annotation" value="header_anno_excel.xlsx" /> - <section name="output_options"> - <param name="similarity_output" value="true" /> - <param name="evalue_output" value="true" /> - <param name="count_output" value="true" /> - <param name="taxa_output" value="true" /> - </section> - <output name="similarity_txt" file="test2_sim_out.txt" /> - <output name="similarity_plot" file="test2_sim_out.png" compare="sim_size"/> - <output name="evalue_txt" file="test2_evalue_out.txt" /> - <output name="evalue_plot" file="test2_evalue_out.png" compare="sim_size"/> - <output name="cluster_count" file="test_2count_out.txt" /> - <output name="cluster_taxa" file="test_2taxa_out.xlsx" decompress="true"/> - <output name="processed_taxa" file="test_2processed.xlsx" decompress="true"/> - </test> - <test expect_num_outputs="5"> - <param name="input_cluster" value="input2_test.clstr.txt" /> - <param name="input_annotation" value="header_anno_excel.xlsx" /> - <section name="output_options"> - <param name="similarity_output" value="true" /> - <param name="count_output" value="true" /> - <param name="taxa_output" value="true" /> - <param name="evalue_output" value="false" /> - </section> - <section name="processing_options"> - <param name="show_unannotated_clusters" value="true"/> - <param name="make_taxa_in_cluster_split" value="true"/> - <param name="print_empty_files" value="true"/> - </section> - <section name="taxa_params"> - <param name="uncertain_taxa_use_ratio" value="0.6"/> - <param name="min_to_split" value="0.6"/> - <param name="min_count_to_split" value="6"/> - </section> - <section name="plot_params"> - <param name="simi_plot_y_min" value="0.4" /> - <param name="simi_plot_y_max" value="0.4" /> - </section> - <output name="similarity_txt" file="test2_sim_extra_out.txt" /> - <output name="similarity_plot" file="test2_sim_extra_out.png" compare="sim_size"/> - <output name="cluster_count" file="test_2count_extra_out.txt" /> - <output name="cluster_taxa" file="test_2taxa_extra_out.xlsx" decompress="true"/> - <output name="processed_taxa" file="test_2processed_extra.xlsx" decompress="true"/> - </test> - </tests> - - <help><![CDATA[ -**CD-HIT Cluster Analysis** - -This tool analyzes CD-HIT clustering results and provides various outputs including taxonomic analysis, similarity analysis, E-value analysis, and read count summaries. - -**Input Files:** - -1. **CD-HIT cluster file (.txt/.clstr)**: Required. The cluster file output from cd-hit-est containing clustered sequences. - -2. **Annotation file (.xlsx)**: Tab-separated file containing sequence annotations with columns: - -**Output Options:** - -- **Cluster similarity output**: Creates similarity analysis with plots and text files showing intra-cluster similarity distributions -- **Cluster e-value output**: Creates E-value analysis with plots and text files showing E-value distributions -- **Cluster count output**: Creates summary tables with annotated/unannotated read counts per cluster -- **Taxa annotations output**: Creates taxonomic analysis determining the most likely taxa for each cluster - -**Parameters:** - -- **Plot Parameters**: Control the size of similarity plots (X and Y-axis limits) -- **Taxonomic Analysis Parameters**: Control how uncertain taxa are handled and when clusters are split -- **Processing Options**: Control display of unannotated clusters and verbose output - -**Output Files:** - -- **Similarity data**: Tab-separated file with similarity statistics -- **Similarity plot**: PNG image showing similarity distribution across clusters -- **E-value data**: Tab-separated file with E-value statistics -- **E-value plot**: PNG image showing E-value distribution -- **Count summary**: Tab-separated file with read counts per cluster -- **Raw taxa per cluster**: Excel file showing all taxa found in each cluster -- **Processed taxa**: Excel file with clusters where a taxon was assigned - -**Note**: The tool expects that sequence counts are included in the cluster file headers in the format "header(count)". - -------------- - -.. class:: infomark - -**Credits** - -Based on a script by Nick Kortleven, translated, modified and wrapped by Onno de Gorter, -Developed for the New light on old remedies project, a PhD research by Anja Fischer. - -Link to the project website: - -* https://ahm.uva.nl/funded-research-projects/new-lights-on-old-remedies/new-lights-on-old-remedies.html - - ]]></help> - <creator> - <organization name="Naturalis Biodiversity Center" url="https://www.naturalis.nl/en/science" /> - <person givenName="Onno" familyName="de Gorter" url="https://github.com/Onnodg"/> - <person givenName="Nick" familyName="Kortleven" url="https://github.com/tombkingsts" /> - </creator> +<tool id="cdhit_cluster_analysis" name="CD-HIT Cluster Analysis" version="2.0.0"> + <description>Analyze CD-HIT clustering results with taxonomic annotation</description> + + <requirements> + <requirement type="package" version="3.12.3">python</requirement> + <requirement type="package" version="3.10.6">matplotlib</requirement> + <requirement type="package" version="2.3.2">pandas</requirement> + <requirement type="package" version="3.1.5">openpyxl</requirement> + </requirements> + + <command detect_errors="exit_code"><![CDATA[ +bash '$__tool_directory__/cdhit_analysis.sh' + --input_cluster '$input_cluster' + --input_annotation '$input_annotation' + #if $output_options.similarity_output: + --output_similarity_txt '$similarity_txt' + --output_similarity_plot '$similarity_plot' + #end if + + #if $output_options.count_output: + --output_count '$cluster_count' + #end if + + #if $output_options.taxa_output: + --output_excel '$taxa_excel' + #if $output_options.show_all: + --output_taxa_clusters + #end if + #if $output_options.show_calculated: + --output_taxa_processed + #end if + #end if + --log_file '$log_file' + + --simi_plot_y_min '$plot_params.simi_plot_y_min' + --simi_plot_y_max '$plot_params.simi_plot_y_max' + --min_cluster_support '$taxa_params.min_cluster_support' + --uncertain_taxa_use_ratio '$taxa_params.uncertain_taxa_use_ratio' + --min_to_split '$taxa_params.min_to_split' + --min_count_to_split '$taxa_params.min_count_to_split' + + ]]></command> + + <inputs> + <param name="input_cluster" type="data" format="txt" label="CD-HIT cluster file" + help="Output cluster file from cd-hit-est" /> + <param name="input_annotation" type="data" format="xlsx" + label="Excel Annotations file" + help="Excel workfile with annotations per header" /> + + <section name="output_options" title="Output Options" expanded="true"> + <param name="similarity_output" type="boolean" truevalue="true" falsevalue="false" + checked="false" label="Create cluster similarity output" + help="Generate similarity analysis and plots" /> + <param name="count_output" type="boolean" truevalue="true" falsevalue="false" + checked="false" label="Create cluster count output" + help="Generate read count summaries" /> + <param name="taxa_output" type="boolean" truevalue="true" falsevalue="false" + checked="false" label="Create taxa annotations output" + help="Generate taxonomic analysis" /> + <param name="show_all" type="boolean" truevalue="true" falsevalue="false" + checked="false" label="Show all annotations per cluster" + help="Ouput all annotations found per cluster in the excel file" /> + <param name="show_calculated" type="boolean" truevalue="true" falsevalue="false" + checked="false" label="Show calculated annotations per cluster" + help="Output calculated annotations per cluster in the excel file" /> + </section> + + <section name="plot_params" title="Plot Parameters" expanded="false"> + <param name="simi_plot_y_min" type="float" value="95.0" min="0" max="100" + label="Similarity plot Y-axis minimum" + help="Minimum value for similarity plot Y-axis" /> + <param name="simi_plot_y_max" type="float" value="100.0" min="0" max="100" + label="Similarity plot Y-axis maximum" + help="Maximum value for similarity plot Y-axis" /> + </section> + + <section name="taxa_params" title="Taxonomic Analysis Parameters" expanded="false"> + <param name="uncertain_taxa_use_ratio" type="float" value="0.5" min="0" max="1" + label="Uncertain taxa ratio" + help="Ratio at which uncertain taxa count toward the correct taxa" /> + <param name="min_to_split" type="float" value="0.45" min="0" max="0.5" + label="Minimum percentage to split" + help="Minimum percentage the second most abundant taxon has to be for taxonomic split" /> + <param name="min_count_to_split" type="integer" value="10" min="1" + label="Minimum count to split" + help="Minimum count for taxonomic split within clusters" /> + <param name="min_cluster_support" type="integer" value="1" min="1" + label="Minimum cluster size" + help="Clusters are ignored if they are smaller than this number" /> + </section> + </inputs> + + <outputs> + <data name="similarity_txt" format="txt" label="Similarity data" > + <filter>output_options['similarity_output']</filter> + </data> + <data name="similarity_plot" format="png" label="Similarity plot" > + <filter>output_options['similarity_output']</filter> + </data> + <data name="cluster_count" format="txt" label="Count summary" > + <filter>output_options['count_output']</filter> + </data> + <data name="taxa_excel" format="xlsx" label="Taxon output per cluster" > + <filter>output_options['taxa_output']</filter> + </data> + <data name="log_file" format="txt" label="Log file"/> + </outputs> + + <tests> + <test expect_num_outputs="5"> + <param name="input_cluster" value="prev_anno.txt" /> + <param name="input_annotation" value="prev4.xlsx" /> + <section name="output_options"> + <param name="similarity_output" value="true" /> + <param name="count_output" value="true" /> + <param name="taxa_output" value="true" /> + <param name="show_all" value="true" /> + <param name="show_calculated" value="true" /> + </section> + <section name="taxa_params"> + <param name="uncertain_taxa_use_ratio" value="0.5" /> + <param name="min_to_split" value="0.45" /> + <param name="min_count_to_split" value="10" /> + <param name="min_cluster_support" value="1" /> + </section> + <section name="plot_params"> + <param name="simi_plot_y_min" value="95" /> + <param name="simi_plot_y_max" value="100" /> + </section> + <output name="log_file" file="test1_logs.txt"/> + <output name="similarity_txt" file="test1_similarity.txt" /> + <output name="similarity_plot" file="test1_similarity.png" compare="sim_size" /> + <output name="cluster_count" file="test1_summary.txt" /> + <output name="taxa_excel" file="test1_un_report.xlsx" decompress="true" /> + </test> + <test expect_num_outputs="5"> + <param name="input_cluster" value="test2_clusters.txt" /> + <param name="input_annotation" value="test2_annotations.xlsx" /> + <section name="output_options"> + <param name="similarity_output" value="true" /> + <param name="count_output" value="true" /> + <param name="taxa_output" value="true" /> + <param name="show_all" value="true" /> + <param name="show_calculated" value="true" /> + </section> + <section name="taxa_params"> + <param name="uncertain_taxa_use_ratio" value="0.5" /> + <param name="min_to_split" value="0.45" /> + <param name="min_count_to_split" value="10" /> + <param name="min_cluster_support" value="1" /> + </section> + <section name="plot_params"> + <param name="simi_plot_y_min" value="95" /> + <param name="simi_plot_y_max" value="100" /> + </section> + <output name="log_file" file="test2_logs.txt"/> + <output name="similarity_txt" file="test2_similarity.txt" /> + <output name="similarity_plot" file="test2_similarity.png" compare="sim_size" /> + <output name="cluster_count" file="test2_summary.txt" /> + <output name="taxa_excel" file="test2_un_report.xlsx" decompress="true" /> + </test> + <test expect_num_outputs="3"> + <param name="input_cluster" value="test2_clusters.txt" /> + <param name="input_annotation" value="test2_annotations.xlsx" /> + <section name="output_options"> + <param name="count_output" value="true" /> + <param name="taxa_output" value="true" /> + <param name="show_all" value="true" /> + <param name="show_calculated" value="false" /> + </section> + <section name="taxa_params"> + <param name="uncertain_taxa_use_ratio" value="0.2" /> + <param name="min_to_split" value="0.1" /> + <param name="min_count_to_split" value="3" /> + <param name="min_cluster_support" value="4" /> + </section> + <section name="plot_params"> + <param name="simi_plot_y_min" value="95" /> + <param name="simi_plot_y_max" value="100" /> + </section> + <output name="log_file" file="test3_logs.txt"/> + <output name="cluster_count" file="test3_summary.txt" /> + <output name="taxa_excel" file="test3_un_report.xlsx" decompress="true" /> + </test> + </tests> + + <help><![CDATA[ +**CD-HIT Cluster Analysis** + +This tool analyzes CD-HIT clustering output together with an annotation Excel file, +producing similarity statistics, count summaries, and taxonomic assignments. + +**Input Files:** + +1. **CD-HIT cluster file (.txt/.clstr)**: Required. The cluster file output from cd-hit-est containing clustered sequences. + +2. **Annotation file (.xlsx)**: Tab-separated file containing sequence annotations with columns: + +**Output Options:** + +- **Cluster similarity output**: Creates similarity analysis with plots and text files showing intra-cluster similarity distributions +- **Cluster count output**: Creates summary tables with annotated/unannotated read counts per cluster +- **Taxa annotations output**: Creates taxonomic analysis determining the most likely taxa for each cluster + - **Raw_Taxa_Clusters** — all annotations per cluster + - **Processed_Taxa_Clusters** — Annotations per cluster after weighted LCA + +**Parameters:** + +- **Plot Parameters**: Control the size of similarity plots (X and Y-axis limits) +- **Taxonomic Analysis Parameters**: Control when clusters are valid and when clusters are split + +**Output Files:** + +- **Similarity data**: Tab-separated file with similarity statistics +- **Similarity plot**: PNG image showing similarity distribution across clusters +- **Count summary**: Tab-separated file with read counts per cluster +- **Taxon output per cluster**: Excel file showing all taxa found in each cluster +- **Log file**: Contains cluster statistics and error logs + +**Note**: The tool expects that sequence counts are included in the cluster file headers in the format "header(count)". + +------------- + +.. class:: infomark + +**Credits** + +Based on a script by Nick Kortleven, translated, modified and wrapped by Onno de Gorter, +Developed for the New light on old remedies project, a PhD research by Anja Fischer. + +Link to the project website: + +* https://ahm.uva.nl/funded-research-projects/new-lights-on-old-remedies/new-lights-on-old-remedies.html + + ]]></help> + <creator> + <organization name="Naturalis Biodiversity Center" url="https://www.naturalis.nl/en/science" /> + <person givenName="Onno" familyName="de Gorter" url="https://github.com/Onnodg"/> + <person givenName="Nick" familyName="Kortleven" url="https://github.com/tombkingsts" /> + </creator> </tool> \ No newline at end of file
