Mercurial > repos > onnodg > cdhit_analysis
view cdhit_analysis.xml @ 3:c6981ea453ae draft default tip
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit ef31054ae26e19eff2f1b1f6c7979e39c47c0d5b-dirty
| author | onnodg |
|---|---|
| date | Fri, 24 Oct 2025 09:38:24 +0000 |
| parents | 706b7acdb230 |
| children |
line wrap: on
line source
<tool id="cdhit_cluster_analysis" name="CD-HIT Cluster Analysis" version="1.0.2"> <description>Analyze CD-HIT clustering results with taxonomic annotation</description> <requirements> <requirement type="package" version="3.12.3">python</requirement> <requirement type="package" version="3.10.6">matplotlib</requirement> <requirement type="package" version="2.3.2">pandas</requirement> <requirement type="package" version="3.1.5">openpyxl</requirement> </requirements> <command detect_errors="exit_code"><![CDATA[ bash '$__tool_directory__/cdhit_analysis.sh' --input_cluster '$input_cluster' --input_annotation '$input_annotation' #if $output_options.similarity_output: --output_similarity_txt '$similarity_txt' --output_similarity_plot '$similarity_plot' #end if #if $output_options.evalue_output: --output_evalue_txt '$evalue_txt' --output_evalue_plot '$evalue_plot' #end if #if $output_options.count_output: --output_count '$cluster_count' #end if #if $output_options.taxa_output: --output_taxa_clusters '$cluster_taxa' --output_taxa_processed '$processed_taxa' #end if --simi_plot_y_min '$plot_params.simi_plot_y_min' --simi_plot_y_max '$plot_params.simi_plot_y_max' --uncertain_taxa_use_ratio '$taxa_params.uncertain_taxa_use_ratio' --min_to_split '$taxa_params.min_to_split' --min_count_to_split '$taxa_params.min_count_to_split' #if $processing_options.show_unannotated_clusters: --show_unannotated_clusters #end if #if $processing_options.make_taxa_in_cluster_split: --make_taxa_in_cluster_split #end if #if $processing_options.print_empty_files: --print_empty_files #end if ]]></command> <inputs> <param name="input_cluster" type="data" format="txt" label="CD-HIT cluster file" help="Output cluster file from cd-hit-est" /> <param name="input_annotation" type="data" format="xlsx" label="Excel Annotations file" help="Excel workfile with annotations per header" /> <section name="output_options" title="Output Options" expanded="true"> <param name="similarity_output" type="boolean" truevalue="true" falsevalue="false" checked="true" label="Create cluster similarity output" help="Generate similarity analysis and plots" /> <param name="evalue_output" type="boolean" truevalue="true" falsevalue="false" checked="true" label="Create cluster E-value output" help="Generate E-value analysis and plots" /> <param name="count_output" type="boolean" truevalue="true" falsevalue="false" checked="true" label="Create cluster count output" help="Generate read count summaries" /> <param name="taxa_output" type="boolean" truevalue="true" falsevalue="false" checked="true" label="Create taxa annotations output" help="Generate taxonomic analysis" /> </section> <section name="plot_params" title="Plot Parameters" expanded="false"> <param name="simi_plot_y_min" type="float" value="95.0" min="0" max="100" label="Similarity plot Y-axis minimum" help="Minimum value for similarity plot Y-axis" /> <param name="simi_plot_y_max" type="float" value="100.0" min="0" max="100" label="Similarity plot Y-axis maximum" help="Maximum value for similarity plot Y-axis" /> </section> <section name="taxa_params" title="Taxonomic Analysis Parameters" expanded="false"> <param name="uncertain_taxa_use_ratio" type="float" value="0.5" min="0" max="1" label="Uncertain taxa ratio" help="Ratio at which uncertain taxa count toward the correct taxa" /> <param name="min_to_split" type="float" value="0.45" min="0" max="1" label="Minimum percentage to split" help="Minimum percentage for taxonomic split" /> <param name="min_count_to_split" type="integer" value="10" min="1" label="Minimum count to split" help="Minimum count for taxonomic split" /> </section> <section name="processing_options" title="Processing Options" expanded="false"> <param name="show_unannotated_clusters" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Show unannotated clusters" help="Include unannotated clusters in output" /> <param name="make_taxa_in_cluster_split" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Split clusters with multiple taxa" help="Split clusters containing multiple taxa instead of marking as uncertain" /> <param name="print_empty_files" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Print empty file messages" help="Print messages about empty annotation files" /> </section> </inputs> <outputs> <data name="similarity_txt" format="txt" label="Similarity data" > <filter>output_options['similarity_output']</filter> </data> <data name="similarity_plot" format="png" label="Similarity plot" > <filter>output_options['similarity_output']</filter> </data> <data name="evalue_txt" format="txt" label="E-value data" > <filter>output_options['evalue_output']</filter> </data> <data name="evalue_plot" format="png" label="E-value plot" > <filter>output_options['evalue_output']</filter> </data> <data name="cluster_count" format="txt" label="Count summary" > <filter>output_options['count_output']</filter> </data> <data name="cluster_taxa" format="xlsx" label="Raw taxa per cluster" > <filter>output_options['taxa_output']</filter> </data> <data name="processed_taxa" format="xlsx" label="Processed taxa" > <filter>output_options['taxa_output']</filter> </data> </outputs> <tests> <test expect_num_outputs="7"> <param name="input_cluster" value="29-test.clstr.txt" /> <param name="input_annotation" value="header_anno_29_test.xlsx" /> <section name="output_options"> <param name="similarity_output" value="true" /> <param name="evalue_output" value="true" /> <param name="count_output" value="true" /> <param name="taxa_output" value="true" /> </section> <output name="similarity_txt" file="sim_out.txt" /> <output name="similarity_plot" file="sim_out.png" compare="sim_size"/> <output name="evalue_txt" file="evalue_out.txt" /> <output name="evalue_plot" file="evalue_out.png" compare="sim_size"/> <output name="cluster_count" file="count_out.txt" /> <output name="cluster_taxa" file="taxa_out.xlsx" decompress="true"/> <output name="processed_taxa" file="processed.xlsx" decompress="true"/> </test> <test expect_num_outputs="7"> <param name="input_cluster" value="input2_test.clstr.txt" /> <param name="input_annotation" value="header_anno_excel.xlsx" /> <section name="output_options"> <param name="similarity_output" value="true" /> <param name="evalue_output" value="true" /> <param name="count_output" value="true" /> <param name="taxa_output" value="true" /> </section> <output name="similarity_txt" file="test2_sim_out.txt" /> <output name="similarity_plot" file="test2_sim_out.png" compare="sim_size"/> <output name="evalue_txt" file="test2_evalue_out.txt" /> <output name="evalue_plot" file="test2_evalue_out.png" compare="sim_size"/> <output name="cluster_count" file="test_2count_out.txt" /> <output name="cluster_taxa" file="test_2taxa_out.xlsx" decompress="true"/> <output name="processed_taxa" file="test_2processed.xlsx" decompress="true"/> </test> <test expect_num_outputs="5"> <param name="input_cluster" value="input2_test.clstr.txt" /> <param name="input_annotation" value="header_anno_excel.xlsx" /> <section name="output_options"> <param name="similarity_output" value="true" /> <param name="count_output" value="true" /> <param name="taxa_output" value="true" /> <param name="evalue_output" value="false" /> </section> <section name="processing_options"> <param name="show_unannotated_clusters" value="true"/> <param name="make_taxa_in_cluster_split" value="true"/> <param name="print_empty_files" value="true"/> </section> <section name="taxa_params"> <param name="uncertain_taxa_use_ratio" value="0.6"/> <param name="min_to_split" value="0.6"/> <param name="min_count_to_split" value="6"/> </section> <section name="plot_params"> <param name="simi_plot_y_min" value="0.4" /> <param name="simi_plot_y_max" value="0.4" /> </section> <output name="similarity_txt" file="test2_sim_extra_out.txt" /> <output name="similarity_plot" file="test2_sim_extra_out.png" compare="sim_size"/> <output name="cluster_count" file="test_2count_extra_out.txt" /> <output name="cluster_taxa" file="test_2taxa_extra_out.xlsx" decompress="true"/> <output name="processed_taxa" file="test_2processed_extra.xlsx" decompress="true"/> </test> </tests> <help><![CDATA[ **CD-HIT Cluster Analysis** This tool analyzes CD-HIT clustering results and provides various outputs including taxonomic analysis, similarity analysis, E-value analysis, and read count summaries. **Input Files:** 1. **CD-HIT cluster file (.txt/.clstr)**: Required. The cluster file output from cd-hit-est containing clustered sequences. 2. **Annotation file (.xlsx)**: Tab-separated file containing sequence annotations with columns: **Output Options:** - **Cluster similarity output**: Creates similarity analysis with plots and text files showing intra-cluster similarity distributions - **Cluster e-value output**: Creates E-value analysis with plots and text files showing E-value distributions - **Cluster count output**: Creates summary tables with annotated/unannotated read counts per cluster - **Taxa annotations output**: Creates taxonomic analysis determining the most likely taxa for each cluster **Parameters:** - **Plot Parameters**: Control the size of similarity plots (X and Y-axis limits) - **Taxonomic Analysis Parameters**: Control how uncertain taxa are handled and when clusters are split - **Processing Options**: Control display of unannotated clusters and verbose output **Output Files:** - **Similarity data**: Tab-separated file with similarity statistics - **Similarity plot**: PNG image showing similarity distribution across clusters - **E-value data**: Tab-separated file with E-value statistics - **E-value plot**: PNG image showing E-value distribution - **Count summary**: Tab-separated file with read counts per cluster - **Raw taxa per cluster**: Excel file showing all taxa found in each cluster - **Processed taxa**: Excel file with clusters where a taxon was assigned **Note**: The tool expects that sequence counts are included in the cluster file headers in the format "header(count)". ------------- .. class:: infomark **Credits** Based on a script by Nick Kortleven, translated, modified and wrapped by Onno de Gorter, Developed for the New light on old remedies project, a PhD research by Anja Fischer. Link to the project website: * https://ahm.uva.nl/funded-research-projects/new-lights-on-old-remedies/new-lights-on-old-remedies.html ]]></help> <creator> <organization name="Naturalis Biodiversity Center" url="https://www.naturalis.nl/en/science" /> <person givenName="Onno" familyName="de Gorter" url="https://github.com/Onnodg"/> <person givenName="Nick" familyName="Kortleven" url="https://github.com/tombkingsts" /> </creator> </tool>
