Mercurial > repos > onnodg > cdhit_analysis
comparison cdhit_analysis.xml @ 0:00d56396b32a draft
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
| author | onnodg |
|---|---|
| date | Tue, 14 Oct 2025 09:09:46 +0000 |
| parents | |
| children | ff68835adb2b |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:00d56396b32a |
|---|---|
| 1 <tool id="cdhit_cluster_analysis" name="CD-HIT Cluster Analysis" version="1.0.0"> | |
| 2 <description>Analyze CD-HIT clustering results with taxonomic annotation</description> | |
| 3 | |
| 4 <requirements> | |
| 5 <requirement type="package" version="3.12.3">python</requirement> | |
| 6 <requirement type="package" version="3.10.6">matplotlib</requirement> | |
| 7 <requirement type="package" version="2.3.2">pandas</requirement> | |
| 8 <requirement type="package" version="3.1.5">openpyxl</requirement> | |
| 9 </requirements> | |
| 10 | |
| 11 <command detect_errors="exit_code"><![CDATA[ | |
| 12 python '$__tool_directory__/cdhit_analysis.py' | |
| 13 --input_cluster '$input_cluster' | |
| 14 --input_annotation '$input_annotation' | |
| 15 | |
| 16 #if $output_options.similarity_output: | |
| 17 --output_similarity_txt '$output_similarity_txt' | |
| 18 --output_similarity_plot '$output_similarity_plot' | |
| 19 #end if | |
| 20 #if $output_options.evalue_output: | |
| 21 --output_evalue_txt '$output_evalue_txt' | |
| 22 --output_evalue_plot '$output_evalue_plot' | |
| 23 #end if | |
| 24 #if $output_options.count_output: | |
| 25 --output_count '$output_count' | |
| 26 #end if | |
| 27 #if $output_options.taxa_output: | |
| 28 --output_taxa_clusters '$output_taxa_clusters' | |
| 29 --output_taxa_processed '$output_taxa_processed' | |
| 30 #end if | |
| 31 | |
| 32 --simi_plot_y_min '$plot_params.simi_plot_y_min' | |
| 33 --simi_plot_y_max '$plot_params.simi_plot_y_max' | |
| 34 | |
| 35 --uncertain_taxa_use_ratio '$taxa_params.uncertain_taxa_use_ratio' | |
| 36 --min_to_split '$taxa_params.min_to_split' | |
| 37 --min_count_to_split '$taxa_params.min_count_to_split' | |
| 38 | |
| 39 #if $processing_options.show_unannotated_clusters: | |
| 40 --show_unannotated_clusters | |
| 41 #end if | |
| 42 #if $processing_options.make_taxa_in_cluster_split: | |
| 43 --make_taxa_in_cluster_split | |
| 44 #end if | |
| 45 #if $processing_options.print_empty_files: | |
| 46 --print_empty_files | |
| 47 #end if | |
| 48 ]]></command> | |
| 49 | |
| 50 <inputs> | |
| 51 <param name="input_cluster" type="data" format="txt" label="CD-HIT cluster file (.clstr/.txt)" | |
| 52 help="Output cluster file from cd-hit-est" /> | |
| 53 <param name="input_annotation" type="data" format="xlsx" | |
| 54 label="Annotation file" | |
| 55 help="Excel workfile with sequence annotations (header, evalue, taxa)" /> | |
| 56 | |
| 57 <section name="output_options" title="Output Options" expanded="true"> | |
| 58 <param name="similarity_output" type="boolean" truevalue="true" falsevalue="false" | |
| 59 checked="true" label="Create similarity output" | |
| 60 help="Generate similarity analysis and plots" /> | |
| 61 <param name="evalue_output" type="boolean" truevalue="true" falsevalue="false" | |
| 62 checked="true" label="Create E-value output" | |
| 63 help="Generate E-value analysis and plots" /> | |
| 64 <param name="count_output" type="boolean" truevalue="true" falsevalue="false" | |
| 65 checked="true" label="Create count output" | |
| 66 help="Generate read count summaries" /> | |
| 67 <param name="taxa_output" type="boolean" truevalue="true" falsevalue="false" | |
| 68 checked="true" label="Create taxa output" | |
| 69 help="Generate taxonomic analysis" /> | |
| 70 </section> | |
| 71 | |
| 72 <section name="plot_params" title="Plot Parameters" expanded="false"> | |
| 73 <param name="simi_plot_y_min" type="float" value="95.0" min="0" max="100" | |
| 74 label="Similarity plot Y-axis minimum" | |
| 75 help="Minimum value for similarity plot Y-axis" /> | |
| 76 <param name="simi_plot_y_max" type="float" value="100.0" min="0" max="100" | |
| 77 label="Similarity plot Y-axis maximum" | |
| 78 help="Maximum value for similarity plot Y-axis" /> | |
| 79 </section> | |
| 80 | |
| 81 <section name="taxa_params" title="Taxonomic Analysis Parameters" expanded="false"> | |
| 82 <param name="uncertain_taxa_use_ratio" type="float" value="0.5" min="0" max="1" | |
| 83 label="Uncertain taxa ratio" | |
| 84 help="Ratio at which uncertain taxa count toward the correct taxa" /> | |
| 85 <param name="min_to_split" type="float" value="0.45" min="0" max="1" | |
| 86 label="Minimum percentage to split" | |
| 87 help="Minimum percentage for taxonomic split" /> | |
| 88 <param name="min_count_to_split" type="integer" value="10" min="1" | |
| 89 label="Minimum count to split" | |
| 90 help="Minimum count for taxonomic split" /> | |
| 91 </section> | |
| 92 | |
| 93 <section name="processing_options" title="Processing Options" expanded="false"> | |
| 94 <param name="show_unannotated_clusters" type="boolean" truevalue="true" falsevalue="false" | |
| 95 checked="false" label="Show unannotated clusters" | |
| 96 help="Include unannotated clusters in output" /> | |
| 97 <param name="make_taxa_in_cluster_split" type="boolean" truevalue="true" falsevalue="false" | |
| 98 checked="false" label="Split clusters with multiple taxa" | |
| 99 help="Split clusters containing multiple taxa instead of marking as uncertain" /> | |
| 100 <param name="print_empty_files" type="boolean" truevalue="true" falsevalue="false" | |
| 101 checked="false" label="Print empty file messages" | |
| 102 help="Print messages about empty annotation files" /> | |
| 103 </section> | |
| 104 </inputs> | |
| 105 | |
| 106 <outputs> | |
| 107 <data name="output_similarity_txt" format="txt" label="Similarity data" > | |
| 108 <filter>output_options['similarity_output']</filter> | |
| 109 </data> | |
| 110 | |
| 111 <data name="output_similarity_plot" format="png" label="Similarity plot" > | |
| 112 <filter>output_options['similarity_output']</filter> | |
| 113 </data> | |
| 114 | |
| 115 <data name="output_evalue_txt" format="txt" label="E-value data" > | |
| 116 <filter>output_options['evalue_output']</filter> | |
| 117 </data> | |
| 118 | |
| 119 <data name="output_evalue_plot" format="png" label="E-value plot" > | |
| 120 <filter>output_options['evalue_output']</filter> | |
| 121 </data> | |
| 122 | |
| 123 <data name="output_count" format="txt" label="Count summary" > | |
| 124 <filter>output_options['count_output']</filter> | |
| 125 </data> | |
| 126 | |
| 127 <data name="output_taxa_clusters" format="xlsx" label="Raw taxa per cluster" > | |
| 128 <filter>output_options['taxa_output']</filter> | |
| 129 </data> | |
| 130 | |
| 131 <data name="output_taxa_processed" format="xlsx" label="Processed taxa" > | |
| 132 <filter>output_options['taxa_output']</filter> | |
| 133 </data> | |
| 134 </outputs> | |
| 135 | |
| 136 <tests> | |
| 137 <test expect_num_outputs="7"> | |
| 138 <param name="input_cluster" value="29-test.clstr.txt" /> | |
| 139 <param name="input_annotation" value="header_anno_29_test.xlsx" /> | |
| 140 <section name="output_options"> | |
| 141 <param name="similarity_output" value="true" /> | |
| 142 <param name="evalue_output" value="true" /> | |
| 143 <param name="count_output" value="true" /> | |
| 144 <param name="taxa_output" value="true" /> | |
| 145 </section> | |
| 146 <output name="output_similarity_txt" file="sim_out.txt" /> | |
| 147 <output name="output_similarity_plot" file="sim_out.png" compare="sim_size"/> | |
| 148 <output name="output_evalue_txt" file="evalue_out.txt" /> | |
| 149 <output name="output_evalue_plot" file="evalue_out.png" compare="sim_size"/> | |
| 150 <output name="output_count" file="count_out.txt" /> | |
| 151 <output name="output_taxa_clusters" file="taxa_out.xlsx" decompress="true"/> | |
| 152 <output name="output_taxa_processed" file="processed.xlsx" decompress="true"/> | |
| 153 </test> | |
| 154 <test expect_num_outputs="7"> | |
| 155 <param name="input_cluster" value="input2_test.clstr.txt" /> | |
| 156 <param name="input_annotation" value="header_anno_excel.xlsx" /> | |
| 157 <section name="output_options"> | |
| 158 <param name="similarity_output" value="true" /> | |
| 159 <param name="evalue_output" value="true" /> | |
| 160 <param name="count_output" value="true" /> | |
| 161 <param name="taxa_output" value="true" /> | |
| 162 </section> | |
| 163 <output name="output_similarity_txt" file="test2_sim_out.txt" /> | |
| 164 <output name="output_similarity_plot" file="test2_sim_out.png" compare="sim_size"/> | |
| 165 <output name="output_evalue_txt" file="test2_evalue_out.txt" /> | |
| 166 <output name="output_evalue_plot" file="test2_evalue_out.png" compare="sim_size"/> | |
| 167 <output name="output_count" file="test_2count_out.txt" /> | |
| 168 <output name="output_taxa_clusters" file="test_2taxa_out.xlsx" decompress="true"/> | |
| 169 <output name="output_taxa_processed" file="test_2processed.xlsx" decompress="true"/> | |
| 170 </test> | |
| 171 <test expect_num_outputs="5"> | |
| 172 <param name="input_cluster" value="input2_test.clstr.txt" /> | |
| 173 <param name="input_annotation" value="header_anno_excel.xlsx" /> | |
| 174 <section name="output_options"> | |
| 175 <param name="similarity_output" value="true" /> | |
| 176 <param name="count_output" value="true" /> | |
| 177 <param name="taxa_output" value="true" /> | |
| 178 <param name="evalue_output" value="false" /> | |
| 179 </section> | |
| 180 <section name="processing_options"> | |
| 181 <param name="show_unnanotated_clusters" value="true"/> | |
| 182 <param name="make_taxa_in_cluster_split" value="true"/> | |
| 183 <param name="print_empty_files" value="true"/> | |
| 184 </section> | |
| 185 <section name="taxa_params"> | |
| 186 <param name="uncertain_taxa_use_ratio" value="0.6"/> | |
| 187 <param name="min_to_split" value="0.6"/> | |
| 188 <param name="min_count_to_split" value="6"/> | |
| 189 </section> | |
| 190 <section name="plot_params" title="Plot Parameters" expanded="false"> | |
| 191 <param name="simi_plot_y_min" value="0.4" /> | |
| 192 <param name="simi_plot_y_max" value="0.4" /> | |
| 193 </section> | |
| 194 <output name="output_similarity_txt" file="test2_sim_extra_out.txt" /> | |
| 195 <output name="output_similarity_plot" file="test2_sim_extra_out.png" compare="sim_size"/> | |
| 196 <output name="output_count" file="test_2count_extra_out.txt" /> | |
| 197 <output name="output_taxa_clusters" file="test_2taxa_extra_out.xlsx" decompress="true"/> | |
| 198 <output name="output_taxa_processed" file="test_2processed_extra.xlsx" decompress="true"/> | |
| 199 </test> | |
| 200 </tests> | |
| 201 | |
| 202 <help><![CDATA[ | |
| 203 **CD-HIT Cluster Analysis** | |
| 204 | |
| 205 This tool analyzes CD-HIT clustering results and provides various outputs including taxonomic analysis, similarity analysis, E-value analysis, and read count summaries. | |
| 206 | |
| 207 **Input Files:** | |
| 208 | |
| 209 1. **CD-HIT cluster file (.txt/.clstr)**: Required. The cluster file output from cd-hit-est containing clustered sequences. | |
| 210 | |
| 211 2. **Annotation file (.xlsx)**: Tab-separated file containing sequence annotations with columns: | |
| 212 | |
| 213 **Output Options:** | |
| 214 | |
| 215 - **Similarity output**: Creates similarity analysis with plots and text files showing intra-cluster similarity distributions | |
| 216 - **E-value output**: Creates E-value analysis with plots and text files showing E-value distributions | |
| 217 - **Count output**: Creates summary tables with annotated/unannotated read counts per cluster | |
| 218 - **Taxa output**: Creates taxonomic analysis determining the most likely taxa for each cluster | |
| 219 | |
| 220 **Parameters:** | |
| 221 | |
| 222 - **Plot Parameters**: Control the size of similarity plots (X and Y-axis limits) | |
| 223 - **Taxonomic Analysis Parameters**: Control how uncertain taxa are handled and when clusters are split | |
| 224 - **Processing Options**: Control display of unannotated clusters and verbose output | |
| 225 | |
| 226 **Output Files:** | |
| 227 | |
| 228 - **Similarity data**: Tab-separated file with similarity statistics | |
| 229 - **Similarity plot**: PNG image showing similarity distribution across clusters | |
| 230 - **E-value data**: Tab-separated file with E-value statistics | |
| 231 - **E-value plot**: PNG image showing E-value distribution | |
| 232 - **Count summary**: Tab-separated file with read counts per cluster | |
| 233 - **Raw taxa per cluster**: Excel file showing all taxa found in each cluster | |
| 234 - **Processed taxa**: Excel file with clusters where a taxon was assigned | |
| 235 | |
| 236 **Note**: The tool expects that sequence counts are included in the cluster file headers in the format "header(count)". | |
| 237 | |
| 238 **Credits** | |
| 239 Authors = Onno de Gorter, 2025. | |
| 240 Based on a script by Nick Kortleven, translated, modified and wrapped by Onno de Gorter, | |
| 241 Developed for the New light on old remedies project, a PhD research by Anja Fischer | |
| 242 ]]></help> | |
| 243 </tool> |
