comparison cdhit_analysis.xml @ 1:ff68835adb2b draft

planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit d771f9fbfd42bcdeda1623d954550882a0863847-dirty
author onnodg
date Mon, 20 Oct 2025 12:27:31 +0000
parents 00d56396b32a
children 706b7acdb230
comparison
equal deleted inserted replaced
0:00d56396b32a 1:ff68835adb2b
1 <tool id="cdhit_cluster_analysis" name="CD-HIT Cluster Analysis" version="1.0.0"> 1 <tool id="cdhit_cluster_analysis" name="CD-HIT Cluster Analysis" version="1.0.1">
2 <description>Analyze CD-HIT clustering results with taxonomic annotation</description> 2 <description>Analyze CD-HIT clustering results with taxonomic annotation</description>
3 3
4 <requirements> 4 <requirements>
5 <requirement type="package" version="3.12.3">python</requirement> 5 <requirement type="package" version="3.12.3">python</requirement>
6 <requirement type="package" version="3.10.6">matplotlib</requirement> 6 <requirement type="package" version="3.10.6">matplotlib</requirement>
12 python '$__tool_directory__/cdhit_analysis.py' 12 python '$__tool_directory__/cdhit_analysis.py'
13 --input_cluster '$input_cluster' 13 --input_cluster '$input_cluster'
14 --input_annotation '$input_annotation' 14 --input_annotation '$input_annotation'
15 15
16 #if $output_options.similarity_output: 16 #if $output_options.similarity_output:
17 --output_similarity_txt '$output_similarity_txt' 17 --output_similarity_txt '$similarity_txt'
18 --output_similarity_plot '$output_similarity_plot' 18 --output_similarity_plot '$similarity_plot'
19 #end if 19 #end if
20 #if $output_options.evalue_output: 20 #if $output_options.evalue_output:
21 --output_evalue_txt '$output_evalue_txt' 21 --output_evalue_txt '$evalue_txt'
22 --output_evalue_plot '$output_evalue_plot' 22 --output_evalue_plot '$evalue_plot'
23 #end if 23 #end if
24 #if $output_options.count_output: 24 #if $output_options.count_output:
25 --output_count '$output_count' 25 --output_count '$cluster_count'
26 #end if 26 #end if
27 #if $output_options.taxa_output: 27 #if $output_options.taxa_output:
28 --output_taxa_clusters '$output_taxa_clusters' 28 --output_taxa_clusters '$cluster_taxa'
29 --output_taxa_processed '$output_taxa_processed' 29 --output_taxa_processed '$processed_taxa'
30 #end if 30 #end if
31 31
32 --simi_plot_y_min '$plot_params.simi_plot_y_min' 32 --simi_plot_y_min '$plot_params.simi_plot_y_min'
33 --simi_plot_y_max '$plot_params.simi_plot_y_max' 33 --simi_plot_y_max '$plot_params.simi_plot_y_max'
34 34
46 --print_empty_files 46 --print_empty_files
47 #end if 47 #end if
48 ]]></command> 48 ]]></command>
49 49
50 <inputs> 50 <inputs>
51 <param name="input_cluster" type="data" format="txt" label="CD-HIT cluster file (.clstr/.txt)" 51 <param name="input_cluster" type="data" format="txt" label="CD-HIT cluster file"
52 help="Output cluster file from cd-hit-est" /> 52 help="Output cluster file from cd-hit-est" />
53 <param name="input_annotation" type="data" format="xlsx" 53 <param name="input_annotation" type="data" format="xlsx"
54 label="Annotation file" 54 label="Excel Annotations file"
55 help="Excel workfile with sequence annotations (header, evalue, taxa)" /> 55 help="Excel workfile with annotations per header" />
56 56
57 <section name="output_options" title="Output Options" expanded="true"> 57 <section name="output_options" title="Output Options" expanded="true">
58 <param name="similarity_output" type="boolean" truevalue="true" falsevalue="false" 58 <param name="similarity_output" type="boolean" truevalue="true" falsevalue="false"
59 checked="true" label="Create similarity output" 59 checked="true" label="Create cluster similarity output"
60 help="Generate similarity analysis and plots" /> 60 help="Generate similarity analysis and plots" />
61 <param name="evalue_output" type="boolean" truevalue="true" falsevalue="false" 61 <param name="evalue_output" type="boolean" truevalue="true" falsevalue="false"
62 checked="true" label="Create E-value output" 62 checked="true" label="Create cluster E-value output"
63 help="Generate E-value analysis and plots" /> 63 help="Generate E-value analysis and plots" />
64 <param name="count_output" type="boolean" truevalue="true" falsevalue="false" 64 <param name="count_output" type="boolean" truevalue="true" falsevalue="false"
65 checked="true" label="Create count output" 65 checked="true" label="Create cluster count output"
66 help="Generate read count summaries" /> 66 help="Generate read count summaries" />
67 <param name="taxa_output" type="boolean" truevalue="true" falsevalue="false" 67 <param name="taxa_output" type="boolean" truevalue="true" falsevalue="false"
68 checked="true" label="Create taxa output" 68 checked="true" label="Create taxa annotations output"
69 help="Generate taxonomic analysis" /> 69 help="Generate taxonomic analysis" />
70 </section> 70 </section>
71 71
72 <section name="plot_params" title="Plot Parameters" expanded="false"> 72 <section name="plot_params" title="Plot Parameters" expanded="false">
73 <param name="simi_plot_y_min" type="float" value="95.0" min="0" max="100" 73 <param name="simi_plot_y_min" type="float" value="95.0" min="0" max="100"
102 help="Print messages about empty annotation files" /> 102 help="Print messages about empty annotation files" />
103 </section> 103 </section>
104 </inputs> 104 </inputs>
105 105
106 <outputs> 106 <outputs>
107 <data name="output_similarity_txt" format="txt" label="Similarity data" > 107 <data name="similarity_txt" format="txt" label="Similarity data" >
108 <filter>output_options['similarity_output']</filter> 108 <filter>output_options['similarity_output']</filter>
109 </data> 109 </data>
110 110
111 <data name="output_similarity_plot" format="png" label="Similarity plot" > 111 <data name="similarity_plot" format="png" label="Similarity plot" >
112 <filter>output_options['similarity_output']</filter> 112 <filter>output_options['similarity_output']</filter>
113 </data> 113 </data>
114 114
115 <data name="output_evalue_txt" format="txt" label="E-value data" > 115 <data name="evalue_txt" format="txt" label="E-value data" >
116 <filter>output_options['evalue_output']</filter> 116 <filter>output_options['evalue_output']</filter>
117 </data> 117 </data>
118 118
119 <data name="output_evalue_plot" format="png" label="E-value plot" > 119 <data name="evalue_plot" format="png" label="E-value plot" >
120 <filter>output_options['evalue_output']</filter> 120 <filter>output_options['evalue_output']</filter>
121 </data> 121 </data>
122 122
123 <data name="output_count" format="txt" label="Count summary" > 123 <data name="cluster_count" format="txt" label="Count summary" >
124 <filter>output_options['count_output']</filter> 124 <filter>output_options['count_output']</filter>
125 </data> 125 </data>
126 126
127 <data name="output_taxa_clusters" format="xlsx" label="Raw taxa per cluster" > 127 <data name="cluster_taxa" format="xlsx" label="Raw taxa per cluster" >
128 <filter>output_options['taxa_output']</filter> 128 <filter>output_options['taxa_output']</filter>
129 </data> 129 </data>
130 130
131 <data name="output_taxa_processed" format="xlsx" label="Processed taxa" > 131 <data name="processed_taxa" format="xlsx" label="Processed taxa" >
132 <filter>output_options['taxa_output']</filter> 132 <filter>output_options['taxa_output']</filter>
133 </data> 133 </data>
134 </outputs> 134 </outputs>
135 135
136 <tests> 136 <tests>
141 <param name="similarity_output" value="true" /> 141 <param name="similarity_output" value="true" />
142 <param name="evalue_output" value="true" /> 142 <param name="evalue_output" value="true" />
143 <param name="count_output" value="true" /> 143 <param name="count_output" value="true" />
144 <param name="taxa_output" value="true" /> 144 <param name="taxa_output" value="true" />
145 </section> 145 </section>
146 <output name="output_similarity_txt" file="sim_out.txt" /> 146 <output name="similarity_txt" file="sim_out.txt" />
147 <output name="output_similarity_plot" file="sim_out.png" compare="sim_size"/> 147 <output name="similarity_plot" file="sim_out.png" compare="sim_size"/>
148 <output name="output_evalue_txt" file="evalue_out.txt" /> 148 <output name="evalue_txt" file="evalue_out.txt" />
149 <output name="output_evalue_plot" file="evalue_out.png" compare="sim_size"/> 149 <output name="evalue_plot" file="evalue_out.png" compare="sim_size"/>
150 <output name="output_count" file="count_out.txt" /> 150 <output name="cluster_count" file="count_out.txt" />
151 <output name="output_taxa_clusters" file="taxa_out.xlsx" decompress="true"/> 151 <output name="cluster_taxa" file="taxa_out.xlsx" decompress="true"/>
152 <output name="output_taxa_processed" file="processed.xlsx" decompress="true"/> 152 <output name="processed_taxa" file="processed.xlsx" decompress="true"/>
153 </test> 153 </test>
154 <test expect_num_outputs="7"> 154 <test expect_num_outputs="7">
155 <param name="input_cluster" value="input2_test.clstr.txt" /> 155 <param name="input_cluster" value="input2_test.clstr.txt" />
156 <param name="input_annotation" value="header_anno_excel.xlsx" /> 156 <param name="input_annotation" value="header_anno_excel.xlsx" />
157 <section name="output_options"> 157 <section name="output_options">
158 <param name="similarity_output" value="true" /> 158 <param name="similarity_output" value="true" />
159 <param name="evalue_output" value="true" /> 159 <param name="evalue_output" value="true" />
160 <param name="count_output" value="true" /> 160 <param name="count_output" value="true" />
161 <param name="taxa_output" value="true" /> 161 <param name="taxa_output" value="true" />
162 </section> 162 </section>
163 <output name="output_similarity_txt" file="test2_sim_out.txt" /> 163 <output name="similarity_txt" file="test2_sim_out.txt" />
164 <output name="output_similarity_plot" file="test2_sim_out.png" compare="sim_size"/> 164 <output name="similarity_plot" file="test2_sim_out.png" compare="sim_size"/>
165 <output name="output_evalue_txt" file="test2_evalue_out.txt" /> 165 <output name="evalue_txt" file="test2_evalue_out.txt" />
166 <output name="output_evalue_plot" file="test2_evalue_out.png" compare="sim_size"/> 166 <output name="evalue_plot" file="test2_evalue_out.png" compare="sim_size"/>
167 <output name="output_count" file="test_2count_out.txt" /> 167 <output name="cluster_count" file="test_2count_out.txt" />
168 <output name="output_taxa_clusters" file="test_2taxa_out.xlsx" decompress="true"/> 168 <output name="cluster_taxa" file="test_2taxa_out.xlsx" decompress="true"/>
169 <output name="output_taxa_processed" file="test_2processed.xlsx" decompress="true"/> 169 <output name="processed_taxa" file="test_2processed.xlsx" decompress="true"/>
170 </test> 170 </test>
171 <test expect_num_outputs="5"> 171 <test expect_num_outputs="5">
172 <param name="input_cluster" value="input2_test.clstr.txt" /> 172 <param name="input_cluster" value="input2_test.clstr.txt" />
173 <param name="input_annotation" value="header_anno_excel.xlsx" /> 173 <param name="input_annotation" value="header_anno_excel.xlsx" />
174 <section name="output_options"> 174 <section name="output_options">
176 <param name="count_output" value="true" /> 176 <param name="count_output" value="true" />
177 <param name="taxa_output" value="true" /> 177 <param name="taxa_output" value="true" />
178 <param name="evalue_output" value="false" /> 178 <param name="evalue_output" value="false" />
179 </section> 179 </section>
180 <section name="processing_options"> 180 <section name="processing_options">
181 <param name="show_unnanotated_clusters" value="true"/> 181 <param name="show_unannotated_clusters" value="true"/>
182 <param name="make_taxa_in_cluster_split" value="true"/> 182 <param name="make_taxa_in_cluster_split" value="true"/>
183 <param name="print_empty_files" value="true"/> 183 <param name="print_empty_files" value="true"/>
184 </section> 184 </section>
185 <section name="taxa_params"> 185 <section name="taxa_params">
186 <param name="uncertain_taxa_use_ratio" value="0.6"/> 186 <param name="uncertain_taxa_use_ratio" value="0.6"/>
187 <param name="min_to_split" value="0.6"/> 187 <param name="min_to_split" value="0.6"/>
188 <param name="min_count_to_split" value="6"/> 188 <param name="min_count_to_split" value="6"/>
189 </section> 189 </section>
190 <section name="plot_params" title="Plot Parameters" expanded="false"> 190 <section name="plot_params">
191 <param name="simi_plot_y_min" value="0.4" /> 191 <param name="simi_plot_y_min" value="0.4" />
192 <param name="simi_plot_y_max" value="0.4" /> 192 <param name="simi_plot_y_max" value="0.4" />
193 </section> 193 </section>
194 <output name="output_similarity_txt" file="test2_sim_extra_out.txt" /> 194 <output name="similarity_txt" file="test2_sim_extra_out.txt" />
195 <output name="output_similarity_plot" file="test2_sim_extra_out.png" compare="sim_size"/> 195 <output name="similarity_plot" file="test2_sim_extra_out.png" compare="sim_size"/>
196 <output name="output_count" file="test_2count_extra_out.txt" /> 196 <output name="cluster_count" file="test_2count_extra_out.txt" />
197 <output name="output_taxa_clusters" file="test_2taxa_extra_out.xlsx" decompress="true"/> 197 <output name="cluster_taxa" file="test_2taxa_extra_out.xlsx" decompress="true"/>
198 <output name="output_taxa_processed" file="test_2processed_extra.xlsx" decompress="true"/> 198 <output name="processed_taxa" file="test_2processed_extra.xlsx" decompress="true"/>
199 </test> 199 </test>
200 </tests> 200 </tests>
201 201
202 <help><![CDATA[ 202 <help><![CDATA[
203 **CD-HIT Cluster Analysis** 203 **CD-HIT Cluster Analysis**
210 210
211 2. **Annotation file (.xlsx)**: Tab-separated file containing sequence annotations with columns: 211 2. **Annotation file (.xlsx)**: Tab-separated file containing sequence annotations with columns:
212 212
213 **Output Options:** 213 **Output Options:**
214 214
215 - **Similarity output**: Creates similarity analysis with plots and text files showing intra-cluster similarity distributions 215 - **Cluster similarity output**: Creates similarity analysis with plots and text files showing intra-cluster similarity distributions
216 - **E-value output**: Creates E-value analysis with plots and text files showing E-value distributions 216 - **Cluster e-value output**: Creates E-value analysis with plots and text files showing E-value distributions
217 - **Count output**: Creates summary tables with annotated/unannotated read counts per cluster 217 - **Cluster count output**: Creates summary tables with annotated/unannotated read counts per cluster
218 - **Taxa output**: Creates taxonomic analysis determining the most likely taxa for each cluster 218 - **Taxa annotations output**: Creates taxonomic analysis determining the most likely taxa for each cluster
219 219
220 **Parameters:** 220 **Parameters:**
221 221
222 - **Plot Parameters**: Control the size of similarity plots (X and Y-axis limits) 222 - **Plot Parameters**: Control the size of similarity plots (X and Y-axis limits)
223 - **Taxonomic Analysis Parameters**: Control how uncertain taxa are handled and when clusters are split 223 - **Taxonomic Analysis Parameters**: Control how uncertain taxa are handled and when clusters are split
233 - **Raw taxa per cluster**: Excel file showing all taxa found in each cluster 233 - **Raw taxa per cluster**: Excel file showing all taxa found in each cluster
234 - **Processed taxa**: Excel file with clusters where a taxon was assigned 234 - **Processed taxa**: Excel file with clusters where a taxon was assigned
235 235
236 **Note**: The tool expects that sequence counts are included in the cluster file headers in the format "header(count)". 236 **Note**: The tool expects that sequence counts are included in the cluster file headers in the format "header(count)".
237 237
238 -------------
239
240 .. class:: infomark
241
238 **Credits** 242 **Credits**
239 Authors = Onno de Gorter, 2025. 243
240 Based on a script by Nick Kortleven, translated, modified and wrapped by Onno de Gorter, 244 Based on a script by Nick Kortleven, translated, modified and wrapped by Onno de Gorter,
241 Developed for the New light on old remedies project, a PhD research by Anja Fischer 245 Developed for the New light on old remedies project, a PhD research by Anja Fischer.
246
247 Link to the project website:
248
249 * https://ahm.uva.nl/funded-research-projects/new-lights-on-old-remedies/new-lights-on-old-remedies.html
250
242 ]]></help> 251 ]]></help>
252 <creator>
253 <organization name="Naturalis Biodiversity Center" url="https://www.naturalis.nl/en/science" />
254 <person givenName="Onno" familyName="de Gorter" url="https://github.com/Onnodg"/>
255 <person givenName="Nick" familyName="Kortleven" url="https://github.com/tombkingsts" />
256 </creator>
243 </tool> 257 </tool>