comparison cdhit_analysis.xml @ 4:e64af72e1b8f draft default tip

planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit 4017d38cf327c48a6252e488ba792527dae97a70-dirty
author onnodg
date Mon, 15 Dec 2025 16:44:40 +0000
parents c6981ea453ae
children
comparison
equal deleted inserted replaced
3:c6981ea453ae 4:e64af72e1b8f
1 <tool id="cdhit_cluster_analysis" name="CD-HIT Cluster Analysis" version="1.0.2"> 1 <tool id="cdhit_cluster_analysis" name="CD-HIT Cluster Analysis" version="2.0.0">
2 <description>Analyze CD-HIT clustering results with taxonomic annotation</description> 2 <description>Analyze CD-HIT clustering results with taxonomic annotation</description>
3 3
4 <requirements> 4 <requirements>
5 <requirement type="package" version="3.12.3">python</requirement> 5 <requirement type="package" version="3.12.3">python</requirement>
6 <requirement type="package" version="3.10.6">matplotlib</requirement> 6 <requirement type="package" version="3.10.6">matplotlib</requirement>
10 10
11 <command detect_errors="exit_code"><![CDATA[ 11 <command detect_errors="exit_code"><![CDATA[
12 bash '$__tool_directory__/cdhit_analysis.sh' 12 bash '$__tool_directory__/cdhit_analysis.sh'
13 --input_cluster '$input_cluster' 13 --input_cluster '$input_cluster'
14 --input_annotation '$input_annotation' 14 --input_annotation '$input_annotation'
15
16 #if $output_options.similarity_output: 15 #if $output_options.similarity_output:
17 --output_similarity_txt '$similarity_txt' 16 --output_similarity_txt '$similarity_txt'
18 --output_similarity_plot '$similarity_plot' 17 --output_similarity_plot '$similarity_plot'
19 #end if 18 #end if
20 19
21 #if $output_options.evalue_output:
22 --output_evalue_txt '$evalue_txt'
23 --output_evalue_plot '$evalue_plot'
24 #end if
25
26 #if $output_options.count_output: 20 #if $output_options.count_output:
27 --output_count '$cluster_count' 21 --output_count '$cluster_count'
28 #end if 22 #end if
29 23
30 #if $output_options.taxa_output: 24 #if $output_options.taxa_output:
31 --output_taxa_clusters '$cluster_taxa' 25 --output_excel '$taxa_excel'
32 --output_taxa_processed '$processed_taxa' 26 #if $output_options.show_all:
27 --output_taxa_clusters
28 #end if
29 #if $output_options.show_calculated:
30 --output_taxa_processed
31 #end if
33 #end if 32 #end if
33 --log_file '$log_file'
34 34
35 --simi_plot_y_min '$plot_params.simi_plot_y_min' 35 --simi_plot_y_min '$plot_params.simi_plot_y_min'
36 --simi_plot_y_max '$plot_params.simi_plot_y_max' 36 --simi_plot_y_max '$plot_params.simi_plot_y_max'
37 37 --min_cluster_support '$taxa_params.min_cluster_support'
38 --uncertain_taxa_use_ratio '$taxa_params.uncertain_taxa_use_ratio' 38 --uncertain_taxa_use_ratio '$taxa_params.uncertain_taxa_use_ratio'
39 --min_to_split '$taxa_params.min_to_split' 39 --min_to_split '$taxa_params.min_to_split'
40 --min_count_to_split '$taxa_params.min_count_to_split' 40 --min_count_to_split '$taxa_params.min_count_to_split'
41 41
42 #if $processing_options.show_unannotated_clusters:
43 --show_unannotated_clusters
44 #end if
45
46 #if $processing_options.make_taxa_in_cluster_split:
47 --make_taxa_in_cluster_split
48 #end if
49
50 #if $processing_options.print_empty_files:
51 --print_empty_files
52 #end if
53 ]]></command> 42 ]]></command>
54 43
55 <inputs> 44 <inputs>
56 <param name="input_cluster" type="data" format="txt" label="CD-HIT cluster file" 45 <param name="input_cluster" type="data" format="txt" label="CD-HIT cluster file"
57 help="Output cluster file from cd-hit-est" /> 46 help="Output cluster file from cd-hit-est" />
59 label="Excel Annotations file" 48 label="Excel Annotations file"
60 help="Excel workfile with annotations per header" /> 49 help="Excel workfile with annotations per header" />
61 50
62 <section name="output_options" title="Output Options" expanded="true"> 51 <section name="output_options" title="Output Options" expanded="true">
63 <param name="similarity_output" type="boolean" truevalue="true" falsevalue="false" 52 <param name="similarity_output" type="boolean" truevalue="true" falsevalue="false"
64 checked="true" label="Create cluster similarity output" 53 checked="false" label="Create cluster similarity output"
65 help="Generate similarity analysis and plots" /> 54 help="Generate similarity analysis and plots" />
66 <param name="evalue_output" type="boolean" truevalue="true" falsevalue="false"
67 checked="true" label="Create cluster E-value output"
68 help="Generate E-value analysis and plots" />
69 <param name="count_output" type="boolean" truevalue="true" falsevalue="false" 55 <param name="count_output" type="boolean" truevalue="true" falsevalue="false"
70 checked="true" label="Create cluster count output" 56 checked="false" label="Create cluster count output"
71 help="Generate read count summaries" /> 57 help="Generate read count summaries" />
72 <param name="taxa_output" type="boolean" truevalue="true" falsevalue="false" 58 <param name="taxa_output" type="boolean" truevalue="true" falsevalue="false"
73 checked="true" label="Create taxa annotations output" 59 checked="false" label="Create taxa annotations output"
74 help="Generate taxonomic analysis" /> 60 help="Generate taxonomic analysis" />
61 <param name="show_all" type="boolean" truevalue="true" falsevalue="false"
62 checked="false" label="Show all annotations per cluster"
63 help="Ouput all annotations found per cluster in the excel file" />
64 <param name="show_calculated" type="boolean" truevalue="true" falsevalue="false"
65 checked="false" label="Show calculated annotations per cluster"
66 help="Output calculated annotations per cluster in the excel file" />
75 </section> 67 </section>
76 68
77 <section name="plot_params" title="Plot Parameters" expanded="false"> 69 <section name="plot_params" title="Plot Parameters" expanded="false">
78 <param name="simi_plot_y_min" type="float" value="95.0" min="0" max="100" 70 <param name="simi_plot_y_min" type="float" value="95.0" min="0" max="100"
79 label="Similarity plot Y-axis minimum" 71 label="Similarity plot Y-axis minimum"
85 77
86 <section name="taxa_params" title="Taxonomic Analysis Parameters" expanded="false"> 78 <section name="taxa_params" title="Taxonomic Analysis Parameters" expanded="false">
87 <param name="uncertain_taxa_use_ratio" type="float" value="0.5" min="0" max="1" 79 <param name="uncertain_taxa_use_ratio" type="float" value="0.5" min="0" max="1"
88 label="Uncertain taxa ratio" 80 label="Uncertain taxa ratio"
89 help="Ratio at which uncertain taxa count toward the correct taxa" /> 81 help="Ratio at which uncertain taxa count toward the correct taxa" />
90 <param name="min_to_split" type="float" value="0.45" min="0" max="1" 82 <param name="min_to_split" type="float" value="0.45" min="0" max="0.5"
91 label="Minimum percentage to split" 83 label="Minimum percentage to split"
92 help="Minimum percentage for taxonomic split" /> 84 help="Minimum percentage the second most abundant taxon has to be for taxonomic split" />
93 <param name="min_count_to_split" type="integer" value="10" min="1" 85 <param name="min_count_to_split" type="integer" value="10" min="1"
94 label="Minimum count to split" 86 label="Minimum count to split"
95 help="Minimum count for taxonomic split" /> 87 help="Minimum count for taxonomic split within clusters" />
96 </section> 88 <param name="min_cluster_support" type="integer" value="1" min="1"
97 89 label="Minimum cluster size"
98 <section name="processing_options" title="Processing Options" expanded="false"> 90 help="Clusters are ignored if they are smaller than this number" />
99 <param name="show_unannotated_clusters" type="boolean" truevalue="true" falsevalue="false"
100 checked="false" label="Show unannotated clusters"
101 help="Include unannotated clusters in output" />
102 <param name="make_taxa_in_cluster_split" type="boolean" truevalue="true" falsevalue="false"
103 checked="false" label="Split clusters with multiple taxa"
104 help="Split clusters containing multiple taxa instead of marking as uncertain" />
105 <param name="print_empty_files" type="boolean" truevalue="true" falsevalue="false"
106 checked="false" label="Print empty file messages"
107 help="Print messages about empty annotation files" />
108 </section> 91 </section>
109 </inputs> 92 </inputs>
110 93
111 <outputs> 94 <outputs>
112 <data name="similarity_txt" format="txt" label="Similarity data" > 95 <data name="similarity_txt" format="txt" label="Similarity data" >
113 <filter>output_options['similarity_output']</filter> 96 <filter>output_options['similarity_output']</filter>
114 </data> 97 </data>
115
116 <data name="similarity_plot" format="png" label="Similarity plot" > 98 <data name="similarity_plot" format="png" label="Similarity plot" >
117 <filter>output_options['similarity_output']</filter> 99 <filter>output_options['similarity_output']</filter>
118 </data> 100 </data>
119
120 <data name="evalue_txt" format="txt" label="E-value data" >
121 <filter>output_options['evalue_output']</filter>
122 </data>
123
124 <data name="evalue_plot" format="png" label="E-value plot" >
125 <filter>output_options['evalue_output']</filter>
126 </data>
127
128 <data name="cluster_count" format="txt" label="Count summary" > 101 <data name="cluster_count" format="txt" label="Count summary" >
129 <filter>output_options['count_output']</filter> 102 <filter>output_options['count_output']</filter>
130 </data> 103 </data>
131 104 <data name="taxa_excel" format="xlsx" label="Taxon output per cluster" >
132 <data name="cluster_taxa" format="xlsx" label="Raw taxa per cluster" >
133 <filter>output_options['taxa_output']</filter> 105 <filter>output_options['taxa_output']</filter>
134 </data> 106 </data>
135 107 <data name="log_file" format="txt" label="Log file"/>
136 <data name="processed_taxa" format="xlsx" label="Processed taxa" >
137 <filter>output_options['taxa_output']</filter>
138 </data>
139 </outputs> 108 </outputs>
140 109
141 <tests> 110 <tests>
142 <test expect_num_outputs="7">
143 <param name="input_cluster" value="29-test.clstr.txt" />
144 <param name="input_annotation" value="header_anno_29_test.xlsx" />
145 <section name="output_options">
146 <param name="similarity_output" value="true" />
147 <param name="evalue_output" value="true" />
148 <param name="count_output" value="true" />
149 <param name="taxa_output" value="true" />
150 </section>
151 <output name="similarity_txt" file="sim_out.txt" />
152 <output name="similarity_plot" file="sim_out.png" compare="sim_size"/>
153 <output name="evalue_txt" file="evalue_out.txt" />
154 <output name="evalue_plot" file="evalue_out.png" compare="sim_size"/>
155 <output name="cluster_count" file="count_out.txt" />
156 <output name="cluster_taxa" file="taxa_out.xlsx" decompress="true"/>
157 <output name="processed_taxa" file="processed.xlsx" decompress="true"/>
158 </test>
159 <test expect_num_outputs="7">
160 <param name="input_cluster" value="input2_test.clstr.txt" />
161 <param name="input_annotation" value="header_anno_excel.xlsx" />
162 <section name="output_options">
163 <param name="similarity_output" value="true" />
164 <param name="evalue_output" value="true" />
165 <param name="count_output" value="true" />
166 <param name="taxa_output" value="true" />
167 </section>
168 <output name="similarity_txt" file="test2_sim_out.txt" />
169 <output name="similarity_plot" file="test2_sim_out.png" compare="sim_size"/>
170 <output name="evalue_txt" file="test2_evalue_out.txt" />
171 <output name="evalue_plot" file="test2_evalue_out.png" compare="sim_size"/>
172 <output name="cluster_count" file="test_2count_out.txt" />
173 <output name="cluster_taxa" file="test_2taxa_out.xlsx" decompress="true"/>
174 <output name="processed_taxa" file="test_2processed.xlsx" decompress="true"/>
175 </test>
176 <test expect_num_outputs="5"> 111 <test expect_num_outputs="5">
177 <param name="input_cluster" value="input2_test.clstr.txt" /> 112 <param name="input_cluster" value="prev_anno.txt" />
178 <param name="input_annotation" value="header_anno_excel.xlsx" /> 113 <param name="input_annotation" value="prev4.xlsx" />
179 <section name="output_options"> 114 <section name="output_options">
180 <param name="similarity_output" value="true" /> 115 <param name="similarity_output" value="true" />
181 <param name="count_output" value="true" /> 116 <param name="count_output" value="true" />
182 <param name="taxa_output" value="true" /> 117 <param name="taxa_output" value="true" />
183 <param name="evalue_output" value="false" /> 118 <param name="show_all" value="true" />
184 </section> 119 <param name="show_calculated" value="true" />
185 <section name="processing_options">
186 <param name="show_unannotated_clusters" value="true"/>
187 <param name="make_taxa_in_cluster_split" value="true"/>
188 <param name="print_empty_files" value="true"/>
189 </section> 120 </section>
190 <section name="taxa_params"> 121 <section name="taxa_params">
191 <param name="uncertain_taxa_use_ratio" value="0.6"/> 122 <param name="uncertain_taxa_use_ratio" value="0.5" />
192 <param name="min_to_split" value="0.6"/> 123 <param name="min_to_split" value="0.45" />
193 <param name="min_count_to_split" value="6"/> 124 <param name="min_count_to_split" value="10" />
125 <param name="min_cluster_support" value="1" />
194 </section> 126 </section>
195 <section name="plot_params"> 127 <section name="plot_params">
196 <param name="simi_plot_y_min" value="0.4" /> 128 <param name="simi_plot_y_min" value="95" />
197 <param name="simi_plot_y_max" value="0.4" /> 129 <param name="simi_plot_y_max" value="100" />
198 </section> 130 </section>
199 <output name="similarity_txt" file="test2_sim_extra_out.txt" /> 131 <output name="log_file" file="test1_logs.txt"/>
200 <output name="similarity_plot" file="test2_sim_extra_out.png" compare="sim_size"/> 132 <output name="similarity_txt" file="test1_similarity.txt" />
201 <output name="cluster_count" file="test_2count_extra_out.txt" /> 133 <output name="similarity_plot" file="test1_similarity.png" compare="sim_size" />
202 <output name="cluster_taxa" file="test_2taxa_extra_out.xlsx" decompress="true"/> 134 <output name="cluster_count" file="test1_summary.txt" />
203 <output name="processed_taxa" file="test_2processed_extra.xlsx" decompress="true"/> 135 <output name="taxa_excel" file="test1_un_report.xlsx" decompress="true" />
136 </test>
137 <test expect_num_outputs="5">
138 <param name="input_cluster" value="test2_clusters.txt" />
139 <param name="input_annotation" value="test2_annotations.xlsx" />
140 <section name="output_options">
141 <param name="similarity_output" value="true" />
142 <param name="count_output" value="true" />
143 <param name="taxa_output" value="true" />
144 <param name="show_all" value="true" />
145 <param name="show_calculated" value="true" />
146 </section>
147 <section name="taxa_params">
148 <param name="uncertain_taxa_use_ratio" value="0.5" />
149 <param name="min_to_split" value="0.45" />
150 <param name="min_count_to_split" value="10" />
151 <param name="min_cluster_support" value="1" />
152 </section>
153 <section name="plot_params">
154 <param name="simi_plot_y_min" value="95" />
155 <param name="simi_plot_y_max" value="100" />
156 </section>
157 <output name="log_file" file="test2_logs.txt"/>
158 <output name="similarity_txt" file="test2_similarity.txt" />
159 <output name="similarity_plot" file="test2_similarity.png" compare="sim_size" />
160 <output name="cluster_count" file="test2_summary.txt" />
161 <output name="taxa_excel" file="test2_un_report.xlsx" decompress="true" />
162 </test>
163 <test expect_num_outputs="3">
164 <param name="input_cluster" value="test2_clusters.txt" />
165 <param name="input_annotation" value="test2_annotations.xlsx" />
166 <section name="output_options">
167 <param name="count_output" value="true" />
168 <param name="taxa_output" value="true" />
169 <param name="show_all" value="true" />
170 <param name="show_calculated" value="false" />
171 </section>
172 <section name="taxa_params">
173 <param name="uncertain_taxa_use_ratio" value="0.2" />
174 <param name="min_to_split" value="0.1" />
175 <param name="min_count_to_split" value="3" />
176 <param name="min_cluster_support" value="4" />
177 </section>
178 <section name="plot_params">
179 <param name="simi_plot_y_min" value="95" />
180 <param name="simi_plot_y_max" value="100" />
181 </section>
182 <output name="log_file" file="test3_logs.txt"/>
183 <output name="cluster_count" file="test3_summary.txt" />
184 <output name="taxa_excel" file="test3_un_report.xlsx" decompress="true" />
204 </test> 185 </test>
205 </tests> 186 </tests>
206 187
207 <help><![CDATA[ 188 <help><![CDATA[
208 **CD-HIT Cluster Analysis** 189 **CD-HIT Cluster Analysis**
209 190
210 This tool analyzes CD-HIT clustering results and provides various outputs including taxonomic analysis, similarity analysis, E-value analysis, and read count summaries. 191 This tool analyzes CD-HIT clustering output together with an annotation Excel file,
192 producing similarity statistics, count summaries, and taxonomic assignments.
211 193
212 **Input Files:** 194 **Input Files:**
213 195
214 1. **CD-HIT cluster file (.txt/.clstr)**: Required. The cluster file output from cd-hit-est containing clustered sequences. 196 1. **CD-HIT cluster file (.txt/.clstr)**: Required. The cluster file output from cd-hit-est containing clustered sequences.
215 197
216 2. **Annotation file (.xlsx)**: Tab-separated file containing sequence annotations with columns: 198 2. **Annotation file (.xlsx)**: Tab-separated file containing sequence annotations with columns:
217 199
218 **Output Options:** 200 **Output Options:**
219 201
220 - **Cluster similarity output**: Creates similarity analysis with plots and text files showing intra-cluster similarity distributions 202 - **Cluster similarity output**: Creates similarity analysis with plots and text files showing intra-cluster similarity distributions
221 - **Cluster e-value output**: Creates E-value analysis with plots and text files showing E-value distributions
222 - **Cluster count output**: Creates summary tables with annotated/unannotated read counts per cluster 203 - **Cluster count output**: Creates summary tables with annotated/unannotated read counts per cluster
223 - **Taxa annotations output**: Creates taxonomic analysis determining the most likely taxa for each cluster 204 - **Taxa annotations output**: Creates taxonomic analysis determining the most likely taxa for each cluster
205 - **Raw_Taxa_Clusters** — all annotations per cluster
206 - **Processed_Taxa_Clusters** — Annotations per cluster after weighted LCA
224 207
225 **Parameters:** 208 **Parameters:**
226 209
227 - **Plot Parameters**: Control the size of similarity plots (X and Y-axis limits) 210 - **Plot Parameters**: Control the size of similarity plots (X and Y-axis limits)
228 - **Taxonomic Analysis Parameters**: Control how uncertain taxa are handled and when clusters are split 211 - **Taxonomic Analysis Parameters**: Control when clusters are valid and when clusters are split
229 - **Processing Options**: Control display of unannotated clusters and verbose output
230 212
231 **Output Files:** 213 **Output Files:**
232 214
233 - **Similarity data**: Tab-separated file with similarity statistics 215 - **Similarity data**: Tab-separated file with similarity statistics
234 - **Similarity plot**: PNG image showing similarity distribution across clusters 216 - **Similarity plot**: PNG image showing similarity distribution across clusters
235 - **E-value data**: Tab-separated file with E-value statistics
236 - **E-value plot**: PNG image showing E-value distribution
237 - **Count summary**: Tab-separated file with read counts per cluster 217 - **Count summary**: Tab-separated file with read counts per cluster
238 - **Raw taxa per cluster**: Excel file showing all taxa found in each cluster 218 - **Taxon output per cluster**: Excel file showing all taxa found in each cluster
239 - **Processed taxa**: Excel file with clusters where a taxon was assigned 219 - **Log file**: Contains cluster statistics and error logs
240 220
241 **Note**: The tool expects that sequence counts are included in the cluster file headers in the format "header(count)". 221 **Note**: The tool expects that sequence counts are included in the cluster file headers in the format "header(count)".
242 222
243 ------------- 223 -------------
244 224