Mercurial > repos > onnodg > cdhit_analysis
comparison cdhit_analysis.xml @ 1:ff68835adb2b draft
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit d771f9fbfd42bcdeda1623d954550882a0863847-dirty
| author | onnodg |
|---|---|
| date | Mon, 20 Oct 2025 12:27:31 +0000 |
| parents | 00d56396b32a |
| children | 706b7acdb230 |
comparison
equal
deleted
inserted
replaced
| 0:00d56396b32a | 1:ff68835adb2b |
|---|---|
| 1 <tool id="cdhit_cluster_analysis" name="CD-HIT Cluster Analysis" version="1.0.0"> | 1 <tool id="cdhit_cluster_analysis" name="CD-HIT Cluster Analysis" version="1.0.1"> |
| 2 <description>Analyze CD-HIT clustering results with taxonomic annotation</description> | 2 <description>Analyze CD-HIT clustering results with taxonomic annotation</description> |
| 3 | 3 |
| 4 <requirements> | 4 <requirements> |
| 5 <requirement type="package" version="3.12.3">python</requirement> | 5 <requirement type="package" version="3.12.3">python</requirement> |
| 6 <requirement type="package" version="3.10.6">matplotlib</requirement> | 6 <requirement type="package" version="3.10.6">matplotlib</requirement> |
| 12 python '$__tool_directory__/cdhit_analysis.py' | 12 python '$__tool_directory__/cdhit_analysis.py' |
| 13 --input_cluster '$input_cluster' | 13 --input_cluster '$input_cluster' |
| 14 --input_annotation '$input_annotation' | 14 --input_annotation '$input_annotation' |
| 15 | 15 |
| 16 #if $output_options.similarity_output: | 16 #if $output_options.similarity_output: |
| 17 --output_similarity_txt '$output_similarity_txt' | 17 --output_similarity_txt '$similarity_txt' |
| 18 --output_similarity_plot '$output_similarity_plot' | 18 --output_similarity_plot '$similarity_plot' |
| 19 #end if | 19 #end if |
| 20 #if $output_options.evalue_output: | 20 #if $output_options.evalue_output: |
| 21 --output_evalue_txt '$output_evalue_txt' | 21 --output_evalue_txt '$evalue_txt' |
| 22 --output_evalue_plot '$output_evalue_plot' | 22 --output_evalue_plot '$evalue_plot' |
| 23 #end if | 23 #end if |
| 24 #if $output_options.count_output: | 24 #if $output_options.count_output: |
| 25 --output_count '$output_count' | 25 --output_count '$cluster_count' |
| 26 #end if | 26 #end if |
| 27 #if $output_options.taxa_output: | 27 #if $output_options.taxa_output: |
| 28 --output_taxa_clusters '$output_taxa_clusters' | 28 --output_taxa_clusters '$cluster_taxa' |
| 29 --output_taxa_processed '$output_taxa_processed' | 29 --output_taxa_processed '$processed_taxa' |
| 30 #end if | 30 #end if |
| 31 | 31 |
| 32 --simi_plot_y_min '$plot_params.simi_plot_y_min' | 32 --simi_plot_y_min '$plot_params.simi_plot_y_min' |
| 33 --simi_plot_y_max '$plot_params.simi_plot_y_max' | 33 --simi_plot_y_max '$plot_params.simi_plot_y_max' |
| 34 | 34 |
| 46 --print_empty_files | 46 --print_empty_files |
| 47 #end if | 47 #end if |
| 48 ]]></command> | 48 ]]></command> |
| 49 | 49 |
| 50 <inputs> | 50 <inputs> |
| 51 <param name="input_cluster" type="data" format="txt" label="CD-HIT cluster file (.clstr/.txt)" | 51 <param name="input_cluster" type="data" format="txt" label="CD-HIT cluster file" |
| 52 help="Output cluster file from cd-hit-est" /> | 52 help="Output cluster file from cd-hit-est" /> |
| 53 <param name="input_annotation" type="data" format="xlsx" | 53 <param name="input_annotation" type="data" format="xlsx" |
| 54 label="Annotation file" | 54 label="Excel Annotations file" |
| 55 help="Excel workfile with sequence annotations (header, evalue, taxa)" /> | 55 help="Excel workfile with annotations per header" /> |
| 56 | 56 |
| 57 <section name="output_options" title="Output Options" expanded="true"> | 57 <section name="output_options" title="Output Options" expanded="true"> |
| 58 <param name="similarity_output" type="boolean" truevalue="true" falsevalue="false" | 58 <param name="similarity_output" type="boolean" truevalue="true" falsevalue="false" |
| 59 checked="true" label="Create similarity output" | 59 checked="true" label="Create cluster similarity output" |
| 60 help="Generate similarity analysis and plots" /> | 60 help="Generate similarity analysis and plots" /> |
| 61 <param name="evalue_output" type="boolean" truevalue="true" falsevalue="false" | 61 <param name="evalue_output" type="boolean" truevalue="true" falsevalue="false" |
| 62 checked="true" label="Create E-value output" | 62 checked="true" label="Create cluster E-value output" |
| 63 help="Generate E-value analysis and plots" /> | 63 help="Generate E-value analysis and plots" /> |
| 64 <param name="count_output" type="boolean" truevalue="true" falsevalue="false" | 64 <param name="count_output" type="boolean" truevalue="true" falsevalue="false" |
| 65 checked="true" label="Create count output" | 65 checked="true" label="Create cluster count output" |
| 66 help="Generate read count summaries" /> | 66 help="Generate read count summaries" /> |
| 67 <param name="taxa_output" type="boolean" truevalue="true" falsevalue="false" | 67 <param name="taxa_output" type="boolean" truevalue="true" falsevalue="false" |
| 68 checked="true" label="Create taxa output" | 68 checked="true" label="Create taxa annotations output" |
| 69 help="Generate taxonomic analysis" /> | 69 help="Generate taxonomic analysis" /> |
| 70 </section> | 70 </section> |
| 71 | 71 |
| 72 <section name="plot_params" title="Plot Parameters" expanded="false"> | 72 <section name="plot_params" title="Plot Parameters" expanded="false"> |
| 73 <param name="simi_plot_y_min" type="float" value="95.0" min="0" max="100" | 73 <param name="simi_plot_y_min" type="float" value="95.0" min="0" max="100" |
| 102 help="Print messages about empty annotation files" /> | 102 help="Print messages about empty annotation files" /> |
| 103 </section> | 103 </section> |
| 104 </inputs> | 104 </inputs> |
| 105 | 105 |
| 106 <outputs> | 106 <outputs> |
| 107 <data name="output_similarity_txt" format="txt" label="Similarity data" > | 107 <data name="similarity_txt" format="txt" label="Similarity data" > |
| 108 <filter>output_options['similarity_output']</filter> | 108 <filter>output_options['similarity_output']</filter> |
| 109 </data> | 109 </data> |
| 110 | 110 |
| 111 <data name="output_similarity_plot" format="png" label="Similarity plot" > | 111 <data name="similarity_plot" format="png" label="Similarity plot" > |
| 112 <filter>output_options['similarity_output']</filter> | 112 <filter>output_options['similarity_output']</filter> |
| 113 </data> | 113 </data> |
| 114 | 114 |
| 115 <data name="output_evalue_txt" format="txt" label="E-value data" > | 115 <data name="evalue_txt" format="txt" label="E-value data" > |
| 116 <filter>output_options['evalue_output']</filter> | 116 <filter>output_options['evalue_output']</filter> |
| 117 </data> | 117 </data> |
| 118 | 118 |
| 119 <data name="output_evalue_plot" format="png" label="E-value plot" > | 119 <data name="evalue_plot" format="png" label="E-value plot" > |
| 120 <filter>output_options['evalue_output']</filter> | 120 <filter>output_options['evalue_output']</filter> |
| 121 </data> | 121 </data> |
| 122 | 122 |
| 123 <data name="output_count" format="txt" label="Count summary" > | 123 <data name="cluster_count" format="txt" label="Count summary" > |
| 124 <filter>output_options['count_output']</filter> | 124 <filter>output_options['count_output']</filter> |
| 125 </data> | 125 </data> |
| 126 | 126 |
| 127 <data name="output_taxa_clusters" format="xlsx" label="Raw taxa per cluster" > | 127 <data name="cluster_taxa" format="xlsx" label="Raw taxa per cluster" > |
| 128 <filter>output_options['taxa_output']</filter> | 128 <filter>output_options['taxa_output']</filter> |
| 129 </data> | 129 </data> |
| 130 | 130 |
| 131 <data name="output_taxa_processed" format="xlsx" label="Processed taxa" > | 131 <data name="processed_taxa" format="xlsx" label="Processed taxa" > |
| 132 <filter>output_options['taxa_output']</filter> | 132 <filter>output_options['taxa_output']</filter> |
| 133 </data> | 133 </data> |
| 134 </outputs> | 134 </outputs> |
| 135 | 135 |
| 136 <tests> | 136 <tests> |
| 141 <param name="similarity_output" value="true" /> | 141 <param name="similarity_output" value="true" /> |
| 142 <param name="evalue_output" value="true" /> | 142 <param name="evalue_output" value="true" /> |
| 143 <param name="count_output" value="true" /> | 143 <param name="count_output" value="true" /> |
| 144 <param name="taxa_output" value="true" /> | 144 <param name="taxa_output" value="true" /> |
| 145 </section> | 145 </section> |
| 146 <output name="output_similarity_txt" file="sim_out.txt" /> | 146 <output name="similarity_txt" file="sim_out.txt" /> |
| 147 <output name="output_similarity_plot" file="sim_out.png" compare="sim_size"/> | 147 <output name="similarity_plot" file="sim_out.png" compare="sim_size"/> |
| 148 <output name="output_evalue_txt" file="evalue_out.txt" /> | 148 <output name="evalue_txt" file="evalue_out.txt" /> |
| 149 <output name="output_evalue_plot" file="evalue_out.png" compare="sim_size"/> | 149 <output name="evalue_plot" file="evalue_out.png" compare="sim_size"/> |
| 150 <output name="output_count" file="count_out.txt" /> | 150 <output name="cluster_count" file="count_out.txt" /> |
| 151 <output name="output_taxa_clusters" file="taxa_out.xlsx" decompress="true"/> | 151 <output name="cluster_taxa" file="taxa_out.xlsx" decompress="true"/> |
| 152 <output name="output_taxa_processed" file="processed.xlsx" decompress="true"/> | 152 <output name="processed_taxa" file="processed.xlsx" decompress="true"/> |
| 153 </test> | 153 </test> |
| 154 <test expect_num_outputs="7"> | 154 <test expect_num_outputs="7"> |
| 155 <param name="input_cluster" value="input2_test.clstr.txt" /> | 155 <param name="input_cluster" value="input2_test.clstr.txt" /> |
| 156 <param name="input_annotation" value="header_anno_excel.xlsx" /> | 156 <param name="input_annotation" value="header_anno_excel.xlsx" /> |
| 157 <section name="output_options"> | 157 <section name="output_options"> |
| 158 <param name="similarity_output" value="true" /> | 158 <param name="similarity_output" value="true" /> |
| 159 <param name="evalue_output" value="true" /> | 159 <param name="evalue_output" value="true" /> |
| 160 <param name="count_output" value="true" /> | 160 <param name="count_output" value="true" /> |
| 161 <param name="taxa_output" value="true" /> | 161 <param name="taxa_output" value="true" /> |
| 162 </section> | 162 </section> |
| 163 <output name="output_similarity_txt" file="test2_sim_out.txt" /> | 163 <output name="similarity_txt" file="test2_sim_out.txt" /> |
| 164 <output name="output_similarity_plot" file="test2_sim_out.png" compare="sim_size"/> | 164 <output name="similarity_plot" file="test2_sim_out.png" compare="sim_size"/> |
| 165 <output name="output_evalue_txt" file="test2_evalue_out.txt" /> | 165 <output name="evalue_txt" file="test2_evalue_out.txt" /> |
| 166 <output name="output_evalue_plot" file="test2_evalue_out.png" compare="sim_size"/> | 166 <output name="evalue_plot" file="test2_evalue_out.png" compare="sim_size"/> |
| 167 <output name="output_count" file="test_2count_out.txt" /> | 167 <output name="cluster_count" file="test_2count_out.txt" /> |
| 168 <output name="output_taxa_clusters" file="test_2taxa_out.xlsx" decompress="true"/> | 168 <output name="cluster_taxa" file="test_2taxa_out.xlsx" decompress="true"/> |
| 169 <output name="output_taxa_processed" file="test_2processed.xlsx" decompress="true"/> | 169 <output name="processed_taxa" file="test_2processed.xlsx" decompress="true"/> |
| 170 </test> | 170 </test> |
| 171 <test expect_num_outputs="5"> | 171 <test expect_num_outputs="5"> |
| 172 <param name="input_cluster" value="input2_test.clstr.txt" /> | 172 <param name="input_cluster" value="input2_test.clstr.txt" /> |
| 173 <param name="input_annotation" value="header_anno_excel.xlsx" /> | 173 <param name="input_annotation" value="header_anno_excel.xlsx" /> |
| 174 <section name="output_options"> | 174 <section name="output_options"> |
| 176 <param name="count_output" value="true" /> | 176 <param name="count_output" value="true" /> |
| 177 <param name="taxa_output" value="true" /> | 177 <param name="taxa_output" value="true" /> |
| 178 <param name="evalue_output" value="false" /> | 178 <param name="evalue_output" value="false" /> |
| 179 </section> | 179 </section> |
| 180 <section name="processing_options"> | 180 <section name="processing_options"> |
| 181 <param name="show_unnanotated_clusters" value="true"/> | 181 <param name="show_unannotated_clusters" value="true"/> |
| 182 <param name="make_taxa_in_cluster_split" value="true"/> | 182 <param name="make_taxa_in_cluster_split" value="true"/> |
| 183 <param name="print_empty_files" value="true"/> | 183 <param name="print_empty_files" value="true"/> |
| 184 </section> | 184 </section> |
| 185 <section name="taxa_params"> | 185 <section name="taxa_params"> |
| 186 <param name="uncertain_taxa_use_ratio" value="0.6"/> | 186 <param name="uncertain_taxa_use_ratio" value="0.6"/> |
| 187 <param name="min_to_split" value="0.6"/> | 187 <param name="min_to_split" value="0.6"/> |
| 188 <param name="min_count_to_split" value="6"/> | 188 <param name="min_count_to_split" value="6"/> |
| 189 </section> | 189 </section> |
| 190 <section name="plot_params" title="Plot Parameters" expanded="false"> | 190 <section name="plot_params"> |
| 191 <param name="simi_plot_y_min" value="0.4" /> | 191 <param name="simi_plot_y_min" value="0.4" /> |
| 192 <param name="simi_plot_y_max" value="0.4" /> | 192 <param name="simi_plot_y_max" value="0.4" /> |
| 193 </section> | 193 </section> |
| 194 <output name="output_similarity_txt" file="test2_sim_extra_out.txt" /> | 194 <output name="similarity_txt" file="test2_sim_extra_out.txt" /> |
| 195 <output name="output_similarity_plot" file="test2_sim_extra_out.png" compare="sim_size"/> | 195 <output name="similarity_plot" file="test2_sim_extra_out.png" compare="sim_size"/> |
| 196 <output name="output_count" file="test_2count_extra_out.txt" /> | 196 <output name="cluster_count" file="test_2count_extra_out.txt" /> |
| 197 <output name="output_taxa_clusters" file="test_2taxa_extra_out.xlsx" decompress="true"/> | 197 <output name="cluster_taxa" file="test_2taxa_extra_out.xlsx" decompress="true"/> |
| 198 <output name="output_taxa_processed" file="test_2processed_extra.xlsx" decompress="true"/> | 198 <output name="processed_taxa" file="test_2processed_extra.xlsx" decompress="true"/> |
| 199 </test> | 199 </test> |
| 200 </tests> | 200 </tests> |
| 201 | 201 |
| 202 <help><![CDATA[ | 202 <help><![CDATA[ |
| 203 **CD-HIT Cluster Analysis** | 203 **CD-HIT Cluster Analysis** |
| 210 | 210 |
| 211 2. **Annotation file (.xlsx)**: Tab-separated file containing sequence annotations with columns: | 211 2. **Annotation file (.xlsx)**: Tab-separated file containing sequence annotations with columns: |
| 212 | 212 |
| 213 **Output Options:** | 213 **Output Options:** |
| 214 | 214 |
| 215 - **Similarity output**: Creates similarity analysis with plots and text files showing intra-cluster similarity distributions | 215 - **Cluster similarity output**: Creates similarity analysis with plots and text files showing intra-cluster similarity distributions |
| 216 - **E-value output**: Creates E-value analysis with plots and text files showing E-value distributions | 216 - **Cluster e-value output**: Creates E-value analysis with plots and text files showing E-value distributions |
| 217 - **Count output**: Creates summary tables with annotated/unannotated read counts per cluster | 217 - **Cluster count output**: Creates summary tables with annotated/unannotated read counts per cluster |
| 218 - **Taxa output**: Creates taxonomic analysis determining the most likely taxa for each cluster | 218 - **Taxa annotations output**: Creates taxonomic analysis determining the most likely taxa for each cluster |
| 219 | 219 |
| 220 **Parameters:** | 220 **Parameters:** |
| 221 | 221 |
| 222 - **Plot Parameters**: Control the size of similarity plots (X and Y-axis limits) | 222 - **Plot Parameters**: Control the size of similarity plots (X and Y-axis limits) |
| 223 - **Taxonomic Analysis Parameters**: Control how uncertain taxa are handled and when clusters are split | 223 - **Taxonomic Analysis Parameters**: Control how uncertain taxa are handled and when clusters are split |
| 233 - **Raw taxa per cluster**: Excel file showing all taxa found in each cluster | 233 - **Raw taxa per cluster**: Excel file showing all taxa found in each cluster |
| 234 - **Processed taxa**: Excel file with clusters where a taxon was assigned | 234 - **Processed taxa**: Excel file with clusters where a taxon was assigned |
| 235 | 235 |
| 236 **Note**: The tool expects that sequence counts are included in the cluster file headers in the format "header(count)". | 236 **Note**: The tool expects that sequence counts are included in the cluster file headers in the format "header(count)". |
| 237 | 237 |
| 238 ------------- | |
| 239 | |
| 240 .. class:: infomark | |
| 241 | |
| 238 **Credits** | 242 **Credits** |
| 239 Authors = Onno de Gorter, 2025. | 243 |
| 240 Based on a script by Nick Kortleven, translated, modified and wrapped by Onno de Gorter, | 244 Based on a script by Nick Kortleven, translated, modified and wrapped by Onno de Gorter, |
| 241 Developed for the New light on old remedies project, a PhD research by Anja Fischer | 245 Developed for the New light on old remedies project, a PhD research by Anja Fischer. |
| 246 | |
| 247 Link to the project website: | |
| 248 | |
| 249 * https://ahm.uva.nl/funded-research-projects/new-lights-on-old-remedies/new-lights-on-old-remedies.html | |
| 250 | |
| 242 ]]></help> | 251 ]]></help> |
| 252 <creator> | |
| 253 <organization name="Naturalis Biodiversity Center" url="https://www.naturalis.nl/en/science" /> | |
| 254 <person givenName="Onno" familyName="de Gorter" url="https://github.com/Onnodg"/> | |
| 255 <person givenName="Nick" familyName="Kortleven" url="https://github.com/tombkingsts" /> | |
| 256 </creator> | |
| 243 </tool> | 257 </tool> |
