Mercurial > repos > ecology > ecoregion_cluster_estimate
diff Nb_cluster.xml @ 0:0f6542d0986e draft
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
author | ecology |
---|---|
date | Wed, 18 Oct 2023 09:59:06 +0000 |
parents | |
children | e94a25eed489 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Nb_cluster.xml Wed Oct 18 09:59:06 2023 +0000 @@ -0,0 +1,103 @@ +<tool id="ecoregion_cluster_estimate" name="ClusterEstimate" version="0.1.0+galaxy0" profile="22.05"> + <description>Find an optimal number of cluster with SIH index</description> + <requirements> + <requirement type="package" version="4.2.3">r-base</requirement> + <requirement type="package" version="2.1.4">r-cluster</requirement> + <requirement type="package" version="1.1.1">r-dplyr</requirement> + <requirement type="package" version="2.0.0">r-tidyverse</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + Rscript + '$__tool_directory__/nb_clust_G.R' + '$envfile' + '$taxafile' + '$predictionfile' + '$max_k' + '$metric' + '$sample' + '$output1' + '$output2' + '$output3' + ]]> + </command> + <inputs> + <param name="envfile" type="data" format="txt,csv,tabular" label="Environment file"/> + <param name="taxafile" type="data" format="txt" label="Taxa selected file (List of taxa from TaxaSeeker tool)"/> + <param name="predictionfile" type="data" format="txt" multiple="true" label="Prediction files"/> + <param name="max_k" type="integer" value="2" min="1" label="Number of Cluster to test"/> + <param name="metric" type="select" label="What metric to use to calculate dissimilarities between observations ?"> + <option value="manhattan">manhattan</option> + <option value="euclidean">euclidean</option> + <option value="jaccard">jaccard</option> + </param> + <param name="sample" type="integer" label="The number of samples to be drawn from the dataset" min="5" value="10"/> + </inputs> + <outputs> + <data name="output1" from_work_dir="Indices_SIH.png" format="png" label="SIH index plot"/> + <data name="output2" from_work_dir="data_to_clus.tsv" format="tsv" label="Data to cluster"/> + <data name="output3" from_work_dir="data_bio.tsv" format="tsv" label="Data.bio table "/> + </outputs> + <tests> + <test> + <param name="envfile" value="ceamarc_env.csv"/> + <param name="taxafile" value="List_of_taxa.txt"/> + <param name="predictionfile" value="1_brts_pred_ceamarc.txt"/> + <param name='max_k' value="2"/> + <param name='metric' value="manhattan"/> + <param name='sample' value="10"/> + <output name='output1' value="SIH_index_plot.png"/> + <output name='output2' value="Data_to_cluster.tsv"/> + <output name='output3' value="Data.bio_table.tsv"/> + </test> + </tests> + <help><![CDATA[ +================== +**What it does ?** +================== + +The tool enables the determination of the optimal number of clusters for partition-based clustering, along with generating files used in the subsequent ecoregionalization workflow. + +=================== +**How to use it ?** +=================== + +The tool takes three inputs files: a file containing the environmental parameter values for each environment layer pixel (latitude-longitude), a file containing the list of selected taxa from previous step of the workflow and the file containing the BRT predictions. See example below. + +Then there are few parameters : + +- the maximum number of clusters to be tested with a minimum of two clusters + +- the metric used to calculate the dissimilarities between the observations: Manhattan, Euclidean and Jaccard + +- the sample size that will be used to perform clustering. Indeed, the clara function is used to clustering large data using a representative sample rather than the entire data set. This will speed up the clustering process and make the calculation more efficient. A fairly high value representative of the data is recommended. It is important to note that using too small a sample may result in loss of information compared to using the entire data set. + +The tool will produce three outputs. The first two are files that will be used in the rest of the workflow: a file containing four pieces of information, latitude, longitude, presence prediction and corresponding taxon, and a file containing the data to be partitioned. The third output corresponds to the main information of the tool, a graph presenting the value of the HIS index according to the number of clusters. The silhouette index provides a measure of the separation between clusters and the compactness within each cluster. The silhouette index ranges from -1 to 1. Values close to 1 indicate that objects are well grouped and separated from other clusters, while values close to -1 indicate that objects are poorly grouped and may be closer to other clusters. A value close to 0 indicates a situation where objects are located at the border between two neighboring clusters. + +**Example of the environemental file :** + ++------+------+---------+------+--------------+-----+ +| long | lat | Carbo | Grav | Maxbearing | ... | ++------+------+---------+------+--------------+-----+ +|139.22|-65.57| 0.88 |28.59 | 3.67 | ... | ++------+------+---------+------+--------------+-----+ +|139.22|-65.57| 0.88 |28.61 | 3.64 | ... | ++------+------+---------+------+--------------+-----+ +| ... | ... | ... | ... | ... | ... | ++------+------+---------+------+--------------+-----+ + +**Example of the Brt prediction file :** + ++-----------+----------+-----------------------+-------------+ +| lat | long | Prediction.index | spe | ++-----------+----------+-----------------------+-------------+ +| -65.57 | 139.22 | 0.122438487221909 | Acarnidae | ++-----------+----------+-----------------------+-------------+ +| -65.57 | 139.32 | 0.119154535627801 | Acarnidae | ++-----------+----------+-----------------------+-------------+ +| ... | ... | ... | ... | ++-----------+----------+-----------------------+-------------+ + + ]]> + </help> +</tool> +