comparison Nb_cluster.xml @ 1:e94a25eed489 draft

planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 459ba1277acd7d8d4a02f90dbd7ff444bf8eac92
author ecology
date Wed, 24 Jan 2024 15:53:32 +0000
parents 0f6542d0986e
children 001d7d101915
comparison
equal deleted inserted replaced
0:0f6542d0986e 1:e94a25eed489
19 '$output2' 19 '$output2'
20 '$output3' 20 '$output3'
21 ]]> 21 ]]>
22 </command> 22 </command>
23 <inputs> 23 <inputs>
24 <param name="envfile" type="data" format="txt,csv,tabular" label="Environment file"/> 24 <param name="envfile" type="data" format="tabular" label="Environment file (tabular format only)" help="See example below"/>
25 <param name="taxafile" type="data" format="txt" label="Taxa selected file (List of taxa from TaxaSeeker tool)"/> 25 <param name="taxafile" type="data" format="txt" label="Taxa selected file (File 'List of taxa' from TaxaSeeker tool)"/>
26 <param name="predictionfile" type="data" format="txt" multiple="true" label="Prediction files"/> 26 <param name="predictionfile" type="data" format="tabular" multiple="true" label="Prediction files"/>
27 <param name="max_k" type="integer" value="2" min="1" label="Number of Cluster to test"/> 27 <param name="max_k" type="integer" value="2" min="1" label="Number of Cluster to test"/>
28 <param name="metric" type="select" label="What metric to use to calculate dissimilarities between observations ?"> 28 <param name="metric" type="select" label="What metric to use to calculate dissimilarities between observations ?">
29 <option value="manhattan">manhattan</option> 29 <option value="manhattan">manhattan</option>
30 <option value="euclidean">euclidean</option> 30 <option value="euclidean">euclidean</option>
31 <option value="jaccard">jaccard</option> 31 <option value="jaccard">jaccard</option>
32 </param> 32 </param>
33 <param name="sample" type="integer" label="The number of samples to be drawn from the dataset" min="5" value="10"/> 33 <param name="sample" type="integer" label="The number of samples to be drawn from the dataset" min="5" value="10"/>
34 </inputs> 34 </inputs>
35 <outputs> 35 <outputs>
36 <data name="output1" from_work_dir="Indices_SIH.png" format="png" label="SIH index plot"/> 36 <data name="output1" from_work_dir="Indices_SIH.png" format="png" label="SIH index plot"/>
37 <data name="output2" from_work_dir="data_to_clus.tsv" format="tsv" label="Data to cluster"/> 37 <data name="output2" from_work_dir="data_to_clus.tsv" format="tabular" label="Data to cluster"/>
38 <data name="output3" from_work_dir="data_bio.tsv" format="tsv" label="Data.bio table "/> 38 <data name="output3" from_work_dir="data_bio.tsv" format="tabular" label="Data.bio table "/>
39 </outputs> 39 </outputs>
40 <tests> 40 <tests>
41 <test> 41 <test>
42 <param name="envfile" value="ceamarc_env.csv"/> 42 <param name="envfile" value="ceamarc_env.tsv"/>
43 <param name="taxafile" value="List_of_taxa.txt"/> 43 <param name="taxafile" value="List_of_taxa.txt"/>
44 <param name="predictionfile" value="1_brts_pred_ceamarc.txt"/> 44 <param name="predictionfile" value="1_brts_pred_ceamarc.tsv"/>
45 <param name='max_k' value="2"/> 45 <param name='max_k' value="2"/>
46 <param name='metric' value="manhattan"/> 46 <param name='metric' value="manhattan"/>
47 <param name='sample' value="10"/> 47 <param name='sample' value="10"/>
48 <output name='output1' value="SIH_index_plot.png"/> 48 <output name='output1'>
49 <assert_contents>
50 <has_size value="4297" delta="500"/>
51 </assert_contents>
52 </output>
49 <output name='output2' value="Data_to_cluster.tsv"/> 53 <output name='output2' value="Data_to_cluster.tsv"/>
50 <output name='output3' value="Data.bio_table.tsv"/> 54 <output name='output3' value="Data.bio_table.tsv"/>
51 </test> 55 </test>
52 </tests> 56 </tests>
53 <help><![CDATA[ 57 <help><![CDATA[
69 73
70 - the metric used to calculate the dissimilarities between the observations: Manhattan, Euclidean and Jaccard 74 - the metric used to calculate the dissimilarities between the observations: Manhattan, Euclidean and Jaccard
71 75
72 - the sample size that will be used to perform clustering. Indeed, the clara function is used to clustering large data using a representative sample rather than the entire data set. This will speed up the clustering process and make the calculation more efficient. A fairly high value representative of the data is recommended. It is important to note that using too small a sample may result in loss of information compared to using the entire data set. 76 - the sample size that will be used to perform clustering. Indeed, the clara function is used to clustering large data using a representative sample rather than the entire data set. This will speed up the clustering process and make the calculation more efficient. A fairly high value representative of the data is recommended. It is important to note that using too small a sample may result in loss of information compared to using the entire data set.
73 77
74 The tool will produce three outputs. The first two are files that will be used in the rest of the workflow: a file containing four pieces of information, latitude, longitude, presence prediction and corresponding taxon, and a file containing the data to be partitioned. The third output corresponds to the main information of the tool, a graph presenting the value of the HIS index according to the number of clusters. The silhouette index provides a measure of the separation between clusters and the compactness within each cluster. The silhouette index ranges from -1 to 1. Values close to 1 indicate that objects are well grouped and separated from other clusters, while values close to -1 indicate that objects are poorly grouped and may be closer to other clusters. A value close to 0 indicates a situation where objects are located at the border between two neighboring clusters. 78 The tool will produce three outputs. The first two are files that will be used in the rest of the workflow: a file containing four pieces of information, latitude, longitude, presence prediction and corresponding taxon, and a file containing the data to be partitioned. The third output corresponds to the main information of the tool, a graph presenting the value of the SIH index according to the number of clusters. The silhouette index provides a measure of the separation between clusters and the compactness within each cluster. The silhouette index ranges from -1 to 1. Values close to 1 indicate that objects are well grouped and separated from other clusters, while values close to -1 indicate that objects are poorly grouped and may be closer to other clusters. A value close to 0 indicates a situation where objects are located at the border between two neighboring clusters.
75 79
76 **Example of the environemental file :** 80 **Example of the environemental file :**
77 81
78 +------+------+---------+------+--------------+-----+ 82 +------+------+---------+------+--------------+-----+
79 | long | lat | Carbo | Grav | Maxbearing | ... | 83 | long | lat | Carbo | Grav | Maxbearing | ... |