diff nb_clust_G.R @ 0:e3cd588fd14a draft

planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
author ecology
date Wed, 18 Oct 2023 09:58:17 +0000
parents
children 9dc992f80c25
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/nb_clust_G.R	Wed Oct 18 09:58:17 2023 +0000
@@ -0,0 +1,73 @@
+# Script to determine the optimal number of clusters thanks to the optimization of the SIH index and to produce the files needed in the next step of clustering
+
+#load packages
+library(cluster)
+library(dplyr)
+library(tidyverse)
+
+#load arguments
+args = commandArgs(trailingOnly=TRUE) 
+if (length(args)==0)
+{
+    stop("This tool needs at least one argument")
+}else{
+    enviro <- args[1]
+    taxa_list <- args[2]
+    preds <- args[3]
+    max_k <- as.numeric(args[4])
+    metric <- args[5]
+    sample <- as.numeric(args[6])
+}
+
+#load data 
+
+env.data <- read.table(enviro, header = TRUE, dec = ".", na.strings = "-9999.00") 
+
+##List of modelled taxa used for clustering
+tv <- read.table(taxa_list, dec=".", sep=" ", header=F, na.strings = "NA") 
+names(tv) <- c("a")
+
+################Grouping of taxa if multiple prediction files entered ################
+
+data_split = str_split(preds,",")
+data.bio = NULL
+
+for (i in 1:length(data_split[[1]])) {
+data.bio1 <- read.table(data_split[[1]][i], dec=".", sep=" ", header=T, na.strings = "NA")
+data.bio <- rbind(data.bio,data.bio1)
+remove(data.bio1)
+}
+
+names(data.bio) <- c("lat", "long", "pred", "taxon")
+
+#keep selected taxa
+data.bio <- data.bio[which(data.bio$taxon %in% tv$a),]
+
+write.table(data.bio,file="data_bio.tsv",sep="\t",quote=F,row.names=F)
+
+#format data
+
+test3 <- matrix(data.bio$pred , nrow = nrow(env.data),  ncol = nrow(data.bio)/nrow(env.data))
+test3 <- data.frame(test3)
+names(test3) <- unique(data.bio$taxon)
+
+write.table(test3, file="data_to_clus.tsv", sep="\t",quote=F,row.names=F)
+
+#Max number of clusters to test
+max_k <- max_k
+
+# Initialization of vectors to store SIH indices
+sih_values <- rep(0, max_k)
+
+# Calculation of the SIH index for each number of clusters
+for (k in 2:max_k) {
+  # Clara execution
+  clara_res <- clara(test3, k,  metric =metric,  samples = sample, sampsize = min(nrow(test3), (nrow(data.bio)/nrow(test3))+2*k))
+  # Calculation of the SIH index
+  sih_values[k] <- clara_res$silinfo$avg.width
+}
+
+# Plot SIH Index Chart by Number of Clusters
+png("Indices_SIH.png")
+plot(2:max_k, sih_values[2:max_k], type = "b", xlab = "Nombre de clusters", ylab = "Indice SIH")
+dev.off()