Repository 'ecoregion_taxa_seeker'
hg clone https://toolshed.g2.bx.psu.edu/repos/ecology/ecoregion_taxa_seeker

Changeset 5:68d98dbd8bfc (2025-05-14)
Previous changeset 4:2152f424e7a2 (2024-10-16) Next changeset 6:14bad9452425 (2025-05-28)
Commit message:
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
added:
claraguess.R
test-data/enviro.tabular
test-data/preds.tabular
test-data/taxas.tabular
b
diff -r 2152f424e7a2 -r 68d98dbd8bfc claraguess.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/claraguess.R Wed May 14 13:49:16 2025 +0000
[
@@ -0,0 +1,75 @@
+##30/04/2025
+##Jean Le Cras
+### Clustering with Clara algorithm with an option to determine the optimal number of clusters based on SIH index
+
+#load libraries
+library(cluster)
+library(dplyr)
+library(tidyverse)
+
+#load arguments
+args <- commandArgs(trailingOnly = TRUE)
+if (length(args)==0) {
+  stop("This tool needs at least one argument")
+}
+
+#load data
+enviro_path <- args[1]
+preds_path <- args[2]
+taxas_path <- args[3]
+type <- args[4]
+k <- as.integer(args[5])
+metric <- args[6]
+samples <- as.integer(args[7])
+env.data <- read.table(enviro_path, sep = "\t", header = TRUE, dec = ".", na.strings = "-9999")
+
+data_split = str_split(preds_path, ",")
+preds.data = NULL
+
+for (i in 1:length(data_split[[1]])) {
+  df <- read.table(data_split[[1]][i], dec=".", sep="\t", header=T, na.strings="NA")
+  preds.data <- rbind(preds.data, df)
+  remove(df)
+}
+
+names(preds.data) <- c("lat", "long", "pred", "taxa")
+
+development_traits <- str_split(readLines(taxas_path), "\t")
+
+#select the clara model with the optimal number of clusters
+model <- NULL
+
+if (type == "auto") {
+  sih_scores <- c()
+  models <- list()
+  
+  for (i in 2:k) {
+    models[[i]] <- clara(preds.data$pred, i, metric = metric, samples = samples, stand = TRUE)
+    sih_scores[i] <- models[[i]]$silinfo$avg.width
+  }
+  png("sih_scores.png")
+  plot(2:k, sih_scores[2:k], type = "b", xlab = "Number of clusters", ylab = "SIH index")
+  dev.off()
+  
+  best_k <- which.max(sih_scores[3:k]) + 2
+  model <- models[[best_k]]
+  k <- best_k
+} else {
+  model <- clara(preds.data$pred, k, metric = metric, samples = samples, stand = TRUE)
+}
+
+#saving results
+png("silhouette_plot.png")
+plot(silhouette(model), main = paste("Silhouette plot for k =", k))
+dev.off()
+
+data.test <- matrix(preds.data$pred, nrow = nrow(env.data), ncol = nrow(preds.data) / nrow(env.data))
+data.test <- data.frame(data.test)
+names(data.test) <- unique(preds.data$development)
+
+full.data <- cbind(preds.data[1:nrow(data.test), 1:2], model$clustering)
+names(full.data) <- c("lat", "long", "cluster")
+full.data <- cbind(full.data, data.test, env.data[, 3:ncol(env.data)])
+
+write.table(full.data[1:3], file = "data_cluster.tabular", quote = FALSE, sep = "\t", row.names = FALSE)
+write.table(full.data, file = "clustered_taxas_env.tabular", quote = FALSE, sep = "\t", row.names = FALSE)
\ No newline at end of file
b
diff -r 2152f424e7a2 -r 68d98dbd8bfc test-data/enviro.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/enviro.tabular Wed May 14 13:49:16 2025 +0000
b
@@ -0,0 +1,11 @@
+"long" "lat" "Carbo" "Grav"
+"152250" 145.35 -66.91 1.06 0.92
+"12010" 139.91 -65.75 1.6 25.77
+"88982" 139.8 -66.34 1.16 2.91
+"109687" 144.67 -66.48 1.23 13.74
+"124999" 142.27 -66.59 3.31 3.13
+"119451" 141.04 -66.55 2.99 3.31
+"128864" 140.8 -66.62 2.76 2.95
+"125833" 142.26 -66.6 3.3 3.12
+"113403" 144.09 -66.5 1.4 8.95
+"56804" 140.83 -66.11 2.33 9.83
b
diff -r 2152f424e7a2 -r 68d98dbd8bfc test-data/preds.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/preds.tabular Wed May 14 13:49:16 2025 +0000
b
@@ -0,0 +1,11 @@
+"lat" "long" "Prediction.index" "spe"
+"148207" -66.83 144.41 0.127117028637464 "Development_direct.development"
+"175359" -65.83 141.48 0.000126665227113262 "Development_lecithotrophic"
+"226550" -66.2 144.88 4.3718554237373e-05 "Development_lecithotrophic"
+"132855" -66.66 142.98 0.512219435270445 "Development_direct.development"
+"380560" -66.19 145.47 0.455627207042825 "Development_planktotrophic"
+"361759" -66.06 140.61 0.529983873632234 "Development_planktotrophic"
+"345639" -65.95 140.92 0.471266742957099 "Development_planktotrophic"
+"27420" -65.9 144.56 0.0224605950764499 "Development_direct.development"
+"207288" -66.07 143.27 4.36554587548896e-05 "Development_lecithotrophic"
+"349308" -65.98 140.02 0.402308937370468 "Development_planktotrophic"
b
diff -r 2152f424e7a2 -r 68d98dbd8bfc test-data/taxas.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/taxas.tabular Wed May 14 13:49:16 2025 +0000
b
@@ -0,0 +1,3 @@
+Development_direct.development
+Development_lecithotrophic
+Development_planktotrophic