annotate claraguess.R @ 2:8ac53593cef6 draft default tip

planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 9dee19b2d28b61a81f2a89d4c7d35678e31a9927
author ecology
date Wed, 23 Jul 2025 14:37:13 +0000
parents adeb719a267f
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
1 ##30/04/2025
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
2 ##Jean Le Cras
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
3 ### Clustering with Clara algorithm with an option to determine the optimal number of clusters based on SIH index
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
4
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
5 #load libraries
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
6 library(cluster)
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
7 library(dplyr)
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
8 library(tidyverse)
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
9
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
10 #load arguments
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
11 args <- commandArgs(trailingOnly = TRUE)
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
12 if (length(args)==0) {
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
13 stop("This tool needs at least one argument")
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
14 }
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
15
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
16 #load data
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
17 enviro_path <- args[1]
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
18 preds_path <- args[2]
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
19 taxas_path <- args[3]
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
20 type <- args[4]
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
21 k <- as.integer(args[5])
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
22 metric <- args[6]
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
23 samples <- as.integer(args[7])
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
24 env.data <- read.table(enviro_path, sep = "\t", header = TRUE, dec = ".", na.strings = "-9999")
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
25
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
26 data_split = str_split(preds_path, ",")
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
27 preds.data = NULL
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
28
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
29 for (i in 1:length(data_split[[1]])) {
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
30 df <- read.table(data_split[[1]][i], dec=".", sep="\t", header=T, na.strings="NA")
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
31 preds.data <- rbind(preds.data, df)
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
32 remove(df)
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
33 }
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
34
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
35 names(preds.data) <- c("lat", "long", "pred", "taxa")
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
36
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
37 development_traits <- str_split(readLines(taxas_path), "\t")
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
38
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
39 #select the clara model with the optimal number of clusters
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
40 model <- NULL
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
41
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
42 if (type == "auto") {
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
43 sih_scores <- c()
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
44 models <- list()
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
45
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
46 for (i in 2:k) {
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
47 models[[i]] <- clara(preds.data$pred, i, metric = metric, samples = samples, stand = TRUE)
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
48 sih_scores[i] <- models[[i]]$silinfo$avg.width
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
49 }
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
50 png("sih_scores.png")
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
51 plot(2:k, sih_scores[2:k], type = "b", xlab = "Number of clusters", ylab = "SIH index")
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
52 dev.off()
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
53
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
54 best_k <- which.max(sih_scores[3:k]) + 2
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
55 model <- models[[best_k]]
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
56 k <- best_k
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
57 } else {
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
58 model <- clara(preds.data$pred, k, metric = metric, samples = samples, stand = TRUE)
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
59 }
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
60
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
61 #saving results
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
62 png("silhouette_plot.png")
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
63 plot(silhouette(model), main = paste("Silhouette plot for k =", k))
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
64 dev.off()
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
65
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
66 data.test <- matrix(preds.data$pred, nrow = nrow(env.data), ncol = nrow(preds.data) / nrow(env.data))
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
67 data.test <- data.frame(data.test)
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
68 names(data.test) <- unique(preds.data$development)
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
69
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
70 full.data <- cbind(preds.data[1:nrow(data.test), 1:2], model$clustering)
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
71 names(full.data) <- c("lat", "long", "cluster")
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
72 full.data <- cbind(full.data, data.test, env.data[, 3:ncol(env.data)])
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
73
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
74 write.table(full.data[1:3], file = "data_cluster.tabular", quote = FALSE, sep = "\t", row.names = FALSE)
adeb719a267f planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 1f5e22a210b8a395f1c7b48f54e03e781a1b34c4
ecology
parents:
diff changeset
75 write.table(full.data, file = "clustered_taxas_env.tabular", quote = FALSE, sep = "\t", row.names = FALSE)