annotate nb_clust_G.R @ 1:36637718c51d draft

planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
author ecology
date Wed, 11 Sep 2024 09:19:58 +0000
parents 5cde56683579
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
1 # Script to determine the optimal number of clusters thanks to the optimization of the SIH index and to produce the files needed in the next step of clustering
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
2
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
3 #load packages
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
4 library(cluster)
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
5 library(dplyr)
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
6 library(tidyverse)
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
7
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
8 #load arguments
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
9 args = commandArgs(trailingOnly=TRUE)
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
10 if (length(args)==0)
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
11 {
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
12 stop("This tool needs at least one argument")
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
13 }else{
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
14 enviro <- args[1]
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
15 taxa_list <- args[2]
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
16 preds <- args[3]
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
17 max_k <- as.numeric(args[4])
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
18 metric <- args[5]
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
19 sample <- as.numeric(args[6])
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
20 }
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
21
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
22 #load data
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
23
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
24 env.data <- read.table(enviro, sep="\t", header = TRUE, dec = ".", na.strings = "-9999")
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
25
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
26 ##List of modelled taxa used for clustering
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
27 tv <- read.table(taxa_list, dec=".", sep=" ", header=F, na.strings = "NA")
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
28 names(tv) <- c("a")
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
29
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
30 ################Grouping of taxa if multiple prediction files entered ################
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
31
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
32 data_split = str_split(preds,",")
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
33 data.bio = NULL
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
34
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
35 for (i in 1:length(data_split[[1]])) {
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
36 data.bio1 <- read.table(data_split[[1]][i], dec=".", sep="\t", header=T, na.strings = "NA")
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
37 data.bio <- rbind(data.bio,data.bio1)
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
38 remove(data.bio1)
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
39 }
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
40
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
41 names(data.bio) <- c("lat", "long", "pred", "taxon")
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
42
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
43 #keep selected taxa
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
44 data.bio <- data.bio[which(data.bio$taxon %in% tv$a),]
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
45
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
46 write.table(data.bio,file="data_bio.tsv",sep="\t",quote=F,row.names=F)
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
47
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
48 #format data
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
49
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
50 test3 <- matrix(data.bio$pred , nrow = nrow(env.data), ncol = nrow(data.bio)/nrow(env.data))
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
51 test3 <- data.frame(test3)
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
52 names(test3) <- unique(data.bio$taxon)
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
53
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
54 write.table(test3, file="data_to_clus.tsv", sep="\t",quote=F,row.names=F)
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
55
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
56 #Max number of clusters to test
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
57 max_k <- max_k
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
58
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
59 # Initialization of vectors to store SIH indices
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
60 sih_values <- rep(0, max_k)
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
61
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
62 # Calculation of the SIH index for each number of clusters
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
63 for (k in 2:max_k) {
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
64 # Clara execution
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
65 clara_res <- clara(test3, k, metric =metric, samples = sample, sampsize = min(nrow(test3), (nrow(data.bio)/nrow(test3))+2*k))
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
66 # Calculation of the SIH index
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
67 sih_values[k] <- clara_res$silinfo$avg.width
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
68 }
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
69
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
70 # Plot SIH Index Chart by Number of Clusters
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
71 png("Indices_SIH.png")
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
72 plot(2:max_k, sih_values[2:max_k], type = "b", xlab = "Number of clusters", ylab = "SIH index")
5cde56683579 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
ecology
parents:
diff changeset
73 dev.off()