comparison nb_clust_G.R @ 0:5cde56683579 draft default tip

planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 5d48df67919fbc9d77b98a8243d438c397f61a0e
author ecology
date Thu, 21 Mar 2024 14:05:01 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5cde56683579
1 # Script to determine the optimal number of clusters thanks to the optimization of the SIH index and to produce the files needed in the next step of clustering
2
3 #load packages
4 library(cluster)
5 library(dplyr)
6 library(tidyverse)
7
8 #load arguments
9 args = commandArgs(trailingOnly=TRUE)
10 if (length(args)==0)
11 {
12 stop("This tool needs at least one argument")
13 }else{
14 enviro <- args[1]
15 taxa_list <- args[2]
16 preds <- args[3]
17 max_k <- as.numeric(args[4])
18 metric <- args[5]
19 sample <- as.numeric(args[6])
20 }
21
22 #load data
23
24 env.data <- read.table(enviro, sep="\t", header = TRUE, dec = ".", na.strings = "-9999")
25
26 ##List of modelled taxa used for clustering
27 tv <- read.table(taxa_list, dec=".", sep=" ", header=F, na.strings = "NA")
28 names(tv) <- c("a")
29
30 ################Grouping of taxa if multiple prediction files entered ################
31
32 data_split = str_split(preds,",")
33 data.bio = NULL
34
35 for (i in 1:length(data_split[[1]])) {
36 data.bio1 <- read.table(data_split[[1]][i], dec=".", sep="\t", header=T, na.strings = "NA")
37 data.bio <- rbind(data.bio,data.bio1)
38 remove(data.bio1)
39 }
40
41 names(data.bio) <- c("lat", "long", "pred", "taxon")
42
43 #keep selected taxa
44 data.bio <- data.bio[which(data.bio$taxon %in% tv$a),]
45
46 write.table(data.bio,file="data_bio.tsv",sep="\t",quote=F,row.names=F)
47
48 #format data
49
50 test3 <- matrix(data.bio$pred , nrow = nrow(env.data), ncol = nrow(data.bio)/nrow(env.data))
51 test3 <- data.frame(test3)
52 names(test3) <- unique(data.bio$taxon)
53
54 write.table(test3, file="data_to_clus.tsv", sep="\t",quote=F,row.names=F)
55
56 #Max number of clusters to test
57 max_k <- max_k
58
59 # Initialization of vectors to store SIH indices
60 sih_values <- rep(0, max_k)
61
62 # Calculation of the SIH index for each number of clusters
63 for (k in 2:max_k) {
64 # Clara execution
65 clara_res <- clara(test3, k, metric =metric, samples = sample, sampsize = min(nrow(test3), (nrow(data.bio)/nrow(test3))+2*k))
66 # Calculation of the SIH index
67 sih_values[k] <- clara_res$silinfo$avg.width
68 }
69
70 # Plot SIH Index Chart by Number of Clusters
71 png("Indices_SIH.png")
72 plot(2:max_k, sih_values[2:max_k], type = "b", xlab = "Number of clusters", ylab = "SIH index")
73 dev.off()