Mercurial > repos > ecology > ecoregion_taxa_seeker
annotate nb_clust_G.R @ 1:9dc992f80c25 draft
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 459ba1277acd7d8d4a02f90dbd7ff444bf8eac92
author | ecology |
---|---|
date | Wed, 24 Jan 2024 15:52:43 +0000 |
parents | e3cd588fd14a |
children |
rev | line source |
---|---|
0
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
1 # Script to determine the optimal number of clusters thanks to the optimization of the SIH index and to produce the files needed in the next step of clustering |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
2 |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
3 #load packages |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
4 library(cluster) |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
5 library(dplyr) |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
6 library(tidyverse) |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
7 |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
8 #load arguments |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
9 args = commandArgs(trailingOnly=TRUE) |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
10 if (length(args)==0) |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
11 { |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
12 stop("This tool needs at least one argument") |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
13 }else{ |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
14 enviro <- args[1] |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
15 taxa_list <- args[2] |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
16 preds <- args[3] |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
17 max_k <- as.numeric(args[4]) |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
18 metric <- args[5] |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
19 sample <- as.numeric(args[6]) |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
20 } |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
21 |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
22 #load data |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
23 |
1
9dc992f80c25
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 459ba1277acd7d8d4a02f90dbd7ff444bf8eac92
ecology
parents:
0
diff
changeset
|
24 env.data <- read.table(enviro, sep="\t", header = TRUE, dec = ".", na.strings = "-9999") |
0
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
25 |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
26 ##List of modelled taxa used for clustering |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
27 tv <- read.table(taxa_list, dec=".", sep=" ", header=F, na.strings = "NA") |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
28 names(tv) <- c("a") |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
29 |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
30 ################Grouping of taxa if multiple prediction files entered ################ |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
31 |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
32 data_split = str_split(preds,",") |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
33 data.bio = NULL |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
34 |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
35 for (i in 1:length(data_split[[1]])) { |
1
9dc992f80c25
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 459ba1277acd7d8d4a02f90dbd7ff444bf8eac92
ecology
parents:
0
diff
changeset
|
36 data.bio1 <- read.table(data_split[[1]][i], dec=".", sep="\t", header=T, na.strings = "NA") |
0
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
37 data.bio <- rbind(data.bio,data.bio1) |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
38 remove(data.bio1) |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
39 } |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
40 |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
41 names(data.bio) <- c("lat", "long", "pred", "taxon") |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
42 |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
43 #keep selected taxa |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
44 data.bio <- data.bio[which(data.bio$taxon %in% tv$a),] |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
45 |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
46 write.table(data.bio,file="data_bio.tsv",sep="\t",quote=F,row.names=F) |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
47 |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
48 #format data |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
49 |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
50 test3 <- matrix(data.bio$pred , nrow = nrow(env.data), ncol = nrow(data.bio)/nrow(env.data)) |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
51 test3 <- data.frame(test3) |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
52 names(test3) <- unique(data.bio$taxon) |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
53 |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
54 write.table(test3, file="data_to_clus.tsv", sep="\t",quote=F,row.names=F) |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
55 |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
56 #Max number of clusters to test |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
57 max_k <- max_k |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
58 |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
59 # Initialization of vectors to store SIH indices |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
60 sih_values <- rep(0, max_k) |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
61 |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
62 # Calculation of the SIH index for each number of clusters |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
63 for (k in 2:max_k) { |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
64 # Clara execution |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
65 clara_res <- clara(test3, k, metric =metric, samples = sample, sampsize = min(nrow(test3), (nrow(data.bio)/nrow(test3))+2*k)) |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
66 # Calculation of the SIH index |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
67 sih_values[k] <- clara_res$silinfo$avg.width |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
68 } |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
69 |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
70 # Plot SIH Index Chart by Number of Clusters |
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
71 png("Indices_SIH.png") |
1
9dc992f80c25
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 459ba1277acd7d8d4a02f90dbd7ff444bf8eac92
ecology
parents:
0
diff
changeset
|
72 plot(2:max_k, sih_values[2:max_k], type = "b", xlab = "Number of clusters", ylab = "SIH index") |
0
e3cd588fd14a
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit 2a2ae892fa2dbc1eff9c6a59c3ad8f3c27c1c78d
ecology
parents:
diff
changeset
|
73 dev.off() |