comparison wormsmeasurements.R @ 1:6f75ab89587a draft default tip

planemo upload for repository https://github.com/jeanlecras/tools-ecology/tree/master/tools/WormsMeasurements commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
author ecology
date Wed, 28 May 2025 10:13:42 +0000
parents 23b963a1284e
children
comparison
equal deleted inserted replaced
0:23b963a1284e 1:6f75ab89587a
11 args <- commandArgs(trailingOnly = TRUE) 11 args <- commandArgs(trailingOnly = TRUE)
12 if (length(args) == 0) { 12 if (length(args) == 0) {
13 stop("This tool needs at least one argument") 13 stop("This tool needs at least one argument")
14 } 14 }
15 15
16 occurrence <- read.csv(args[1], header=T, sep="\t") %>% arrange(scientificName) 16 scientificName_name <- args[3]
17 occurrence <- read.csv(args[1], header=T, sep="\t") %>%
18 arrange(.[[scientificName_name]])
17 measurement_types <- unlist(str_split(args[2], ",")) 19 measurement_types <- unlist(str_split(args[2], ","))
18 include_inherited <- ifelse(args[4]=="true", T, F) 20 include_inherited <- ifelse(args[4]=="true", T, F)
19 pivot_wider <- ifelse(args[5]=="true", T, F) 21 pivot_wider <- ifelse(args[5]=="true", T, F)
20 scientificName_name <- args[3] 22 exclude_NA <- ifelse(args[6]=="true", T, F)
23
24 # regex to only keep genus and specific epithet from scientific names
25 regex_find <- "^([A-Z][^A-Z(]+)(.*)$"
26 regex_replace <- "\\1"
21 27
22 28
23 ### 29 # function to extract the measurement values from the attributes data tibble
24 extract_traits_values <- function(traits_data) { 30 extract_traits_values <- function(traits_data) {
25 result <- setNames(rep(NA, length(measurement_types)), measurement_types) 31 result <- setNames(rep(NA, length(measurement_types)), measurement_types)
26 32
27 if (is.null(traits_data) || nrow(traits_data) == 0) { 33 if (is.null(traits_data) || nrow(traits_data) == 0) {
28 return(result) 34 return(result)
40 result[traits_filtered$measurementType[i]] <- traits_filtered$measurementValue[i] 46 result[traits_filtered$measurementType[i]] <- traits_filtered$measurementValue[i]
41 } 47 }
42 return(result) 48 return(result)
43 } 49 }
44 50
51 # function to call the call the WoRMS API and get the measurement values
45 get_life_history_traits <- function(scientific_name) { 52 get_life_history_traits <- function(scientific_name) {
46 if (scientific_name %in% names(cache)) { 53 clean_scientific_name <- trimws(gsub(regex_find, regex_replace, scientific_name))
47 return(cache[[scientific_name]]) 54
55 if (clean_scientific_name %in% names(cache)) {
56 return(cache[[clean_scientific_name]])
48 } 57 }
49 58
50 worms_id <- tryCatch( 59 worms_id <- tryCatch(
51 wm_name2id(name = scientific_name), 60 wm_name2id(name = clean_scientific_name),
52 error = function(e) NA 61 error = function(e) NA
53 ) 62 )
54 63
55 if (is.na(worms_id) || length(worms_id) == 0) { 64 if (is.na(worms_id) || length(worms_id) == 0) {
56 cache[[scientific_name]] <<- NULL 65 cache[[clean_scientific_name]] <<- NULL
57 return(NULL) 66 return(NULL)
58 } 67 }
59 68
60 data_attr <- tryCatch( 69 data_attr <- tryCatch(
61 wm_attr_data(worms_id, include_inherited=include_inherited), 70 wm_attr_data(worms_id, include_inherited=include_inherited),
62 error = function(e) NULL 71 error = function(e) NULL
63 ) 72 )
64 73
65 if (is.null(data_attr)) { 74 if (is.null(data_attr)) {
66 cache[[scientific_name]] <<- NULL 75 cache[[clean_scientific_name]] <<- NULL
67 return(NULL) 76 return(NULL)
68 } 77 }
69 78
70 traits <- extract_traits_values(data_attr) 79 traits <- extract_traits_values(data_attr)
71 cache[[scientific_name]] <<- traits 80 cache[[clean_scientific_name]] <<- traits
72 return(traits) 81 return(traits)
73 } 82 }
74 83
84 # a cache to limit API calls
75 cache <- list() 85 cache <- list()
76 86
87 # add a columns conataining the lists of values of the measurments requested
77 trait_data <- occurrence %>% 88 trait_data <- occurrence %>%
78 mutate(life_history_traits = map(.data[[scientificName_name]], ~ get_life_history_traits(.x))) 89 mutate(life_history_traits = map(.data[[scientificName_name]], ~ get_life_history_traits(.x)))
79 90
80 view(trait_data) 91 # convert the column of lists to multiple columns of unique values
81 trait_data <- trait_data %>% 92 trait_data <- trait_data %>%
82 unnest_wider(life_history_traits) 93 unnest_wider(life_history_traits)
83 94
84 if (pivot_wider) { 95 # make sur each measurement type has a column
85 trait_data <- dummy_cols(trait_data, select_columns = measurement_types, remove_selected_columns=T, ignore_na=T) 96 for (col in measurement_types) {
86 97 if (!(col %in% names(trait_data))) {
98 trait_data[[col]] <- NA
99 }
87 } 100 }
88 101
102 # list of quantitativ measurements
103 numeric_cols <- c()
104
105 # try to convert columns to numeric and remember them
106 trait_data <- trait_data %>%
107 mutate(across(all_of(measurement_types), ~ {
108 numeric_col <- suppressWarnings(as.numeric(.))
109 if (all(is.na(.) == is.na(numeric_col))) {
110 numeric_cols <<- c(numeric_cols, cur_column())
111 numeric_col
112 } else {
113 .
114 }
115 }))
116
117 # filter NA but only in the added columns
118 if (exclude_NA) {
119 trait_data <- trait_data[complete.cases(trait_data[, measurement_types]),]
120 }
121
122 # determine what are the qualitativ columns to be one hot encoded
123 factor_cols <- setdiff(measurement_types, numeric_cols)
124
125 # one hot encode quantitativ columns
126 if (pivot_wider & length(factor_cols) > 0) {
127 trait_data <- dummy_cols(trait_data, select_columns = factor_cols, remove_selected_columns=T, ignore_na=T)
128 }
129
130 # write the enriched dataset as tabular
89 write.table(trait_data, "enriched_data.tabular", sep="\t", row.names = FALSE) 131 write.table(trait_data, "enriched_data.tabular", sep="\t", row.names = FALSE)