changeset 1:6f75ab89587a draft default tip

planemo upload for repository https://github.com/jeanlecras/tools-ecology/tree/master/tools/WormsMeasurements commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
author ecology
date Wed, 28 May 2025 10:13:42 +0000
parents 23b963a1284e
children
files test-data/enriched_data.tabular test-data/enriched_data_inherited.tabular test-data/enriched_data_inherited_ohe.tabular test-data/sample.tabular wormsmeasurements.R wormsmeasurements.xml
diffstat 6 files changed, 93 insertions(+), 42 deletions(-) [+]
line wrap: on
line diff
--- a/test-data/enriched_data.tabular	Wed May 14 15:08:00 2025 +0000
+++ b/test-data/enriched_data.tabular	Wed May 28 10:13:42 2025 +0000
@@ -1,4 +1,4 @@
-"decimalLatitude"	"decimalLongitude"	"scientificName"	"Development"	" Fecundity"
+"decimalLatitude"	"decimalLongitude"	"scientificName"	"Development"	"Fecundity"
 -49.355	70.215	"Abatus cordatus"	NA	NA
 -66.963303	163.223297	"Ctenocidaris spinosa"	NA	NA
 -42.45	-74.75833333	"Loxechinus albus"	NA	NA
--- a/test-data/enriched_data_inherited.tabular	Wed May 14 15:08:00 2025 +0000
+++ b/test-data/enriched_data_inherited.tabular	Wed May 28 10:13:42 2025 +0000
@@ -1,4 +1,4 @@
-"decimalLatitude"	"decimalLongitude"	"scientificName"	"Development"	" Fecundity"
+"decimalLatitude"	"decimalLongitude"	"scientificName"	"Development"	"Fecundity"
 -49.355	70.215	"Abatus cordatus"	"planktotrophic"	NA
 -66.963303	163.223297	"Ctenocidaris spinosa"	"direct development"	NA
 -42.45	-74.75833333	"Loxechinus albus"	"planktotrophic"	NA
--- a/test-data/enriched_data_inherited_ohe.tabular	Wed May 14 15:08:00 2025 +0000
+++ b/test-data/enriched_data_inherited_ohe.tabular	Wed May 28 10:13:42 2025 +0000
@@ -1,11 +1,11 @@
-"decimalLatitude"	"decimalLongitude"	"scientificName"	"Development_direct development"	"Development_planktotrophic"	" Fecundity_"
--49.355	70.215	"Abatus cordatus"	0	1	0
--66.963303	163.223297	"Ctenocidaris spinosa"	1	0	0
--42.45	-74.75833333	"Loxechinus albus"	0	1	0
--37.606167	176.705167	"Ogmocidaris benhami"	0	1	0
--36.201698	175.834198	"Peronella hinemoae"	0	1	0
--37.494667	176.672501	"Phormosoma bursarium"	0	1	0
--43.469	173.572	"Pseudechinus huttoni"	0	1	0
--47.7	179.45	"Pseudechinus novaezealandiae"	0	1	0
--74.72	164.2183333	"Sterechinus neumayeri"	0	1	0
--70.51166667	-8.801	"Sterechinus sp"	NA	NA	0
+"decimalLatitude"	"decimalLongitude"	"scientificName"	"Fecundity"	"Development_direct development"	"Development_planktotrophic"
+-49.355	70.215	"Abatus cordatus"	NA	0	1
+-66.963303	163.223297	"Ctenocidaris spinosa"	NA	1	0
+-42.45	-74.75833333	"Loxechinus albus"	NA	0	1
+-37.606167	176.705167	"Ogmocidaris benhami"	NA	0	1
+-36.201698	175.834198	"Peronella hinemoae"	NA	0	1
+-37.494667	176.672501	"Phormosoma bursarium"	NA	0	1
+-43.469	173.572	"Pseudechinus huttoni"	NA	0	1
+-47.7	179.45	"Pseudechinus novaezealandiae"	NA	0	1
+-74.72	164.2183333	"Sterechinus neumayeri"	NA	0	1
+-70.51166667	-8.801	"Sterechinus sp"	NA	NA	NA
--- a/test-data/sample.tabular	Wed May 14 15:08:00 2025 +0000
+++ b/test-data/sample.tabular	Wed May 28 10:13:42 2025 +0000
@@ -1,11 +1,11 @@
 "decimalLatitude"	"decimalLongitude"	"scientificName"
-"1838"	-66.963303	163.223297	"Ctenocidaris spinosa"
-"7923"	-70.51166667	-8.801	"Sterechinus sp"
-"4410"	-37.494667	176.672501	"Phormosoma bursarium"
-"10574"	-37.606167	176.705167	"Ogmocidaris benhami"
-"2050"	-74.72	164.2183333	"Sterechinus neumayeri"
-"2531"	-47.7	179.45	"Pseudechinus novaezealandiae"
-"4145"	-49.355	70.215	"Abatus cordatus"
-"9337"	-36.201698	175.834198	"Peronella hinemoae"
-"7481"	-43.469	173.572	"Pseudechinus huttoni"
-"9986"	-42.45	-74.75833333	"Loxechinus albus"
+-66.963303	163.223297	"Ctenocidaris spinosa"
+-70.51166667	-8.801	"Sterechinus sp"
+-37.494667	176.672501	"Phormosoma bursarium"
+-37.606167	176.705167	"Ogmocidaris benhami"
+-74.72	164.2183333	"Sterechinus neumayeri"
+-47.7	179.45	"Pseudechinus novaezealandiae"
+-49.355	70.215	"Abatus cordatus"
+-36.201698	175.834198	"Peronella hinemoae"
+-43.469	173.572	"Pseudechinus huttoni"
+-42.45	-74.75833333	"Loxechinus albus"
--- a/wormsmeasurements.R	Wed May 14 15:08:00 2025 +0000
+++ b/wormsmeasurements.R	Wed May 28 10:13:42 2025 +0000
@@ -13,14 +13,20 @@
     stop("This tool needs at least one argument")
 }
 
-occurrence <- read.csv(args[1], header=T, sep="\t") %>% arrange(scientificName)
+scientificName_name <- args[3]
+occurrence <- read.csv(args[1], header=T, sep="\t") %>% 
+  arrange(.[[scientificName_name]])
 measurement_types <- unlist(str_split(args[2], ","))
 include_inherited <- ifelse(args[4]=="true", T, F)
 pivot_wider <- ifelse(args[5]=="true", T, F)
-scientificName_name <- args[3]
+exclude_NA <- ifelse(args[6]=="true", T, F)
+
+# regex to only keep genus and specific epithet from scientific names
+regex_find <- "^([A-Z][^A-Z(]+)(.*)$"
+regex_replace <- "\\1"
 
 
-### 
+# function to extract the measurement values from the attributes data tibble
 extract_traits_values <- function(traits_data) {
   result <- setNames(rep(NA, length(measurement_types)), measurement_types)
   
@@ -42,18 +48,21 @@
   return(result)
 }
 
+# function to call the call the WoRMS API and get the measurement values
 get_life_history_traits <- function(scientific_name) {
-  if (scientific_name %in% names(cache)) { 
-    return(cache[[scientific_name]])  
+  clean_scientific_name <- trimws(gsub(regex_find, regex_replace, scientific_name))
+
+  if (clean_scientific_name %in% names(cache)) { 
+    return(cache[[clean_scientific_name]])  
   }
   
   worms_id <- tryCatch(
-    wm_name2id(name = scientific_name),
+    wm_name2id(name = clean_scientific_name),
     error = function(e) NA
   )
   
   if (is.na(worms_id) || length(worms_id) == 0) {
-    cache[[scientific_name]] <<- NULL
+    cache[[clean_scientific_name]] <<- NULL
     return(NULL)
   }
   
@@ -63,27 +72,60 @@
   )
   
   if (is.null(data_attr)) {
-    cache[[scientific_name]] <<- NULL
+    cache[[clean_scientific_name]] <<- NULL
     return(NULL)
   }
   
   traits <- extract_traits_values(data_attr)
-  cache[[scientific_name]] <<- traits
+  cache[[clean_scientific_name]] <<- traits
   return(traits)
 }
 
+# a cache to limit API calls
 cache <- list()
 
+# add a columns conataining the lists of values of the measurments requested
 trait_data <- occurrence %>%
   mutate(life_history_traits = map(.data[[scientificName_name]], ~ get_life_history_traits(.x)))
 
-view(trait_data)
+# convert the column of lists to multiple columns of unique values
 trait_data <- trait_data %>%
   unnest_wider(life_history_traits)
 
-if (pivot_wider) {
-  trait_data <- dummy_cols(trait_data, select_columns = measurement_types, remove_selected_columns=T, ignore_na=T)
-
+# make sur each measurement type has a column
+for (col in measurement_types) {
+  if (!(col %in% names(trait_data))) {
+    trait_data[[col]] <- NA
+  }
 }
 
+# list of quantitativ measurements
+numeric_cols <- c()
+
+# try to convert columns to numeric and remember them
+trait_data <- trait_data %>%
+  mutate(across(all_of(measurement_types), ~ {
+    numeric_col <- suppressWarnings(as.numeric(.))
+    if (all(is.na(.) == is.na(numeric_col))) {
+      numeric_cols <<- c(numeric_cols, cur_column())
+      numeric_col
+    } else {
+      .
+    }
+  }))
+
+# filter NA but only in the added columns
+if (exclude_NA) {
+  trait_data <- trait_data[complete.cases(trait_data[, measurement_types]),]
+}
+
+# determine what are the qualitativ columns to be one hot encoded
+factor_cols <- setdiff(measurement_types, numeric_cols)
+
+# one hot encode quantitativ columns
+if (pivot_wider & length(factor_cols) > 0) {
+  trait_data <- dummy_cols(trait_data, select_columns = factor_cols, remove_selected_columns=T, ignore_na=T)
+}
+
+# write the enriched dataset as tabular
 write.table(trait_data, "enriched_data.tabular", sep="\t", row.names = FALSE)
\ No newline at end of file
--- a/wormsmeasurements.xml	Wed May 14 15:08:00 2025 +0000
+++ b/wormsmeasurements.xml	Wed May 28 10:13:42 2025 +0000
@@ -1,4 +1,4 @@
-<tool id="WormsMeasurements" name="Enrich dataset with WoRMS" version="0.1.1" profile="23.2">
+<tool id="WormsMeasurements" name="Enrich dataset with WoRMS" version="0.1.2" profile="23.2">
     <description>Enrich dataset with measurement type data from WoRMS</description>
 
     <requirements>
@@ -16,15 +16,20 @@
     '$scientic_name'
     '$include_inherited'
     '$pivot_wider'
+    '$exclude_NA'
     '$output'
     ]]></command>
 
     <inputs>
         <param name="data" type="data" format="tabular" label="occurrence data"/>
-        <param name="measurement_types" type="text" format="txt" label="list of measurement types"/>
-        <param name="scientic_name" type="text" format="txt" label="scientific names column name" value="scientificName"/>
+        <param name="measurement_types" type="text" format="txt" label="list of measurement types">
+        </param>
+        <param name="scientic_name" type="text" format="txt" label="scientific names column name" value="scientificName" optional="false">
+            <validator type="regex" message="this field can't be empty">.+</validator>
+        </param>
         <param name="include_inherited" type="boolean" label="include attributes inherited from parent taxon" checked="false"/>        
         <param name="pivot_wider" type="boolean" label="one hot encoding on the measurement types" checked="false"/>
+        <param name="exclude_NA" type="boolean" label="exclude lines with missing values (NA)" checked="false"/>
     </inputs>
 
     <outputs>
@@ -34,26 +39,29 @@
     <tests>
         <test>
             <param name="data" value="sample.tabular"/>
-            <param name="measurement_types" value="Development, Fecundity"/>
+            <param name="measurement_types" value="Development,Fecundity"/>
             <param name="scientic_name" value="scientificName"/>
             <param name="include_inherited" value="true"/>
             <param name="pivot_wider" value="false"/>
+            <param name="exclude_NA" value="false"/>
             <output name="output" file="enriched_data_inherited.tabular"/>
         </test>
         <test>
             <param name="data" value="sample.tabular"/>
-            <param name="measurement_types" value="Development, Fecundity"/>
+            <param name="measurement_types" value="Development,Fecundity"/>
             <param name="scientic_name" value="scientificName"/>
             <param name="include_inherited" value="false"/>
             <param name="pivot_wider" value="false"/>
+            <param name="exclude_NA" value="false"/>
             <output name="output" file="enriched_data.tabular"/>
         </test>
         <test>
             <param name="data" value="sample.tabular"/>
-            <param name="measurement_types" value="Development, Fecundity"/>
+            <param name="measurement_types" value="Development,Fecundity"/>
             <param name="include_inherited" value="true"/>
             <param name="scientic_name" value="scientificName"/>
             <param name="pivot_wider" value="true"/>
+            <param name="exclude_NA" value="false"/>
             <output name="output" file="enriched_data_inherited_ohe.tabular"/>
         </test>
     </tests>
@@ -76,6 +84,7 @@
 - **scientific names column name**: the name of column in the dataset containing scientific names.
 - **include attributes inherited from parent taxon**: usefull when the data you are looking for are incomplete.
 - **one hot encoding on the measurement types**: each possible values of a measurementType becomes a column encoded in 0/1
+- **exclude_NA**: exclude lines with missing measurement value
 
 ## Outputs: