comparison FunctExePlotGLMGalaxy.r @ 0:3ab852a7ff53 draft

"planemo upload for repository https://github.com/ColineRoyaux/PAMPA-Galaxy commit 04381ca7162ec3ec68419e308194b91d11cacb04"
author ecology
date Mon, 16 Nov 2020 11:02:43 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:3ab852a7ff53
1 #Rscript
2
3 #####################################################################################################################
4 #####################################################################################################################
5 ###################################### Create a plot from your community data #######################################
6 #####################################################################################################################
7 #####################################################################################################################
8
9 ###################### Packages
10 suppressMessages(library(ggplot2))
11 suppressMessages(library(boot))
12
13 ###################### Load arguments and declaring variables
14
15 args <- commandArgs(trailingOnly = TRUE)
16
17
18 if (length(args) < 2) {
19 stop("At least 3 arguments must be supplied input dataset file with GLM results (.tabular)", call. = FALSE) #if no args -> error and exit1
20
21 } else {
22 import_data <- args[1] ###### file name : glm results table
23 data_tab <- args[2] ###### file name : Metrics table
24 unitobs_tab <- args[3] ###### file name : Unitobs table
25 source(args[4]) ###### Import functions
26
27 }
28
29 #Import data
30 glmtable <- read.table(import_data, sep = "\t", dec = ".", header = TRUE, encoding = "UTF-8") #
31 datatable <- read.table(data_tab, sep = "\t", dec = ".", header = TRUE, encoding = "UTF-8") #
32 unitobs <- read.table(unitobs_tab, sep = "\t", dec = ".", header = TRUE, encoding = "UTF-8") #
33
34 #Check files
35
36 vars_data1 <- c("analysis", "Interest.var", "distribution")
37 err_msg_data1 <- "The input GLM results dataset doesn't have the right format. It needs to have at least the following 3 fields :\n- analysis\n- Interest.var\n- distribution\n"
38 check_file(glmtable, err_msg_data1, vars_data1, 4)
39
40 if (length(grep("[0-2][0|9][0-9][0-9].[Estimate|Pvalue]", colnames(glmtable))) == 0) {
41 stop("The input GLM results dataset doesn't have the right format or informations. This tool is to represent temporal trends, if your GLM doesn't take the year variable as a fixed effect this tool is not proper to make any representation of it. It needs to have at least estimates and p-value for every year from your time series GLM as columns with name formated as : yyyy Estimate (example : 2020 Estimate) and yyyy Pvalue (example : 2020 Pvalue).")
42 }
43
44 if (length(grep("[0-2][0|9][0-9][0-9].IC_[up|inf]", colnames(glmtable))) == 0) {
45 assess_ic <- FALSE
46 }else{
47 assess_ic <- TRUE
48 }
49
50 metric <- as.character(glmtable[1, "Interest.var"])
51
52 vars_data2 <- c("observation.unit", "location", metric)
53 err_msg_data2 <- "The input metrics dataset doesn't have the right format. It needs to have at least the following 3 fields :\n- observation.unit\n- location\n- the name of the interest metric\n"
54 check_file(datatable, err_msg_data2, vars_data2, 4)
55
56 vars_data3 <- c("observation.unit", "year")
57 err_msg_data3 <- "The input unitobs dataset doesn't have the right format. It needs to have at least the following 2 fields :\n- observation.unit\n- year\n"
58 check_file(unitobs, err_msg_data3, vars_data3, 2)
59 if (length(grep("[0-2][0|9][0-9][0-9]", unitobs$year)) == 0) {
60 stop("The year column in the input unitobs dataset doesn't have the right format. Years must be fully written as : yyyy (example : 2020).")
61 }
62
63 if (all(is.na(match(datatable[, "observation.unit"], unitobs[, "observation.unit"])))) {
64 stop("Observation units doesn't match in the inputs metrics dataset and unitobs dataset")
65 }
66
67 ####################################################################################################################
68 ######################### Creating plot from time series GLM data ## Function : ggplot_glm #########################
69 ####################################################################################################################
70 ggplot_glm <- function(glmtable, datatable, unitobs, metric = metric, sp, description = TRUE,
71 trend_on_graph = TRUE, assess_ic = TRUE) {
72 ## Purpose: Creating plot from time series GLM data
73 ## ----------------------------------------------------------------------
74 ## Arguments: glmtable : GLM(s) results table
75 ## datatable : Metrics table
76 ## unitobs : Unitobs table
77 ## metric : Interest variable in GLM(s)
78 ## sp : name of processed GLM
79 ## description : Two graphs ?
80 ## trend_on_graph : Write global trend of the time series on graph ?
81 ## assess_ic : Assess confidence intervals ?
82 ## ----------------------------------------------------------------------
83 ## Author: Coline ROYAUX 13 october 2020
84
85 s_signif <- 0.05 ## threshold when pvalue is considered significant
86 distrib <- as.character(glmtable[1, "distribution"]) ## extract GLM distribution
87
88 col <- c("observation.unit", "location", metric) ## names of needed columns in metrics table to construct the 2nd panel of the graph
89
90 if (colnames(glmtable)[length(glmtable)] == "separation") { ## if GLM is a community analysis
91 cut <- as.character(glmtable[1, "separation"])
92 if (cut != "None") { ## if there is plural GLM analysis performed
93 if (! is.element(as.character(cut), colnames(unitobs))) {
94 stop("The input unitobs dataset doesn't have the right format. If plural GLM analysis have been performed it needs to have the field used as separation factor.")
95 }
96 datatable <- cbind(datatable[, col], unitobs[match(datatable[, "observation.unit"], unitobs[, "observation.unit"]), c("year", cut)]) ## extracting 'year' and analysis separation factor columns from unitobs table to merge with metrics table /// Matching lines with 'observation.unit' column
97 colnames(datatable) <- c(col, "year", cut)
98 }else{
99 datatable <- cbind(datatable[, col], unitobs[match(datatable[, "observation.unit"], unitobs[, "observation.unit"]), "year"]) ## extracting 'year' column from unitobs table to merge with metrics table /// Matching lines with 'observation.unit' column
100 colnames(datatable) <- c(col, "year")
101 }
102
103 }else{ ## GLM is a population analysis
104 cut <- "species.code"
105 if (! is.element("species.code", colnames(datatable))) {
106 stop("The input metrics dataset or the GLM results dataset doesn't have the right format. If the GLM is a population analysis, the field species.code needs to be informed in the metrics dataset. If the GLM is a community analysis the field separation (None if only one analysis and name of the separation factor if several analysis) needs to be informed in the last column of the GLM results dataset.")
107 }
108 col <- c(col, cut)
109 datatable <- cbind(datatable[, col], unitobs[match(datatable[, "observation.unit"], unitobs[, "observation.unit"]), "year"]) ## extracting 'year' column from unitobs table to merge with metrics table /// Matching lines with 'observation.unit' column
110 colnames(datatable) <- c(col, "year")
111 }
112
113 ##vpan vector of names of the two panels in the ggplot
114
115 switch(as.character(metric),
116 "number" = vpan <- c("Abundance variation", "Raw abundance"),
117 "presence_absence" = vpan <- c("Presence-absence variation", "% presence in location"),
118 vpan <- c(paste(metric, " variation"), paste("Mean ", metric)))
119
120 ##Cut table for 1 analysis
121 glmtab <- glmtable[glmtable[, "analysis"] == sp, ]
122
123 glmtab <- glmtab[, grep("FALSE", is.na(glmtab[1, ]))] ## Supress columns with NA only
124
125 ## specification of temporal variable necessary for the analyses
126 an <- as.numeric(unlist(strsplit(gsub("X", "", paste(colnames(glmtab)[grep("[0-2][0|9][0-9][0-9].Estimate", colnames(glmtab))], collapse = " ")), split = ".Estimate"))) ## Extract list of years studied in the GLM
127
128 year <- sort(c(min(an) - 1, an)) ## Add the first level, "reference" in the GLM
129 nbans <- length(year)
130 pasdetemps <- nbans - 1
131
132 ##### Table 1
133
134 coefan <- unlist(lapply(an, FUN = function(x) {
135 if (length(glmtab[glmtab[, 1] == sp, grep(paste0("X", x, ".Estimate"), colnames(glmtab))]) > 0) {
136 glmtab[glmtab[, 1] == sp, grep(paste0("X", x, ".Estimate"), colnames(glmtab))]
137 }else{
138 NA
139 }
140 })) ## extract estimates for each years to contruct graph 1
141
142 ic_inf <- unlist(lapply(an, FUN = function(x) {
143 if (length(glmtab[glmtab[, 1] == sp, grep(paste0("X", x, ".IC_inf"), colnames(glmtab))]) > 0) {
144 glmtab[glmtab[, 1] == sp, grep(paste0("X", x, ".IC_inf"), colnames(glmtab))]
145 }else{
146 NA
147 }
148 }))
149
150 ic_up <- unlist(lapply(an, FUN = function(x) {
151 if (length(glmtab[glmtab[, 1] == sp, grep(paste0("X", x, ".IC_up"), colnames(glmtab))]) > 0) {
152 glmtab[glmtab[, 1] == sp, grep(paste0("X", x, ".IC_up"), colnames(glmtab))]
153 }else{
154 NA
155 }
156 }))
157
158 switch(distrib, ## Applying the reciprocal of the link function to coefficients and confidence intervals depending on distribution law
159 "poisson" = {
160 coefyear <- c(1, exp(as.numeric(coefan))) ## link function : log
161 if (assess_ic) {
162 ic_inf_sim <- c(1, exp(as.numeric(ic_inf)))
163 ic_sup_sim <- c(1, exp(as.numeric(ic_up)))
164 } else {
165 ic_inf_sim <- NA
166 ic_sup_sim <- NA
167 }},
168 "quasipoisson" = {
169 coefyear <- c(1, exp(as.numeric(coefan))) ## link function : log
170 if (assess_ic) {
171 ic_inf_sim <- c(1, exp(as.numeric(ic_inf)))
172 ic_sup_sim <- c(1, exp(as.numeric(ic_up)))
173 } else {
174 ic_inf_sim <- NA
175 ic_sup_sim <- NA
176 }},
177 "inverse.gaussian" = {
178 coefyear <- c(0, as.numeric(coefan) ^ (- 1 / 2)) ## link function : x^ - 2
179 if (assess_ic) {
180 ic_inf_sim <- c(0, as.numeric(ic_inf) ^ (- 1 / 2))
181 ic_sup_sim <- c(0, as.numeric(ic_up) ^ (- 1 / 2))
182 } else {
183 ic_inf_sim <- NA
184 ic_sup_sim <- NA
185 }},
186 "binomial" = {
187 coefyear <- c(1, boot::inv.logit(as.numeric(coefan))) ## link function : logit
188 if (assess_ic) {
189 ic_inf_sim <- c(1, boot::inv.logit(as.numeric(ic_inf)))
190 ic_sup_sim <- c(1, boot::inv.logit(as.numeric(ic_up)))
191 } else {
192 ic_inf_sim <- NA
193 ic_sup_sim <- NA
194 }},
195 "quasibinomial" = {
196 coefyear <- c(1, boot::inv.logit(as.numeric(coefan))) ## link function : logit
197 if (assess_ic) {
198 ic_inf_sim <- c(1, boot::inv.logit(as.numeric(ic_inf)))
199 ic_sup_sim <- c(1, boot::inv.logit(as.numeric(ic_up)))
200 } else {
201 ic_inf_sim <- NA
202 ic_sup_sim <- NA
203 }},
204 "Gamma" = {
205 coefyear <- c(1, as.numeric(coefan) ^ (- 1)) ## link function : -x^ - 1
206 if (assess_ic) {
207 ic_inf_sim <- c(1, as.numeric(ic_inf) ^ (- 1))
208 ic_sup_sim <- c(1, as.numeric(ic_up) ^ (- 1))
209 } else {
210 ic_inf_sim <- NA
211 ic_sup_sim <- NA
212 }}
213 , {
214 coefyear <- c(0, as.numeric(coefan))
215 if (assess_ic) {
216 ic_inf_sim <- c(0, as.numeric(ic_inf))
217 ic_sup_sim <- c(0, as.numeric(ic_up))
218 } else {
219 ic_inf_sim <- NA
220 ic_sup_sim <- NA
221 }})
222
223 pval <- c(1, unlist(lapply(an, FUN = function(x) {
224 if (length(glmtab[glmtab[, 1] == sp, grep(paste0("X", x, ".Pvalue"), colnames(glmtab))]) > 0) {
225 glmtab[glmtab[, 1] == sp, grep(paste0("X", x, ".Pvalue"), colnames(glmtab))]
226 }else{
227 NA
228 }
229 }))) ## extract p value for each year
230
231
232 tab1 <- data.frame(year, val = coefyear, ## table for the graphical output 1
233 ll = unlist(ic_inf_sim), ul = unlist(ic_sup_sim),
234 catPoint = ifelse(pval < s_signif, "significatif", NA), pval,
235 courbe = vpan[1],
236 panel = vpan[1])
237 ## cleaning of wrong or biaised measures of the confidence interval
238 if (assess_ic) {
239 tab1$ul <- ifelse(tab1$ul == Inf, NA, tab1$ul)
240 tab1$ul <- ifelse(tab1$ul > 1.000000e+20, NA, tab1$ul)
241 tab1$val <- ifelse(tab1$val > 1.000000e+20, 1.000000e+20, tab1$val)
242 }
243
244 coefancontinu <- as.numeric(as.character(glmtab[glmtab[, 1] == sp, grep("year.Estimate", colnames(glmtab))])) ## tendency of population evolution on the studied period = regression coefficient of the variable year as a continuous variable in the GLM
245 switch(distrib, ## Applying the reciprocal of the link function to coefficients and confidence intervals depending on distribution law
246 "poisson" = {
247 trend <- round(exp(as.numeric(coefancontinu)), 3) ## link function : log
248 pourcentage <- round((exp(as.numeric(coefancontinu) * as.numeric(pasdetemps)) - 1) * 100, 2)
249 },
250 "quasipoisson" = {
251 trend <- round(exp(as.numeric(coefancontinu)), 3) ## link function : log
252 pourcentage <- round((exp(as.numeric(coefancontinu) * as.numeric(pasdetemps)) - 1) * 100, 2)
253 },
254 "inverse.gaussian" = {
255 trend <- round(as.numeric(coefancontinu) ^ (- 1 / 2), 3) ## link function : x^ - 2
256 pourcentage <- round((((as.numeric(coefancontinu) * as.numeric(pasdetemps)) ^ (- 1 / 2)) - 1) * 100, 2)
257 },
258 "binomial" = {
259 trend <- round(boot::inv.logit(as.numeric(coefancontinu)), 3) ## link function : logit
260 pourcentage <- round((boot::inv.logit(as.numeric(coefancontinu) * as.numeric(pasdetemps)) - 1) * 100, 2)
261 },
262 "quasibinomial" = {
263 trend <- round(boot::inv.logit(as.numeric(coefancontinu)), 3) ## link function : logit
264 pourcentage <- round((boot::inv.logit(as.numeric(coefancontinu) * as.numeric(pasdetemps)) - 1) * 100, 2)
265 },
266 "Gamma" = {
267 trend <- round(as.numeric(coefancontinu) ^ (- 1), 3) ## link function : -x^ - 1
268 pourcentage <- round((((as.numeric(coefancontinu) * as.numeric(pasdetemps)) ^ (- 1)) - 1) * 100, 2)
269 }
270 , {
271 trend <- round(as.numeric(coefancontinu), 3)
272 pourcentage <- round((((as.numeric(coefancontinu) * as.numeric(pasdetemps))) - 1) * 100, 2)
273 })
274
275 pval <- as.numeric(as.character(glmtab[glmtab[, 1] == sp, grep("year.Pvalue", colnames(glmtab))])) ## Extract p value
276
277 ## table for global trend on the whole time series
278 tab1t <- NULL
279 if (length(pval) > 0) {
280 tab1t <- data.frame(Est = trend, pourcent = pourcentage, signif = pval < s_signif, pval)
281 }
282
283 ##### Table 2
284
285 if (sp == "global") { ## prepare metrics table to extract variables used in graph 2
286 datatablecut <- datatable[grep("FALSE", is.na(datatable[, as.character(metric)])), ]
287
288 }else{
289
290 datatablecut <- datatable[datatable[, as.character(cut)] == sp, ]
291 datatablecut <- datatablecut[grep("FALSE", is.na(datatablecut[, as.character(metric)])), ]
292 }
293
294 switch(as.character(metric), ## Different value represented graph 2 depending on the nature of the metric
295 "number" = {
296 valplot <- lapply(sort(year), FUN = function(x) {
297 sum(na.omit(as.numeric(subset(datatablecut, year == x)[, as.character(metric)])))
298 })
299 }, ## sum if abundance
300 "presence_absence" = {
301 nb_loc <- lapply(sort(year), FUN = function(x) {
302 length(unique(subset(datatablecut, year == x)[, "location"]))
303 }) ## number of plots per year
304 nb_loc_presence <- lapply(sort(year), FUN = function(x) {
305 length(unique(subset(datatablecut[datatablecut[, metric] > 0, ], year == x)[, "location"]))
306 }) ## number of plots where the species were observed
307 valplot <- (na.omit(as.numeric(nb_loc_presence)) / na.omit(as.numeric(nb_loc))) * 100
308 } ## % of presence in observered plots if presence / absence
309 , {
310 valplot <- lapply(sort(year), FUN = function(x) {
311 mean(na.omit(as.numeric(subset(datatablecut, year == x)[, as.character(metric)])))
312 })
313 } ## mean if any other metric
314 )
315
316 tab2 <- data.frame(year, val = round(as.numeric(valplot), 2), ll = NA, ul = NA, catPoint = NA, pval = NA,
317 courbe = vpan[2], panel = vpan[2])
318
319 ## Creating ggplots
320
321 dgg <- tab1
322
323 figname <- paste(sp, ".png", sep = "")
324
325 ## coord for horizontal lines in graphs
326 hline_data1 <- data.frame(z = tab1$val[1], panel = c(vpan[1]), couleur = "var estimates", type = "var estimates")
327 hline_data3 <- data.frame(z = 0, panel = vpan[2], couleur = "seuil", type = "seuil")
328 hline_data <- rbind(hline_data1, hline_data3)
329 titre <- paste(sp)
330
331 ## text for the population evolution trend
332
333 pasdetemps <- max(dgg$year) - min(dgg$year) + 1
334 if (! is.null(tab1t)) {
335 if (assess_ic) {
336 txt_pente1 <- paste("Global trend : ", tab1t$Est,
337 ifelse(tab1t$signif, " *", ""),
338 ifelse(tab1t$signif, paste("\n", ifelse(tab1t$pourcent > 0, "+ ", "- "),
339 abs(tab1t$pourcent), " % in ", pasdetemps, " years", sep = ""), ""), sep = "")
340 }else{
341 txt_pente1 <- ifelse(tab1t$signif, paste("\n", ifelse(tab1t$pourcent > 0, "+ ", "- "),
342 abs(tab1t$pourcent), " % in ", pasdetemps, " years", sep = ""), "")
343 }
344 }else{
345 trend_on_graph <- FALSE
346 }
347
348 ## table of the text for the population evolution trend
349 tab_text_pent <- data.frame(y = c(max(c(dgg$val, dgg$ul), na.rm = TRUE) * .9),
350 x = median(dgg$year),
351 txt = ifelse(trend_on_graph, c(txt_pente1), ""),
352 courbe = c(vpan[1]), panel = c(vpan[1]))
353
354 dgg <- rbind(tab1, tab2)
355
356 ## colors for plots
357 vec_col_point <- c("#ffffff", "#eeb40f", "#ee0f59")
358 names(vec_col_point) <- c("significatif", "infSeuil", "0")
359 vec_col_courbe <- c("#3c47e0", "#5b754d", "#55bb1d", "#973ce0")
360 names(vec_col_courbe) <- c(vpan[1], "loc", "presence", vpan[2])
361 vec_col_hline <- c("#ffffff", "#e76060")
362 names(vec_col_hline) <- c("var estimates", "seuil")
363
364 col <- c(vec_col_point, vec_col_courbe, vec_col_hline)
365 names(col) <- c(names(vec_col_point), names(vec_col_courbe), names(vec_col_hline))
366
367 p <- ggplot2::ggplot(data = dgg, mapping = ggplot2::aes_string(x = "year", y = "val"))
368 ## Titles and scales
369 p <- p + facet_grid(panel ~ ., scale = "free") +
370 theme(legend.position = "none",
371 panel.grid.minor = element_blank(),
372 panel.grid.major.y = element_blank(),
373 axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
374 ylab("") + xlab("year") + ggtitle(titre) +
375 scale_colour_manual(values = col, name = "",
376 breaks = names(col)) +
377 scale_x_continuous(breaks = min(dgg$year):max(dgg$year))
378 p <- p + ggplot2::geom_hline(data = hline_data, mapping = ggplot2::aes_string(yintercept = "z", colour = "couleur", linetype = "type"),
379 alpha = 1, size = 1.2)
380 if (assess_ic) { ############# ONLY FOR THE CONFIDENCE INTERVAL
381 p <- p + ggplot2::geom_ribbon(mapping = ggplot2::aes_string(ymin = "ll", ymax = "ul"), fill = col[vpan[1]], alpha = .2)
382 p <- p + ggplot2::geom_pointrange(mapping = ggplot2::aes_string(y = "val", ymin = "ll", ymax = "ul"), fill = col[vpan[1]], alpha = .2)
383 }
384
385 p <- p + ggplot2::geom_line(mapping = ggplot2::aes_string(colour = "courbe"), size = 1.5)
386 p <- p + ggplot2::geom_point(mapping = ggplot2::aes_string(colour = "courbe"), size = 3)
387 alph <- ifelse(!is.na(dgg$catPoint), 1, 0)
388 p <- p + ggplot2::geom_point(mapping = ggplot2::aes_string(colour = "catPoint", alpha = alph), size = 2)
389 p <- p + ggplot2::geom_text(data = tab_text_pent, mapping = ggplot2::aes_string("x", "y", label = "txt"), parse = FALSE, color = col[vpan[1]], fontface = 2, size = 4)
390 ggplot2::ggsave(figname, p, width = 16, height = 15, units = "cm")
391 }
392 ############################################################################################################ fin fonction graphique / end of function for graphical output
393
394 ################# Analysis
395
396 for (sp in glmtable[, 1]) {
397
398 if (!all(is.na(glmtable[glmtable[, 1] == sp, 4:(length(glmtable) - 1)]))) { ##ignore lines with only NA
399 ggplot_glm(glmtable = glmtable, datatable = datatable, unitobs = unitobs, metric = metric, sp = sp, description = TRUE, trend_on_graph = TRUE, assess_ic = assess_ic)
400 }
401 }
402
403 sink("stdout.txt", split = TRUE)