Mercurial > repos > ecology > pampa_glmcomm
diff FunctExeCalcGLMGalaxy.r @ 4:44b9069775ca draft
"planemo upload for repository https://github.com/ColineRoyaux/PAMPA-Galaxy commit 3df1978827a91be30e815dee2ed83a92862d1b1c"
author | ecology |
---|---|
date | Sun, 22 Nov 2020 18:40:23 +0000 |
parents | f0dc3958e65d |
children | 8db02073b671 |
line wrap: on
line diff
--- a/FunctExeCalcGLMGalaxy.r Mon Jul 27 09:52:47 2020 -0400 +++ b/FunctExeCalcGLMGalaxy.r Sun Nov 22 18:40:23 2020 +0000 @@ -1,4 +1,4 @@ -#Rscript +#Rscript ##################################################################################################################### ##################################################################################################################### @@ -8,270 +8,253 @@ ###################### Packages suppressMessages(library(multcomp)) +suppressMessages(library(DHARMa)) suppressMessages(library(glmmTMB)) ###Version: 0.2.3 -suppressMessages(library(gap)) +suppressMessages(library(gap)) + ###################### Load arguments and declaring variables -args = commandArgs(trailingOnly=TRUE) -#options(encoding = "UTF-8") +args <- commandArgs(trailingOnly = TRUE) if (length(args) < 10) { - stop("At least 4 arguments must be supplied : \n- two input dataset files (.tabular) : metrics table and unitobs table \n- Interest variable field from metrics table \n- Response variable from unitobs table.", call.=FALSE) #si pas d'arguments -> affiche erreur et quitte / if no args -> error and exit1 + stop("At least 4 arguments must be supplied : \n- two input dataset files (.tabular) : metrics table and unitobs table \n- Interest variable field from metrics table \n- Response variable from unitobs table.", call. = FALSE) # if no args -> error and exit1 } else { - Importdata <- args[1] ###### file name : metrics table - ImportUnitobs <- args[2] ###### file name : unitobs informations + import_data <- args[1] ###### file name : metrics table + import_unitobs <- args[2] ###### file name : unitobs informations colmetric <- as.numeric(args[3]) ###### Selected interest metric for GLM - listFact <- strsplit(args [4],",")[[1]] ###### Selected response factors for GLM - listRand <- strsplit(args [5],",")[[1]] ###### Selected randomized response factors for GLM - colFactAna <- args[6] ####### (optional) Selected splitting factors for GLMs - Distrib <- args[7] ###### (optional) Selected distribution for GLM - log <- args[8] ###### (Optional) Log on interest metric ? + list_fact <- strsplit(args [4], ",")[[1]] ###### Selected response factors for GLM + list_rand <- strsplit(args [5], ",")[[1]] ###### Selected randomized response factors for GLM + col_fact_ana <- args[6] ####### (optional) Selected splitting factors for GLMs + distrib <- args[7] ###### (optional) Selected distribution for GLM + glm_out <- args[8] ###### (Optional) GLM object as Rdata output ? aggreg <- args[9] ###### Aggregation level of the data table source(args[10]) ###### Import functions } -#### Data must be a dataframe with at least 3 variables : unitobs representing location and year ("observation.unit"), species code ("species.code") and abundance ("number") + + +#### d_ata must be a dataframe with at least 3 variables : unitobs representing location and year ("observation.unit"), species code ("species.code") and abundance ("number") -#Import des données / Import data -obs<- read.table(Importdata,sep="\t",dec=".",header=TRUE,encoding="UTF-8") # -obs[obs == -999] <- NA +#Import des données / Import data +obs <- read.table(import_data, sep = "\t", dec = ".", header = TRUE, encoding = "UTF-8") # +obs[obs == -999] <- NA metric <- colnames(obs)[colmetric] -tabUnitobs <- read.table(ImportUnitobs,sep="\t",dec=".",header=TRUE,encoding="UTF-8") -tabUnitobs[tabUnitobs == -999] <- NA +tab_unitobs <- read.table(import_unitobs, sep = "\t", dec = ".", header = TRUE, encoding = "UTF-8") +tab_unitobs[tab_unitobs == -999] <- NA -if (colFactAna != "None") -{ - FactAna <- colnames(tabUnitobs)[as.numeric(colFactAna)] - if (class(tabUnitobs[FactAna]) == "numeric" || FactAna == "observation.unit"){stop("Wrong chosen separation factor : Analysis can't be separated by observation unit or numeric factor")} +if (col_fact_ana != "None") { + fact_ana <- colnames(tab_unitobs)[as.numeric(col_fact_ana)] + if (class(tab_unitobs[fact_ana]) == "numeric" || fact_ana == "observation.unit") { + stop("Wrong chosen separation factor : Analysis can't be separated by observation unit or numeric factor") + } }else{ - FactAna <- colFactAna + fact_ana <- col_fact_ana +} + +vars_data1 <- NULL +err_msg_data1 <- "The input metrics dataset doesn't have the right format. It needs to have at least the following 2 variables :\n- observation.unit (or year and site)\n- numeric or integer metric\n" +check_file(obs, err_msg_data1, vars_data1, 2) + +vars_data2 <- c("observation.unit", list_fact, list_rand) +err_msg_data2 <- "The input unitobs dataset doesn't have the right format. It needs to have at least the following 2 variables :\n- observation.unit (or year and site)\n- factors used in GLM (habitat, year and/or site)\n" +check_file(tab_unitobs, err_msg_data2, vars_data2[vars_data2 != "None"], 2) + +if (all(c(list_fact, list_rand) == "None")) { + stop("GLM needs to have at least one response variable.") +} + +if (list_fact[1] == "None" || all(is.element(list_fact, list_rand))) { + stop("GLM can't have only random effects.") } -#factors <- fact.det.f(Obs=obs) - -vars_data1<- NULL -err_msg_data1<-"The input metrics dataset doesn't have the right format. It needs to have at least the following 2 variables :\n- observation.unit (or year and site)\n- numeric or integer metric\n" -check_file(obs,err_msg_data1,vars_data1,2) - -vars_data2 <- c(listFact,listRand) -err_msg_data2<-"The input unitobs dataset doesn't have the right format. It needs to have at least the following 2 variables :\n- observation.unit (or year and site)\n- factors used in GLM (habitat, year and/or site)\n" -check_file(tabUnitobs,err_msg_data2,vars_data2[vars_data2 != "None"],2) - #################################################################################################### -########## Computing Generalized Linear Model ## Function : modeleLineaireWP2.unitobs.f ############ +######### Computing Generalized Linear Model ## Function : glm_community ############ #################################################################################################### -modeleLineaireWP2.unitobs.f <- function(metrique, listFact, listRand, FactAna, Distrib, log=FALSE, tabMetrics, tableMetrique, tabUnitobs, unitobs="observation.unit", nbName="number") -{ - ## Purpose: Monitoring steps for GLM on unitobs +glm_community <- function(metrique, list_fact, list_rand, fact_ana, distrib, tab_metrics, tab_metrique, tab_unitobs, unitobs = "observation.unit", nb_name = "number") { + ## Purpose: Monitoring steps for GLM on community data ## ---------------------------------------------------------------------- ## Arguments: metrique : selected metric - ## listFact : Factors for GLM - ## listRand : Random factors for GLM - ## factAna : Separation factor for GLMs - ## Distrib : selected distribution for model - ## log : log transformation on data ? boolean - ## tabMetrics : data table metrics - ## tableMetrique : data table's name - ## tabUnitobs : data table unitobs + ## list_fact : Factors for GLM + ## list_rand : Random factors for GLM + ## fact_ana : Separation factor for GLMs + ## distrib : selected distribution for model + ## tab_metrics : data table metrics + ## tab_metrique : data table's name + ## tab_unitobs : data table unitobs ## ---------------------------------------------------------------------- ## Author: Yves Reecht, Date: 18 août 2010, 15:59 modified by Coline ROYAUX 04 june 2020 - tmpData <- tabMetrics + tmpd_ata <- tab_metrics - if (listRand[1] != "None") - { - if (all(is.element(listFact,listRand)) || listFact[1] == "None") - { - RespFact <- paste("(1|",paste(listRand,collapse=") + (1|"),")") - listF <- NULL - listFact <- listRand - }else{ - listF <- listFact[!is.element(listFact,listRand)] - RespFact <- paste(paste(listF, collapse=" + ")," + (1|",paste(listRand,collapse=") + (1|"),")") - listFact <- c(listF,listRand) - } - }else{ - listF <- listFact - RespFact <- paste(listFact, collapse=" + ") - } + out_fact <- .GlobalEnv$organise_fact(list_rand = list_rand, list_fact = list_fact) + resp_fact <- out_fact[[1]] + list_f <- out_fact[[2]] + list_fact <- out_fact[[3]] ##Creating model's expression : - - #if (log == FALSE) { - exprML <- eval(parse(text=paste(metrique, "~", RespFact))) - #}else{ - # exprML <- eval(parse(text=paste("log(",metrique,")", "~", RespFact))) - #} + expr_lm <- eval(parse(text = paste(metrique, "~", resp_fact))) ##Creating analysis table : - listFactTab <- c(listFact, FactAna) - listFactTab <- listFactTab[listFactTab != "None"] + list_fact_tab <- c(list_fact, fact_ana) + list_fact_tab <- list_fact_tab[list_fact_tab != "None"] - if (all(is.na(match(tmpData[,unitobs],tabUnitobs[,unitobs])))) {stop("Observation units doesn't match in the two input tables")} + if (all(is.na(match(tmpd_ata[, unitobs], tab_unitobs[, unitobs])))) { + stop("Observation units doesn't match in the two input tables") + } - if(! is.element("species.code",colnames(tmpData))) - { - col <- c(unitobs,metrique) - tmpData <- cbind(tmpData[,col], tabUnitobs[match(tmpData[,unitobs],tabUnitobs[,unitobs]),listFactTab]) - colnames(tmpData) <- c(col,listFactTab) + if (! is.element("species.code", colnames(tmpd_ata))) { + col <- c(unitobs, metrique) + tmpd_ata <- cbind(tmpd_ata[, col], tab_unitobs[match(tmpd_ata[, unitobs], tab_unitobs[, unitobs]), list_fact_tab]) + colnames(tmpd_ata) <- c(col, list_fact_tab) - for (i in listFactTab) { + for (i in list_fact_tab) { switch(i, - tmpData[,i] <- as.factor(tmpData[,i])) + tmpd_ata[, i] <- as.factor(tmpd_ata[, i])) } }else{ stop("Warning : wrong data frame, data frame should be aggregated by observation unit (year and site)") } ## Suppress unsed 'levels' : - tmpData <- dropLevels.f(tmpData) + tmpd_ata <- .GlobalEnv$drop_levels_f(tmpd_ata) ## Automatic choice of distribution if none is selected by user : - if (Distrib == "None") - { - switch(class(tmpData[,metrique]), - "integer"={loiChoisie <- "poisson"}, - "numeric"={loiChoisie <- "gaussian"}, - stop("Selected metric class doesn't fit, you should select an integer or a numeric variable")) + + chose_distrib <- .GlobalEnv$distrib_choice(distrib = distrib, metrique = metrique, data = tmpd_ata) + + if (fact_ana != "None" && nlevels(tmpd_ata[, fact_ana]) > 1) { + ana_cut <- levels(tmpd_ata[, fact_ana]) }else{ - loiChoisie <- Distrib - } - - - ## Compute Model(s) : - - if (FactAna != "None" && nlevels(tmpData[,FactAna]) > 1) - { - Anacut <- levels(tmpData[,FactAna]) - }else{ - Anacut <- NULL + ana_cut <- NULL } - ##Create results table : - lev <- unlist(lapply(listF,FUN=function(x){levels(tmpData[,x])})) - - if (listRand[1] != "None") ## if random effects - { - TabSum <- data.frame(analysis=c("global", Anacut),AIC=NA,BIC=NA,logLik=NA, deviance=NA,df.resid=NA) - colrand <- unlist(lapply(listRand, - FUN=function(x){lapply(c("Std.Dev","NbObservation","NbLevels"), - FUN=function(y){paste(x,y,collapse = ":") - }) - })) - TabSum[,colrand] <- NA + ##Create results table : + lev <- unlist(lapply(list_f, FUN = function(x) { + levels(tmpd_ata[, x]) + })) + row <- c("global", ana_cut) - if (! is.null(lev)) ## if fixed effects + random effects - { - colcoef <- unlist(lapply(c("(Intercept)",lev), - FUN=function(x){lapply(c("Estimate","Std.Err","Zvalue","Pvalue","signif"), - FUN=function(y){paste(x,y,collapse = ":") - }) - })) - }else{ ## if no fixed effects - colcoef <- NULL - } + if (is.element("year", list_f) && ! is.element("year", list_rand)) { + tab_sum <- .GlobalEnv$create_res_table(list_rand = list_rand, list_fact = list_fact, row = row, lev = unlist(c("year", lev)), distrib = chose_distrib) + }else{ + tab_sum <- .GlobalEnv$create_res_table(list_rand = list_rand, list_fact = list_fact, row = row, lev = lev, distrib = chose_distrib) + } + ### creating rate table + tab_rate <- data.frame(analysis = row, complete_plan = NA, balanced_plan = NA, NA_proportion_OK = NA, no_residual_dispersion = NA, uniform_residuals = NA, outliers_proportion_OK = NA, no_zero_inflation = NA, observation_factor_ratio_OK = NA, enough_levels_random_effect = NA, rate = NA) - }else{ ## if no random effects - TabSum <- data.frame(analysis=c("global", Anacut),AIC=NA,Resid.deviance=NA,df.resid=NA,Null.deviance=NA,df.null=NA) + ##plural analysis + for (cut in ana_cut) { + cutd_ata <- tmpd_ata[grep(cut, tmpd_ata[, fact_ana]), ] + cutd_ata <- .GlobalEnv$drop_levels_f(cutd_ata) + + res <- "" + resy <- "" - switch(loiChoisie, - "gaussian"={colcoef <- unlist(lapply(c("(Intercept)",lev), - FUN=function(x){lapply(c("Estimate","Std.Err","Tvalue","Pvalue","signif"), - FUN=function(y){paste(x,y,collapse = ":") - }) - }))}, - "quasipoisson"={colcoef <- unlist(lapply(c("(Intercept)",lev), - FUN=function(x){lapply(c("Estimate","Std.Err","Tvalue","Pvalue","signif"), - FUN=function(y){paste(x,y,collapse = ":") - }) - }))}, - colcoef <- unlist(lapply(c("(Intercept)",lev), - FUN=function(x){lapply(c("Estimate","Std.Err","Zvalue","Pvalue","signif"), - FUN=function(y){paste(x,y,collapse = ":") - }) - }))) + if (list_rand[1] != "None") { + res <- tryCatch(glmmTMB::glmmTMB(expr_lm, family = chose_distrib, data = cutd_ata), error = function(e) { + }) + if (is.element("year", list_f) && ! is.element("year", list_rand)) { #Model with year as continuous + cutd_ata$year <- as.numeric(cutd_ata$year) + resy <- tryCatch(glmmTMB::glmmTMB(expr_lm, family = chose_distrib, data = cutd_ata), error = function(e) { + }) + cutd_ata$year <- as.factor(cutd_ata$year) + }else{ + resy <- "" + } - } - - TabSum[,colcoef] <- NA - - ### creating rate table - TabRate <- data.frame(analysis=c("global", Anacut), complete_plan=NA, balanced_plan=NA, NA_proportion_OK=NA, no_residual_dispersion=NA, uniform_residuals=NA, outliers_proportion_OK=NA, no_zero_inflation=NA, observation_factor_ratio_OK=NA, enough_levels_random_effect=NA, rate=NA) + }else{ + res <- tryCatch(glm(expr_lm, data = cutd_ata, family = chose_distrib), error = function(e) { + }) + if (is.element("year", list_f)) { #Model with year as continuous + cutd_ata$year <- as.numeric(cutd_ata$year) + resy <- tryCatch(glm(expr_lm, family = chose_distrib, data = cutd_ata), error = function(e) { + }) + cutd_ata$year <- as.factor(cutd_ata$year) + }else{ + resy <- "" + } - for (cut in Anacut) - { - cutData <- tmpData[grep(cut,tmpData[,FactAna]),] - cutData <- dropLevels.f(cutData) - - res <-"" - - if (listRand[1] != "None") - { - res <- tryCatch(glmmTMB(exprML,family=loiChoisie, data=cutData), error=function(e){}) - }else{ - res <- tryCatch(glm(exprML,data=cutData,family=loiChoisie), error=function(e){}) } ## Write results : - if (! is.null(res)) - { - TabSum <- sortiesLM.f(objLM=res, TabSum=TabSum, metrique=metrique, - factAna=factAna, cut=cut, colAna="analysis", lev=lev, #modSel=iFactGraphSel, listFactSel=listFactSel, - listFact=listFact, - Data=cutData, #Log=Log, - type=ifelse(tableMetrique == "unitSpSz" && factAna != "size.class", - "CL_unitobs", - "unitobs")) + if (! is.null(res)) { + file_save_glm_cut <- paste("GLM_", cut, ".Rdata", sep = "") + save(res, file = file_save_glm_cut) - TabRate[TabRate[,"analysis"]==cut,c(2:11)] <- noteGLM.f(data=cutData, objLM=res, metric=metrique, listFact=listFact, details=TRUE) + tab_sum <- sorties_lm_f(obj_lm = res, obj_lmy = resy, tab_sum = tab_sum, metrique = metrique, + fact_ana = fact_ana, cut = cut, col_ana = "analysis", lev = lev, #modSel = iFactGraphSel, list_fact_sel = list_fact_sel, + list_fact = list_fact, list_rand = list_rand, + d_ata = cutd_ata) + + tab_rate[tab_rate[, "analysis"] == cut, c(2:11)] <- .GlobalEnv$note_glm_f(data = cutd_ata, obj_lm = res, metric = metrique, list_fact = list_fact, details = TRUE) }else{ - cat("\nCannot compute GLM for level",cut,"Check if one or more factor(s) have only one level, or try with another distribution for the model in advanced settings \n\n") + cat("\nCannot compute GLM for level", cut, "Check if one or more factor(s) have only one level, or try with another distribution for the model in advanced settings \n\n") } } - ## Global analysis : + ## Global analysis : + + res_g <- "" + res_gy <- "" - if (listRand[1] != "None") - { - resG <- glmmTMB(exprML,family=loiChoisie, data=tmpData) + if (list_rand[1] != "None") { + res_g <- glmmTMB::glmmTMB(expr_lm, family = chose_distrib, data = tmpd_ata) + if (is.element("year", list_fact) && ! is.element("year", list_rand)) { #Model with year as continuous + yr <- tmpd_ata$year + tmpd_ata$year <- as.numeric(tmpd_ata$year) + res_gy <- glmmTMB::glmmTMB(expr_lm, family = chose_distrib, data = tmpd_ata) + tmpd_ata$year <- as.factor(tmpd_ata$year) + tmpd_ata$year <- yr + }else{ + res_gy <- "" + } + }else{ - resG <- glm(exprML,data=tmpData,family=loiChoisie) + res_g <- glm(expr_lm, data = tmpd_ata, family = chose_distrib) + if (is.element("year", list_fact)) { #Model with year as continuous + yr <- tmpd_ata$year + tmpd_ata$year <- as.numeric(tmpd_ata$year) + res_gy <- glm(expr_lm, family = chose_distrib, data = tmpd_ata) + tmpd_ata$year <- yr + }else{ + res_gy <- "" + } } ## write results : - TabSum <- sortiesLM.f(objLM=resG, TabSum=TabSum, metrique=metrique, - factAna=factAna, cut="global", colAna="analysis", lev=lev, #modSel=iFactGraphSel, listFactSel=listFactSel, - listFact=listFact, - Data=tmpData, #Log=Log, - type=ifelse(tableMetrique == "unitSpSz" && factAna != "size.class", - "CL_unitobs", - "unitobs")) + + save(res_g, file = "global_GLM.Rdata") - TabRate[TabRate[,"analysis"]=="global",c(2:11)] <- noteGLM.f(data=tmpData, objLM=resG, metric=metrique, listFact=listFact, details=TRUE) - noteGLMs.f(tabRate=TabRate,exprML=exprML,objLM=resG, file_out=TRUE) + tab_sum <- sorties_lm_f(obj_lm = res_g, obj_lmy = res_gy, tab_sum = tab_sum, metrique = metrique, + fact_ana = fact_ana, cut = "global", col_ana = "analysis", lev = lev, #modSel = iFactGraphSel, list_fact_sel = list_fact_sel, + list_fact = list_fact, list_rand = list_rand, + d_ata = tmpd_ata) + + tab_rate[tab_rate[, "analysis"] == "global", c(2:11)] <- .GlobalEnv$note_glm_f(data = tmpd_ata, obj_lm = res_g, metric = metrique, list_fact = list_fact, details = TRUE) + .GlobalEnv$note_glms_f(tab_rate = tab_rate, expr_lm = expr_lm, obj_lm = res_g, file_out = TRUE) + ## simple statistics and infos : filename <- "GLMSummaryFull.txt" + info_stats_f(filename = filename, d_ata = tmpd_ata, agreg_level = aggreg, type = "stat", + metrique = metrique, fact_graph = fact_ana, #fact_graph_sel = modSel, + list_fact = list_fact)#, list_fact_sel = list_fact_sel) - ## Save data on model : - - infoStats.f(filename=filename, Data=tmpData, agregLevel=aggreg, type="stat", - metrique=metrique, factGraph=factAna, #factGraphSel=modSel, - listFact=listFact)#, listFactSel=listFactSel) + tab_sum$separation <- fact_ana - return(TabSum) + return(tab_sum) } ################# Analysis -Tab <- modeleLineaireWP2.unitobs.f(metrique=metric, listFact=listFact, listRand=listRand, FactAna=FactAna, Distrib=Distrib, log=log, tabMetrics=obs, tableMetrique=aggreg, tabUnitobs=tabUnitobs, nbName="number") +tab <- glm_community(metrique = metric, list_fact = list_fact, list_rand = list_rand, fact_ana = fact_ana, distrib = distrib, tab_metrics = obs, tab_metrique = aggreg, tab_unitobs = tab_unitobs, nb_name = "number") -write.table(Tab,"GLMSummary.tabular", row.names=FALSE, sep="\t", dec=".",fileEncoding="UTF-8") +write.table(tab, "GLMSummary.tabular", row.names = FALSE, sep = "\t", dec = ".", fileEncoding = "UTF-8")