view FunctExePlotGLMGalaxy.r @ 0:3ab852a7ff53 draft

"planemo upload for repository https://github.com/ColineRoyaux/PAMPA-Galaxy commit 04381ca7162ec3ec68419e308194b91d11cacb04"
author ecology
date Mon, 16 Nov 2020 11:02:43 +0000
parents
children
line wrap: on
line source

#Rscript

#####################################################################################################################
#####################################################################################################################
###################################### Create a plot from your community data #######################################
#####################################################################################################################
#####################################################################################################################

###################### Packages
suppressMessages(library(ggplot2))
suppressMessages(library(boot))

###################### Load arguments and declaring variables

args <- commandArgs(trailingOnly = TRUE)


if (length(args) < 2) {
    stop("At least 3 arguments must be supplied input dataset file with GLM results (.tabular)", call. = FALSE) #if no args -> error and exit1

} else {
    import_data <- args[1] ###### file name : glm results table
    data_tab <- args[2] ###### file name : Metrics table
    unitobs_tab <- args[3] ###### file name : Unitobs table
    source(args[4]) ###### Import functions

}

#Import data
glmtable <- read.table(import_data, sep = "\t", dec = ".", header = TRUE, encoding = "UTF-8") #
datatable <- read.table(data_tab, sep = "\t", dec = ".", header = TRUE, encoding = "UTF-8") #
unitobs <- read.table(unitobs_tab, sep = "\t", dec = ".", header = TRUE, encoding = "UTF-8") #

#Check files

vars_data1 <- c("analysis", "Interest.var", "distribution")
err_msg_data1 <- "The input GLM results dataset doesn't have the right format. It needs to have at least the following 3 fields :\n- analysis\n- Interest.var\n- distribution\n"
check_file(glmtable, err_msg_data1, vars_data1, 4)

if (length(grep("[0-2][0|9][0-9][0-9].[Estimate|Pvalue]", colnames(glmtable))) == 0) {
    stop("The input GLM results dataset doesn't have the right format or informations. This tool is to represent temporal trends, if your GLM doesn't take the year variable as a fixed effect this tool is not proper to make any representation of it. It needs to have at least estimates and p-value for every year from your time series GLM as columns with name formated as : yyyy Estimate (example : 2020 Estimate) and  yyyy Pvalue (example : 2020 Pvalue).")
}

if (length(grep("[0-2][0|9][0-9][0-9].IC_[up|inf]", colnames(glmtable))) == 0) {
    assess_ic <- FALSE
}else{
    assess_ic <- TRUE
}

metric <- as.character(glmtable[1, "Interest.var"])

vars_data2 <- c("observation.unit", "location", metric)
err_msg_data2 <- "The input metrics dataset doesn't have the right format. It needs to have at least the following 3 fields :\n- observation.unit\n- location\n- the name of the interest metric\n"
check_file(datatable, err_msg_data2, vars_data2, 4)

vars_data3 <- c("observation.unit", "year")
err_msg_data3 <- "The input unitobs dataset doesn't have the right format. It needs to have at least the following 2 fields :\n- observation.unit\n- year\n"
check_file(unitobs, err_msg_data3, vars_data3, 2)
if (length(grep("[0-2][0|9][0-9][0-9]", unitobs$year)) == 0) {
    stop("The year column in the input unitobs dataset doesn't have the right format. Years must be fully written as : yyyy (example : 2020).")
}

if (all(is.na(match(datatable[, "observation.unit"], unitobs[, "observation.unit"])))) {
    stop("Observation units doesn't match in the inputs metrics dataset and unitobs dataset")
}

####################################################################################################################
######################### Creating plot from time series GLM data ## Function : ggplot_glm #########################
####################################################################################################################
ggplot_glm <- function(glmtable, datatable, unitobs, metric = metric, sp, description = TRUE,
                       trend_on_graph = TRUE, assess_ic = TRUE) {
    ## Purpose: Creating plot from time series GLM data
    ## ----------------------------------------------------------------------
    ## Arguments: glmtable : GLM(s) results table
    ##            datatable : Metrics table
    ##            unitobs : Unitobs table
    ##            metric : Interest variable in GLM(s)
    ##            sp : name of processed GLM
    ##            description : Two graphs ?
    ##            trend_on_graph : Write global trend of the time series on graph ?
    ##            assess_ic : Assess confidence intervals ?
    ## ----------------------------------------------------------------------
    ## Author: Coline ROYAUX 13 october 2020

    s_signif <- 0.05 ## threshold when pvalue is considered significant
    distrib <- as.character(glmtable[1, "distribution"]) ## extract GLM distribution

    col <- c("observation.unit", "location", metric) ## names of needed columns in metrics table to construct the 2nd panel of the graph

    if (colnames(glmtable)[length(glmtable)] == "separation") { ## if GLM is a community analysis
        cut <- as.character(glmtable[1, "separation"])
        if (cut != "None") { ## if there is plural GLM analysis performed
            if (! is.element(as.character(cut), colnames(unitobs))) {
                stop("The input unitobs dataset doesn't have the right format. If plural GLM analysis have been performed it needs to have the field used as separation factor.")
            }
            datatable <- cbind(datatable[, col], unitobs[match(datatable[, "observation.unit"], unitobs[, "observation.unit"]), c("year", cut)]) ## extracting 'year' and analysis separation factor columns from unitobs table to merge with metrics table /// Matching lines with 'observation.unit' column
            colnames(datatable) <- c(col, "year", cut)
        }else{
            datatable <- cbind(datatable[, col], unitobs[match(datatable[, "observation.unit"], unitobs[, "observation.unit"]), "year"]) ## extracting 'year' column from unitobs table to merge with metrics table /// Matching lines with 'observation.unit' column
            colnames(datatable) <- c(col, "year")
        }

    }else{ ## GLM is a population analysis
        cut <- "species.code"
        if (! is.element("species.code", colnames(datatable))) {
            stop("The input metrics dataset or the GLM results dataset doesn't have the right format. If the GLM is a population analysis, the field species.code needs to be informed in the metrics dataset. If the GLM is a community analysis the field separation (None if only one analysis and name of the separation factor if several analysis) needs to be informed in the last column of the GLM results dataset.")
        }
        col <- c(col, cut)
        datatable <- cbind(datatable[, col], unitobs[match(datatable[, "observation.unit"], unitobs[, "observation.unit"]), "year"]) ## extracting 'year' column from unitobs table to merge with metrics table /// Matching lines with 'observation.unit' column
        colnames(datatable) <- c(col, "year")
    }

    ##vpan vector of names of the two panels in the ggplot

    switch(as.character(metric),
           "number" = vpan <- c("Abundance variation", "Raw abundance"),
           "presence_absence" = vpan <- c("Presence-absence variation", "% presence in location"),
           vpan <- c(paste(metric, " variation"), paste("Mean ", metric)))

    ##Cut table for 1 analysis
    glmtab <- glmtable[glmtable[, "analysis"] == sp, ]

    glmtab <- glmtab[, grep("FALSE", is.na(glmtab[1, ]))] ## Supress columns with NA only

    ## specification of temporal variable necessary for the analyses
    an <- as.numeric(unlist(strsplit(gsub("X", "", paste(colnames(glmtab)[grep("[0-2][0|9][0-9][0-9].Estimate", colnames(glmtab))], collapse = " ")), split = ".Estimate"))) ## Extract list of years studied in the GLM

    year <- sort(c(min(an) - 1, an)) ## Add the first level, "reference" in the GLM
    nbans <- length(year)
    pasdetemps <- nbans - 1

    ##### Table 1

    coefan <- unlist(lapply(an, FUN = function(x) {
                                          if (length(glmtab[glmtab[, 1] == sp, grep(paste0("X", x, ".Estimate"), colnames(glmtab))]) > 0) {
                                              glmtab[glmtab[, 1] == sp, grep(paste0("X", x, ".Estimate"), colnames(glmtab))]
                                          }else{
                                              NA
                                          }
                                                 }))  ## extract estimates for each years to contruct graph 1

    ic_inf <- unlist(lapply(an, FUN = function(x) {
                                          if (length(glmtab[glmtab[, 1] == sp, grep(paste0("X", x, ".IC_inf"), colnames(glmtab))]) > 0) {
                                              glmtab[glmtab[, 1] == sp, grep(paste0("X", x, ".IC_inf"), colnames(glmtab))]
                                          }else{
                                               NA
                                          }
                                                  }))

    ic_up <- unlist(lapply(an, FUN = function(x) {
                                         if (length(glmtab[glmtab[, 1] == sp, grep(paste0("X", x, ".IC_up"), colnames(glmtab))]) > 0) {
                                             glmtab[glmtab[, 1] == sp, grep(paste0("X", x, ".IC_up"), colnames(glmtab))]
                                         }else{
                                             NA
                                         }
                                                }))

    switch(distrib,  ## Applying the reciprocal of the link function to coefficients and confidence intervals depending on distribution law
           "poisson" = {
                            coefyear <- c(1, exp(as.numeric(coefan))) ## link function : log
                            if (assess_ic) {
                                ic_inf_sim <- c(1, exp(as.numeric(ic_inf)))
                                ic_sup_sim <- c(1, exp(as.numeric(ic_up)))
                            } else {
                                ic_inf_sim <- NA
                                ic_sup_sim <- NA
                      }},
           "quasipoisson" = {
                                 coefyear <- c(1, exp(as.numeric(coefan))) ## link function : log
                                 if (assess_ic) {
                                     ic_inf_sim <- c(1, exp(as.numeric(ic_inf)))
                                     ic_sup_sim <- c(1, exp(as.numeric(ic_up)))
                                 } else {
                                     ic_inf_sim <- NA
                                     ic_sup_sim <- NA
                           }},
           "inverse.gaussian" = {
                                     coefyear <- c(0, as.numeric(coefan) ^ (- 1 / 2)) ## link function : x^ - 2
                                     if (assess_ic) {
                                         ic_inf_sim <- c(0, as.numeric(ic_inf) ^ (- 1 / 2))
                                         ic_sup_sim <- c(0, as.numeric(ic_up) ^ (- 1 / 2))
                                     } else {
                                         ic_inf_sim <- NA
                                         ic_sup_sim <- NA
                               }},
           "binomial" = {
                             coefyear <- c(1, boot::inv.logit(as.numeric(coefan))) ## link function : logit
                             if (assess_ic) {
                                 ic_inf_sim <- c(1, boot::inv.logit(as.numeric(ic_inf)))
                                 ic_sup_sim <- c(1, boot::inv.logit(as.numeric(ic_up)))
                             } else {
                                 ic_inf_sim <- NA
                                 ic_sup_sim <- NA
                       }},
           "quasibinomial" = {
                                  coefyear <- c(1, boot::inv.logit(as.numeric(coefan))) ## link function : logit
                                  if (assess_ic) {
                                      ic_inf_sim <- c(1, boot::inv.logit(as.numeric(ic_inf)))
                                      ic_sup_sim <- c(1, boot::inv.logit(as.numeric(ic_up)))
                                  } else {
                                      ic_inf_sim <- NA
                                      ic_sup_sim <- NA
                            }},
           "Gamma" = {
                          coefyear <- c(1, as.numeric(coefan) ^ (- 1)) ## link function : -x^ - 1
                          if (assess_ic) {
                              ic_inf_sim <- c(1, as.numeric(ic_inf) ^ (- 1))
                              ic_sup_sim <- c(1, as.numeric(ic_up) ^ (- 1))
                          } else {
                              ic_inf_sim <- NA
                              ic_sup_sim <- NA
                    }}
         , {
                coefyear <- c(0, as.numeric(coefan))
                if (assess_ic) {
                    ic_inf_sim <- c(0, as.numeric(ic_inf))
                    ic_sup_sim <- c(0, as.numeric(ic_up))
                } else {
                    ic_inf_sim <- NA
                    ic_sup_sim <- NA
            }})

    pval <- c(1, unlist(lapply(an, FUN = function(x) {
                                             if (length(glmtab[glmtab[, 1] == sp, grep(paste0("X", x, ".Pvalue"), colnames(glmtab))]) > 0) {
                                                 glmtab[glmtab[, 1] == sp, grep(paste0("X", x, ".Pvalue"), colnames(glmtab))]
                                              }else{
                                                  NA
                                              }
                                                    }))) ## extract p value for each year


    tab1 <- data.frame(year, val = coefyear,  ## table for the graphical output 1
                       ll = unlist(ic_inf_sim), ul = unlist(ic_sup_sim),
                       catPoint = ifelse(pval < s_signif, "significatif", NA), pval,
                       courbe = vpan[1],
                       panel = vpan[1])
    ## cleaning of wrong or biaised measures of the confidence interval
    if (assess_ic) {
        tab1$ul <-  ifelse(tab1$ul == Inf, NA, tab1$ul)
        tab1$ul <-  ifelse(tab1$ul > 1.000000e+20, NA, tab1$ul)
        tab1$val <-  ifelse(tab1$val > 1.000000e+20, 1.000000e+20, tab1$val)
    }

    coefancontinu <- as.numeric(as.character(glmtab[glmtab[, 1] == sp, grep("year.Estimate", colnames(glmtab))])) ##  tendency of population evolution on the studied period = regression coefficient of the variable year as a continuous variable in the GLM
    switch(distrib, ## Applying the reciprocal of the link function to coefficients and confidence intervals depending on distribution law
           "poisson" = {
                           trend <- round(exp(as.numeric(coefancontinu)), 3) ## link function : log
                            pourcentage <- round((exp(as.numeric(coefancontinu) * as.numeric(pasdetemps)) - 1) * 100, 2)
                       },
           "quasipoisson" = {
                                 trend <- round(exp(as.numeric(coefancontinu)), 3) ## link function : log
                                 pourcentage <- round((exp(as.numeric(coefancontinu) * as.numeric(pasdetemps)) - 1) * 100, 2)
                            },
           "inverse.gaussian" = {
                                     trend <- round(as.numeric(coefancontinu) ^ (- 1 / 2), 3) ## link function : x^ - 2
                                     pourcentage <- round((((as.numeric(coefancontinu) * as.numeric(pasdetemps)) ^ (- 1 / 2)) - 1) * 100, 2)
                                },
           "binomial" = {
                             trend <- round(boot::inv.logit(as.numeric(coefancontinu)), 3) ## link function : logit
                             pourcentage <- round((boot::inv.logit(as.numeric(coefancontinu) * as.numeric(pasdetemps)) - 1) * 100, 2)
                        },
           "quasibinomial" = {
                                  trend <- round(boot::inv.logit(as.numeric(coefancontinu)), 3) ## link function : logit
                                  pourcentage <- round((boot::inv.logit(as.numeric(coefancontinu) * as.numeric(pasdetemps)) - 1) * 100, 2)
                             },
           "Gamma" = {
                          trend <- round(as.numeric(coefancontinu) ^ (- 1), 3) ## link function : -x^ - 1
                          pourcentage <- round((((as.numeric(coefancontinu) * as.numeric(pasdetemps)) ^ (- 1)) - 1) * 100, 2)
                     }
         , {
                trend <- round(as.numeric(coefancontinu), 3)
                pourcentage <-  round((((as.numeric(coefancontinu) * as.numeric(pasdetemps))) - 1) * 100, 2)
           })

    pval <- as.numeric(as.character(glmtab[glmtab[, 1] == sp, grep("year.Pvalue", colnames(glmtab))])) ## Extract p value

    ## table for global trend on the whole time series
    tab1t <- NULL
    if (length(pval) > 0) {
        tab1t <- data.frame(Est = trend, pourcent = pourcentage, signif = pval < s_signif, pval)
    }

    ##### Table 2

    if (sp == "global") { ## prepare metrics table to extract variables used in graph 2
        datatablecut <- datatable[grep("FALSE", is.na(datatable[, as.character(metric)])), ]

    }else{

        datatablecut <- datatable[datatable[, as.character(cut)] == sp, ]
        datatablecut <- datatablecut[grep("FALSE", is.na(datatablecut[, as.character(metric)])), ]
    }

    switch(as.character(metric), ## Different value represented graph 2 depending on the nature of the metric
           "number" = {
                           valplot <- lapply(sort(year), FUN = function(x) {
                                                                   sum(na.omit(as.numeric(subset(datatablecut, year == x)[, as.character(metric)])))
                                                                           })
                      }, ## sum if abundance
           "presence_absence" = {
                                     nb_loc <- lapply(sort(year), FUN = function(x) {
                                                                            length(unique(subset(datatablecut, year == x)[, "location"]))
                                                                                    }) ## number of plots per year
                                     nb_loc_presence <- lapply(sort(year), FUN = function(x) {
                                                                                     length(unique(subset(datatablecut[datatablecut[, metric] > 0, ], year == x)[, "location"]))
                                                                                             }) ##  number of plots where the species were observed
                                     valplot <- (na.omit(as.numeric(nb_loc_presence)) / na.omit(as.numeric(nb_loc))) * 100
                                } ## % of presence in observered plots if presence / absence
         , {
                valplot <- lapply(sort(year), FUN = function(x) {
                                                        mean(na.omit(as.numeric(subset(datatablecut, year == x)[, as.character(metric)])))
                                                                })
           } ## mean if any other metric
          )

    tab2 <- data.frame(year, val = round(as.numeric(valplot), 2), ll = NA, ul = NA, catPoint = NA, pval = NA,
                       courbe = vpan[2], panel = vpan[2])

    ## Creating ggplots

    dgg <- tab1

    figname <- paste(sp, ".png", sep = "")

    ## coord for horizontal lines in graphs
    hline_data1 <- data.frame(z = tab1$val[1], panel = c(vpan[1]), couleur = "var estimates", type = "var estimates")
    hline_data3 <- data.frame(z = 0, panel = vpan[2], couleur = "seuil", type = "seuil")
    hline_data <- rbind(hline_data1, hline_data3)
    titre <- paste(sp)

    ## text for the population evolution trend

    pasdetemps <- max(dgg$year) - min(dgg$year) + 1
    if (! is.null(tab1t)) {
        if (assess_ic) {
            txt_pente1 <- paste("Global trend : ", tab1t$Est,
                               ifelse(tab1t$signif, " *", ""),
                               ifelse(tab1t$signif, paste("\n", ifelse(tab1t$pourcent > 0, "+ ", "- "),
                                                         abs(tab1t$pourcent), " % in ", pasdetemps, " years", sep = ""), ""), sep = "")
        }else{
            txt_pente1 <- ifelse(tab1t$signif, paste("\n", ifelse(tab1t$pourcent > 0, "+ ", "- "),
                                                   abs(tab1t$pourcent), " % in ", pasdetemps, " years", sep = ""), "")
        }
    }else{
        trend_on_graph <- FALSE
    }

    ## table of the text for the population evolution trend
    tab_text_pent <- data.frame(y = c(max(c(dgg$val, dgg$ul), na.rm = TRUE) * .9),
                              x = median(dgg$year),
                              txt = ifelse(trend_on_graph, c(txt_pente1), ""),
                              courbe = c(vpan[1]), panel = c(vpan[1]))

    dgg <- rbind(tab1, tab2)

    ## colors for plots
    vec_col_point <- c("#ffffff", "#eeb40f", "#ee0f59")
    names(vec_col_point) <- c("significatif", "infSeuil", "0")
    vec_col_courbe <- c("#3c47e0", "#5b754d", "#55bb1d", "#973ce0")
    names(vec_col_courbe) <- c(vpan[1], "loc", "presence", vpan[2])
    vec_col_hline <- c("#ffffff", "#e76060")
    names(vec_col_hline) <- c("var estimates", "seuil")

    col <- c(vec_col_point, vec_col_courbe, vec_col_hline)
    names(col) <- c(names(vec_col_point), names(vec_col_courbe), names(vec_col_hline))

    p <- ggplot2::ggplot(data = dgg, mapping = ggplot2::aes_string(x = "year", y = "val"))
    ## Titles and scales
    p <- p + facet_grid(panel ~ ., scale = "free") +
    theme(legend.position = "none",
          panel.grid.minor = element_blank(),
          panel.grid.major.y = element_blank(),
          axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))  +
    ylab("") + xlab("year") + ggtitle(titre) +
    scale_colour_manual(values = col, name = "",
                             breaks = names(col)) +
    scale_x_continuous(breaks = min(dgg$year):max(dgg$year))
    p <- p + ggplot2::geom_hline(data = hline_data, mapping = ggplot2::aes_string(yintercept = "z", colour = "couleur", linetype = "type"),
                    alpha = 1, size = 1.2)
    if (assess_ic) { ############# ONLY FOR THE CONFIDENCE INTERVAL
        p <- p + ggplot2::geom_ribbon(mapping = ggplot2::aes_string(ymin = "ll", ymax = "ul"), fill = col[vpan[1]], alpha = .2)
        p <- p + ggplot2::geom_pointrange(mapping = ggplot2::aes_string(y = "val", ymin = "ll", ymax = "ul"), fill = col[vpan[1]], alpha = .2)
    }

    p <- p + ggplot2::geom_line(mapping = ggplot2::aes_string(colour = "courbe"), size = 1.5)
    p <- p + ggplot2::geom_point(mapping = ggplot2::aes_string(colour = "courbe"), size = 3)
    alph <- ifelse(!is.na(dgg$catPoint), 1, 0)
    p <- p + ggplot2::geom_point(mapping = ggplot2::aes_string(colour = "catPoint", alpha = alph), size = 2)
    p <- p + ggplot2::geom_text(data = tab_text_pent, mapping = ggplot2::aes_string("x", "y", label = "txt"), parse = FALSE, color = col[vpan[1]], fontface = 2, size = 4)
    ggplot2::ggsave(figname, p, width = 16, height = 15, units = "cm")
}
############################################################################################################ fin fonction graphique / end of function for graphical output

################# Analysis

for (sp in glmtable[, 1]) {

    if (!all(is.na(glmtable[glmtable[, 1] == sp, 4:(length(glmtable) - 1)]))) { ##ignore lines with only NA
        ggplot_glm(glmtable = glmtable, datatable = datatable, unitobs = unitobs, metric = metric, sp = sp, description = TRUE, trend_on_graph = TRUE, assess_ic = assess_ic)
    }
}

sink("stdout.txt", split = TRUE)