Mercurial > repos > ecology > regionalgam_gls_adjusted
view dennis-gam-initial-functions.R @ 0:e2bff90af491 draft default tip
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/regionalgam commit ffe42225fff8992501b743ebe2c78e50fddc4a4e
author | ecology |
---|---|
date | Thu, 20 Jun 2019 04:04:19 -0400 |
parents | |
children |
line wrap: on
line source
### R-Script Adapted from script provided by the CEH, UK BY: Reto Schmucki [ reto.schmucki@mail.mcgill.ca] ### DATE: 14 July 2014 function to run two stage model in DENNIS et al. 2013 .onAttach <- function(libname, pkgname) { packageStartupMessage(" The regionalGAM package that is no longer maintained, \n use the new rbms package instead. \n devtools::install_github(\"RetoSchmucki/rbms\", force=TRUE)") } #' year_day_func Function #' This function generate the full sequence of days, months and include the observation to that file. #' @param sp_data A data.frame with your observation. #' @keywords year days #' @export #' @author Reto Schmucki #' @examples #' year_day_func() # FUNCTIONS year_day_func = function(sp_data) { year <- unique(sp_data$YEAR) origin.d <- paste(year, "01-01", sep = "-") if ((year%%4 == 0) & ((year%%100 != 0) | (year%%400 == 0))) { nday <- 366 } else { nday <- 365 } date.serie <- as.POSIXlt(seq(as.Date(origin.d), length = nday, by = "day"), format = "%Y-%m-%d") dayno <- as.numeric(julian(date.serie, origin = as.Date(origin.d)) + 1) month <- as.numeric(strftime(date.serie, format = "%m")) week <- as.numeric(strftime(date.serie, format = "%W")) week_day <- as.numeric(strftime(date.serie, format = "%u")) day <- as.numeric(strftime(date.serie, format = "%d")) site_list <- sp_data[!duplicated(sp_data$SITE), c("SITE")] all_day_site <- data.frame(SPECIES = sp_data$SPECIES[1], SITE = rep(site_list, rep(nday, length(site_list))), YEAR = sp_data$YEAR[1], MONTH = month, WEEK = week, DAY = day, DAY_WEEK = week_day, DAYNO = dayno, COUNT = NA) count_index <- match(paste(sp_data$SITE, sp_data$DAYNO, sep = "_"), paste(all_day_site$SITE, all_day_site$DAYNO, sep = "_")) all_day_site$COUNT[count_index] <- sp_data$COUNT site_count_length <- aggregate(sp_data$COUNT, by = list(sp_data$SITE), function(x) list(1:length(x))) names(site_count_length$x) <- as.character(site_count_length$Group.1) site_countno <- utils::stack(site_count_length$x) all_day_site$COUNTNO <- NA all_day_site$COUNTNO[count_index] <- site_countno$values # add count number to ease extraction of single count # Add zero to close observation season two weeks before and after the first and last first_obs <- min(all_day_site$DAYNO[!is.na(all_day_site$COUNT)]) last_obs <- max(all_day_site$DAYNO[!is.na(all_day_site$COUNT)]) closing_season <- c((first_obs - 11):(first_obs - 7), (last_obs + 7):(last_obs + 11)) # If closing season is before day 1 or day 365, simply set the first and last 5 days to 0 if (min(closing_season) < 1) closing_season[1:5] <- c(1:5) if (max(closing_season) > nday) closing_season[6:10] <- c((nday - 4):nday) all_day_site$COUNT[all_day_site$DAYNO %in% closing_season] <- 0 all_day_site$ANCHOR <- 0 all_day_site$ANCHOR[all_day_site$DAYNO %in% closing_season] <- 1 all_day_site <- all_day_site[order(all_day_site$SITE, all_day_site$DAYNO), ] return(all_day_site) } #' trap_area Function #' #' This function compute the area under the curve using the trapezoid method. #' @param x A vector or a two columns matrix. #' @param y A vector, Default is NULL #' @keywords trapezoid #' @export #' @examples #' trap_area() trap_area = function(x, y = NULL) { # If y is null and x has multiple columns then set y to x[,2] and x to x[,1] if (is.null(y)) { if (length(dim(x)) == 2) { y = x[, 2] x = x[, 1] } else { stop("ERROR: need to either specifiy both x and y or supply a two column data.frame/matrix to x") } } # Check x and y are same length if (length(x) != length(y)) { stop("ERROR: x and y need to be the same length") } # Need to exclude any pairs that are NA for either x or y rm_inds = which(is.na(x) | is.na(y)) if (length(rm_inds) > 0) { x = x[-rm_inds] y = y[-rm_inds] } # Determine values of trapezoids under curve Get inds inds = 1:(length(x) - 1) # Determine area using trapezoidal rule Area = ( (b1 + b2)/2 ) * h where b1 and b2 are lengths of bases # (the parallel sides) and h is the height (the perpendicular distance between two bases) areas = ((y[inds] + y[inds + 1])/2) * diff(x) # total area is sum of all trapezoid areas tot_area = sum(areas) # Return total area return(tot_area) } #' trap_index Function #' #' This function compute the area under the curve (Abundance Index) across species, sites and years #' @param sp_data A data.frame containing species count data generated from the year_day_func() #' @param y A vector, Default is NULL #' @keywords Abundance index #' @export #' @examples #' trap_index() trap_index = function(sp_data, data_col = "IMP", time_col = "DAYNO", by_col = c("SPECIES", "SITE", "YEAR")) { # Build output data.frame out_obj = unique(sp_data[, by_col]) # Set row.names to be equal to collapsing of output rows (will be unique, you need them to make uploading # values back to data.frame will be easier) row.names(out_obj) = apply(out_obj, 1, paste, collapse = "_") # Using this row.names from out_obj above as index in by function to loop through values all unique combs # of by_cols and fit trap_area to data ind_dat = by(sp_data[, c(time_col, data_col)], apply(sp_data[, by_col], 1, paste, collapse = "_"), trap_area) # Add this data to output object out_obj[names(ind_dat), "SINDEX"] = round(ind_dat/7, 1) # Set row.names to defaults row.names(out_obj) = NULL # Return output object return(out_obj) } #' flight_curve Function #' This function compute the flight curve across and years #' @param your_dataset A data.frame containing original species count data. The data format is a csv (comma "," separated) with 6 columns with headers, namely "species","transect_id","visit_year","visit_month","visit_day","sp_count" #' @keywords standardize flight curve #' @export #' @examples #' flight_curve() flight_curve <- function(your_dataset, GamFamily = 'nb', MinVisit = 2, MinOccur = 1) { if("mgcv" %in% installed.packages() == "FALSE") { print("mgcv package is not installed.") x <- readline("Do you want to install it? Y/N") if (x == 'Y') { install.packages("mgcv") } if (x == 'N') { stop("flight curve can not be computed without the mgcv package, sorry") } } flight_pheno <- data.frame() your_dataset$DAYNO <- strptime(paste(your_dataset$DAY, your_dataset$MONTH, your_dataset$YEAR, sep = "/"), "%d/%m/%Y")$yday + 1 dataset <- your_dataset[, c("SPECIES", "SITE", "YEAR", "MONTH", "DAY", "DAYNO", "COUNT")] sample_year <- unique(dataset$YEAR) sample_year <- sample_year[order(sample_year)] if (length(sample_year) >1 ) { for (y in sample_year) { dataset_y <- dataset[dataset$YEAR == y, ] # subset sites with enough visit and occurence occ <- aggregate(dataset_y$COUNT, by = list(SITE = dataset_y$SITE), function(x) sum(x > 0)) vis <- aggregate(dataset_y$COUNT, by = list(SITE = dataset_y$SITE), function(x) length(x)) dataset_y <- dataset_y[dataset_y$SITE %in% occ$SITE[occ$x >= MinOccur], ] dataset_y <- dataset_y[dataset_y$SITE %in% vis$SITE[vis$x >= MinVisit], ] nsite <- length(unique(dataset_y$SITE)) if (nsite < 1) { print(paste("No sites with sufficient visits and occurence, MinOccur:", MinOccur, " MinVisit: ", MinVisit, " for " , dataset$SPECIES[1],"at year", y)) next } # Determine missing days and add to dataset sp_data_all <- year_day_func(dataset_y) if (nsite > 200) { sp_data_all <- sp_data_all[as.character(sp_data_all$SITE) %in% as.character(unique(dataset_y$SITE)[sample(1:nsite, 200, replace = F)]), ] sp_data_all <- sp_data_all } sp_data_all$trimDAYNO <- sp_data_all$DAYNO - min(sp_data_all$DAYNO) + 1 print(paste("Fitting the GAM for",as.character(sp_data_all$SPECIES[1]),"and year",y,"with",length(unique(sp_data_all$SITE)),"sites :",Sys.time())) if(length(unique(sp_data_all$SITE))>1){ gam_obj_site <- try(mgcv::gam(COUNT ~ s(trimDAYNO, bs = "cr") + as.factor(SITE) -1, data = sp_data_all, family = GamFamily), silent = TRUE) } else { gam_obj_site <- try(mgcv::gam(COUNT ~ s(trimDAYNO, bs = "cr") -1, data = sp_data_all, family = GamFamily), silent = TRUE) } # Give a second try if the GAM does not converge the first time if (class(gam_obj_site)[1] == "try-error") { # Determine missing days and add to dataset sp_data_all <- year_day_func(dataset_y) if (nsite > 200) { sp_data_all <- sp_data_all[as.character(sp_data_all$SITE) %in% as.character(unique(dataset_y$SITE)[sample(1:nsite, 200, replace = F)]), ] } else { sp_data_all <- sp_data_all } sp_data_all$trimDAYNO <- sp_data_all$DAYNO - min(sp_data_all$DAYNO) + 1 print(paste("Fitting the GAM for",sp_data_all$SPECIES[1],"at year", y,"with",length(unique(sp_data_all$SITE)),"sites :",Sys.time(),"second try")) if(length(unique(sp_data_all$SITE))>1){ gam_obj_site <- try(mgcv::gam(COUNT ~ s(trimDAYNO, bs = "cr") + as.factor(SITE) -1, data = sp_data_all, family = GamFamily), silent = TRUE) } else { gam_obj_site <- try(mgcv::gam(COUNT ~ s(trimDAYNO, bs = "cr") -1, data = sp_data_all, family = GamFamily), silent = TRUE) } if (class(gam_obj_site)[1] == "try-error") { print(paste("Error in fitting the flight period for",sp_data_all$SPECIES[1],"at year", y,"no convergence after two trial")) sp_data_all[, "FITTED"] <- NA sp_data_all[, "COUNT_IMPUTED"] <- NA sp_data_all[is.na(sp_data_all$COUNT), "COUNT_IMPUTED"] <- NA sp_data_all[, "NM"] <- NA } else { # Generate a list of values for all days from the additive model and use # these value to fill the missing observations sp_data_all[, "FITTED"] <- mgcv::predict.gam(gam_obj_site, newdata = sp_data_all[, c("trimDAYNO", "SITE")], type = "response") # force zeros at the beginning end end of the year sp_data_all[sp_data_all$trimDAYNO < 60, "FITTED"] <- 0 sp_data_all[sp_data_all$trimDAYNO > 305, "FITTED"] <- 0 # if infinite number in predict replace with NA. if(sum(is.infinite(sp_data_all[, "FITTED"]))>0){ print(paste("Error in the flight period for",sp_data_all$SPECIES[1],"at year", y,"weird predicted values")) sp_data_all[, "FITTED"] <- NA sp_data_all[, "COUNT_IMPUTED"] <- NA sp_data_all[is.na(sp_data_all$COUNT), "COUNT_IMPUTED"] <- NA sp_data_all[, "NM"] <- NA } else { sp_data_all[, "COUNT_IMPUTED"] <- sp_data_all$COUNT sp_data_all[is.na(sp_data_all$COUNT), "COUNT_IMPUTED"] <- sp_data_all$FITTED[is.na(sp_data_all$COUNT)] # Define the flight curve from the fitted values and append them over # years (this is one flight curve per year for all site) site_sums <- aggregate(sp_data_all$FITTED, by = list(SITE = sp_data_all$SITE), FUN = sum) # Rename sum column names(site_sums)[names(site_sums) == "x"] <- "SITE_YR_FSUM" # Add data to sp_data data.frame (ensure merge does not sort the data!) sp_data_all = merge(sp_data_all, site_sums, by <- c("SITE"), all = TRUE, sort = FALSE) # Calculate normalized values sp_data_all[, "NM"] <- sp_data_all$FITTED/sp_data_all$SITE_YR_FSUM } } } else { # Generate a list of values for all days from the additive model and use # these value to fill the missing observations sp_data_all[, "FITTED"] <- mgcv::predict.gam(gam_obj_site, newdata = sp_data_all[, c("trimDAYNO", "SITE")], type = "response") # force zeros at the beginning end end of the year sp_data_all[sp_data_all$trimDAYNO < 60, "FITTED"] <- 0 sp_data_all[sp_data_all$trimDAYNO > 305, "FITTED"] <- 0 # if infinite number in predict replace with NA. if(sum(is.infinite(sp_data_all[, "FITTED"]))>0){ print(paste("Error in the flight period for",sp_data_all$SPECIES[1],"at year", y,"weird predicted values")) sp_data_all[, "FITTED"] <- NA sp_data_all[, "COUNT_IMPUTED"] <- NA sp_data_all[is.na(sp_data_all$COUNT), "COUNT_IMPUTED"] <- NA sp_data_all[, "NM"] <- NA } else { sp_data_all[, "COUNT_IMPUTED"] <- sp_data_all$COUNT sp_data_all[is.na(sp_data_all$COUNT), "COUNT_IMPUTED"] <- sp_data_all$FITTED[is.na(sp_data_all$COUNT)] # Define the flight curve from the fitted values and append them over # years (this is one flight curve per year for all site) site_sums = aggregate(sp_data_all$FITTED, by = list(SITE = sp_data_all$SITE), FUN = sum) # Rename sum column names(site_sums)[names(site_sums) == "x"] = "SITE_YR_FSUM" # Add data to sp_data data.frame (ensure merge does not sort the data!) sp_data_all = merge(sp_data_all, site_sums, by = c("SITE"), all = TRUE, sort = FALSE) # Calculate normalized values sp_data_all[, "NM"] = sp_data_all$FITTED/sp_data_all$SITE_YR_FSUM } } sp_data_filled <- sp_data_all flight_curve <- data.frame(species = sp_data_filled$SPECIES, year = sp_data_filled$YEAR, week = sp_data_filled$WEEK, DAYNO = sp_data_filled$DAYNO, DAYNO_adj = sp_data_filled$trimDAYNO, nm = sp_data_filled$NM)[!duplicated(paste(sp_data_filled$YEAR, sp_data_filled$DAYNO, sep = "_")), ] flight_curve <- flight_curve[order(flight_curve$DAYNO), ] # bind if exist else create if (is.na(flight_curve$nm[1])) next() flight_pheno <- rbind(flight_pheno, flight_curve) } # end of year loop } else { y <- unique(dataset$YEAR) dataset_y <- dataset[dataset$YEAR == y, ] # subset sites with enough visit and occurence occ <- aggregate(dataset_y$COUNT, by = list(SITE = dataset_y$SITE), function(x) sum(x > 0)) vis <- aggregate(dataset_y$COUNT, by = list(SITE = dataset_y$SITE), function(x) length(x)) dataset_y <- dataset_y[dataset_y$SITE %in% occ$SITE[occ$x >= MinOccur], ] dataset_y <- dataset_y[dataset_y$SITE %in% vis$SITE[vis$x >= MinVisit], ] nsite <- length(unique(dataset_y$SITE)) if (nsite < 1) { stop(paste("No sites with sufficient visits and occurence, MinOccur:", MinOccur, " MinVisit: ", MinVisit, " for " ,dataset$SPECIES[1],"at year", y)) } # Determine missing days and add to dataset sp_data_all <- year_day_func(dataset_y) if (nsite > 200) { sp_data_all <- sp_data_all[as.character(sp_data_all$SITE) %in% as.character(unique(dataset_y$SITE)[sample(1:nsite, 200, replace = F)]), ] } else { sp_data_all <- sp_data_all } sp_data_all$trimDAYNO <- sp_data_all$DAYNO - min(sp_data_all$DAYNO) + 1 print(paste("Fitting the GAM for",sp_data_all$SPECIES[1],"at year", y,":",Sys.time())) if(length(unique(sp_data_all$SITE))>1){ gam_obj_site <- try(mgcv::gam(COUNT ~ s(trimDAYNO, bs = "cr") + as.factor(SITE) -1, data = sp_data_all, family = GamFamily), silent = TRUE) } else { gam_obj_site <- try(mgcv::gam(COUNT ~ s(trimDAYNO, bs = "cr") -1, data = sp_data_all, family = GamFamily), silent = TRUE) } # Give a second try if the GAM does not converge the first time if (class(gam_obj_site)[1] == "try-error") { # Determine missing days and add to dataset sp_data_all <- year_day_func(dataset_y) if (nsite > 200) { sp_data_all <- sp_data_all[as.character(sp_data_all$SITE) %in% as.character(unique(dataset_y$SITE)[sample(1:nsite, 200, replace = F)]), ] } else { sp_data_all <- sp_data_all } sp_data_all$trimDAYNO <- sp_data_all$DAYNO - min(sp_data_all$DAYNO) + 1 print(paste("Fitting the GAM for",sp_data_all$SPECIES[1],"at year", y,"with",length(unique(sp_data_all$SITE)),"sites :",Sys.time(),"second try")) if(length(unique(sp_data_all$SITE))>1){ gam_obj_site <- try(mgcv::bam(COUNT ~ s(trimDAYNO, bs = "cr") + as.factor(SITE) - 1, data = sp_data_all, family = GamFamily), silent = TRUE) } else { gam_obj_site <- try(mgcv::gam(COUNT ~ s(trimDAYNO, bs = "cr") -1, data = sp_data_all, family = GamFamily), silent = TRUE) } if (class(gam_obj_site)[1] == "try-error") { print(paste("Error in fitting the flight period for",sp_data_all$SPECIES[1],"at year", y,"no convergence after two trial")) sp_data_all[, "FITTED"] <- NA sp_data_all[, "COUNT_IMPUTED"] <- NA sp_data_all[is.na(sp_data_all$COUNT), "COUNT_IMPUTED"] <- NA sp_data_all[, "NM"] <- NA } else { # Generate a list of values for all days from the additive model and use # these value to fill the missing observations sp_data_all[, "FITTED"] <- mgcv::predict.gam(gam_obj_site, newdata = sp_data_all[, c("trimDAYNO", "SITE")], type = "response") # force zeros at the beginning end end of the year sp_data_all[sp_data_all$trimDAYNO < 60, "FITTED"] <- 0 sp_data_all[sp_data_all$trimDAYNO > 305, "FITTED"] <- 0 # if infinite number in predict replace with NA. if(sum(is.infinite(sp_data_all[, "FITTED"]))>0){ print(paste("Error in the flight period for",sp_data_all$SPECIES[1],"at year", y,"weird predicted values")) sp_data_all[, "FITTED"] <- NA sp_data_all[, "COUNT_IMPUTED"] <- NA sp_data_all[is.na(sp_data_all$COUNT), "COUNT_IMPUTED"] <- NA sp_data_all[, "NM"] <- NA } else { sp_data_all[, "COUNT_IMPUTED"] <- sp_data_all$COUNT sp_data_all[is.na(sp_data_all$COUNT), "COUNT_IMPUTED"] <- sp_data_all$FITTED[is.na(sp_data_all$COUNT)] # Define the flight curve from the fitted values and append them over # years (this is one flight curve per year for all site) site_sums <- aggregate(sp_data_all$FITTED, by = list(SITE = sp_data_all$SITE), FUN = sum) # Rename sum column names(site_sums)[names(site_sums) == "x"] <- "SITE_YR_FSUM" # Add data to sp_data data.frame (ensure merge does not sort the data!) sp_data_all = merge(sp_data_all, site_sums, by <- c("SITE"), all = TRUE, sort = FALSE) # Calculate normalized values sp_data_all[, "NM"] <- sp_data_all$FITTED/sp_data_all$SITE_YR_FSUM } } } else { # Generate a list of values for all days from the additive model and use # these value to fill the missing observations sp_data_all[, "FITTED"] <- mgcv::predict.gam(gam_obj_site, newdata = sp_data_all[, c("trimDAYNO", "SITE")], type = "response") # force zeros at the beginning end end of the year sp_data_all[sp_data_all$trimDAYNO < 60, "FITTED"] <- 0 sp_data_all[sp_data_all$trimDAYNO > 305, "FITTED"] <- 0 # if infinite number in predict replace with NA. if(sum(is.infinite(sp_data_all[, "FITTED"]))>0){ print(paste("Error in the flight period for",sp_data_all$SPECIES[1],"at year", y,"weird predicted values")) sp_data_all[, "FITTED"] <- NA sp_data_all[, "COUNT_IMPUTED"] <- NA sp_data_all[is.na(sp_data_all$COUNT), "COUNT_IMPUTED"] <- NA sp_data_all[, "NM"] <- NA } else { sp_data_all[, "COUNT_IMPUTED"] <- sp_data_all$COUNT sp_data_all[is.na(sp_data_all$COUNT), "COUNT_IMPUTED"] <- sp_data_all$FITTED[is.na(sp_data_all$COUNT)] # Define the flight curve from the fitted values and append them over # years (this is one flight curve per year for all site) site_sums = aggregate(sp_data_all$FITTED, by = list(SITE = sp_data_all$SITE), FUN = sum) # Rename sum column names(site_sums)[names(site_sums) == "x"] = "SITE_YR_FSUM" # Add data to sp_data data.frame (ensure merge does not sort the data!) sp_data_all = merge(sp_data_all, site_sums, by = c("SITE"), all = TRUE, sort = FALSE) # Calculate normalized values sp_data_all[, "NM"] = sp_data_all$FITTED/sp_data_all$SITE_YR_FSUM } } sp_data_filled <- sp_data_all flight_curve <- data.frame(species = sp_data_filled$SPECIES, year = sp_data_filled$YEAR, week = sp_data_filled$WEEK, DAYNO = sp_data_filled$DAYNO, DAYNO_adj = sp_data_filled$trimDAYNO, nm = sp_data_filled$NM)[!duplicated(paste(sp_data_filled$YEAR, sp_data_filled$DAYNO, sep = "_")), ] flight_curve <- flight_curve[order(flight_curve$DAYNO), ] flight_pheno <- rbind(flight_pheno, flight_curve) } return(flight_pheno) } #' abundance_index Function #' #' This function compute the Abundance Index across sites and years from your dataset and the regional flight curve #' @param your_dataset A data.frame containing original species count data. The data format is a csv (comma "," separated) with 6 columns with headers, namely "species","transect_id","visit_year","visit_month","visit_day","sp_count" #' @param flight_pheno A data.frame for the regional flight curve computed with the function flight_curve #' @keywords standardize flight curve #' @export #' @examples #' abundance_index() abundance_index <- function(your_dataset,flight_pheno) { your_dataset$DAYNO <- strptime(paste(your_dataset$DAY, your_dataset$MONTH, your_dataset$YEAR, sep = "/"), "%d/%m/%Y")$yday + 1 dataset <- your_dataset[, c("SPECIES", "SITE", "YEAR", "MONTH", "DAY", "DAYNO", "COUNT")] sample_year <- unique(dataset$YEAR) sample_year <- sample_year[order(sample_year)] if (length(sample_year)>1){ for (y in sample_year) { year_pheno <- flight_pheno[flight_pheno$year == y, ] dataset_y <- dataset[dataset$YEAR == y, ] sp_data_site <- year_day_func(dataset_y) sp_data_site$trimDAYNO <- sp_data_site$DAYNO - min(sp_data_site$DAYNO) + 1 sp_data_site <- merge(sp_data_site, year_pheno[, c("DAYNO", "nm")], by = c("DAYNO"), all.x = TRUE, sort = FALSE) # compute proportion of the flight curve sampled due to missing visits pro_missing_count <- data.frame(SITE = sp_data_site$SITE, WEEK = sp_data_site$WEEK, NM = sp_data_site$nm, COUNT = sp_data_site$COUNT, ANCHOR = sp_data_site$ANCHOR) pro_missing_count$site_week <- paste(pro_missing_count$SITE, pro_missing_count$WEEK, sep = "_") siteweeknocount <- aggregate(pro_missing_count$COUNT, by = list(pro_missing_count$site_week), function(x) sum(!is.na(x)) == 0) pro_missing_count <- pro_missing_count[pro_missing_count$site_week %in% siteweeknocount$Group.1[siteweeknocount$x == TRUE], ] pro_count_agg <- aggregate(pro_missing_count$NM, by = list(pro_missing_count$SITE), function(x) 1 - sum(x, na.rm = T)) names(pro_count_agg) <- c("SITE", "PROP_PHENO_SAMPLED") # remove samples outside the monitoring window sp_data_site$COUNT[sp_data_site$nm==0] <- NA # Compute the regional GAM index if(length(unique(sp_data_site$SITE))>1){ glm_obj_site <- glm(COUNT ~ factor(SITE) + offset(log(nm)) - 1, data = sp_data_site, family = quasipoisson(link = "log"), control = list(maxit = 100)) } else { glm_obj_site <- glm(COUNT ~ offset(log(nm)) - 1, data = sp_data_site, family = quasipoisson(link = "log"), control = list(maxit = 100)) } sp_data_site[, "FITTED"] <- predict.glm(glm_obj_site, newdata = sp_data_site, type = "response") sp_data_site[, "COUNT_IMPUTED"] <- sp_data_site$COUNT sp_data_site[is.na(sp_data_site$COUNT), "COUNT_IMPUTED"] <- sp_data_site$FITTED[is.na(sp_data_site$COUNT)] ## add fitted value for missing mid-week data sp_data_site <- sp_data_site[!paste(sp_data_site$DAY_WEEK, sp_data_site$COUNT) %in% c("1 NA", "2 NA", "3 NA", "5 NA", "6 NA", "7 NA"), ] ## remove all added mid-week values for weeks with real count ## (observation) sp_data_site$site_week <- paste(sp_data_site$SITE, sp_data_site$WEEK, sep = "_") siteweekcount <- aggregate(sp_data_site$COUNT, by = list(sp_data_site$site_week), function(x) sum(!is.na(x)) > 0) sp_data_site <- sp_data_site[!(is.na(sp_data_site$COUNT) & (sp_data_site$site_week %in% siteweekcount$Group.1[siteweekcount$x == TRUE])), names(sp_data_site) != "site_week"] ## Compute the regional GAM index print(paste("Compute index for",sp_data_site$SPECIES[1],"at year", y,"for",length(unique(sp_data_site$SITE)),"sites:",Sys.time())) regional_gam_index <- trap_index(sp_data_site, data_col = "COUNT_IMPUTED", time_col = "DAYNO", by_col = c("SPECIES", "SITE", "YEAR")) cumu_index <- merge(regional_gam_index, pro_count_agg, by = c("SITE"), all.x = TRUE, sort = FALSE) names(cumu_index) <- c("SITE", "SPECIES", "YEAR", "regional_gam", "prop_pheno_sampled") cumu_index <- cumu_index[order(cumu_index$SITE), ] # bind if exist else create if ("cumullated_indices" %in% ls()) { cumullated_indices <- rbind(cumullated_indices, cumu_index) } else { cumullated_indices <- cumu_index } } # end of year loop } else { y <- unique(dataset$YEAR) year_pheno <- flight_pheno[flight_pheno$year == y, ] dataset_y <- dataset[dataset$YEAR == y, ] sp_data_site <- year_day_func(dataset_y) sp_data_site$trimDAYNO <- sp_data_site$DAYNO - min(sp_data_site$DAYNO) + 1 sp_data_site <- merge(sp_data_site, year_pheno[, c("DAYNO", "nm")], by = c("DAYNO"), all.x = TRUE, sort = FALSE) # compute proportion of the flight curve sampled due to missing visits pro_missing_count <- data.frame(SITE = sp_data_site$SITE, WEEK = sp_data_site$WEEK, NM = sp_data_site$nm, COUNT = sp_data_site$COUNT, ANCHOR = sp_data_site$ANCHOR) pro_missing_count$site_week <- paste(pro_missing_count$SITE, pro_missing_count$WEEK, sep = "_") siteweeknocount <- aggregate(pro_missing_count$COUNT, by = list(pro_missing_count$site_week), function(x) sum(!is.na(x)) == 0) pro_missing_count <- pro_missing_count[pro_missing_count$site_week %in% siteweeknocount$Group.1[siteweeknocount$x == TRUE], ] pro_count_agg <- aggregate(pro_missing_count$NM, by = list(pro_missing_count$SITE), function(x) 1 - sum(x, na.rm = T)) names(pro_count_agg) <- c("SITE", "PROP_PHENO_SAMPLED") # remove samples outside the monitoring window sp_data_site$COUNT[sp_data_site$nm==0] <- NA # Compute the regional GAM index if(length(unique(sp_data_site$SITE))>1){ glm_obj_site <- glm(COUNT ~ factor(SITE) + offset(log(nm)) - 1, data = sp_data_site, family = quasipoisson(link = "log"), control = list(maxit = 100)) } else { glm_obj_site <- glm(COUNT ~ offset(log(nm)) - 1, data = sp_data_site, family = quasipoisson(link = "log"), control = list(maxit = 100)) } sp_data_site[, "FITTED"] <- predict.glm(glm_obj_site, newdata = sp_data_site, type = "response") sp_data_site[, "COUNT_IMPUTED"] <- sp_data_site$COUNT sp_data_site[is.na(sp_data_site$COUNT), "COUNT_IMPUTED"] <- sp_data_site$FITTED[is.na(sp_data_site$COUNT)] # add fitted value for missing mid-week data sp_data_site <- sp_data_site[!paste(sp_data_site$DAY_WEEK, sp_data_site$COUNT) %in% c("1 NA", "2 NA", "3 NA", "5 NA", "6 NA", "7 NA"), ] # remove all added mid-week values for weeks with real count # (observation) sp_data_site$site_week <- paste(sp_data_site$SITE, sp_data_site$WEEK, sep = "_") siteweekcount <- aggregate(sp_data_site$COUNT, by = list(sp_data_site$site_week), function(x) sum(!is.na(x)) > 0) sp_data_site <- sp_data_site[!(is.na(sp_data_site$COUNT) & (sp_data_site$site_week %in% siteweekcount$Group.1[siteweekcount$x == TRUE])), names(sp_data_site) != "site_week"] # Compute the regional gam index print(paste("Compute index for",sp_data_site$SPECIES[1],"at year", y,"for",length(unique(sp_data_site$SITE)),"sites:",Sys.time())) regional_gam_index <- trap_index(sp_data_site, data_col = "COUNT_IMPUTED", time_col = "DAYNO", by_col = c("SPECIES", "SITE", "YEAR")) cumu_index <- merge(regional_gam_index, pro_count_agg, by = c("SITE"), all.x = TRUE, sort = FALSE) names(cumu_index) <- c("SITE", "SPECIES", "YEAR", "regional_gam", "prop_pheno_sampled") cumu_index <- cumu_index[order(cumu_index$SITE), ] # bind if exist else create if ("cumullated_indices" %in% ls()) { cumullated_indices <- rbind(cumullated_indices, cumu_index) } else { cumullated_indices <- cumu_index } } return(cumullated_indices) }