oghma: evaluate_aggregation.R comparison

comparison evaluate_aggregation.R @ 91:b0b172279433 draft

Uploaded

author	nicolas
date	Mon, 31 Oct 2016 04:53:14 -0400
parents
children

comparison

equal deleted inserted replaced

-:634533b40622
+:b0b172279433
+########################################################
+#
+# creation date : 10/10/16
+# last modification : 29/10/16
+# author : Dr Nicolas Beaume
+#
+########################################################
+suppressWarnings(suppressMessages(library(GA)))
+library("miscTools")
+library(rpart)
+suppressWarnings(suppressMessages(library(randomForest)))
+library(e1071)
+suppressWarnings(suppressMessages(library(glmnet)))
+library(rrBLUP)
+options(warn=-1)
+############################ helper functions #######################
+##### classifiers
+prediction <- function(genotype, model, classifier="unknown") {
+# run prediction according to the classifier
+switch(classifier,
+rrBLUP={
+predictions <- as.matrix(genotype) %*% as.matrix(model$u);
+predictions <- predictions[,1]+model$beta;
+},
+rf={
+predictions <- predict(model, genotype);
+},
+svm={
+predictions <- predict(model, genotype);
+},
+lasso={
+predictions <- predict(model, as.matrix(genotype), type = "response");
+},
+{warning("unkonwn classifier, please choose among the following : rrBLUP, rf, svm, lasso")})
+return(predictions)
+}
+# extract parameter from a model, excluding rrBLUP which auto-optimize
+extractParameter <- function(model, classifierName) {
+param <- NULL
+switch(classifierName,
+# random forest
+rf={
+param <- model$ntree
+param <- c(param, list(model$mtry))
+names(param) <- c("ntree", "mtry")
+},
+# svm
+svm={
+param <- as.numeric(model$cost)
+param <- c(param, list(model$gamma))
+param <- c(param, list(model$coef0))
+param <- c(param, list(model$degree))
+param <- c(param, list(model$kernel))
+names(param) <- c("c", "g", "coef", "d", "kernel")
+switch((model$kernel+1),
+param$kernel <- "linear",
+param$kernel <- "polynomial",
+param$kernel <- "radial",
+param$kernel <- "sigmoid"
+)
+},
+# lasso
+lasso={
+param <- as.list(model$lambda)
+names(param) <- "lambda"
+},
+{print(paste("unknown classifier, please choose among rf, svm, lasso"));
+stop()}
+)
+return(param)
+}
+##### Genetic algorithm
+# compute r2 by computing the classic formula
+# compare the sum of square difference from target to prediciton
+# to the sum of square difference from target to the mean of the target
+r2 <- function(target, prediction) {
+sst <- sum((target-mean(target))^2)
+ssr <- sum((target-prediction)^2)
+return(1-ssr/sst)
+}
+optimizeOneIndividual <- function(values, trueValue) {
+# change the value into a function
+f <- function(w) {sum(values * w/sum(w))}
+fitness <- function(x) {1/abs(trueValue-f(x))}
+resp <- ga(type = "real-valued", fitness = fitness, min = rep(0, length(values)), max = rep(1, length(values)),
+maxiter = 1000, monitor = NULL, keepBest = T)
+resp@solution <- resp@solution/sum(resp@solution)
+return(resp)
+}
+optimizeWeight <- function(values, trueValue, n=1000) {
+fitnessAll <- function(w) {
+predicted <- apply(values, 1, weightedPrediction.vec, w)
+return(mean(r2(trueValue, predicted)))
+#return(mean(1/abs(trueValue-predicted)))
+}
+resp <- ga(type = "real-valued", fitness = fitnessAll, min = rep(0, ncol(values)), max = rep(1, ncol(values)),
+maxiter = n, monitor = NULL, keepBest = T)
+resp@solution <- resp@solution/sum(resp@solution)
+return(resp)
+}
+weightedPrediction <- function(classifiers, w) {
+if(length(w) > ncol(classifiers)) {
+warning("more weights than classifiers, extra weigths are ignored")
+w <- w[1:ncol(classifiers)]
+} else if(length(w) < ncol(classifiers)) {
+warning("less weights than classifiers, extra classifiers are ignored")
+classifiers <- classifiers[,1:length(w)]
+}
+prediction <- NULL
+prediction <- c(prediction, apply(classifiers, 1, weightedPrediction.vec, w))
+return(prediction)
+}
+weightedPrediction.vec <- function(values, w) {
+return(sum(values * w/sum(w)))
+}
+##### meta-decision tree
+tuneTree <- function(data, target) {
+data <- data.frame(data, target=target)
+size <-  nrow(data)
+xerror <-  NULL
+split <-  1:ceiling(size/5)
+leafSize <-  1:ceiling(size/10)
+xerror <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))
+cp <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))
+for(i in 1:length(split)) {
+for(j in 1:length(leafSize)) {
+op <- list(minsplit=split[i], minbucket=leafSize[j])
+tree <- rpart(target ~., data=data, control=op, method="anova")
+xerror[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"xerror"]
+cp[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"]
+}
+}
+index <- which(xerror==min(xerror), arr.ind = T)
+op <- list(minsplit=split[index[1]], minbucket=leafSize[index[2]], cp=cp[index[1], index[2]])
+return(op)
+}
+###### meta-LASSO
+# create fold by picking at random row indexes
+createFolds <- function(nbObs, n) {
+# pick indexes
+index <- sample(1:n, size=nbObs, replace = T)
+# populate folds
+folds <- NULL
+for(i in 1:n) {
+folds <- c(folds, list(which(index==i)))
+}
+return(folds)
+}
+searchParamLASSO <- function(genotype, phenotype, alpha=seq(0,1,0.1), n=7) {
+folds <- createFolds(nrow(genotype), n = n)
+acc <- NULL
+indexAlpha <- 1
+for(a in alpha) {
+curAcc <- NULL
+for(i in 1:n) {
+train <- genotype[-folds[[i]],]
+test <- genotype[folds[[i]],]
+phenoTrain <- phenotype[-folds[[i]]]
+phenoTest <- phenotype[folds[[i]]]
+cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a)
+model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se)
+pred <- predict(model, test, type = "response")
+curAcc <- c(curAcc, r2(phenoTest, pred))
+}
+acc <- c(acc, mean(curAcc))
+}
+names(acc) <- alpha
+return(as.numeric(names(acc)[which.max(acc)]))
+}
+###### meta-random forest
+searchParamRF <- function(genotype, phenotype, rangeNtree, mtry=ncol(genotype)) {
+n <- ceiling(nrow(genotype)/3)
+indexTest <- sample(1:nrow(genotype), size=n)
+train <- genotype[-indexTest,]
+test <- genotype[indexTest,]
+phenoTrain <- phenotype[-indexTest]
+phenoTest <- phenotype[indexTest]
+acc <- NULL
+indexNtree <- 1
+for(ntree in rangeNtree) {
+model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry = mtry)
+pred <- predict(model, test)
+acc <- c(acc, r2(phenoTest, pred))
+}
+names(acc) <- rangeNtree
+best <- which.max(acc)
+return(as.numeric(names(acc)[best]))
+}
+###### meta-SVM
+searchParamSVM <- function(train, target, kernel="radial") {
+# tuning parameters then train
+model <- NULL
+switch(kernel,
+sigmoid={
+tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), kernel="sigmoid");
+g <- tune$best.parameters[[1]];
+c <- tune$best.parameters[[2]];
+model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")},
+linear={
+tune <-  tune.svm(train, target, cost = 10^(0:2), kernel="linear");
+c <- tune$best.parameters[[1]];
+model <-  svm(x=train, y=target, cost = c, kernel = "linear")},
+polynomial={
+tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), degree = 0:4, coef0 = 0:3, kernel="polynomial");
+d <- tune$best.parameters[[1]];
+g <- tune$best.parameters[[2]];
+coef <- tune$best.parameters[[3]];
+c <- tune$best.parameters[[4]];
+model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)},
+{
+tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:3), kernel="radial");
+g <- tune$best.parameters[[1]];
+c <- tune$best.parameters[[2]];
+model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")}
+)
+return(model)
+}
+#################### upper level functions #####################
+aggregateDT <- function(train, test, target, folds) {
+r2Aggreg <- NULL
+for (i in 1:length(folds)) {
+trainIndex <- unlist(folds[-i])
+testIndex <- folds[[i]]
+treeParam <- tuneTree(train[[i]], target[trainIndex])
+data <- data.frame(train[[i]], target=target[trainIndex])
+model <- rpart(target ~., data=data, method = "anova", control = treeParam)
+model <- prune(model, cp=treeParam["cp"])
+pred <- predict(model, data.frame(test[[i]]))
+r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred))
+}
+return(r2Aggreg)
+}
+aggregateGeneticMean <- function(train, test, target, folds) {
+r2Aggreg <- NULL
+for (i in 1:length(folds)) {
+trainIndex <- unlist(folds[-i])
+testIndex <- folds[[i]]
+opt <- optimizeWeight(values = train[[i]], trueValue = target[trainIndex])
+pred <- weightedPrediction(test[[i]], opt@solution)
+r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred))
+}
+return(r2Aggreg)
+}
+aggregateLASSO <- function(train, test, target, folds) {
+r2Aggreg <- NULL
+for (i in 1:length(folds)) {
+trainIndex <- unlist(folds[-i])
+testIndex <- folds[[i]]
+alpha <- searchParamLASSO(train[[i]], target[trainIndex])
+cv <- cv.glmnet(x=as.matrix(train[[i]]), y=target[trainIndex], alpha=alpha)
+model <- glmnet(x=as.matrix(train[[i]]), y=target[trainIndex], alpha=alpha, lambda = cv$lambda.1se)
+pred <- predict(model, test[[i]])
+r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred))
+}
+return(r2Aggreg)
+}
+aggregateRF <- function(train, test, target, folds) {
+r2Aggreg <- NULL
+for (i in 1:length(folds)) {
+trainIndex <- unlist(folds[-i])
+testIndex <- folds[[i]]
+ntree <- searchParamRF(genotype = train[[i]], phenotype = target[trainIndex],
+rangeNtree = seq(100, 1000, 100))
+model <- randomForest(x=as.matrix(train[[i]]), y=target[trainIndex],
+ntree = ntree, mtry = ncol(train[[i]]))
+pred <- predict(model, test[[i]])
+r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred))
+}
+return(r2Aggreg)
+}
+aggregateSVM <- function(train, test, target, folds, kernel="linear") {
+r2Aggreg <- NULL
+for (i in 1:length(folds)) {
+trainIndex <- unlist(folds[-i])
+testIndex <- folds[[i]]
+model <- searchParamSVM(train = train[[i]], target = target[trainIndex], kernel = kernel)
+pred <- predict(model, test[[i]])
+r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred))
+}
+return(r2Aggreg)
+}
+################################### main #############################
+# # load argument
+cmd <- commandArgs(T)
+source(cmd[1])
+# load folds
+con = file(folds)
+folds <- readLines(con = con, n = 1, ok=T)
+close(con)
+folds <- readRDS(folds)
+# phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve
+phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
+# load genotype
+con = file(genotype)
+genotype <- readLines(con = con, n = 1, ok=T)
+close(con)
+genotype <- read.table(genotype, sep="\t", h=T)
+# find which classifiers will be used for aggregation
+classifNames <- NULL
+if(lassoModel !="None"){
+classifNames <- c(classifNames, "lasso")
+con = file(lassoModel)
+lassoModel <- readLines(con = con, n = 1, ok=T)
+close(con)
+lassoModel <- readRDS(lassoModel)
+}
+if(rrBLUPModel !="None"){
+classifNames <- c(classifNames, "rrBLUP")
+con = file(rrBLUPModel)
+rrBLUPModel <- readLines(con = con, n = 1, ok=T)
+close(con)
+rrBLUPModel <- readRDS(rrBLUPModel)
+}
+if(rfModel !="None"){
+classifNames <- c(classifNames, "rf")
+con = file(rfModel)
+rfModel <- readLines(con = con, n = 1, ok=T)
+close(con)
+rfModel <- readRDS(rfModel)
+}
+if(svmModel !="None"){
+classifNames <- c(classifNames, "svm")
+con = file(svmModel)
+svmModel <- readLines(con = con, n = 1, ok=T)
+close(con)
+svmModel <- readRDS(svmModel)
+}
+# compute prediction of the training set and test set for each fold and each classifiers
+# train predictions and test prediction are stored in separate lists
+# where each element of the list represent a folds
+predictionTrain.list <- NULL
+predictionTest.list <- NULL
+r2Classif.list <- NULL
+for(i in 1:length(folds)) {
+# for the current fold, create training set and test set
+trainIndex <- unlist(folds[-i])
+testIndex <- folds[[i]]
+train <- genotype[trainIndex,]
+phenoTrain <- phenotype[trainIndex]
+test <- genotype[testIndex,]
+phenoTest <- phenotype[testIndex]
+# only to intialize data frame containing predictions
+predictionTrain <- matrix(rep(-1, length(classifNames)*length(trainIndex)),
+ncol=length(classifNames))
+colnames(predictionTrain) <- classifNames
+predictionTest <- matrix(rep(-1, length(classifNames)*length(testIndex)),
+ncol=length(classifNames))
+colnames(predictionTest) <- classifNames
+r2Classif <- NULL
+# for each classifiers, compute prediction on both sets
+# and evaluate r2 to find the best classifier
+for(j in 1:length(classifNames)) {
+switch(classifNames[j],
+# random forest
+rf={
+# predict train and test
+param <- extractParameter(rfModel, "rf")
+model <- randomForest(x=train, y=phenoTrain, mtry = param$mtry,
+ntree = param$ntree);
+predictionTrain[,j] <- prediction(train, model, classifier = "rf");
+predictionTest[,j] <- prediction(test, model, classifier = "rf");
+r2Classif <- c(r2Classif, rf=r2(phenoTest, predictionTest[,"rf"]))},
+# svm
+svm={
+# predict train and test
+param <- extractParameter(svmModel, "svm");
+model <- svm(train, phenoTrain, kernel = param$kernel, cost = param$c,
+gamma=param$g, degree = param$d, coef0 = param$coef, scale = F)
+predictionTrain[,j] <- prediction(train, model, classifier = "svm");
+predictionTest[,j] <- prediction(test, model, classifier = "svm");
+r2Classif <- c(r2Classif, svm=r2(phenoTest, predictionTest[,"svm"]))},
+# lasso
+lasso={
+# predict train and test
+param <- extractParameter(lassoModel, "lasso");
+model <- glmnet(x=as.matrix(train), y=phenoTrain, lambda = param$lambda);
+predictionTrain[,j] <- prediction(train, model, classifier = "lasso");
+predictionTest[,j] <- prediction(test, model, classifier = "lasso");
+r2Classif <- c(r2Classif, lasso=r2(phenoTest, predictionTest[,"lasso"]))},
+# rrBLUP
+rrBLUP={
+# predict train and test
+model <- mixed.solve(phenoTrain, Z=train,K=NULL, SE=F,return.Hinv = F);
+predictionTrain[,j] <- prediction(train, model, classifier = "rrBLUP");
+predictionTest[,j] <- prediction(test, model, classifier = "rrBLUP");
+r2Classif <- c(r2Classif, rrBLUP=r2(phenoTest, predictionTest[,"rrBLUP"]))},
+{print(paste("unknown classifier, please choose among rf, svm, lasso, rrBLUP"));
+stop()}
+)
+}
+predictionTrain.list <- c(predictionTrain.list, list(predictionTrain))
+predictionTest.list <- c(predictionTest.list, list(predictionTest))
+r2Classif.list <- c(r2Classif.list, list(r2Classif))
+}
+# aggregate !
+switch(method,
+geneticMean={
+aggreg <- aggregateGeneticMean(train=predictionTrain.list, test=predictionTest.list,
+target = phenotype, folds=folds)
+},
+dt={
+aggreg <- aggregateDT(train=predictionTrain.list, test=predictionTest.list,
+target = phenotype, folds=folds)
+},
+lasso={
+aggreg <- aggregateLASSO(train=predictionTrain.list, test=predictionTest.list,
+target = phenotype, folds=folds)
+},
+rf={
+aggreg <- aggregateRF(train=predictionTrain.list, test=predictionTest.list,
+target = phenotype, folds=folds)
+},
+# svm, by default
+{aggreg <- aggregateSVM(train=predictionTrain.list, test=predictionTest.list,
+target = phenotype, folds=folds, kernel=kernel)}
+)
+# determine best classifier
+# first, transform list into a matrix
+saveRDS(r2Classif.list, "/Users/nbeaume/Desktop/r2Classif.rds")
+r2Classif.list <- t(data.frame(r2Classif.list))
+# then, compute the mean r2 for each classifier
+meanR2Classif <- apply(r2Classif.list, 2, mean)
+# choose the best one
+bestClassif <- which.max(meanR2Classif)
+# compare aggregation and best classifiers
+finalRes <- cbind(bestClassif=r2Classif.list[,bestClassif], aggreg=aggreg,
+diff=(aggreg-r2Classif.list[,bestClassif]))
+print(apply(finalRes, 2, mean))

Mercurial > repos > nicolas > oghma

comparison evaluate_aggregation.R @ 91:b0b172279433 draft