Mercurial > repos > nicolas > oghma
comparison evaluate_aggregation.R @ 91:b0b172279433 draft
Uploaded
| author | nicolas |
|---|---|
| date | Mon, 31 Oct 2016 04:53:14 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 90:634533b40622 | 91:b0b172279433 |
|---|---|
| 1 ######################################################## | |
| 2 # | |
| 3 # creation date : 10/10/16 | |
| 4 # last modification : 29/10/16 | |
| 5 # author : Dr Nicolas Beaume | |
| 6 # | |
| 7 ######################################################## | |
| 8 | |
| 9 suppressWarnings(suppressMessages(library(GA))) | |
| 10 library("miscTools") | |
| 11 library(rpart) | |
| 12 suppressWarnings(suppressMessages(library(randomForest))) | |
| 13 library(e1071) | |
| 14 suppressWarnings(suppressMessages(library(glmnet))) | |
| 15 library(rrBLUP) | |
| 16 options(warn=-1) | |
| 17 ############################ helper functions ####################### | |
| 18 | |
| 19 ##### classifiers | |
| 20 prediction <- function(genotype, model, classifier="unknown") { | |
| 21 # run prediction according to the classifier | |
| 22 switch(classifier, | |
| 23 rrBLUP={ | |
| 24 predictions <- as.matrix(genotype) %*% as.matrix(model$u); | |
| 25 predictions <- predictions[,1]+model$beta; | |
| 26 }, | |
| 27 rf={ | |
| 28 predictions <- predict(model, genotype); | |
| 29 }, | |
| 30 svm={ | |
| 31 predictions <- predict(model, genotype); | |
| 32 }, | |
| 33 lasso={ | |
| 34 predictions <- predict(model, as.matrix(genotype), type = "response"); | |
| 35 }, | |
| 36 {warning("unkonwn classifier, please choose among the following : rrBLUP, rf, svm, lasso")}) | |
| 37 return(predictions) | |
| 38 } | |
| 39 | |
| 40 # extract parameter from a model, excluding rrBLUP which auto-optimize | |
| 41 extractParameter <- function(model, classifierName) { | |
| 42 param <- NULL | |
| 43 switch(classifierName, | |
| 44 # random forest | |
| 45 rf={ | |
| 46 param <- model$ntree | |
| 47 param <- c(param, list(model$mtry)) | |
| 48 names(param) <- c("ntree", "mtry") | |
| 49 }, | |
| 50 # svm | |
| 51 svm={ | |
| 52 param <- as.numeric(model$cost) | |
| 53 param <- c(param, list(model$gamma)) | |
| 54 param <- c(param, list(model$coef0)) | |
| 55 param <- c(param, list(model$degree)) | |
| 56 param <- c(param, list(model$kernel)) | |
| 57 names(param) <- c("c", "g", "coef", "d", "kernel") | |
| 58 switch((model$kernel+1), | |
| 59 param$kernel <- "linear", | |
| 60 param$kernel <- "polynomial", | |
| 61 param$kernel <- "radial", | |
| 62 param$kernel <- "sigmoid" | |
| 63 ) | |
| 64 }, | |
| 65 # lasso | |
| 66 lasso={ | |
| 67 param <- as.list(model$lambda) | |
| 68 names(param) <- "lambda" | |
| 69 }, | |
| 70 {print(paste("unknown classifier, please choose among rf, svm, lasso")); | |
| 71 stop()} | |
| 72 ) | |
| 73 return(param) | |
| 74 } | |
| 75 | |
| 76 ##### Genetic algorithm | |
| 77 | |
| 78 # compute r2 by computing the classic formula | |
| 79 # compare the sum of square difference from target to prediciton | |
| 80 # to the sum of square difference from target to the mean of the target | |
| 81 r2 <- function(target, prediction) { | |
| 82 sst <- sum((target-mean(target))^2) | |
| 83 ssr <- sum((target-prediction)^2) | |
| 84 return(1-ssr/sst) | |
| 85 } | |
| 86 | |
| 87 optimizeOneIndividual <- function(values, trueValue) { | |
| 88 # change the value into a function | |
| 89 f <- function(w) {sum(values * w/sum(w))} | |
| 90 fitness <- function(x) {1/abs(trueValue-f(x))} | |
| 91 resp <- ga(type = "real-valued", fitness = fitness, min = rep(0, length(values)), max = rep(1, length(values)), | |
| 92 maxiter = 1000, monitor = NULL, keepBest = T) | |
| 93 resp@solution <- resp@solution/sum(resp@solution) | |
| 94 return(resp) | |
| 95 } | |
| 96 | |
| 97 optimizeWeight <- function(values, trueValue, n=1000) { | |
| 98 fitnessAll <- function(w) { | |
| 99 predicted <- apply(values, 1, weightedPrediction.vec, w) | |
| 100 return(mean(r2(trueValue, predicted))) | |
| 101 #return(mean(1/abs(trueValue-predicted))) | |
| 102 } | |
| 103 resp <- ga(type = "real-valued", fitness = fitnessAll, min = rep(0, ncol(values)), max = rep(1, ncol(values)), | |
| 104 maxiter = n, monitor = NULL, keepBest = T) | |
| 105 resp@solution <- resp@solution/sum(resp@solution) | |
| 106 return(resp) | |
| 107 } | |
| 108 | |
| 109 weightedPrediction <- function(classifiers, w) { | |
| 110 if(length(w) > ncol(classifiers)) { | |
| 111 warning("more weights than classifiers, extra weigths are ignored") | |
| 112 w <- w[1:ncol(classifiers)] | |
| 113 } else if(length(w) < ncol(classifiers)) { | |
| 114 warning("less weights than classifiers, extra classifiers are ignored") | |
| 115 classifiers <- classifiers[,1:length(w)] | |
| 116 } | |
| 117 prediction <- NULL | |
| 118 prediction <- c(prediction, apply(classifiers, 1, weightedPrediction.vec, w)) | |
| 119 return(prediction) | |
| 120 } | |
| 121 | |
| 122 weightedPrediction.vec <- function(values, w) { | |
| 123 return(sum(values * w/sum(w))) | |
| 124 } | |
| 125 | |
| 126 ##### meta-decision tree | |
| 127 | |
| 128 tuneTree <- function(data, target) { | |
| 129 data <- data.frame(data, target=target) | |
| 130 size <- nrow(data) | |
| 131 xerror <- NULL | |
| 132 split <- 1:ceiling(size/5) | |
| 133 leafSize <- 1:ceiling(size/10) | |
| 134 xerror <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize)) | |
| 135 cp <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize)) | |
| 136 for(i in 1:length(split)) { | |
| 137 for(j in 1:length(leafSize)) { | |
| 138 op <- list(minsplit=split[i], minbucket=leafSize[j]) | |
| 139 tree <- rpart(target ~., data=data, control=op, method="anova") | |
| 140 xerror[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"xerror"] | |
| 141 cp[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"] | |
| 142 } | |
| 143 } | |
| 144 index <- which(xerror==min(xerror), arr.ind = T) | |
| 145 op <- list(minsplit=split[index[1]], minbucket=leafSize[index[2]], cp=cp[index[1], index[2]]) | |
| 146 return(op) | |
| 147 } | |
| 148 | |
| 149 ###### meta-LASSO | |
| 150 # create fold by picking at random row indexes | |
| 151 createFolds <- function(nbObs, n) { | |
| 152 # pick indexes | |
| 153 index <- sample(1:n, size=nbObs, replace = T) | |
| 154 # populate folds | |
| 155 folds <- NULL | |
| 156 for(i in 1:n) { | |
| 157 folds <- c(folds, list(which(index==i))) | |
| 158 } | |
| 159 return(folds) | |
| 160 } | |
| 161 | |
| 162 searchParamLASSO <- function(genotype, phenotype, alpha=seq(0,1,0.1), n=7) { | |
| 163 folds <- createFolds(nrow(genotype), n = n) | |
| 164 acc <- NULL | |
| 165 indexAlpha <- 1 | |
| 166 for(a in alpha) { | |
| 167 curAcc <- NULL | |
| 168 for(i in 1:n) { | |
| 169 train <- genotype[-folds[[i]],] | |
| 170 test <- genotype[folds[[i]],] | |
| 171 phenoTrain <- phenotype[-folds[[i]]] | |
| 172 phenoTest <- phenotype[folds[[i]]] | |
| 173 cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a) | |
| 174 model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se) | |
| 175 pred <- predict(model, test, type = "response") | |
| 176 curAcc <- c(curAcc, r2(phenoTest, pred)) | |
| 177 } | |
| 178 acc <- c(acc, mean(curAcc)) | |
| 179 } | |
| 180 names(acc) <- alpha | |
| 181 return(as.numeric(names(acc)[which.max(acc)])) | |
| 182 } | |
| 183 | |
| 184 ###### meta-random forest | |
| 185 | |
| 186 searchParamRF <- function(genotype, phenotype, rangeNtree, mtry=ncol(genotype)) { | |
| 187 n <- ceiling(nrow(genotype)/3) | |
| 188 indexTest <- sample(1:nrow(genotype), size=n) | |
| 189 train <- genotype[-indexTest,] | |
| 190 test <- genotype[indexTest,] | |
| 191 phenoTrain <- phenotype[-indexTest] | |
| 192 phenoTest <- phenotype[indexTest] | |
| 193 acc <- NULL | |
| 194 indexNtree <- 1 | |
| 195 for(ntree in rangeNtree) { | |
| 196 model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry = mtry) | |
| 197 pred <- predict(model, test) | |
| 198 acc <- c(acc, r2(phenoTest, pred)) | |
| 199 } | |
| 200 names(acc) <- rangeNtree | |
| 201 best <- which.max(acc) | |
| 202 return(as.numeric(names(acc)[best])) | |
| 203 } | |
| 204 | |
| 205 ###### meta-SVM | |
| 206 searchParamSVM <- function(train, target, kernel="radial") { | |
| 207 # tuning parameters then train | |
| 208 model <- NULL | |
| 209 switch(kernel, | |
| 210 sigmoid={ | |
| 211 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), kernel="sigmoid"); | |
| 212 g <- tune$best.parameters[[1]]; | |
| 213 c <- tune$best.parameters[[2]]; | |
| 214 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")}, | |
| 215 linear={ | |
| 216 tune <- tune.svm(train, target, cost = 10^(0:2), kernel="linear"); | |
| 217 c <- tune$best.parameters[[1]]; | |
| 218 model <- svm(x=train, y=target, cost = c, kernel = "linear")}, | |
| 219 polynomial={ | |
| 220 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), degree = 0:4, coef0 = 0:3, kernel="polynomial"); | |
| 221 d <- tune$best.parameters[[1]]; | |
| 222 g <- tune$best.parameters[[2]]; | |
| 223 coef <- tune$best.parameters[[3]]; | |
| 224 c <- tune$best.parameters[[4]]; | |
| 225 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)}, | |
| 226 { | |
| 227 tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:3), kernel="radial"); | |
| 228 g <- tune$best.parameters[[1]]; | |
| 229 c <- tune$best.parameters[[2]]; | |
| 230 model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")} | |
| 231 ) | |
| 232 return(model) | |
| 233 } | |
| 234 | |
| 235 #################### upper level functions ##################### | |
| 236 | |
| 237 aggregateDT <- function(train, test, target, folds) { | |
| 238 r2Aggreg <- NULL | |
| 239 for (i in 1:length(folds)) { | |
| 240 trainIndex <- unlist(folds[-i]) | |
| 241 testIndex <- folds[[i]] | |
| 242 treeParam <- tuneTree(train[[i]], target[trainIndex]) | |
| 243 data <- data.frame(train[[i]], target=target[trainIndex]) | |
| 244 model <- rpart(target ~., data=data, method = "anova", control = treeParam) | |
| 245 model <- prune(model, cp=treeParam["cp"]) | |
| 246 pred <- predict(model, data.frame(test[[i]])) | |
| 247 r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred)) | |
| 248 } | |
| 249 return(r2Aggreg) | |
| 250 } | |
| 251 | |
| 252 aggregateGeneticMean <- function(train, test, target, folds) { | |
| 253 r2Aggreg <- NULL | |
| 254 for (i in 1:length(folds)) { | |
| 255 trainIndex <- unlist(folds[-i]) | |
| 256 testIndex <- folds[[i]] | |
| 257 opt <- optimizeWeight(values = train[[i]], trueValue = target[trainIndex]) | |
| 258 pred <- weightedPrediction(test[[i]], opt@solution) | |
| 259 r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred)) | |
| 260 } | |
| 261 return(r2Aggreg) | |
| 262 } | |
| 263 | |
| 264 aggregateLASSO <- function(train, test, target, folds) { | |
| 265 r2Aggreg <- NULL | |
| 266 for (i in 1:length(folds)) { | |
| 267 trainIndex <- unlist(folds[-i]) | |
| 268 testIndex <- folds[[i]] | |
| 269 alpha <- searchParamLASSO(train[[i]], target[trainIndex]) | |
| 270 cv <- cv.glmnet(x=as.matrix(train[[i]]), y=target[trainIndex], alpha=alpha) | |
| 271 model <- glmnet(x=as.matrix(train[[i]]), y=target[trainIndex], alpha=alpha, lambda = cv$lambda.1se) | |
| 272 pred <- predict(model, test[[i]]) | |
| 273 r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred)) | |
| 274 } | |
| 275 return(r2Aggreg) | |
| 276 } | |
| 277 | |
| 278 aggregateRF <- function(train, test, target, folds) { | |
| 279 r2Aggreg <- NULL | |
| 280 for (i in 1:length(folds)) { | |
| 281 trainIndex <- unlist(folds[-i]) | |
| 282 testIndex <- folds[[i]] | |
| 283 ntree <- searchParamRF(genotype = train[[i]], phenotype = target[trainIndex], | |
| 284 rangeNtree = seq(100, 1000, 100)) | |
| 285 model <- randomForest(x=as.matrix(train[[i]]), y=target[trainIndex], | |
| 286 ntree = ntree, mtry = ncol(train[[i]])) | |
| 287 pred <- predict(model, test[[i]]) | |
| 288 r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred)) | |
| 289 } | |
| 290 return(r2Aggreg) | |
| 291 } | |
| 292 | |
| 293 aggregateSVM <- function(train, test, target, folds, kernel="linear") { | |
| 294 r2Aggreg <- NULL | |
| 295 for (i in 1:length(folds)) { | |
| 296 trainIndex <- unlist(folds[-i]) | |
| 297 testIndex <- folds[[i]] | |
| 298 model <- searchParamSVM(train = train[[i]], target = target[trainIndex], kernel = kernel) | |
| 299 pred <- predict(model, test[[i]]) | |
| 300 r2Aggreg <- c(r2Aggreg, r2(target[testIndex], pred)) | |
| 301 } | |
| 302 return(r2Aggreg) | |
| 303 } | |
| 304 | |
| 305 ################################### main ############################# | |
| 306 # # load argument | |
| 307 cmd <- commandArgs(T) | |
| 308 source(cmd[1]) | |
| 309 # load folds | |
| 310 con = file(folds) | |
| 311 folds <- readLines(con = con, n = 1, ok=T) | |
| 312 close(con) | |
| 313 folds <- readRDS(folds) | |
| 314 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve | |
| 315 phenotype <- read.table(phenotype, sep="\t", h=T)[,1] | |
| 316 # load genotype | |
| 317 con = file(genotype) | |
| 318 genotype <- readLines(con = con, n = 1, ok=T) | |
| 319 close(con) | |
| 320 genotype <- read.table(genotype, sep="\t", h=T) | |
| 321 # find which classifiers will be used for aggregation | |
| 322 classifNames <- NULL | |
| 323 if(lassoModel !="None"){ | |
| 324 classifNames <- c(classifNames, "lasso") | |
| 325 con = file(lassoModel) | |
| 326 lassoModel <- readLines(con = con, n = 1, ok=T) | |
| 327 close(con) | |
| 328 lassoModel <- readRDS(lassoModel) | |
| 329 } | |
| 330 if(rrBLUPModel !="None"){ | |
| 331 classifNames <- c(classifNames, "rrBLUP") | |
| 332 con = file(rrBLUPModel) | |
| 333 rrBLUPModel <- readLines(con = con, n = 1, ok=T) | |
| 334 close(con) | |
| 335 rrBLUPModel <- readRDS(rrBLUPModel) | |
| 336 } | |
| 337 if(rfModel !="None"){ | |
| 338 classifNames <- c(classifNames, "rf") | |
| 339 con = file(rfModel) | |
| 340 rfModel <- readLines(con = con, n = 1, ok=T) | |
| 341 close(con) | |
| 342 rfModel <- readRDS(rfModel) | |
| 343 } | |
| 344 if(svmModel !="None"){ | |
| 345 classifNames <- c(classifNames, "svm") | |
| 346 con = file(svmModel) | |
| 347 svmModel <- readLines(con = con, n = 1, ok=T) | |
| 348 close(con) | |
| 349 svmModel <- readRDS(svmModel) | |
| 350 } | |
| 351 | |
| 352 # compute prediction of the training set and test set for each fold and each classifiers | |
| 353 # train predictions and test prediction are stored in separate lists | |
| 354 # where each element of the list represent a folds | |
| 355 predictionTrain.list <- NULL | |
| 356 predictionTest.list <- NULL | |
| 357 r2Classif.list <- NULL | |
| 358 for(i in 1:length(folds)) { | |
| 359 # for the current fold, create training set and test set | |
| 360 trainIndex <- unlist(folds[-i]) | |
| 361 testIndex <- folds[[i]] | |
| 362 train <- genotype[trainIndex,] | |
| 363 phenoTrain <- phenotype[trainIndex] | |
| 364 test <- genotype[testIndex,] | |
| 365 phenoTest <- phenotype[testIndex] | |
| 366 # only to intialize data frame containing predictions | |
| 367 predictionTrain <- matrix(rep(-1, length(classifNames)*length(trainIndex)), | |
| 368 ncol=length(classifNames)) | |
| 369 colnames(predictionTrain) <- classifNames | |
| 370 predictionTest <- matrix(rep(-1, length(classifNames)*length(testIndex)), | |
| 371 ncol=length(classifNames)) | |
| 372 colnames(predictionTest) <- classifNames | |
| 373 r2Classif <- NULL | |
| 374 # for each classifiers, compute prediction on both sets | |
| 375 # and evaluate r2 to find the best classifier | |
| 376 for(j in 1:length(classifNames)) { | |
| 377 switch(classifNames[j], | |
| 378 # random forest | |
| 379 rf={ | |
| 380 # predict train and test | |
| 381 param <- extractParameter(rfModel, "rf") | |
| 382 model <- randomForest(x=train, y=phenoTrain, mtry = param$mtry, | |
| 383 ntree = param$ntree); | |
| 384 predictionTrain[,j] <- prediction(train, model, classifier = "rf"); | |
| 385 predictionTest[,j] <- prediction(test, model, classifier = "rf"); | |
| 386 r2Classif <- c(r2Classif, rf=r2(phenoTest, predictionTest[,"rf"]))}, | |
| 387 # svm | |
| 388 svm={ | |
| 389 # predict train and test | |
| 390 param <- extractParameter(svmModel, "svm"); | |
| 391 model <- svm(train, phenoTrain, kernel = param$kernel, cost = param$c, | |
| 392 gamma=param$g, degree = param$d, coef0 = param$coef, scale = F) | |
| 393 predictionTrain[,j] <- prediction(train, model, classifier = "svm"); | |
| 394 predictionTest[,j] <- prediction(test, model, classifier = "svm"); | |
| 395 r2Classif <- c(r2Classif, svm=r2(phenoTest, predictionTest[,"svm"]))}, | |
| 396 # lasso | |
| 397 lasso={ | |
| 398 # predict train and test | |
| 399 param <- extractParameter(lassoModel, "lasso"); | |
| 400 model <- glmnet(x=as.matrix(train), y=phenoTrain, lambda = param$lambda); | |
| 401 predictionTrain[,j] <- prediction(train, model, classifier = "lasso"); | |
| 402 predictionTest[,j] <- prediction(test, model, classifier = "lasso"); | |
| 403 r2Classif <- c(r2Classif, lasso=r2(phenoTest, predictionTest[,"lasso"]))}, | |
| 404 # rrBLUP | |
| 405 rrBLUP={ | |
| 406 # predict train and test | |
| 407 model <- mixed.solve(phenoTrain, Z=train,K=NULL, SE=F,return.Hinv = F); | |
| 408 predictionTrain[,j] <- prediction(train, model, classifier = "rrBLUP"); | |
| 409 predictionTest[,j] <- prediction(test, model, classifier = "rrBLUP"); | |
| 410 r2Classif <- c(r2Classif, rrBLUP=r2(phenoTest, predictionTest[,"rrBLUP"]))}, | |
| 411 {print(paste("unknown classifier, please choose among rf, svm, lasso, rrBLUP")); | |
| 412 stop()} | |
| 413 ) | |
| 414 } | |
| 415 predictionTrain.list <- c(predictionTrain.list, list(predictionTrain)) | |
| 416 predictionTest.list <- c(predictionTest.list, list(predictionTest)) | |
| 417 r2Classif.list <- c(r2Classif.list, list(r2Classif)) | |
| 418 } | |
| 419 # aggregate ! | |
| 420 switch(method, | |
| 421 geneticMean={ | |
| 422 aggreg <- aggregateGeneticMean(train=predictionTrain.list, test=predictionTest.list, | |
| 423 target = phenotype, folds=folds) | |
| 424 }, | |
| 425 dt={ | |
| 426 aggreg <- aggregateDT(train=predictionTrain.list, test=predictionTest.list, | |
| 427 target = phenotype, folds=folds) | |
| 428 }, | |
| 429 lasso={ | |
| 430 aggreg <- aggregateLASSO(train=predictionTrain.list, test=predictionTest.list, | |
| 431 target = phenotype, folds=folds) | |
| 432 }, | |
| 433 rf={ | |
| 434 aggreg <- aggregateRF(train=predictionTrain.list, test=predictionTest.list, | |
| 435 target = phenotype, folds=folds) | |
| 436 }, | |
| 437 # svm, by default | |
| 438 {aggreg <- aggregateSVM(train=predictionTrain.list, test=predictionTest.list, | |
| 439 target = phenotype, folds=folds, kernel=kernel)} | |
| 440 ) | |
| 441 # determine best classifier | |
| 442 # first, transform list into a matrix | |
| 443 saveRDS(r2Classif.list, "/Users/nbeaume/Desktop/r2Classif.rds") | |
| 444 r2Classif.list <- t(data.frame(r2Classif.list)) | |
| 445 # then, compute the mean r2 for each classifier | |
| 446 meanR2Classif <- apply(r2Classif.list, 2, mean) | |
| 447 # choose the best one | |
| 448 bestClassif <- which.max(meanR2Classif) | |
| 449 # compare aggregation and best classifiers | |
| 450 finalRes <- cbind(bestClassif=r2Classif.list[,bestClassif], aggreg=aggreg, | |
| 451 diff=(aggreg-r2Classif.list[,bestClassif])) | |
| 452 print(apply(finalRes, 2, mean)) |
