Mercurial > repos > nicolas > oghma
changeset 59:3eaf7510a384 draft
Uploaded
| author | nicolas | 
|---|---|
| date | Wed, 26 Oct 2016 18:21:49 -0400 | 
| parents | 8d8326d74f04 | 
| children | 50894fc3c231 | 
| files | aggregation.R | 
| diffstat | 1 files changed, 320 insertions(+), 0 deletions(-) [+] | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/aggregation.R Wed Oct 26 18:21:49 2016 -0400 @@ -0,0 +1,320 @@ +######################################################## +# +# creation date : 25/10/16 +# last modification : 25/10/16 +# author : Dr Nicolas Beaume +# +######################################################## + +suppressWarnings(suppressMessages(library(GA))) +library("miscTools") +library(rpart) +suppressWarnings(suppressMessages(library(randomForest))) +library(e1071) +suppressWarnings(suppressMessages(library(glmnet))) +options(warn=-1) +############################ helper functions ####################### + +##### Genetic algorithm + +# compute r2 by computing the classic formula +# compare the sum of square difference from target to prediciton +# to the sum of square difference from target to the mean of the target +r2 <- function(target, prediction) { + sst <- sum((target-mean(target))^2) + ssr <- sum((target-prediction)^2) + return(1-ssr/sst) +} + +optimizeOneIndividual <- function(values, trueValue) { + # change the value into a function + f <- function(w) {sum(values * w/sum(w))} + fitness <- function(x) {1/abs(trueValue-f(x))} + resp <- ga(type = "real-valued", fitness = fitness, min = rep(0, length(values)), max = rep(1, length(values)), + maxiter = 1000, monitor = NULL, keepBest = T) + resp@solution <- resp@solution/sum(resp@solution) + return(resp) +} + +optimizeWeight <- function(values, trueValue, n=1000) { + fitnessAll <- function(w) { + predicted <- apply(values, 1, weightedPrediction.vec, w) + return(mean(r2(trueValue, predicted))) + #return(mean(1/abs(trueValue-predicted))) + } + resp <- ga(type = "real-valued", fitness = fitnessAll, min = rep(0, ncol(values)), max = rep(1, ncol(values)), + maxiter = n, monitor = NULL, keepBest = T) + resp@solution <- resp@solution/sum(resp@solution) + return(resp) +} + +weightedPrediction <- function(classifiers, w) { + if(length(w) > ncol(classifiers)) { + warning("more weights than classifiers, extra weigths are ignored") + w <- w[1:ncol(classifiers)] + } else if(length(w) < ncol(classifiers)) { + warning("less weights than classifiers, extra classifiers are ignored") + classifiers <- classifiers[,1:length(w)] + } + prediction <- NULL + prediction <- c(prediction, apply(classifiers, 1, weightedPrediction.vec, w)) + return(prediction) +} + +weightedPrediction.vec <- function(values, w) { + return(sum(values * w/sum(w))) +} + +##### meta-decision tree + +tuneTree <- function(data, target) { + data <- data.frame(data, target=target) + size <- nrow(data) + xerror <- NULL + split <- 1:ceiling(size/5) + leafSize <- 1:ceiling(size/10) + xerror <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize)) + cp <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize)) + for(i in 1:length(split)) { + for(j in 1:length(leafSize)) { + op <- list(minsplit=split[i], minbucket=leafSize[j]) + tree <- rpart(target ~., data=data, control=op, method="anova") + xerror[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"xerror"] + cp[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"] + } + } + index <- which(xerror==min(xerror), arr.ind = T) + op <- list(minsplit=split[index[1]], minbucket=leafSize[index[2]], cp=cp[index[1], index[2]]) + return(op) +} + +###### meta-LASSO +# create fold by picking at random row indexes +createFolds <- function(nbObs, n) { + # pick indexes + index <- sample(1:n, size=nbObs, replace = T) + # populate folds + folds <- NULL + for(i in 1:n) { + folds <- c(folds, list(which(index==i))) + } + return(folds) +} + +searchParamLASSO <- function(genotype, phenotype, alpha=seq(0,1,0.1), n=7) { + folds <- createFolds(nrow(genotype), n = n) + acc <- NULL + indexAlpha <- 1 + for(a in alpha) { + curAcc <- NULL + for(i in 1:n) { + train <- genotype[-folds[[i]],] + test <- genotype[folds[[i]],] + phenoTrain <- phenotype[-folds[[i]]] + phenoTest <- phenotype[folds[[i]]] + cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a) + model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se) + pred <- predict(model, test, type = "response") + curAcc <- c(curAcc, r2(phenoTest, pred)) + } + acc <- c(acc, mean(curAcc)) + } + names(acc) <- alpha + return(as.numeric(names(acc)[which.max(acc)])) +} + +###### meta-random forest + +searchParamRF <- function(genotype, phenotype, rangeNtree, mtry=ncol(genotype)) { + n <- ceiling(nrow(genotype)/3) + indexTest <- sample(1:nrow(genotype), size=n) + train <- genotype[-indexTest,] + test <- genotype[indexTest,] + phenoTrain <- phenotype[-indexTest] + phenoTest <- phenotype[indexTest] + acc <- NULL + indexNtree <- 1 + for(ntree in rangeNtree) { + model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry = mtry) + pred <- predict(model, test) + acc <- c(acc, r2(phenoTest, pred)) + } + names(acc) <- rangeNtree + best <- which.max(acc) + return(as.numeric(names(acc)[best])) +} + +###### meta-SVM +searchParamSVM <- function(train, target, kernel="radial") { + # tuning parameters then train + model <- NULL + switch(kernel, + sigmoid={ + tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), kernel="sigmoid"); + g <- tune$best.parameters[[1]]; + c <- tune$best.parameters[[2]]; + model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")}, + linear={ + tune <- tune.svm(train, target, cost = 10^(0:2), kernel="linear"); + c <- tune$best.parameters[[1]]; + model <- svm(x=train, y=target, cost = c, kernel = "linear")}, + polynomial={ + tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), degree = 0:4, coef0 = 0:3, kernel="polynomial"); + d <- tune$best.parameters[[1]]; + g <- tune$best.parameters[[2]]; + coef <- tune$best.parameters[[3]]; + c <- tune$best.parameters[[4]]; + model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)}, + { + tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:3), kernel="radial"); + g <- tune$best.parameters[[1]]; + c <- tune$best.parameters[[2]]; + model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")} + ) + return(model) +} + +#################### upper level functions ##################### + +aggregateDT <- function(classifiers, target=NULL, prediction=F, model=NULL, out) { + if(!prediction) { + treeParam <- tuneTree(classifiers, target) + data <- data.frame(classifiers, target) + model <- rpart(target ~., data=data, method = "anova", control = treeParam) + model <- prune(model, cp=treeParam["cp"]) + out <- paste(out, ".rds", sep = "") + saveRDS(model, out) + return(out) + } else { + return(predict(model, classifiers), out) + } +} + +aggregateGeneticMean <- function(classifiers, target=NULL, prediction=F, model=NULL, out){ + if(!prediction) { + opt <- optimizeWeight(values = classifiers, trueValue = target) + out <- paste(out, ".rds", sep = "") + saveRDS(opt@solution, out) + return(out) + } else { + return(weightedPrediction.vec(classifiers, model)) + } +} + +aggregateLASSO <- function(classifiers, target=NULL, prediction=F, model=NULL, alpha=NULL, out) { + if(!prediction) { + alpha <- searchParamLASSO(classifiers, target) + cv <- cv.glmnet(x=as.matrix(classifiers), y=target, alpha=alpha) + model <- glmnet(x=as.matrix(classifiers), y=target, alpha=alpha, lambda = cv$lambda.1se) + out <- paste(out, ".rds", sep = "") + saveRDS(model, out) + return(out) + } else { + return(predict(model, classifiers), out) + } +} + +aggregateRF <- function(classifiers, target=NULL, model=NULL, ntree=NULL, prediction=F, out) { + if(!prediction) { + ntree <- searchParamRF(genotype = classifiers, phenotype = target, + rangeNtree = seq(100, 1000, 100)) + model <- randomForest(x=classifiers, y=target, ntree = ntree, mtry = ncol(classifiers)) + out <- paste(out, ".rds", sep = "") + saveRDS(model, out) + return(out) + } else { + return(predict(model, classifiers), out) + } +} + +aggregateSVM <- function(classifiers, target=NULL, prediction=F, + model=NULL, c=NULL, g=NULL, d=NULL, coef=NULL, kernel="radial", out) { + if(!prediction) { + model <- searchParamSVM(train = classifiers, target = target, kernel = kernel) + out <- paste(out, ".rds", sep = "") + saveRDS(model, out) + return(out) + } else { + return(predict(model, classifiers), out) + } +} + +################################### main ############################# +# # load argument +cmd <- commandArgs(T) +source(cmd[1]) +# check if evaluation is required +evaluation <- F +if(as.integer(doEvaluation) == 1) { + evaluation <- T + con = file(folds) + folds <- readLines(con = con, n = 1, ok=T) + close(con) + folds <- readRDS(folds) +} +# check for model +if(model == "None") { + model <- NULL + prediction <- F +} else { + prediction <- T + con = file(model) + model <- readLines(con = con, n = 1, ok=T) + close(con) + model <- readRDS(model) +} +# load classifiers and phenotype +classifiers <- NULL +classifNames <- NULL +if(lassoPred !="None"){ + classifiers <- c(classifiers, lassoPred) + classifNames <- c(classifNames, "lasso") +} +if(rrBLUPPred !="None"){ + classifiers <- c(classifiers, rrBLUPPred) + classifNames <- c(classifNames, "rrBLUP") +} +if(rfPred !="None"){ + classifiers <- c(classifiers, rfPred) + classifNames <- c(classifNames, "rf") +} +if(svmPred !="None"){ + classifiers <- c(classifiers, svmPred) + classifNames <- c(classifNames, "svm") +} +classifPrediction <- NULL +for(classif in classifiers) { + classifPrediction <- c(classifPrediction, list(read.table(classif, sep="\t", h=T))) +} +classifPrediction <- data.frame(classifPrediction) +colnames(classifPrediction) <- classifNames +# phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve +phenotype <- read.table(phenotype, sep="\t", h=T)[,1] +# aggregate ! +switch(method, + geneticMean={ + res <- aggregateGeneticMean(classifiers = classifPrediction, target = phenotype, + out = out, prediction = prediction, model=model) + }, + dt={ + res <- aggregateDT(classifiers = classifPrediction, target = phenotype, + out = out, prediction = prediction, model=model) + }, + lasso={ + res <- aggregateLASSO(classifiers = data.matrix(classifPrediction), target = phenotype, + out = out, prediction = prediction, model=model) + }, + rf={ + res <- aggregateRF(classifiers = classifPrediction, target = phenotype, + out = out, prediction = prediction, model=model) + }, + # svm + {res <- aggregateSVM(classifiers = classifPrediction, target = phenotype, kernel = kernel, + out = out, prediction = prediction, model = model)} + ) +if(prediction) { + write.table(data.frame(lines=rownames(classifPrediction), res), paste(out,".csv", sep=""), + sep="\t", row.names = F) +} else { + cat(paste(paste(out, ".rds", sep = ""), "\n", sep="")) +} \ No newline at end of file
