Galaxy |

Changeset 53:ed9ad79143a7 (2016-10-26)

Previous changeset 52:b2dcac8ab1d7 (2016-10-26) Next changeset 54:100b17358190 (2016-10-26)

Commit message:
Uploaded

added:
aggregation.R

diff -r b2dcac8ab1d7 -r ed9ad79143a7 aggregation.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/aggregation.R Wed Oct 26 18:05:28 2016 -0400

[

b'@@ -0,0 +1,308 @@\n+########################################################\n+#\n+# creation date : 25/10/16\n+# last modification : 25/10/16\n+# author : Dr Nicolas Beaume\n+#\n+########################################################\n+\n+suppressWarnings(suppressMessages(library(GA)))\n+library("miscTools")\n+library(rpart)\n+suppressWarnings(suppressMessages(library(randomForest)))\n+library(e1071)\n+suppressWarnings(suppressMessages(library(glmnet)))\n+options(warn=-1)\n+############################ helper functions #######################\n+\n+##### Genetic algorithm\n+\n+# compute r2 by computing the classic formula\n+# compare the sum of square difference from target to prediciton\n+# to the sum of square difference from target to the mean of the target\n+r2 <- function(target, prediction) {\n+ sst <- sum((target-mean(target))^2)\n+ ssr <- sum((target-prediction)^2)\n+ return(1-ssr/sst)\n+}\n+\n+optimizeOneIndividual <- function(values, trueValue) {\n+ # change the value into a function\n+ f <- function(w) {sum(values * w/sum(w))}\n+ fitness <- function(x) {1/abs(trueValue-f(x))}\n+ resp <- ga(type = "real-valued", fitness = fitness, min = rep(0, length(values)), max = rep(1, length(values)), \n+ maxiter = 1000, monitor = NULL, keepBest = T)\n+ resp@solution <- resp@solution/sum(resp@solution)\n+ return(resp)\n+}\n+\n+optimizeWeight <- function(values, trueValue, n=1000) {\n+ fitnessAll <- function(w) {\n+ predicted <- apply(values, 1, weightedPrediction.vec, w)\n+ return(mean(r2(trueValue, predicted)))\n+ #return(mean(1/abs(trueValue-predicted)))\n+ }\n+ resp <- ga(type = "real-valued", fitness = fitnessAll, min = rep(0, ncol(values)), max = rep(1, ncol(values)), \n+ maxiter = n, monitor = NULL, keepBest = T)\n+ resp@solution <- resp@solution/sum(resp@solution)\n+ return(resp)\n+}\n+\n+weightedPrediction <- function(classifiers, w) {\n+ if(length(w) > ncol(classifiers)) {\n+ warning("more weights than classifiers, extra weigths are ignored")\n+ w <- w[1:ncol(classifiers)]\n+ } else if(length(w) < ncol(classifiers)) {\n+ warning("less weights than classifiers, extra classifiers are ignored")\n+ classifiers <- classifiers[,1:length(w)]\n+ }\n+ prediction <- NULL\n+ prediction <- c(prediction, apply(classifiers, 1, weightedPrediction.vec, w))\n+ return(prediction)\n+}\n+\n+weightedPrediction.vec <- function(values, w) {\n+ return(sum(values * w/sum(w)))\n+}\n+\n+##### meta-decision tree\n+\n+tuneTree <- function(data, target) {\n+ data <- data.frame(data, target=target)\n+ size <- nrow(data)\n+ xerror <- NULL\n+ split <- 1:ceiling(size/5)\n+ leafSize <- 1:ceiling(size/10)\n+ xerror <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))\n+ cp <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))\n+ for(i in 1:length(split)) {\n+ for(j in 1:length(leafSize)) {\n+ op <- list(minsplit=split[i], minbucket=leafSize[j])\n+ tree <- rpart(target ~., data=data, control=op, method="anova")\n+ xerror[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"xerror"]\n+ cp[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"]\n+ }\n+ }\n+ index <- which(xerror==min(xerror), arr.ind = T)\n+ op <- list(minsplit=split[index[1]], minbucket=leafSize[index[2]], cp=cp[index[1], index[2]])\n+ return(op)\n+}\n+\n+###### meta-LASSO\n+# create fold by picking at random row indexes\n+createFolds <- function(nbObs, n) {\n+ # pick indexes\n+ index <- sample(1:n, size=nbObs, replace = T)\n+ # populate folds\n+ folds <- NULL\n+ for(i in 1:n) {\n+ folds <- c(folds, list(which(index==i)))\n+ }\n+ return(folds)\n+}\n+\n+searchParamLASSO <- function(genotype, phenotype, alpha=seq(0,1,0.1), n=7) {\n+ folds <- createFolds(nrow(genotype), n = n)\n+ acc <- NULL\n+ indexAlpha <- 1\n+ for(a in alpha) {\n+ curAcc <- NULL\n+ for(i in 1:n) {\n+ train <- genotype[-folds[[i]],]\n+ test <- genotype[folds[[i]],]\n+ phenoTrain <- phenotype[-folds[[i]]]\n+ phenoTest <- phenotype[folds[[i'..b'# evaluation of the method\n+ } else {\n+ write.table(weightedPrediction.vec(classifiers, model), out, sep="\\t")\n+ }\n+}\n+\n+aggregateLASSO <- function(classifiers, target=NULL, prediction=F, model=NULL, alpha=NULL, out) {\n+ if(!prediction) {\n+ alpha <- searchParamLASSO(classifiers, target)\n+ cv <- cv.glmnet(x=as.matrix(classifiers), y=target, alpha=alpha)\n+ model <- glmnet(x=as.matrix(classifiers), y=target, alpha=alpha, lambda = cv$lambda.1se)\n+ saveRDS(model, out)\n+ } else {\n+ write.table(predict(model, classifiers), out, sep="\\t")\n+ }\n+}\n+\n+aggregateRF <- function(classifiers, target=NULL, model=NULL, ntree=NULL, prediction=F, out) {\n+ if(!prediction) {\n+ ntree <- searchParamRF(genotype = classifiers, phenotype = target,\n+ rangeNtree = seq(100, 1000, 100))\n+ model <- randomForest(x=classifiers, y=target, ntree = ntree, mtry = ncol(classifiers))\n+ saveRDS(model, out)\n+ } else {\n+ write.table(predict(model, classifiers), out, sep="\\t")\n+ }\n+}\n+\n+aggregateSVM <- function(classifiers, target=NULL, prediction=F, \n+ model=NULL, c=NULL, g=NULL, d=NULL, coef=NULL, kernel="radial", out) {\n+ if(!prediction) {\n+ model <- searchParamSVM(train = classifiers, target = target, kernel = kernel)\n+ saveRDS(model, out)\n+ } else {\n+ write.table(predict(model, classifiers), out, sep="\\t")\n+ }\n+}\n+\n+################################### main #############################\n+# # load argument\n+cmd <- commandArgs(T)\n+source(cmd[1])\n+# check if evaluation is required\n+evaluation <- F\n+if(as.integer(doEvaluation) == 1) {\n+ evaluation <- T\n+ con = file(folds)\n+ folds <- readLines(con = con, n = 1, ok=T)\n+ close(con)\n+ folds <- readRDS(folds)\n+}\n+# check for model\n+if(model == "None") {\n+ model <- NULL\n+ prediction <- F\n+} else {\n+ prediction <- T\n+ con = file(model)\n+ model <- readLines(con = con, n = 1, ok=T)\n+ close(con)\n+ model <- readRDS(model)\n+}\n+# load classifiers and phenotype\n+classifiers <- NULL\n+classifNames <- NULL\n+if(lassoPred !="None"){\n+ classifiers <- c(classifiers, lassoPred)\n+ classifNames <- c(classifNames, "lasso")\n+}\n+if(rrBLUPPred !="None"){\n+ classifiers <- c(classifiers, rrBLUPPred)\n+ classifNames <- c(classifNames, "rrBLUP")\n+}\n+if(rfPred !="None"){\n+ classifiers <- c(classifiers, rfPred)\n+ classifNames <- c(classifNames, "rf")\n+}\n+if(svmPred !="None"){\n+ classifiers <- c(classifiers, svmPred)\n+ classifNames <- c(classifNames, "svm")\n+}\n+classifPrediction <- NULL\n+for(classif in classifiers) {\n+ classifPrediction <- c(classifPrediction, list(read.table(classif, sep="\\t", h=T)))\n+}\n+classifPrediction <- data.frame(classifPrediction)\n+colnames(classifPrediction) <- classifNames\n+# phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve\n+phenotype <- read.table(phenotype, sep="\\t", h=T)[,1]\n+out <- paste(out, ".rds", sep = "")\n+# aggregate !\n+switch(method,\n+ geneticMean={\n+ aggregateGeneticMean(classifiers = classifPrediction, target = phenotype,\n+ out = out, prediction = prediction, model=model)\n+ },\n+ dt={\n+ aggregateDT(classifiers = classifPrediction, target = phenotype,\n+ out = out, prediction = prediction, model=model)\n+ },\n+ lasso={\n+ aggregateLASSO(classifiers = data.matrix(classifPrediction), target = phenotype,\n+ out = out, prediction = prediction, model=model)\n+ },\n+ rf={\n+ aggregateRF(classifiers = classifPrediction, target = phenotype,\n+ out = out, prediction = prediction, model=model)\n+ },\n+ # svm\n+ {aggregateSVM(classifiers = classifPrediction, target = phenotype, kernel = kernel,\n+ out = out, prediction = prediction, model = model)}\n+ )\n+# return path of the result file to galaxy\n+cat(paste(out, "\\n", sep=""))\n\\ No newline at end of file\n'