Mercurial > repos > nicolas > oghma
changeset 61:339ae95a3d46 draft
Deleted selected files
| author | nicolas | 
|---|---|
| date | Wed, 26 Oct 2016 18:25:03 -0400 | 
| parents | 50894fc3c231 | 
| children | 613c81a4f7c0 | 
| files | aggregation.R aggregation.xml | 
| diffstat | 2 files changed, 0 insertions(+), 398 deletions(-) [+] | 
line wrap: on
 line diff
--- a/aggregation.R Wed Oct 26 18:22:03 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,320 +0,0 @@ -######################################################## -# -# creation date : 25/10/16 -# last modification : 25/10/16 -# author : Dr Nicolas Beaume -# -######################################################## - -suppressWarnings(suppressMessages(library(GA))) -library("miscTools") -library(rpart) -suppressWarnings(suppressMessages(library(randomForest))) -library(e1071) -suppressWarnings(suppressMessages(library(glmnet))) -options(warn=-1) -############################ helper functions ####################### - -##### Genetic algorithm - -# compute r2 by computing the classic formula -# compare the sum of square difference from target to prediciton -# to the sum of square difference from target to the mean of the target -r2 <- function(target, prediction) { - sst <- sum((target-mean(target))^2) - ssr <- sum((target-prediction)^2) - return(1-ssr/sst) -} - -optimizeOneIndividual <- function(values, trueValue) { - # change the value into a function - f <- function(w) {sum(values * w/sum(w))} - fitness <- function(x) {1/abs(trueValue-f(x))} - resp <- ga(type = "real-valued", fitness = fitness, min = rep(0, length(values)), max = rep(1, length(values)), - maxiter = 1000, monitor = NULL, keepBest = T) - resp@solution <- resp@solution/sum(resp@solution) - return(resp) -} - -optimizeWeight <- function(values, trueValue, n=1000) { - fitnessAll <- function(w) { - predicted <- apply(values, 1, weightedPrediction.vec, w) - return(mean(r2(trueValue, predicted))) - #return(mean(1/abs(trueValue-predicted))) - } - resp <- ga(type = "real-valued", fitness = fitnessAll, min = rep(0, ncol(values)), max = rep(1, ncol(values)), - maxiter = n, monitor = NULL, keepBest = T) - resp@solution <- resp@solution/sum(resp@solution) - return(resp) -} - -weightedPrediction <- function(classifiers, w) { - if(length(w) > ncol(classifiers)) { - warning("more weights than classifiers, extra weigths are ignored") - w <- w[1:ncol(classifiers)] - } else if(length(w) < ncol(classifiers)) { - warning("less weights than classifiers, extra classifiers are ignored") - classifiers <- classifiers[,1:length(w)] - } - prediction <- NULL - prediction <- c(prediction, apply(classifiers, 1, weightedPrediction.vec, w)) - return(prediction) -} - -weightedPrediction.vec <- function(values, w) { - return(sum(values * w/sum(w))) -} - -##### meta-decision tree - -tuneTree <- function(data, target) { - data <- data.frame(data, target=target) - size <- nrow(data) - xerror <- NULL - split <- 1:ceiling(size/5) - leafSize <- 1:ceiling(size/10) - xerror <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize)) - cp <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize)) - for(i in 1:length(split)) { - for(j in 1:length(leafSize)) { - op <- list(minsplit=split[i], minbucket=leafSize[j]) - tree <- rpart(target ~., data=data, control=op, method="anova") - xerror[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"xerror"] - cp[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"] - } - } - index <- which(xerror==min(xerror), arr.ind = T) - op <- list(minsplit=split[index[1]], minbucket=leafSize[index[2]], cp=cp[index[1], index[2]]) - return(op) -} - -###### meta-LASSO -# create fold by picking at random row indexes -createFolds <- function(nbObs, n) { - # pick indexes - index <- sample(1:n, size=nbObs, replace = T) - # populate folds - folds <- NULL - for(i in 1:n) { - folds <- c(folds, list(which(index==i))) - } - return(folds) -} - -searchParamLASSO <- function(genotype, phenotype, alpha=seq(0,1,0.1), n=7) { - folds <- createFolds(nrow(genotype), n = n) - acc <- NULL - indexAlpha <- 1 - for(a in alpha) { - curAcc <- NULL - for(i in 1:n) { - train <- genotype[-folds[[i]],] - test <- genotype[folds[[i]],] - phenoTrain <- phenotype[-folds[[i]]] - phenoTest <- phenotype[folds[[i]]] - cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a) - model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se) - pred <- predict(model, test, type = "response") - curAcc <- c(curAcc, r2(phenoTest, pred)) - } - acc <- c(acc, mean(curAcc)) - } - names(acc) <- alpha - return(as.numeric(names(acc)[which.max(acc)])) -} - -###### meta-random forest - -searchParamRF <- function(genotype, phenotype, rangeNtree, mtry=ncol(genotype)) { - n <- ceiling(nrow(genotype)/3) - indexTest <- sample(1:nrow(genotype), size=n) - train <- genotype[-indexTest,] - test <- genotype[indexTest,] - phenoTrain <- phenotype[-indexTest] - phenoTest <- phenotype[indexTest] - acc <- NULL - indexNtree <- 1 - for(ntree in rangeNtree) { - model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry = mtry) - pred <- predict(model, test) - acc <- c(acc, r2(phenoTest, pred)) - } - names(acc) <- rangeNtree - best <- which.max(acc) - return(as.numeric(names(acc)[best])) -} - -###### meta-SVM -searchParamSVM <- function(train, target, kernel="radial") { - # tuning parameters then train - model <- NULL - switch(kernel, - sigmoid={ - tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), kernel="sigmoid"); - g <- tune$best.parameters[[1]]; - c <- tune$best.parameters[[2]]; - model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")}, - linear={ - tune <- tune.svm(train, target, cost = 10^(0:2), kernel="linear"); - c <- tune$best.parameters[[1]]; - model <- svm(x=train, y=target, cost = c, kernel = "linear")}, - polynomial={ - tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), degree = 0:4, coef0 = 0:3, kernel="polynomial"); - d <- tune$best.parameters[[1]]; - g <- tune$best.parameters[[2]]; - coef <- tune$best.parameters[[3]]; - c <- tune$best.parameters[[4]]; - model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)}, - { - tune <- tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:3), kernel="radial"); - g <- tune$best.parameters[[1]]; - c <- tune$best.parameters[[2]]; - model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")} - ) - return(model) -} - -#################### upper level functions ##################### - -aggregateDT <- function(classifiers, target=NULL, prediction=F, model=NULL, out) { - if(!prediction) { - treeParam <- tuneTree(classifiers, target) - data <- data.frame(classifiers, target) - model <- rpart(target ~., data=data, method = "anova", control = treeParam) - model <- prune(model, cp=treeParam["cp"]) - out <- paste(out, ".rds", sep = "") - saveRDS(model, out) - return(out) - } else { - return(predict(model, classifiers), out) - } -} - -aggregateGeneticMean <- function(classifiers, target=NULL, prediction=F, model=NULL, out){ - if(!prediction) { - opt <- optimizeWeight(values = classifiers, trueValue = target) - out <- paste(out, ".rds", sep = "") - saveRDS(opt@solution, out) - return(out) - } else { - return(weightedPrediction.vec(classifiers, model)) - } -} - -aggregateLASSO <- function(classifiers, target=NULL, prediction=F, model=NULL, alpha=NULL, out) { - if(!prediction) { - alpha <- searchParamLASSO(classifiers, target) - cv <- cv.glmnet(x=as.matrix(classifiers), y=target, alpha=alpha) - model <- glmnet(x=as.matrix(classifiers), y=target, alpha=alpha, lambda = cv$lambda.1se) - out <- paste(out, ".rds", sep = "") - saveRDS(model, out) - return(out) - } else { - return(predict(model, classifiers), out) - } -} - -aggregateRF <- function(classifiers, target=NULL, model=NULL, ntree=NULL, prediction=F, out) { - if(!prediction) { - ntree <- searchParamRF(genotype = classifiers, phenotype = target, - rangeNtree = seq(100, 1000, 100)) - model <- randomForest(x=classifiers, y=target, ntree = ntree, mtry = ncol(classifiers)) - out <- paste(out, ".rds", sep = "") - saveRDS(model, out) - return(out) - } else { - return(predict(model, classifiers), out) - } -} - -aggregateSVM <- function(classifiers, target=NULL, prediction=F, - model=NULL, c=NULL, g=NULL, d=NULL, coef=NULL, kernel="radial", out) { - if(!prediction) { - model <- searchParamSVM(train = classifiers, target = target, kernel = kernel) - out <- paste(out, ".rds", sep = "") - saveRDS(model, out) - return(out) - } else { - return(predict(model, classifiers), out) - } -} - -################################### main ############################# -# # load argument -cmd <- commandArgs(T) -source(cmd[1]) -# check if evaluation is required -evaluation <- F -if(as.integer(doEvaluation) == 1) { - evaluation <- T - con = file(folds) - folds <- readLines(con = con, n = 1, ok=T) - close(con) - folds <- readRDS(folds) -} -# check for model -if(model == "None") { - model <- NULL - prediction <- F -} else { - prediction <- T - con = file(model) - model <- readLines(con = con, n = 1, ok=T) - close(con) - model <- readRDS(model) -} -# load classifiers and phenotype -classifiers <- NULL -classifNames <- NULL -if(lassoPred !="None"){ - classifiers <- c(classifiers, lassoPred) - classifNames <- c(classifNames, "lasso") -} -if(rrBLUPPred !="None"){ - classifiers <- c(classifiers, rrBLUPPred) - classifNames <- c(classifNames, "rrBLUP") -} -if(rfPred !="None"){ - classifiers <- c(classifiers, rfPred) - classifNames <- c(classifNames, "rf") -} -if(svmPred !="None"){ - classifiers <- c(classifiers, svmPred) - classifNames <- c(classifNames, "svm") -} -classifPrediction <- NULL -for(classif in classifiers) { - classifPrediction <- c(classifPrediction, list(read.table(classif, sep="\t", h=T))) -} -classifPrediction <- data.frame(classifPrediction) -colnames(classifPrediction) <- classifNames -# phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve -phenotype <- read.table(phenotype, sep="\t", h=T)[,1] -# aggregate ! -switch(method, - geneticMean={ - res <- aggregateGeneticMean(classifiers = classifPrediction, target = phenotype, - out = out, prediction = prediction, model=model) - }, - dt={ - res <- aggregateDT(classifiers = classifPrediction, target = phenotype, - out = out, prediction = prediction, model=model) - }, - lasso={ - res <- aggregateLASSO(classifiers = data.matrix(classifPrediction), target = phenotype, - out = out, prediction = prediction, model=model) - }, - rf={ - res <- aggregateRF(classifiers = classifPrediction, target = phenotype, - out = out, prediction = prediction, model=model) - }, - # svm - {res <- aggregateSVM(classifiers = classifPrediction, target = phenotype, kernel = kernel, - out = out, prediction = prediction, model = model)} - ) -if(prediction) { - write.table(data.frame(lines=rownames(classifPrediction), res), paste(out,".csv", sep=""), - sep="\t", row.names = F) -} else { - cat(paste(paste(out, ".rds", sep = ""), "\n", sep="")) -} \ No newline at end of file
--- a/aggregation.xml Wed Oct 26 18:22:03 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,78 +0,0 @@ -<tool id="aggreg" name="aggregation" version="1.0.0"> - <description>predict phenotype by combining multiple classifiers</description> - <command interpreter="Rscript"> - aggregation.R $config > ${output1} - </command> - - <inputs> - <param name="lassoPred" type="data" optional="true" - label="lasso prediction" help="path to rds containing LASSO prediction" - /> - - <param name="rfPred" type="data" optional="true" - label="rf prediction" help="path to rds containing Random Forest prediction" - /> - - <param name="rrBLUPPred" type="data" optional="true" - label="rrBLUP prediction" help="path to rds containing rrBLUP prediction" - /> - - <param name="svmPred" type="data" optional="true" - label="SVM prediction" help="path to rds containing SVM prediction" - /> - - <param name="phenotype" type="data" - label="phenotype data" help=" a tabular datatype containing the phenotypes " - /> - - <param name="eval" type="integer" value="0" - label="do evaluation" help=" whether to produce a model or to use folds to evaluate the tool. 1 means the tool will be evaluate (and a folds argument is required) any other value produces a model " - /> - - <param name="folds" type="data" optional="true" - label="folds" help=" OPTIONAL ARGUMENT path to a folds file containing folds indexes in a R list called /folds/ such as produced by the folds tools in OGHMA suite. " - /> - - <param name="model" type="data" optional="true" - label="model" help= " a path to a file where the results (depending on the chosen mode) will be writen" - /> - <param name="out" type="text" - label="output path" help= " a path to a rds file" - /> - <param name="method" type="text" value="svm" - label="aggregation method" help= "choose among geneticMean, rrBLUP, lasso, rf or svm" - /> - <param name="kernel" type="text" value="linear" - label="kernel for SVM" help= "choose among linear, polynomial, radial, sigmoid" - /> - - </inputs> - - <configfiles> - <configfile name="config"> -## Desc: this file is sourced in encode wrapper script -## as means to pass all galaxy params to R -"${lassoPred}" -> lassoPred -"${rfPred}" -> rfPred -"${rrBLUPPred}" -> rrBLUPPred -"${svmPred}" -> svmPred -"${phenotype}" -> phenotype -"${model}" -> model -"${out}" -> out -"${eval}" -> evaluation -"${folds}" -> folds -"${method}" -> method -"${kernel}" -> kernel -"${eval}" -> doEvaluation - - </configfile> -</configfiles> - -<outputs> - <data format="tabular" name = "output1" label="aggregation output" /> -</outputs> - - <help> - - </help> - </tool> \ No newline at end of file
