| 
46
 | 
     1 ########################################################
 | 
| 
 | 
     2 #
 | 
| 
 | 
     3 # creation date : 25/10/16
 | 
| 
 | 
     4 # last modification : 25/10/16
 | 
| 
 | 
     5 # author : Dr Nicolas Beaume
 | 
| 
 | 
     6 #
 | 
| 
 | 
     7 ########################################################
 | 
| 
 | 
     8 
 | 
| 
 | 
     9 suppressWarnings(suppressMessages(library(GA)))
 | 
| 
 | 
    10 library("miscTools")
 | 
| 
 | 
    11 library(rpart)
 | 
| 
 | 
    12 suppressWarnings(suppressMessages(library(randomForest)))
 | 
| 
 | 
    13 library(e1071)
 | 
| 
 | 
    14 suppressWarnings(suppressMessages(library(glmnet)))
 | 
| 
 | 
    15 ############################ helper functions #######################
 | 
| 
 | 
    16 
 | 
| 
 | 
    17 ##### Genetic algorithm
 | 
| 
 | 
    18 optimizeOneIndividual <- function(values, trueValue) {
 | 
| 
 | 
    19   # change the value into a function
 | 
| 
 | 
    20   f <- function(w) {sum(values * w/sum(w))}
 | 
| 
 | 
    21   fitness <- function(x) {1/abs(trueValue-f(x))}
 | 
| 
 | 
    22   resp <- ga(type = "real-valued", fitness = fitness, min = rep(0, length(values)), max = rep(1, length(values)), 
 | 
| 
 | 
    23              maxiter = 1000, monitor = NULL, keepBest = T)
 | 
| 
 | 
    24   resp@solution <- resp@solution/sum(resp@solution)
 | 
| 
 | 
    25   return(resp)
 | 
| 
 | 
    26 }
 | 
| 
 | 
    27 
 | 
| 
 | 
    28 optimizeWeight <- function(values, trueValue, n=1000) {
 | 
| 
 | 
    29   fitnessAll <- function(w) {
 | 
| 
 | 
    30     predicted <- apply(values, 1, weightedPrediction.vec, w)
 | 
| 
 | 
    31     return(mean(r2(trueValue, predicted)))
 | 
| 
 | 
    32     #return(mean(1/abs(trueValue-predicted)))
 | 
| 
 | 
    33   }
 | 
| 
 | 
    34   resp <- ga(type = "real-valued", fitness = fitnessAll, min = rep(0, ncol(values)), max = rep(1, ncol(values)), 
 | 
| 
 | 
    35              maxiter = n, monitor = NULL, keepBest = T)
 | 
| 
 | 
    36   resp@solution <- resp@solution/sum(resp@solution)
 | 
| 
 | 
    37   return(resp)
 | 
| 
 | 
    38 }
 | 
| 
 | 
    39 
 | 
| 
 | 
    40 weightedPrediction <- function(classifiers, w) {
 | 
| 
 | 
    41   if(length(w) > ncol(classifiers)) {
 | 
| 
 | 
    42     warning("more weights than classifiers, extra weigths are ignored")
 | 
| 
 | 
    43     w <- w[1:ncol(classifiers)]
 | 
| 
 | 
    44   } else if(length(w) < ncol(classifiers)) {
 | 
| 
 | 
    45     warning("less weights than classifiers, extra classifiers are ignored")
 | 
| 
 | 
    46     classifiers <- classifiers[,1:length(w)]
 | 
| 
 | 
    47   }
 | 
| 
 | 
    48   prediction <- NULL
 | 
| 
 | 
    49   prediction <- c(prediction, apply(classifiers, 1, weightedPrediction.vec, w))
 | 
| 
 | 
    50   return(prediction)
 | 
| 
 | 
    51 }
 | 
| 
 | 
    52 
 | 
| 
 | 
    53 weightedPrediction.vec <- function(values, w) {
 | 
| 
 | 
    54   return(sum(values * w/sum(w)))
 | 
| 
 | 
    55 }
 | 
| 
 | 
    56 
 | 
| 
 | 
    57 ##### meta-decision tree
 | 
| 
 | 
    58 
 | 
| 
 | 
    59 tuneTree <- function(data, target) {
 | 
| 
 | 
    60   data <- data.frame(data, target=target)
 | 
| 
 | 
    61   size <-  nrow(data)
 | 
| 
 | 
    62   xerror <-  NULL
 | 
| 
 | 
    63   split <-  1:ceiling(size/5)
 | 
| 
 | 
    64   leafSize <-  1:ceiling(size/10)
 | 
| 
 | 
    65   xerror <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))
 | 
| 
 | 
    66   cp <- matrix(rep(-1, length(split)*length(leafSize)), ncol=length(leafSize))
 | 
| 
 | 
    67   for(i in 1:length(split)) {
 | 
| 
 | 
    68     for(j in 1:length(leafSize)) {
 | 
| 
 | 
    69       op <- list(minsplit=split[i], minbucket=leafSize[j])
 | 
| 
 | 
    70       tree <- rpart(target ~., data=data, control=op, method="anova")
 | 
| 
 | 
    71       xerror[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"xerror"]
 | 
| 
 | 
    72       cp[i,j] <- tree$cptable[which.min(tree$cptable[,"xerror"]),"CP"]
 | 
| 
 | 
    73     }
 | 
| 
 | 
    74   }
 | 
| 
 | 
    75   index <- which(xerror==min(xerror), arr.ind = T)
 | 
| 
 | 
    76   op <- list(minsplit=split[index[1]], minbucket=leafSize[index[2]], cp=cp[index[1], index[2]])
 | 
| 
 | 
    77   return(op)
 | 
| 
 | 
    78 }
 | 
| 
 | 
    79 
 | 
| 
 | 
    80 ###### meta-LASSO
 | 
| 
 | 
    81 # create fold by picking at random row indexes
 | 
| 
 | 
    82 createFolds <- function(nbObs, n) {
 | 
| 
 | 
    83   # pick indexes
 | 
| 
 | 
    84   index <- sample(1:n, size=nbObs, replace = T)
 | 
| 
 | 
    85   # populate folds
 | 
| 
 | 
    86   folds <- NULL
 | 
| 
 | 
    87   for(i in 1:n) {
 | 
| 
 | 
    88     folds <- c(folds, list(which(index==i)))
 | 
| 
 | 
    89   }
 | 
| 
 | 
    90   return(folds)
 | 
| 
 | 
    91 }
 | 
| 
 | 
    92 
 | 
| 
 | 
    93 searchParamLASSO <- function(genotype, phenotype, alpha=seq(0,1,0.1), n=7) {
 | 
| 
 | 
    94   folds <- createFolds(nrow(genotype), n = n)
 | 
| 
 | 
    95   acc <- NULL
 | 
| 
 | 
    96   indexAlpha <- 1
 | 
| 
 | 
    97   for(a in alpha) {
 | 
| 
 | 
    98     curAcc <- NULL
 | 
| 
 | 
    99     for(i in 1:n) {
 | 
| 
 | 
   100       train <- genotype[-folds[[i]],]
 | 
| 
 | 
   101       test <- genotype[folds[[i]],]
 | 
| 
 | 
   102       phenoTrain <- phenotype[-folds[[i]]]
 | 
| 
 | 
   103       phenoTest <- phenotype[folds[[i]]]
 | 
| 
 | 
   104       cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a)
 | 
| 
 | 
   105       model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se)
 | 
| 
 | 
   106       pred <- predict(model, test, type = "response")
 | 
| 
 | 
   107       curAcc <- c(curAcc, r2(phenoTest, pred))
 | 
| 
 | 
   108     }
 | 
| 
 | 
   109     acc <- c(acc, mean(curAcc))
 | 
| 
 | 
   110   }
 | 
| 
 | 
   111   names(acc) <- alpha
 | 
| 
 | 
   112   return(as.numeric(names(acc)[which.max(acc)]))
 | 
| 
 | 
   113 }
 | 
| 
 | 
   114 
 | 
| 
 | 
   115 ###### meta-random forest
 | 
| 
 | 
   116 
 | 
| 
 | 
   117 searchParamRF <- function(genotype, phenotype, rangeNtree, mtry=ncol(genotype)) {
 | 
| 
 | 
   118   n <- ceiling(nrow(genotype)/3)
 | 
| 
 | 
   119   indexTest <- sample(1:nrow(genotype), size=n)
 | 
| 
 | 
   120   train <- genotype[-indexTest,]
 | 
| 
 | 
   121   test <- genotype[indexTest,]
 | 
| 
 | 
   122   phenoTrain <- phenotype[-indexTest]
 | 
| 
 | 
   123   phenoTest <- phenotype[indexTest]
 | 
| 
 | 
   124   acc <- NULL
 | 
| 
 | 
   125   indexNtree <- 1
 | 
| 
 | 
   126   for(ntree in rangeNtree) {
 | 
| 
 | 
   127     model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry = mtry)
 | 
| 
 | 
   128     pred <- predict(model, test)
 | 
| 
 | 
   129     acc <- c(acc, r2(phenoTest, pred))
 | 
| 
 | 
   130   }
 | 
| 
 | 
   131   names(acc) <- rangeNtree
 | 
| 
 | 
   132   best <- which.max(acc)
 | 
| 
 | 
   133   return(as.numeric(names(acc)[best]))
 | 
| 
 | 
   134 }
 | 
| 
 | 
   135 
 | 
| 
 | 
   136 ###### meta-SVM
 | 
| 
 | 
   137 searchParamSVM <- function(train, target, kernel="radial") {
 | 
| 
 | 
   138   # tuning parameters then train
 | 
| 
 | 
   139   model <- NULL
 | 
| 
 | 
   140   switch(kernel,
 | 
| 
 | 
   141          sigmoid={
 | 
| 
 | 
   142            tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), kernel="sigmoid");
 | 
| 
 | 
   143            g <- tune$best.parameters[[1]];
 | 
| 
 | 
   144            c <- tune$best.parameters[[2]];
 | 
| 
 | 
   145            model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")},
 | 
| 
 | 
   146          linear={
 | 
| 
 | 
   147            tune <-  tune.svm(train, target, cost = 10^(0:2), kernel="linear");
 | 
| 
 | 
   148            c <- tune$best.parameters[[1]];
 | 
| 
 | 
   149            model <-  svm(x=train, y=target, cost = c, kernel = "linear")},
 | 
| 
 | 
   150          polynomial={
 | 
| 
 | 
   151            tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:2), degree = 0:4, coef0 = 0:3, kernel="polynomial");
 | 
| 
 | 
   152            d <- tune$best.parameters[[1]];
 | 
| 
 | 
   153            g <- tune$best.parameters[[2]];
 | 
| 
 | 
   154            coef <- tune$best.parameters[[3]];
 | 
| 
 | 
   155            c <- tune$best.parameters[[4]];
 | 
| 
 | 
   156            model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)},
 | 
| 
 | 
   157          {
 | 
| 
 | 
   158            tune <-  tune.svm(train, target, gamma = 10^(-6:-1), cost = 10^(0:3), kernel="radial");
 | 
| 
 | 
   159            g <- tune$best.parameters[[1]];
 | 
| 
 | 
   160            c <- tune$best.parameters[[2]];
 | 
| 
 | 
   161            model <-  svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")}
 | 
| 
 | 
   162   )
 | 
| 
 | 
   163   return(model)
 | 
| 
 | 
   164 }
 | 
| 
 | 
   165 
 | 
| 
 | 
   166 #################### upper level functions #####################
 | 
| 
 | 
   167 
 | 
| 
 | 
   168 aggregateDT <- function(classifiers, target=NULL, prediction=F, model=NULL, out) {
 | 
| 
 | 
   169   if(!prediction) {
 | 
| 
 | 
   170     treeParam <- tuneTree(classifiers, target)
 | 
| 
 | 
   171     data <- data.frame(classifiers, target)
 | 
| 
 | 
   172     model <- rpart(target ~., data=data, method = "anova", control = treeParam)
 | 
| 
 | 
   173     model <- prune(model, cp=treeParam["cp"])
 | 
| 
 | 
   174     saveRDS(model, out)
 | 
| 
 | 
   175   } else {
 | 
| 
 | 
   176     saveRDS(predict(model, data.frame(classifiers)), out)
 | 
| 
 | 
   177   }
 | 
| 
 | 
   178 }
 | 
| 
 | 
   179 
 | 
| 
 | 
   180 aggregateGeneticMean <- function(classifiers, target=NULL, prediction=F, model=NULL, out){
 | 
| 
 | 
   181   if(!prediction) {
 | 
| 
 | 
   182     opt <- optimizeWeight(values = classifiers, trueValue = target)
 | 
| 
 | 
   183     saveRDS(opt@solution, out)
 | 
| 
 | 
   184     # evaluation of the method
 | 
| 
 | 
   185   } else {
 | 
| 
 | 
   186     saveRDS(weightedPrediction.vec(classifiers, model), out)
 | 
| 
 | 
   187   }
 | 
| 
 | 
   188 }
 | 
| 
 | 
   189 
 | 
| 
 | 
   190 aggregateLASSO <- function(classifiers, target=NULL, prediction=F, model=NULL, alpha=NULL, out) {
 | 
| 
 | 
   191   if(!prediction) {
 | 
| 
 | 
   192     alpha <- searchParamLASSO(classifiers, target)
 | 
| 
 | 
   193     cv <- cv.glmnet(x=as.matrix(classifiers), y=target, alpha=alpha)
 | 
| 
 | 
   194     model <- glmnet(x=as.matrix(classifiers), y=target, alpha=alpha, lambda = cv$lambda.1se)
 | 
| 
 | 
   195     saveRDS(model, out)
 | 
| 
 | 
   196   } else {
 | 
| 
 | 
   197     saveRDS(predict(model, classifiers), out)
 | 
| 
 | 
   198   }
 | 
| 
 | 
   199 }
 | 
| 
 | 
   200 
 | 
| 
 | 
   201 aggregateRF <- function(classifiers, target=NULL, model=NULL, ntree=NULL, prediction=F, out) {
 | 
| 
 | 
   202   if(!prediction) {
 | 
| 
 | 
   203    ntree <- searchParamRF(genotype = classifiers, phenotype = target,
 | 
| 
 | 
   204                                               rangeNtree = seq(100, 1000, 100))
 | 
| 
 | 
   205     model <- randomForest(x=classifiers, y=target, ntree = ntree, mtry = ncol(classifiers))
 | 
| 
 | 
   206     saveRDS(model, out)
 | 
| 
 | 
   207   } else {
 | 
| 
 | 
   208     saveRDS(predict(model, classifiers), out)
 | 
| 
 | 
   209   }
 | 
| 
 | 
   210 }
 | 
| 
 | 
   211 
 | 
| 
 | 
   212 aggregateSVM <- function(classifiers, target=NULL, prediction=F, 
 | 
| 
 | 
   213                          model=NULL, c=NULL, g=NULL, d=NULL, coef=NULL, kernel="radial", out) {
 | 
| 
 | 
   214   if(!prediction) {
 | 
| 
 | 
   215       model <- searchParamSVM(train = classifiers, target = target, kernel = kernel)
 | 
| 
 | 
   216       saveRDS(model, out)
 | 
| 
 | 
   217   } else {
 | 
| 
 | 
   218     saveRDS(predict(model, classifiers), out)
 | 
| 
 | 
   219   }
 | 
| 
 | 
   220 }
 | 
| 
 | 
   221 
 | 
| 
 | 
   222 ################################### main #############################
 | 
| 
 | 
   223 # # load argument
 | 
| 
 | 
   224 cmd <- commandArgs(T)
 | 
| 
 | 
   225 source(cmd[1])
 | 
| 
 | 
   226 # check if evaluation is required
 | 
| 
 | 
   227 evaluation <- F
 | 
| 
 | 
   228 if(as.integer(doEvaluation) == 1) {
 | 
| 
 | 
   229   evaluation <- T
 | 
| 
 | 
   230   con = file(folds)
 | 
| 
 | 
   231   folds <- readLines(con = con, n = 1, ok=T)
 | 
| 
 | 
   232   close(con)
 | 
| 
 | 
   233   folds <- readRDS(folds)
 | 
| 
 | 
   234 }
 | 
| 
 | 
   235 # check for model
 | 
| 
 | 
   236 if(model == "None") {
 | 
| 
 | 
   237   model <- NULL
 | 
| 
 | 
   238   prediction <- F
 | 
| 
 | 
   239 } else {
 | 
| 
 | 
   240   prediction <- T
 | 
| 
 | 
   241   con = file(model)
 | 
| 
 | 
   242   model <- readLines(con = con, n = 1, ok=T)
 | 
| 
 | 
   243   close(con)
 | 
| 
 | 
   244   model <- readRDS(model)
 | 
| 
 | 
   245 }
 | 
| 
 | 
   246 # load classifiers and phenotype
 | 
| 
 | 
   247 classifiers <- NULL
 | 
| 
 | 
   248 classifNames <- NULL
 | 
| 
 | 
   249 if(lassoPred !="None"){
 | 
| 
 | 
   250   classifiers <- c(classifiers, lassoPred)
 | 
| 
 | 
   251   classifNames <- c(classifNames, "lasso")
 | 
| 
 | 
   252 }
 | 
| 
 | 
   253 if(rrBLUPPred !="None"){
 | 
| 
 | 
   254   classifiers <- c(classifiers, rrBLUPPred)
 | 
| 
 | 
   255   classifNames <- c(classifNames, "rrBLUP")
 | 
| 
 | 
   256 }
 | 
| 
 | 
   257 if(rfPred !="None"){
 | 
| 
 | 
   258   classifiers <- c(classifiers, rfPred)
 | 
| 
 | 
   259   classifNames <- c(classifNames, "rf")
 | 
| 
 | 
   260 }
 | 
| 
 | 
   261 if(svmPred !="None"){
 | 
| 
 | 
   262   classifiers <- c(classifiers, svmPred)
 | 
| 
 | 
   263   classifNames <- c(classifNames, "svm")
 | 
| 
 | 
   264 }
 | 
| 
 | 
   265 classifPrediction <- NULL
 | 
| 
 | 
   266 for(classif in classifiers) {
 | 
| 
 | 
   267   classifPrediction <- c(classifPrediction, list(read.table(classif, sep="\t", h=T)))
 | 
| 
 | 
   268 }
 | 
| 
 | 
   269 classifPrediction <- data.frame(classifPrediction)
 | 
| 
 | 
   270 colnames(classifPrediction) <- classifNames
 | 
| 
 | 
   271 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve
 | 
| 
 | 
   272 phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
 | 
| 
 | 
   273 out <- paste(out, ".rds", sep = "")
 | 
| 
 | 
   274 # aggregate !
 | 
| 
 | 
   275 switch(method,
 | 
| 
 | 
   276        geneticMean={
 | 
| 
 | 
   277          aggregateGeneticMean(classifiers = classifPrediction, target = phenotype,
 | 
| 
 | 
   278                               out = out, prediction = prediction, model=model)
 | 
| 
 | 
   279        },
 | 
| 
 | 
   280        dt={
 | 
| 
 | 
   281          aggregateDT(classifiers = classifPrediction, target = phenotype,
 | 
| 
 | 
   282                      out = out, prediction = prediction, model=model)
 | 
| 
 | 
   283        },
 | 
| 
 | 
   284        lasso={
 | 
| 
 | 
   285          aggregateLASSO(classifiers = classifPrediction, target = phenotype,
 | 
| 
 | 
   286                      out = out, prediction = prediction, model=model)
 | 
| 
 | 
   287        },
 | 
| 
 | 
   288        rf={
 | 
| 
 | 
   289          aggregateRF(classifiers = classifPrediction, target = phenotype,
 | 
| 
 | 
   290                      out = out, prediction = prediction, model=model)
 | 
| 
 | 
   291        },
 | 
| 
 | 
   292        # svm
 | 
| 
 | 
   293        {aggregateSVM(classifiers = classifPrediction, target = phenotype, kernel = kernel,
 | 
| 
 | 
   294                      out = out, prediction = prediction, model = model)}
 | 
| 
 | 
   295        )
 | 
| 
 | 
   296 # return path of the result file to galaxy
 | 
| 
 | 
   297 cat(paste(out, "\n", sep="")) |