| 79 | 1 ######################################################## | 
|  | 2 # | 
|  | 3 # creation date : 08/01/16 | 
|  | 4 # last modification : 01/09/16 | 
|  | 5 # author : Dr Nicolas Beaume | 
|  | 6 # owner : IRRI | 
|  | 7 # | 
|  | 8 ######################################################## | 
|  | 9 | 
|  | 10 suppressWarnings(suppressMessages(library(glmnet))) | 
|  | 11 library(methods) | 
|  | 12 ############################ helper functions ####################### | 
|  | 13 | 
|  | 14 | 
|  | 15 # optimize alpha parameter | 
|  | 16 optimize <- function(genotype, phenotype, alpha=seq(0,1,0.1), repet=7) { | 
|  | 17   acc <- NULL | 
|  | 18   indexAlpha <- 1 | 
|  | 19   for(a in alpha) { | 
|  | 20     curAcc <- NULL | 
|  | 21     # repeat nfolds time each analysis | 
|  | 22     for(i in 1:repet) { | 
|  | 23       # draw at random 1/3 of the training set for testing and thus choose alpha | 
|  | 24       # note it is not a cross-validation | 
|  | 25       n <- ceiling(nrow(genotype)/3) | 
|  | 26       indexTest <- sample(1:nrow(genotype), size=n) | 
|  | 27       # create training set and test set | 
|  | 28       train <- genotype[-indexTest,] | 
|  | 29       test <- genotype[indexTest,] | 
|  | 30       phenoTrain <- phenotype[-indexTest] | 
|  | 31       phenoTest <- phenotype[indexTest] | 
|  | 32       # cv.glmnet allow to compute lambda at the current alpha | 
|  | 33       cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a) | 
|  | 34       # create model | 
|  | 35       model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se) | 
|  | 36       # predict test set | 
|  | 37       pred <- predict(model, test, type = "response") | 
|  | 38       # compute r2 for choosing the best alpha | 
|  | 39       curAcc <- c(curAcc, r2(phenoTest, pred)) | 
|  | 40     } | 
|  | 41     # add mean r2 for this value of lambda to the accuracy vector | 
|  | 42     acc <- c(acc, mean(curAcc)) | 
|  | 43   } | 
|  | 44   # choose best alpha | 
|  | 45   names(acc) <- alpha | 
|  | 46   return(as.numeric(names(acc)[which.max(acc)])) | 
|  | 47 } | 
|  | 48 | 
|  | 49 # compute r2 by computing the classic formula | 
|  | 50 # compare the sum of square difference from target to prediciton | 
|  | 51 # to the sum of square difference from target to the mean of the target | 
|  | 52 r2 <- function(target, prediction) { | 
|  | 53   sst <- sum((target-mean(target))^2) | 
|  | 54   ssr <- sum((target-prediction)^2) | 
|  | 55   return(1-ssr/sst) | 
|  | 56 } | 
|  | 57 ################################## main function ########################### | 
|  | 58 | 
|  | 59 lasso <- function(genotype, phenotype, evaluation = T, outFile, folds, alpha=NULL) { | 
|  | 60   # go for optimization | 
|  | 61   if(is.null(alpha)) { | 
|  | 62     alpha <- seq(0,1,0.1) | 
|  | 63     alpha <- optimize(genotype=genotype, phenotype=phenotype, alpha = alpha) | 
|  | 64   } | 
|  | 65   # evaluation | 
|  | 66   if(evaluation) { | 
|  | 67     prediction <- NULL | 
|  | 68     # do cross-validation | 
|  | 69     for(i in 1:length(folds)) { | 
|  | 70       # create training and test set | 
|  | 71       train <- genotype[-folds[[i]],] | 
|  | 72       test <- genotype[folds[[i]],] | 
|  | 73       phenoTrain <- phenotype[-folds[[i]]] | 
|  | 74       phenoTest <- phenotype[folds[[i]]] | 
|  | 75       # cv.glmnet helps to compute the right lambda for a chosen alpha | 
|  | 76       cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha) | 
|  | 77       # create model | 
|  | 78       lasso.fit <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha, lambda = cv$lambda.1se) | 
|  | 79       # predict value of the test set for further evaluation | 
|  | 80       prediction <- c(prediction, list(predict(lasso.fit, test, type = "response")[,1])) | 
|  | 81     } | 
|  | 82     # save predicted value for test set of each fold for further evaluation | 
|  | 83     saveRDS(prediction, file=paste(outFile,".rds", sep="")) | 
|  | 84     # just create a model | 
|  | 85   } else { | 
|  | 86     # cv.glmnet helps to compute the right lambda for a chosen alpha | 
|  | 87     cv <- cv.glmnet(x=genotype, y=phenotype, alpha=alpha) | 
|  | 88     # create model | 
|  | 89     model <- glmnet(x=genotype, y=phenotype, alpha=alpha, lambda=cv$lambda.1se) | 
|  | 90     # save model | 
|  | 91     saveRDS(model, file = paste(outFile, ".rds", sep = "")) | 
|  | 92   } | 
|  | 93 } | 
|  | 94 | 
|  | 95 ############################ main ############################# | 
|  | 96 # load argument | 
|  | 97 cmd <- commandArgs(T) | 
|  | 98 source(cmd[1]) | 
|  | 99 # check if evaluation is required | 
|  | 100 evaluation <- F | 
|  | 101 if(as.integer(doEvaluation) == 1) { | 
|  | 102   evaluation <- T | 
|  | 103   con = file(folds) | 
|  | 104   folds <- readLines(con = con, n = 1, ok=T) | 
|  | 105   close(con) | 
|  | 106   folds <- readRDS(folds) | 
|  | 107 } | 
|  | 108 # load classifier parameters | 
|  | 109 alpha <- as.numeric(alpha) | 
|  | 110 if(alpha < 0 | alpha > 1) {alpha <- NULL} | 
|  | 111 # load genotype and phenotype | 
|  | 112 con = file(genotype) | 
|  | 113 genotype <- readLines(con = con, n = 1, ok=T) | 
|  | 114 close(con) | 
|  | 115 genotype <- read.table(genotype, sep="\t", h=T) | 
|  | 116 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve | 
|  | 117 phenotype <- read.table(phenotype, sep="\t", h=T)[,1] | 
|  | 118 # run ! | 
|  | 119 lasso(genotype = data.matrix(genotype), phenotype = phenotype, | 
|  | 120                evaluation = evaluation, outFile = out, folds = folds, alpha = alpha) | 
|  | 121 # return path of the result file to galaxy | 
|  | 122 cat(paste(paste(out, ".rds", sep = ""), "\n", sep="")) |