Mercurial > repos > nicolas > oghma
comparison lasso.R @ 35:2e66da6efc41 draft
Uploaded
| author | nicolas |
|---|---|
| date | Tue, 25 Oct 2016 14:40:32 -0400 |
| parents | dcc10adbe46b |
| children |
comparison
equal
deleted
inserted
replaced
| 34:b57cf5ccc108 | 35:2e66da6efc41 |
|---|---|
| 4 # last modification : 01/09/16 | 4 # last modification : 01/09/16 |
| 5 # author : Dr Nicolas Beaume | 5 # author : Dr Nicolas Beaume |
| 6 # owner : IRRI | 6 # owner : IRRI |
| 7 # | 7 # |
| 8 ######################################################## | 8 ######################################################## |
| 9 log <- file(paste(getwd(), "log_LASSO.txt", sep="/"), open = "wt") | |
| 10 sink(file = log, type="message") | |
| 11 | 9 |
| 12 library(glmnet) | 10 suppressWarnings(suppressMessages(library(glmnet))) |
| 13 library(methods) | 11 library(methods) |
| 14 ############################ helper functions ####################### | 12 ############################ helper functions ####################### |
| 15 | 13 |
| 16 createFolds <- function(nbObs, n) { | |
| 17 index <- sample(1:n, size=nbObs, replace = T) | |
| 18 folds <- NULL | |
| 19 for(i in 1:n) { | |
| 20 folds <- c(folds, list(which(index==i))) | |
| 21 } | |
| 22 return(folds) | |
| 23 } | |
| 24 | 14 |
| 25 optimize <- function(genotype, phenotype, alpha=seq(0,1,0.1), nfolds=7) { | 15 # optimize alpha parameter |
| 16 optimize <- function(genotype, phenotype, alpha=seq(0,1,0.1), repet=7) { | |
| 26 acc <- NULL | 17 acc <- NULL |
| 27 indexAlpha <- 1 | 18 indexAlpha <- 1 |
| 28 for(a in alpha) { | 19 for(a in alpha) { |
| 29 curAcc <- NULL | 20 curAcc <- NULL |
| 30 for(i in 1:nfolds) { | 21 # repeat nfolds time each analysis |
| 22 for(i in 1:repet) { | |
| 23 # draw at random 1/3 of the training set for testing and thus choose alpha | |
| 24 # note it is not a cross-validation | |
| 31 n <- ceiling(nrow(genotype)/3) | 25 n <- ceiling(nrow(genotype)/3) |
| 32 indexTest <- sample(1:nrow(genotype), size=n) | 26 indexTest <- sample(1:nrow(genotype), size=n) |
| 27 # create training set and test set | |
| 33 train <- genotype[-indexTest,] | 28 train <- genotype[-indexTest,] |
| 34 test <- genotype[indexTest,] | 29 test <- genotype[indexTest,] |
| 35 phenoTrain <- phenotype[-indexTest] | 30 phenoTrain <- phenotype[-indexTest] |
| 36 phenoTest <- phenotype[indexTest] | 31 phenoTest <- phenotype[indexTest] |
| 32 # cv.glmnet allow to compute lambda at the current alpha | |
| 37 cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a) | 33 cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a) |
| 34 # create model | |
| 38 model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se) | 35 model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se) |
| 36 # predict test set | |
| 39 pred <- predict(model, test, type = "response") | 37 pred <- predict(model, test, type = "response") |
| 38 # compute r2 for choosing the best alpha | |
| 40 curAcc <- c(curAcc, r2(phenoTest, pred)) | 39 curAcc <- c(curAcc, r2(phenoTest, pred)) |
| 41 } | 40 } |
| 41 # add mean r2 for this value of lambda to the accuracy vector | |
| 42 acc <- c(acc, mean(curAcc)) | 42 acc <- c(acc, mean(curAcc)) |
| 43 } | 43 } |
| 44 # choose best alpha | |
| 44 names(acc) <- alpha | 45 names(acc) <- alpha |
| 45 return(as.numeric(names(acc)[which.max(acc)])) | 46 return(as.numeric(names(acc)[which.max(acc)])) |
| 46 } | 47 } |
| 47 | 48 |
| 49 # compute r2 by computing the classic formula | |
| 50 # compare the sum of square difference from target to prediciton | |
| 51 # to the sum of square difference from target to the mean of the target | |
| 48 r2 <- function(target, prediction) { | 52 r2 <- function(target, prediction) { |
| 49 sst <- sum((target-mean(target))^2) | 53 sst <- sum((target-mean(target))^2) |
| 50 ssr <- sum((target-prediction)^2) | 54 ssr <- sum((target-prediction)^2) |
| 51 return(1-ssr/sst) | 55 return(1-ssr/sst) |
| 52 } | 56 } |
| 53 ################################## main function ########################### | 57 ################################## main function ########################### |
| 54 | 58 |
| 55 lassoSelection <- function(genotype, phenotype, evaluation = T, outFile, folds, alpha=NULL) { | 59 lasso <- function(genotype, phenotype, evaluation = T, outFile, folds, alpha=NULL) { |
| 56 # go for optimization | 60 # go for optimization |
| 57 if(is.null(alpha)) { | 61 if(is.null(alpha)) { |
| 58 alpha <- seq(0,1,0.1) | 62 alpha <- seq(0,1,0.1) |
| 59 alpha <- optimize(genotype=genotype, phenotype=phenotype, alpha = alpha) | 63 alpha <- optimize(genotype=genotype, phenotype=phenotype, alpha = alpha) |
| 60 } | 64 } |
| 61 # evaluation | 65 # evaluation |
| 62 if(evaluation) { | 66 if(evaluation) { |
| 63 prediction <- NULL | 67 prediction <- NULL |
| 68 # do cross-validation | |
| 64 for(i in 1:length(folds)) { | 69 for(i in 1:length(folds)) { |
| 70 # create training and test set | |
| 65 train <- genotype[-folds[[i]],] | 71 train <- genotype[-folds[[i]],] |
| 66 test <- genotype[folds[[i]],] | 72 test <- genotype[folds[[i]],] |
| 67 phenoTrain <- phenotype[-folds[[i]]] | 73 phenoTrain <- phenotype[-folds[[i]]] |
| 68 phenoTest <- phenotype[folds[[i]]] | 74 phenoTest <- phenotype[folds[[i]]] |
| 75 # cv.glmnet helps to compute the right lambda for a chosen alpha | |
| 69 cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha) | 76 cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha) |
| 77 # create model | |
| 70 lasso.fit <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha, lambda = cv$lambda.1se) | 78 lasso.fit <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha, lambda = cv$lambda.1se) |
| 79 # predict value of the test set for further evaluation | |
| 71 prediction <- c(prediction, list(predict(lasso.fit, test, type = "response")[,1])) | 80 prediction <- c(prediction, list(predict(lasso.fit, test, type = "response")[,1])) |
| 72 } | 81 } |
| 82 # save predicted value for test set of each fold for further evaluation | |
| 73 saveRDS(prediction, file=paste(outFile,".rds", sep="")) | 83 saveRDS(prediction, file=paste(outFile,".rds", sep="")) |
| 74 # just create a model | 84 # just create a model |
| 75 } else { | 85 } else { |
| 86 # cv.glmnet helps to compute the right lambda for a chosen alpha | |
| 76 cv <- cv.glmnet(x=genotype, y=phenotype, alpha=alpha) | 87 cv <- cv.glmnet(x=genotype, y=phenotype, alpha=alpha) |
| 88 # create model | |
| 77 model <- glmnet(x=genotype, y=phenotype, alpha=alpha, lambda=cv$lambda.1se) | 89 model <- glmnet(x=genotype, y=phenotype, alpha=alpha, lambda=cv$lambda.1se) |
| 90 # save model | |
| 78 saveRDS(model, file = paste(outFile, ".rds", sep = "")) | 91 saveRDS(model, file = paste(outFile, ".rds", sep = "")) |
| 79 } | 92 } |
| 80 } | 93 } |
| 81 | 94 |
| 82 ############################ main ############################# | 95 ############################ main ############################# |
| 83 # running from terminal (supposing the OghmaGalaxy/bin directory is in your path) : | 96 # load argument |
| 84 # lasso.sh -i path_to_genotype -p phenotype_file -e -f fold_file -o out_file | |
| 85 ## -i : path to the file that contains the genotypes, must be a .rda file (as outputed by loadGenotype). | |
| 86 # please note that the table must be called "encoded" when your datafile is saved into .rda (automatic if encoded.R was used) | |
| 87 | |
| 88 ## -p : file that contains the phenotype must be a .rda file (as outputed by loadGenotype.R). | |
| 89 # please note that the table must be called "phenotype" when your datafile is saved into .rda (automatic if loadGenotype.R was used) | |
| 90 | |
| 91 ## -e : do evaluation instead of producing a model | |
| 92 | |
| 93 ## -f : index of the folds to which belong each individual | |
| 94 # please note that the list must be called "folds" (automatic if folds.R was used) | |
| 95 | |
| 96 ## -o : path to the output folder and generic name of the analysis. The extension related to each type of | |
| 97 # files are automatically added | |
| 98 | |
| 99 cmd <- commandArgs(T) | 97 cmd <- commandArgs(T) |
| 100 source(cmd[1]) | 98 source(cmd[1]) |
| 101 # check if evaluation is required | 99 # check if evaluation is required |
| 102 evaluation <- F | 100 evaluation <- F |
| 103 if(as.integer(doEvaluation) == 1) { | 101 if(as.integer(doEvaluation) == 1) { |
| 106 folds <- readLines(con = con, n = 1, ok=T) | 104 folds <- readLines(con = con, n = 1, ok=T) |
| 107 close(con) | 105 close(con) |
| 108 folds <- readRDS(folds) | 106 folds <- readRDS(folds) |
| 109 } | 107 } |
| 110 # load classifier parameters | 108 # load classifier parameters |
| 111 if(as.numeric(alpha) == -1) {alpha <- NULL} | 109 alpha <- as.numeric(alpha) |
| 110 if(alpha < 0 | alpha > 1) {alpha <- NULL} | |
| 112 # load genotype and phenotype | 111 # load genotype and phenotype |
| 113 con = file(genotype) | 112 con = file(genotype) |
| 114 genotype <- readLines(con = con, n = 1, ok=T) | 113 genotype <- readLines(con = con, n = 1, ok=T) |
| 115 close(con) | 114 close(con) |
| 116 genotype <- read.table(genotype, sep="\t", h=T) | 115 genotype <- read.table(genotype, sep="\t", h=T) |
| 117 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve | 116 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve |
| 118 phenotype <- read.table(phenotype, sep="\t", h=T)[,1] | 117 phenotype <- read.table(phenotype, sep="\t", h=T)[,1] |
| 119 # run ! | 118 # run ! |
| 120 lassoSelection(genotype = data.matrix(genotype), phenotype = phenotype, | 119 lasso(genotype = data.matrix(genotype), phenotype = phenotype, |
| 121 evaluation = evaluation, outFile = out, folds = folds, alpha = alpha) | 120 evaluation = evaluation, outFile = out, folds = folds, alpha = alpha) |
| 121 # return path of the result file to galaxy | |
| 122 cat(paste(paste(out, ".rds", sep = ""), "\n", sep="")) | 122 cat(paste(paste(out, ".rds", sep = ""), "\n", sep="")) |
