Mercurial > repos > nicolas > oghma
changeset 35:2e66da6efc41 draft
Uploaded
| author | nicolas | 
|---|---|
| date | Tue, 25 Oct 2016 14:40:32 -0400 | 
| parents | b57cf5ccc108 | 
| children | d961f726b619 | 
| files | lasso.R | 
| diffstat | 1 files changed, 32 insertions(+), 32 deletions(-) [+] | 
line wrap: on
 line diff
--- a/lasso.R Tue Oct 25 14:40:16 2016 -0400 +++ b/lasso.R Tue Oct 25 14:40:32 2016 -0400 @@ -6,45 +6,49 @@ # owner : IRRI # ######################################################## -log <- file(paste(getwd(), "log_LASSO.txt", sep="/"), open = "wt") -sink(file = log, type="message") -library(glmnet) +suppressWarnings(suppressMessages(library(glmnet))) library(methods) ############################ helper functions ####################### -createFolds <- function(nbObs, n) { - index <- sample(1:n, size=nbObs, replace = T) - folds <- NULL - for(i in 1:n) { - folds <- c(folds, list(which(index==i))) - } - return(folds) -} -optimize <- function(genotype, phenotype, alpha=seq(0,1,0.1), nfolds=7) { +# optimize alpha parameter +optimize <- function(genotype, phenotype, alpha=seq(0,1,0.1), repet=7) { acc <- NULL indexAlpha <- 1 for(a in alpha) { curAcc <- NULL - for(i in 1:nfolds) { + # repeat nfolds time each analysis + for(i in 1:repet) { + # draw at random 1/3 of the training set for testing and thus choose alpha + # note it is not a cross-validation n <- ceiling(nrow(genotype)/3) indexTest <- sample(1:nrow(genotype), size=n) + # create training set and test set train <- genotype[-indexTest,] test <- genotype[indexTest,] phenoTrain <- phenotype[-indexTest] phenoTest <- phenotype[indexTest] + # cv.glmnet allow to compute lambda at the current alpha cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a) + # create model model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se) + # predict test set pred <- predict(model, test, type = "response") + # compute r2 for choosing the best alpha curAcc <- c(curAcc, r2(phenoTest, pred)) } + # add mean r2 for this value of lambda to the accuracy vector acc <- c(acc, mean(curAcc)) } + # choose best alpha names(acc) <- alpha return(as.numeric(names(acc)[which.max(acc)])) } +# compute r2 by computing the classic formula +# compare the sum of square difference from target to prediciton +# to the sum of square difference from target to the mean of the target r2 <- function(target, prediction) { sst <- sum((target-mean(target))^2) ssr <- sum((target-prediction)^2) @@ -52,7 +56,7 @@ } ################################## main function ########################### -lassoSelection <- function(genotype, phenotype, evaluation = T, outFile, folds, alpha=NULL) { +lasso <- function(genotype, phenotype, evaluation = T, outFile, folds, alpha=NULL) { # go for optimization if(is.null(alpha)) { alpha <- seq(0,1,0.1) @@ -61,41 +65,35 @@ # evaluation if(evaluation) { prediction <- NULL + # do cross-validation for(i in 1:length(folds)) { + # create training and test set train <- genotype[-folds[[i]],] test <- genotype[folds[[i]],] phenoTrain <- phenotype[-folds[[i]]] phenoTest <- phenotype[folds[[i]]] + # cv.glmnet helps to compute the right lambda for a chosen alpha cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha) + # create model lasso.fit <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha, lambda = cv$lambda.1se) + # predict value of the test set for further evaluation prediction <- c(prediction, list(predict(lasso.fit, test, type = "response")[,1])) } + # save predicted value for test set of each fold for further evaluation saveRDS(prediction, file=paste(outFile,".rds", sep="")) # just create a model } else { + # cv.glmnet helps to compute the right lambda for a chosen alpha cv <- cv.glmnet(x=genotype, y=phenotype, alpha=alpha) + # create model model <- glmnet(x=genotype, y=phenotype, alpha=alpha, lambda=cv$lambda.1se) + # save model saveRDS(model, file = paste(outFile, ".rds", sep = "")) } } ############################ main ############################# -# running from terminal (supposing the OghmaGalaxy/bin directory is in your path) : -# lasso.sh -i path_to_genotype -p phenotype_file -e -f fold_file -o out_file -## -i : path to the file that contains the genotypes, must be a .rda file (as outputed by loadGenotype). -# please note that the table must be called "encoded" when your datafile is saved into .rda (automatic if encoded.R was used) - -## -p : file that contains the phenotype must be a .rda file (as outputed by loadGenotype.R). -# please note that the table must be called "phenotype" when your datafile is saved into .rda (automatic if loadGenotype.R was used) - -## -e : do evaluation instead of producing a model - -## -f : index of the folds to which belong each individual -# please note that the list must be called "folds" (automatic if folds.R was used) - -## -o : path to the output folder and generic name of the analysis. The extension related to each type of -# files are automatically added - +# load argument cmd <- commandArgs(T) source(cmd[1]) # check if evaluation is required @@ -108,7 +106,8 @@ folds <- readRDS(folds) } # load classifier parameters -if(as.numeric(alpha) == -1) {alpha <- NULL} +alpha <- as.numeric(alpha) +if(alpha < 0 | alpha > 1) {alpha <- NULL} # load genotype and phenotype con = file(genotype) genotype <- readLines(con = con, n = 1, ok=T) @@ -117,6 +116,7 @@ # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve phenotype <- read.table(phenotype, sep="\t", h=T)[,1] # run ! -lassoSelection(genotype = data.matrix(genotype), phenotype = phenotype, +lasso(genotype = data.matrix(genotype), phenotype = phenotype, evaluation = evaluation, outFile = out, folds = folds, alpha = alpha) +# return path of the result file to galaxy cat(paste(paste(out, ".rds", sep = ""), "\n", sep=""))
