comparison lasso.R @ 35:2e66da6efc41 draft

Uploaded
author nicolas
date Tue, 25 Oct 2016 14:40:32 -0400
parents dcc10adbe46b
children
comparison
equal deleted inserted replaced
34:b57cf5ccc108 35:2e66da6efc41
4 # last modification : 01/09/16 4 # last modification : 01/09/16
5 # author : Dr Nicolas Beaume 5 # author : Dr Nicolas Beaume
6 # owner : IRRI 6 # owner : IRRI
7 # 7 #
8 ######################################################## 8 ########################################################
9 log <- file(paste(getwd(), "log_LASSO.txt", sep="/"), open = "wt")
10 sink(file = log, type="message")
11 9
12 library(glmnet) 10 suppressWarnings(suppressMessages(library(glmnet)))
13 library(methods) 11 library(methods)
14 ############################ helper functions ####################### 12 ############################ helper functions #######################
15 13
16 createFolds <- function(nbObs, n) {
17 index <- sample(1:n, size=nbObs, replace = T)
18 folds <- NULL
19 for(i in 1:n) {
20 folds <- c(folds, list(which(index==i)))
21 }
22 return(folds)
23 }
24 14
25 optimize <- function(genotype, phenotype, alpha=seq(0,1,0.1), nfolds=7) { 15 # optimize alpha parameter
16 optimize <- function(genotype, phenotype, alpha=seq(0,1,0.1), repet=7) {
26 acc <- NULL 17 acc <- NULL
27 indexAlpha <- 1 18 indexAlpha <- 1
28 for(a in alpha) { 19 for(a in alpha) {
29 curAcc <- NULL 20 curAcc <- NULL
30 for(i in 1:nfolds) { 21 # repeat nfolds time each analysis
22 for(i in 1:repet) {
23 # draw at random 1/3 of the training set for testing and thus choose alpha
24 # note it is not a cross-validation
31 n <- ceiling(nrow(genotype)/3) 25 n <- ceiling(nrow(genotype)/3)
32 indexTest <- sample(1:nrow(genotype), size=n) 26 indexTest <- sample(1:nrow(genotype), size=n)
27 # create training set and test set
33 train <- genotype[-indexTest,] 28 train <- genotype[-indexTest,]
34 test <- genotype[indexTest,] 29 test <- genotype[indexTest,]
35 phenoTrain <- phenotype[-indexTest] 30 phenoTrain <- phenotype[-indexTest]
36 phenoTest <- phenotype[indexTest] 31 phenoTest <- phenotype[indexTest]
32 # cv.glmnet allow to compute lambda at the current alpha
37 cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a) 33 cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a)
34 # create model
38 model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se) 35 model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se)
36 # predict test set
39 pred <- predict(model, test, type = "response") 37 pred <- predict(model, test, type = "response")
38 # compute r2 for choosing the best alpha
40 curAcc <- c(curAcc, r2(phenoTest, pred)) 39 curAcc <- c(curAcc, r2(phenoTest, pred))
41 } 40 }
41 # add mean r2 for this value of lambda to the accuracy vector
42 acc <- c(acc, mean(curAcc)) 42 acc <- c(acc, mean(curAcc))
43 } 43 }
44 # choose best alpha
44 names(acc) <- alpha 45 names(acc) <- alpha
45 return(as.numeric(names(acc)[which.max(acc)])) 46 return(as.numeric(names(acc)[which.max(acc)]))
46 } 47 }
47 48
49 # compute r2 by computing the classic formula
50 # compare the sum of square difference from target to prediciton
51 # to the sum of square difference from target to the mean of the target
48 r2 <- function(target, prediction) { 52 r2 <- function(target, prediction) {
49 sst <- sum((target-mean(target))^2) 53 sst <- sum((target-mean(target))^2)
50 ssr <- sum((target-prediction)^2) 54 ssr <- sum((target-prediction)^2)
51 return(1-ssr/sst) 55 return(1-ssr/sst)
52 } 56 }
53 ################################## main function ########################### 57 ################################## main function ###########################
54 58
55 lassoSelection <- function(genotype, phenotype, evaluation = T, outFile, folds, alpha=NULL) { 59 lasso <- function(genotype, phenotype, evaluation = T, outFile, folds, alpha=NULL) {
56 # go for optimization 60 # go for optimization
57 if(is.null(alpha)) { 61 if(is.null(alpha)) {
58 alpha <- seq(0,1,0.1) 62 alpha <- seq(0,1,0.1)
59 alpha <- optimize(genotype=genotype, phenotype=phenotype, alpha = alpha) 63 alpha <- optimize(genotype=genotype, phenotype=phenotype, alpha = alpha)
60 } 64 }
61 # evaluation 65 # evaluation
62 if(evaluation) { 66 if(evaluation) {
63 prediction <- NULL 67 prediction <- NULL
68 # do cross-validation
64 for(i in 1:length(folds)) { 69 for(i in 1:length(folds)) {
70 # create training and test set
65 train <- genotype[-folds[[i]],] 71 train <- genotype[-folds[[i]],]
66 test <- genotype[folds[[i]],] 72 test <- genotype[folds[[i]],]
67 phenoTrain <- phenotype[-folds[[i]]] 73 phenoTrain <- phenotype[-folds[[i]]]
68 phenoTest <- phenotype[folds[[i]]] 74 phenoTest <- phenotype[folds[[i]]]
75 # cv.glmnet helps to compute the right lambda for a chosen alpha
69 cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha) 76 cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha)
77 # create model
70 lasso.fit <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha, lambda = cv$lambda.1se) 78 lasso.fit <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha, lambda = cv$lambda.1se)
79 # predict value of the test set for further evaluation
71 prediction <- c(prediction, list(predict(lasso.fit, test, type = "response")[,1])) 80 prediction <- c(prediction, list(predict(lasso.fit, test, type = "response")[,1]))
72 } 81 }
82 # save predicted value for test set of each fold for further evaluation
73 saveRDS(prediction, file=paste(outFile,".rds", sep="")) 83 saveRDS(prediction, file=paste(outFile,".rds", sep=""))
74 # just create a model 84 # just create a model
75 } else { 85 } else {
86 # cv.glmnet helps to compute the right lambda for a chosen alpha
76 cv <- cv.glmnet(x=genotype, y=phenotype, alpha=alpha) 87 cv <- cv.glmnet(x=genotype, y=phenotype, alpha=alpha)
88 # create model
77 model <- glmnet(x=genotype, y=phenotype, alpha=alpha, lambda=cv$lambda.1se) 89 model <- glmnet(x=genotype, y=phenotype, alpha=alpha, lambda=cv$lambda.1se)
90 # save model
78 saveRDS(model, file = paste(outFile, ".rds", sep = "")) 91 saveRDS(model, file = paste(outFile, ".rds", sep = ""))
79 } 92 }
80 } 93 }
81 94
82 ############################ main ############################# 95 ############################ main #############################
83 # running from terminal (supposing the OghmaGalaxy/bin directory is in your path) : 96 # load argument
84 # lasso.sh -i path_to_genotype -p phenotype_file -e -f fold_file -o out_file
85 ## -i : path to the file that contains the genotypes, must be a .rda file (as outputed by loadGenotype).
86 # please note that the table must be called "encoded" when your datafile is saved into .rda (automatic if encoded.R was used)
87
88 ## -p : file that contains the phenotype must be a .rda file (as outputed by loadGenotype.R).
89 # please note that the table must be called "phenotype" when your datafile is saved into .rda (automatic if loadGenotype.R was used)
90
91 ## -e : do evaluation instead of producing a model
92
93 ## -f : index of the folds to which belong each individual
94 # please note that the list must be called "folds" (automatic if folds.R was used)
95
96 ## -o : path to the output folder and generic name of the analysis. The extension related to each type of
97 # files are automatically added
98
99 cmd <- commandArgs(T) 97 cmd <- commandArgs(T)
100 source(cmd[1]) 98 source(cmd[1])
101 # check if evaluation is required 99 # check if evaluation is required
102 evaluation <- F 100 evaluation <- F
103 if(as.integer(doEvaluation) == 1) { 101 if(as.integer(doEvaluation) == 1) {
106 folds <- readLines(con = con, n = 1, ok=T) 104 folds <- readLines(con = con, n = 1, ok=T)
107 close(con) 105 close(con)
108 folds <- readRDS(folds) 106 folds <- readRDS(folds)
109 } 107 }
110 # load classifier parameters 108 # load classifier parameters
111 if(as.numeric(alpha) == -1) {alpha <- NULL} 109 alpha <- as.numeric(alpha)
110 if(alpha < 0 | alpha > 1) {alpha <- NULL}
112 # load genotype and phenotype 111 # load genotype and phenotype
113 con = file(genotype) 112 con = file(genotype)
114 genotype <- readLines(con = con, n = 1, ok=T) 113 genotype <- readLines(con = con, n = 1, ok=T)
115 close(con) 114 close(con)
116 genotype <- read.table(genotype, sep="\t", h=T) 115 genotype <- read.table(genotype, sep="\t", h=T)
117 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve 116 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve
118 phenotype <- read.table(phenotype, sep="\t", h=T)[,1] 117 phenotype <- read.table(phenotype, sep="\t", h=T)[,1]
119 # run ! 118 # run !
120 lassoSelection(genotype = data.matrix(genotype), phenotype = phenotype, 119 lasso(genotype = data.matrix(genotype), phenotype = phenotype,
121 evaluation = evaluation, outFile = out, folds = folds, alpha = alpha) 120 evaluation = evaluation, outFile = out, folds = folds, alpha = alpha)
121 # return path of the result file to galaxy
122 cat(paste(paste(out, ".rds", sep = ""), "\n", sep="")) 122 cat(paste(paste(out, ".rds", sep = ""), "\n", sep=""))