| 9 | 1 ######################################################## | 
|  | 2 # | 
|  | 3 # creation date : 08/01/16 | 
|  | 4 # last modification : 01/09/16 | 
|  | 5 # author : Dr Nicolas Beaume | 
|  | 6 # owner : IRRI | 
|  | 7 # | 
|  | 8 ######################################################## | 
|  | 9 log <- file(paste(getwd(), "log_LASSO.txt", sep="/"), open = "wt") | 
|  | 10 sink(file = log, type="message") | 
|  | 11 | 
|  | 12 library(glmnet) | 
|  | 13 library(methods) | 
|  | 14 ############################ helper functions ####################### | 
|  | 15 | 
|  | 16 createFolds <- function(nbObs, n) { | 
|  | 17   index <- sample(1:n, size=nbObs, replace = T) | 
|  | 18   folds <- NULL | 
|  | 19   for(i in 1:n) { | 
|  | 20     folds <- c(folds, list(which(index==i))) | 
|  | 21   } | 
|  | 22   return(folds) | 
|  | 23 } | 
|  | 24 | 
|  | 25 optimize <- function(genotype, phenotype, alpha=seq(0,1,0.1), nfolds=7) { | 
|  | 26   acc <- NULL | 
|  | 27   indexAlpha <- 1 | 
|  | 28   for(a in alpha) { | 
|  | 29     curAcc <- NULL | 
|  | 30     for(i in 1:nfolds) { | 
|  | 31       n <- ceiling(nrow(genotype)/3) | 
|  | 32       indexTest <- sample(1:nrow(genotype), size=n) | 
|  | 33       train <- genotype[-indexTest,] | 
|  | 34       test <- genotype[indexTest,] | 
|  | 35       phenoTrain <- phenotype[-indexTest] | 
|  | 36       phenoTest <- phenotype[indexTest] | 
|  | 37       cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a) | 
|  | 38       model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se) | 
|  | 39       pred <- predict(model, test, type = "response") | 
|  | 40       curAcc <- c(curAcc, r2(phenoTest, pred)) | 
|  | 41     } | 
|  | 42     acc <- c(acc, mean(curAcc)) | 
|  | 43   } | 
|  | 44   names(acc) <- alpha | 
|  | 45   return(as.numeric(names(acc)[which.max(acc)])) | 
|  | 46 } | 
|  | 47 | 
|  | 48 r2 <- function(target, prediction) { | 
|  | 49   sst <- sum((target-mean(target))^2) | 
|  | 50   ssr <- sum((target-prediction)^2) | 
|  | 51   return(1-ssr/sst) | 
|  | 52 } | 
|  | 53 ################################## main function ########################### | 
|  | 54 | 
|  | 55 lassoSelection <- function(genotype, phenotype, evaluation = T, outFile, folds, alpha=NULL) { | 
|  | 56   # go for optimization | 
|  | 57   if(is.null(alpha)) { | 
|  | 58     alpha <- seq(0,1,0.1) | 
|  | 59     alpha <- optimize(genotype=genotype, phenotype=phenotype, alpha = alpha) | 
|  | 60   } | 
|  | 61   # evaluation | 
|  | 62   if(evaluation) { | 
|  | 63     prediction <- NULL | 
|  | 64     for(i in 1:length(folds)) { | 
|  | 65       train <- genotype[-folds[[i]],] | 
|  | 66       test <- genotype[folds[[i]],] | 
|  | 67       phenoTrain <- phenotype[-folds[[i]]] | 
|  | 68       phenoTest <- phenotype[folds[[i]]] | 
|  | 69       cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha) | 
|  | 70       lasso.fit <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha, lambda = cv$lambda.1se) | 
|  | 71       prediction <- c(prediction, list(predict(lasso.fit, test, type = "response")[,1])) | 
|  | 72     } | 
|  | 73     saveRDS(prediction, file=paste(outFile,".rds", sep="")) | 
|  | 74     # just create a model | 
|  | 75   } else { | 
|  | 76     cv <- cv.glmnet(x=genotype, y=phenotype, alpha=alpha) | 
|  | 77     model <- glmnet(x=genotype, y=phenotype, alpha=alpha, lambda=cv$lambda.1se) | 
|  | 78     saveRDS(model, file = paste(outFile, ".rds", sep = "")) | 
|  | 79   } | 
|  | 80 } | 
|  | 81 | 
|  | 82 ############################ main ############################# | 
|  | 83 # running from terminal (supposing the OghmaGalaxy/bin directory is in your path) : | 
|  | 84 # lasso.sh -i path_to_genotype -p phenotype_file -e -f fold_file -o out_file | 
|  | 85 ## -i : path to the file that contains the genotypes, must be a .rda file (as outputed by loadGenotype). | 
|  | 86 # please note that the table must be called "encoded" when your datafile is saved into .rda (automatic if encoded.R was used) | 
|  | 87 | 
|  | 88 ## -p : file that contains the phenotype must be a .rda file (as outputed by loadGenotype.R). | 
|  | 89 # please note that the table must be called "phenotype" when your datafile is saved into .rda (automatic if loadGenotype.R was used) | 
|  | 90 | 
|  | 91 ## -e : do evaluation instead of producing a model | 
|  | 92 | 
|  | 93 ## -f : index of the folds to which belong each individual | 
|  | 94 # please note that the list must be called "folds" (automatic if folds.R was used) | 
|  | 95 | 
|  | 96 ## -o : path to the output folder and generic name of the analysis. The extension related to each type of | 
|  | 97 # files are automatically added | 
|  | 98 | 
|  | 99 cmd <- commandArgs(T) | 
|  | 100 source(cmd[1]) | 
|  | 101 # check if evaluation is required | 
|  | 102 evaluation <- F | 
|  | 103 if(as.integer(doEvaluation) == 1) { | 
|  | 104   evaluation <- T | 
|  | 105   con = file(folds) | 
|  | 106   folds <- readLines(con = con, n = 1, ok=T) | 
|  | 107   close(con) | 
|  | 108   folds <- readRDS(folds) | 
|  | 109 } | 
|  | 110 # load classifier parameters | 
|  | 111 if(as.numeric(alpha) == -1) {alpha <- NULL} | 
|  | 112 # load genotype and phenotype | 
|  | 113 con = file(genotype) | 
|  | 114 genotype <- readLines(con = con, n = 1, ok=T) | 
|  | 115 close(con) | 
|  | 116 genotype <- read.table(genotype, sep="\t", h=T) | 
|  | 117 # phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve | 
|  | 118 phenotype <- read.table(phenotype, sep="\t", h=T)[,1] | 
|  | 119 # run ! | 
|  | 120 lassoSelection(genotype = data.matrix(genotype), phenotype = phenotype, | 
|  | 121                evaluation = evaluation, outFile = out, folds = folds, alpha = alpha) | 
|  | 122 cat(paste(paste(out, ".rds", sep = ""), "\n", sep="")) |