| 10 | 1 ######################################################## | 
|  | 2 # | 
|  | 3 # creation date : 07/01/16 | 
| 40 | 4 # last modification : 25/10/16 | 
| 10 | 5 # author : Dr Nicolas Beaume | 
|  | 6 # | 
|  | 7 ######################################################## | 
|  | 8 | 
| 40 | 9 suppressWarnings(suppressMessages(library(randomForest))) | 
| 10 | 10 ############################ helper functions ####################### | 
| 40 | 11 # optimize | 
|  | 12 optimize <- function(genotype, phenotype, ntree=1000, | 
|  | 13                      rangeMtry=seq(ceiling(ncol(genotype)/5), | 
|  | 14                                    ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100)), | 
| 10 | 15                      repet=3) { | 
| 40 | 16   # accuracy over all mtry values | 
|  | 17   acc <- NULL | 
|  | 18   for(mtry in rangeMtry) { | 
|  | 19     # to compute the mean accuracy over repetiotion for the current mtry value | 
|  | 20     tempAcc <- NULL | 
|  | 21     for(i in 1:repet) { | 
|  | 22       # 1/3 of the dataset is used as test set | 
|  | 23       n <- ceiling(nrow(genotype)/3) | 
|  | 24       indexTest <- sample(1:nrow(genotype), size=n) | 
|  | 25       # create training and test set | 
|  | 26       train <- genotype[-indexTest,] | 
|  | 27       test <- genotype[indexTest,] | 
|  | 28       phenoTrain <- phenotype[-indexTest] | 
|  | 29       phenoTest <- phenotype[indexTest] | 
|  | 30       # compute model | 
|  | 31       model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry =mtry) | 
|  | 32       # predict on test set and compute accuracy | 
|  | 33       pred <- predict(model, test) | 
|  | 34       tempAcc <- c(tempAcc, r2(phenoTest, pred)) | 
| 10 | 35     } | 
| 40 | 36     # find mean accuracy for the current mtry value | 
|  | 37     acc <- c(acc, mean(tempAcc)) | 
| 10 | 38   } | 
| 40 | 39   # return mtry for the best accuracy | 
|  | 40   names(acc) <- rangeMtry | 
|  | 41   bestParam <- which.max(acc) | 
|  | 42   return(rangeMtry[bestParam]) | 
| 10 | 43 } | 
|  | 44 | 
| 40 | 45 # compute r2 by computing the classic formula | 
|  | 46 # compare the sum of square difference from target to prediciton | 
|  | 47 # to the sum of square difference from target to the mean of the target | 
| 10 | 48 r2 <- function(target, prediction) { | 
|  | 49   sst <- sum((target-mean(target))^2) | 
|  | 50   ssr <- sum((target-prediction)^2) | 
|  | 51   return(1-ssr/sst) | 
|  | 52 } | 
|  | 53 ################################## main function ########################### | 
| 40 | 54 rfSelection <- function(genotype, phenotype, folds, outFile, evaluation=T, mtry=NULL, ntree=1000) { | 
| 10 | 55 | 
|  | 56   # go for optimization | 
| 40 | 57   if(is.null(mtry)) { | 
|  | 58     # find best mtry | 
|  | 59     mtry <- seq(ceiling(ncol(genotype)/10), ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100)) | 
|  | 60     mtry <- optimize(genotype=genotype, phenotype=phenotype, | 
|  | 61                     ntree = ntree, rangeMtry = mtry) | 
| 10 | 62   } | 
|  | 63   # evaluation | 
|  | 64   if(evaluation) { | 
|  | 65     prediction <- NULL | 
|  | 66     for(i in 1:length(folds)) { | 
| 40 | 67       # create training and testing set for the current fold | 
| 10 | 68       train <- genotype[-folds[[i]],] | 
|  | 69       test <- genotype[folds[[i]],] | 
|  | 70       phenoTrain <- phenotype[-folds[[i]]] | 
| 40 | 71       # compute model | 
| 10 | 72       rf <- randomForest(x=train, y=phenoTrain, mtry = mtry, ntree = ntree) | 
| 40 | 73       # predict and save prediction for the current fold | 
| 10 | 74       prediction <- c(prediction, list(predict(rf, test))) | 
|  | 75     } | 
| 40 | 76     # save preductions for all folds to be used for evaluation | 
| 10 | 77     saveRDS(prediction, file = paste(outFile, ".rds", sep = "")) | 
|  | 78   } else { | 
| 40 | 79     # just compute the model and save it | 
| 10 | 80     model <- randomForest(x=genotype, y=phenotype, mtry = mtry, ntree=ntree) | 
|  | 81     saveRDS(model, file = paste(outFile, ".rds", sep = "")) | 
|  | 82   } | 
|  | 83 } | 
|  | 84 | 
|  | 85 | 
|  | 86 ############################ main ############################# | 
| 40 | 87 # load parameters | 
| 10 | 88 cmd <- commandArgs(T) | 
|  | 89 source(cmd[1]) | 
|  | 90 # load classifier parameters | 
|  | 91 mtry <- as.numeric(mtry) | 
|  | 92 ntree <- as.numeric(ntree) | 
|  | 93 if(mtry == -1) {mtry <- NULL} | 
|  | 94 # check if evaluation is required | 
|  | 95 evaluation <- F | 
|  | 96 if(as.integer(doEvaluation) == 1) { | 
|  | 97   evaluation <- T | 
|  | 98   con = file(folds) | 
|  | 99   folds <- readLines(con = con, n = 1, ok=T) | 
|  | 100   close(con) | 
|  | 101   folds <- readRDS(folds) | 
|  | 102 } | 
|  | 103 # load genotype and phenotype | 
|  | 104 con = file(genotype) | 
|  | 105 genotype <- readLines(con = con, n = 1, ok=T) | 
|  | 106 close(con) | 
|  | 107 genotype <- read.table(genotype, sep="\t", h=T) | 
|  | 108 phenotype <- read.table(phenotype, sep="\t", h=T)[,1] | 
|  | 109 # run ! | 
|  | 110 rfSelection(genotype = data.matrix(genotype), phenotype=phenotype, | 
|  | 111             evaluation = evaluation, out = out, folds = folds, mtry = mtry, ntree=ntree) | 
| 40 | 112 # send the path containing results to galaxy | 
| 10 | 113 cat(paste(paste(out, ".rds", sep = ""), "\n", sep="")) |