Mercurial > repos > nicolas > oghma
comparison randomForest.R @ 85:94aa63659613 draft
Uploaded
author | nicolas |
---|---|
date | Fri, 28 Oct 2016 08:48:22 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
84:4eea5c2313d2 | 85:94aa63659613 |
---|---|
1 ######################################################## | |
2 # | |
3 # creation date : 07/01/16 | |
4 # last modification : 25/10/16 | |
5 # author : Dr Nicolas Beaume | |
6 # | |
7 ######################################################## | |
8 | |
9 suppressWarnings(suppressMessages(library(randomForest))) | |
10 ############################ helper functions ####################### | |
11 # optimize | |
12 optimize <- function(genotype, phenotype, ntree=1000, | |
13 rangeMtry=seq(ceiling(ncol(genotype)/5), | |
14 ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100)), | |
15 repet=3) { | |
16 # accuracy over all mtry values | |
17 acc <- NULL | |
18 for(mtry in rangeMtry) { | |
19 # to compute the mean accuracy over repetiotion for the current mtry value | |
20 tempAcc <- NULL | |
21 for(i in 1:repet) { | |
22 # 1/3 of the dataset is used as test set | |
23 n <- ceiling(nrow(genotype)/3) | |
24 indexTest <- sample(1:nrow(genotype), size=n) | |
25 # create training and test set | |
26 train <- genotype[-indexTest,] | |
27 test <- genotype[indexTest,] | |
28 phenoTrain <- phenotype[-indexTest] | |
29 phenoTest <- phenotype[indexTest] | |
30 # compute model | |
31 model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry =mtry) | |
32 # predict on test set and compute accuracy | |
33 pred <- predict(model, test) | |
34 tempAcc <- c(tempAcc, r2(phenoTest, pred)) | |
35 } | |
36 # find mean accuracy for the current mtry value | |
37 acc <- c(acc, mean(tempAcc)) | |
38 } | |
39 # return mtry for the best accuracy | |
40 names(acc) <- rangeMtry | |
41 bestParam <- which.max(acc) | |
42 return(rangeMtry[bestParam]) | |
43 } | |
44 | |
45 # compute r2 by computing the classic formula | |
46 # compare the sum of square difference from target to prediciton | |
47 # to the sum of square difference from target to the mean of the target | |
48 r2 <- function(target, prediction) { | |
49 sst <- sum((target-mean(target))^2) | |
50 ssr <- sum((target-prediction)^2) | |
51 return(1-ssr/sst) | |
52 } | |
53 ################################## main function ########################### | |
54 rfSelection <- function(genotype, phenotype, folds, outFile, evaluation=T, mtry=NULL, ntree=1000) { | |
55 | |
56 # go for optimization | |
57 if(is.null(mtry)) { | |
58 # find best mtry | |
59 mtry <- seq(ceiling(ncol(genotype)/10), ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100)) | |
60 mtry <- optimize(genotype=genotype, phenotype=phenotype, | |
61 ntree = ntree, rangeMtry = mtry) | |
62 } | |
63 # evaluation | |
64 if(evaluation) { | |
65 prediction <- NULL | |
66 for(i in 1:length(folds)) { | |
67 # create training and testing set for the current fold | |
68 train <- genotype[-folds[[i]],] | |
69 test <- genotype[folds[[i]],] | |
70 phenoTrain <- phenotype[-folds[[i]]] | |
71 # compute model | |
72 rf <- randomForest(x=train, y=phenoTrain, mtry = mtry, ntree = ntree) | |
73 # predict and save prediction for the current fold | |
74 prediction <- c(prediction, list(predict(rf, test))) | |
75 } | |
76 # save preductions for all folds to be used for evaluation | |
77 saveRDS(prediction, file = paste(outFile, ".rds", sep = "")) | |
78 } else { | |
79 # just compute the model and save it | |
80 model <- randomForest(x=genotype, y=phenotype, mtry = mtry, ntree=ntree) | |
81 saveRDS(model, file = paste(outFile, ".rds", sep = "")) | |
82 } | |
83 } | |
84 | |
85 | |
86 ############################ main ############################# | |
87 # load parameters | |
88 cmd <- commandArgs(T) | |
89 source(cmd[1]) | |
90 # load classifier parameters | |
91 mtry <- as.numeric(mtry) | |
92 ntree <- as.numeric(ntree) | |
93 if(mtry == -1) {mtry <- NULL} | |
94 # check if evaluation is required | |
95 evaluation <- F | |
96 if(as.integer(doEvaluation) == 1) { | |
97 evaluation <- T | |
98 con = file(folds) | |
99 folds <- readLines(con = con, n = 1, ok=T) | |
100 close(con) | |
101 folds <- readRDS(folds) | |
102 } | |
103 # load genotype and phenotype | |
104 con = file(genotype) | |
105 genotype <- readLines(con = con, n = 1, ok=T) | |
106 close(con) | |
107 genotype <- read.table(genotype, sep="\t", h=T) | |
108 phenotype <- read.table(phenotype, sep="\t", h=T)[,1] | |
109 # run ! | |
110 rfSelection(genotype = data.matrix(genotype), phenotype=phenotype, | |
111 evaluation = evaluation, out = out, folds = folds, mtry = mtry, ntree=ntree) | |
112 # send the path containing results to galaxy | |
113 cat(paste(paste(out, ".rds", sep = ""), "\n", sep="")) |