Previous changeset 69:55449ced23d5 (2016-10-26) Next changeset 71:37d3d073b51d (2016-10-28) |
Commit message:
Deleted selected files |
removed:
computeR2.R computeR2.xml encode.R encode.xml evaluation.R evaluation.xml folds.R folds.xml lasso.R lasso.xml plotPrediction.R plotPrediction.xml prediction.R prediction.xml randomForest.R randomForest.xml rrBLUP.R rrBLUP.xml svm.R svm.xml |
b |
diff -r 55449ced23d5 -r 8cc5a7448ca6 computeR2.R --- a/computeR2.R Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,26 +0,0 @@ -######################################################## -# -# creation date : 27/06/16 -# last modification : 22/10/16 -# author : Dr Nicolas Beaume -# owner : IRRI -# -######################################################## - -# compute r2 by computing the classic formula -# compare the sum of square difference from target to prediciton -# to the sum of square difference from target to the mean of the target -computeR2 <- function(target, prediction) { - sst <- sum((target-mean(target))^2) - ssr <- sum((target-prediction)^2) - return(1-ssr/sst) -} -############################ main ############################# -# extract argument -cmd <- commandArgs(trailingOnly = T) -source(cmd[1]) -# load target and prediction -phenotype <- read.table(phenotype, sep="\t", h=T)[,1] -predicted <- read.table(predicted, sep = "\t", h=T)[,2] -# compute r2 -cat(computeR2(phenotype, predicted), file=out) \ No newline at end of file |
b |
diff -r 55449ced23d5 -r 8cc5a7448ca6 computeR2.xml --- a/computeR2.xml Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,36 +0,0 @@ -<tool id="computeR2" name="compute R2" version="1.0.0"> - <description>compute R-squared value of a prediction vs true phenotype</description> - <command interpreter="Rscript"> - computeR2.R $config - </command> - - <inputs> - <param name="phenotype" type="data" - label="true phenotype" - /> - - <param name="predictedPhenotype" type="data" - label="predicted phenotype" - /> - - </inputs> - - <configfiles> - <configfile name="config"> -## Desc: this file is sourced in encode wrapper script -## as means to pass all galaxy params to R -"${predictedPhenotype}" -> predicted -"${phenotype}" -> phenotype -"${r2}" -> out - - </configfile> -</configfiles> - -<outputs> - <data format="tabular" name = "r2" /> -</outputs> - - <help> - compute R2 of prediction vs true phenotype - </help> - </tool> \ No newline at end of file |
b |
diff -r 55449ced23d5 -r 8cc5a7448ca6 encode.R --- a/encode.R Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,126 +0,0 @@ -######################################################## -# -# creation date : 04/01/16 -# last modification : 17/09/16 -# author : Dr Nicolas Beaume -# owner : IRRI -# -######################################################## - -############################ helper functions ####################### - -# encode one position in one individual -encodeGenotype.position <- function(x, major, code=c(0,1,2), sep=""){ - res <- x - if(!is.na(x)) { - if(isHeterozygous(x, sep = sep)) { - # heterozygous - res <- code[2] - } else { - # determine whether it is the minor or major allele - x <- unlist(strsplit(x, sep)) - # need to check only one element as we already know it is a homozygous - if(length(x) > 1) { - x <- x[1] - } - if(x==major) { - res <- code[3] - } else { - res <- code[1] - } - } - } else { - # keep NA as NA - res <- NA - } - return(res) -} - -# rewrite a marker to make an exact count of the allele for the current marker -encodeGenotype.rewrite <- function(x, sep=""){ - res <- x - if(!is.na(x)) { - if(length(unlist(strsplit(x,sep)))==1) { - # in case of homozygous, must be counted 2 times so the caracter is written 2 times - res <- c(x,x) - } else { - # heterozygous - res <- unlist(strsplit(x, split=sep)) - } - } else { - res <- NA - } - return(res) -} - -# encode one individual -encodeGenotype.vec <- function(indiv, sep="", code=c(0,1,2)){ - # rewrite genotype to make sure everything is written as a double caracter value - newIndiv <- unlist(lapply(as.character(indiv), encodeGenotype.rewrite, sep)) - # compute the occurcence of each genotype to determine major an minor allele - stat <- table(as.character(newIndiv)) - major <- names(stat)[which.max(stat)] - # Encode using the appropriate code - indiv <- unlist(lapply(indiv, encodeGenotype.position, major, code, sep)) - return(indiv) -} - -# determine if the genotype is an homozygous or heterozygous one -# genotype must be written with two characters, even homozygous -# (see encodeGenotype.rewrite() function ) -isHeterozygous <- function(genotype, sep=""){ - bool <- F - # case of NA, can't be determined - if(is.na(genotype)){ - bool <- NA - } else { - # check whether both element of the genotype are the same or not - x <- unlist(strsplit(genotype, sep)) - if(length(x) > 1 & !(x[1] %in% x[2])) { - bool <- T - } - - } - return(bool) -} - -# check if encoding has been made properly. return a boolean vector -# which has the same length that the number of columns of the input matrix -# at marker i, check[i] is true if code[3] is larger than code[1], false otherwise -# please note that this function is not used in the current tool and let -# by convinience for being used outside of galaxy -checkEncoding <- function(encoded, code=c(0,1,2)) { - check <- NULL - for(i in 1:ncol(encoded)) { - # find major an minor allele - major <- length(which(encoded[,i]==code[3])) - minor <- length(which(encoded[,i]==code[1])) - # comaprison - if(major >= minor) { - check <- c(check, T) - } else { - check <- c(check, F) - } - } - return(check) -} - -################################## main function ########################### -# encode all individuals -encodeGenotype <- function(raw, sep="", code=c(0,1,2), outPath){ - # encode genotype - encoded <- apply(raw, 2, encodeGenotype.vec, sep, code) - # set all NA to -1 (thus encoding schems with -1 are not allowed) - encoded[is.na(encoded)] <- -1 - write.table(encoded, file=paste(outPath,".csv", sep=""), row.names = F, sep="\t") -} - -############################ main ############################# -# load argument from xml file -cmd <- commandArgs(T) -source(cmd[1]) -# load genotype -genotype <- read.table(genotype, sep="\t", stringsAsFactors = F, h=T) -code <- c(0,1,2) -encodeGenotype(raw=genotype, sep=sep, code = code, outPath = out) -cat(paste(out,".csv", "\n", sep="")) \ No newline at end of file |
b |
diff -r 55449ced23d5 -r 8cc5a7448ca6 encode.xml --- a/encode.xml Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,40 +0,0 @@ -<tool id="encode" name="encode" version="1.0.0"> - <description>encode genotypes for further machine learning analysis</description> - <command interpreter="Rscript"> - encode.R $config > ${output1} - </command> - - <inputs> - <param name="genotype" type="data" - label="genotype data" help="genotype must be a .csv" - /> - - <param name="separator" type="text" value="/" - label="separator of the heterozygous" help=" caracter used to separate heterozygous in the genotype" - /> - - <param name="encodedPath" type="text" - label="path to the output file" help= " a .csv extension is automatically added by OGHMA" - /> - </inputs> - - - <configfiles> - <configfile name="config"> -## Desc: this file is sourced in encode wrapper script -## as means to pass all galaxy params to R -"${genotype}" -> genotype -"${separator}" -> sep -"${encodedPath}" -> out - - </configfile> -</configfiles> - -<outputs> - <data format="tabular" name="output1" label="encoded data" /> -</outputs> - - <help> - Takes the genotype and encode them in the desired schem (usually 3 integer like 1/2/3) for minor homozygous/heterozygous/major homozygous. The output is a .csv file that may be use by other tools in the OGHMA workflow. - </help> - </tool> \ No newline at end of file |
b |
diff -r 55449ced23d5 -r 8cc5a7448ca6 evaluation.R --- a/evaluation.R Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,95 +0,0 @@ - -######################################################## -# -# creation date : 08/01/16 -# last modification : 23/06/16 -# author : Dr Nicolas Beaume -# owner : IRRI -# -######################################################## - -# compute correlation -predictionCorrelation <- function(prediction, obs) { - return(cor(prediction, obs, method = "pearson")) -} - -# compute r2 by computing the classic formula -# compare the sum of square difference from target to prediciton -# to the sum of square difference from target to the mean of the target -r2 <- function(target, prediction) { - sst <- sum((target-mean(target))^2) - ssr <- sum((target-prediction)^2) - return(1-ssr/sst) -} - -################################## main function ########################### - -evaluatePredictions <- function(data, prediction=NULL, traitValue=NULL, doR2=F, - folds=NULL, classes=NULL, doCor=F) { - eval <- NULL - # case of r2 - if(doR2) { - eval <- c(eval, list(r2=NULL)) - } - if(doCor) { - eval <- c(eval, list(cor=NULL)) - } - for (i in 1:length(folds)) { - train <- unlist(folds[-i]) - test <- folds[[i]] - if(doR2) { - if(!is.null(traitValue)) { - eval$r2 <- c(eval$r2, r2(traitValue[test], prediction[[i]])) - } else { - eval$r2 <- c(eval$r2, NA) - } - } - # case of correlation - if(doCor) { - if(!is.null(traitValue)) { - eval$cor <- c(eval$cor, cor(traitValue[test], prediction[[i]])) - } else { - eval$cor <- c(eval$cor, NA) - } - } - } - # print output - print(eval) - if(doR2) { - print(paste("mean r2 : ", mean(eval$r2), "standard deviation :", sd(eval$r2))) - } - if(doCor) { - print(paste("mean correlation : ", mean(eval$cor), "standard deviation :", sd(eval$cor))) - } -} -############################ main ############################# - -# load arguments -cmd <- commandArgs(trailingOnly = T) -source(cmd[1]) -# set which evaluation are used -if(as.integer(doR2) == 1) { - doR2 <- T -} -if(as.integer(doCor) == 1) { - doCor <- T -} -# load genotype & phenotype -con = file(genotype) -genotype <- readLines(con = con, n = 1, ok=T) -close(con) -genotype <- read.table(genotype, sep="\t",h=T) -phenotype <- read.table(phenotype, sep="\t",h=T)[,1] -# load prediction -con = file(prediction) -prediction <- readLines(con = con, n = 1, ok=T) -close(con) -prediction <- readRDS(prediction) -# load folds -con = file(folds) -folds <- readLines(con = con, n = 1, ok=T) -close(con) -folds <- readRDS(folds) -# evaluation -evaluatePredictions(data=genotype, prediction=prediction, traitValue=phenotype, doR2=doR2, - folds=folds, doCor=doCor) \ No newline at end of file |
b |
diff -r 55449ced23d5 -r 8cc5a7448ca6 evaluation.xml --- a/evaluation.xml Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,55 +0,0 @@ -<tool id="evaluation" name="evaluation" version="1.0.0"> - <description>evaluate results of classifiers prediction</description> - <command interpreter="Rscript"> - evaluation.R $config > ${output1} - </command> - - <inputs> - <param name="genotype" type="data" - label="genotype data" - /> - - <param name="prediction" type="data" - label="results of prediction" - /> - - <param name="phenotype" type="data" - label="phenotype file" - /> - - <param name="r2" type="integer" value="1" - label="compute r-squared distance" help=" whether or not to compute the r-squared distance. 1 means the tool will compute it, any other argument means it will not be computed " - /> - - <param name="cor" type="integer" value="1" - label="compute correlation" help=" whether or not to compute the correlation between predictions and ture values. 1 means the tool will compute it, any other argument means it will not be computed " - /> - - <param name="folds" type="data" - label="folds" help=" path to a folds file containing folds indexes in a R list called /folds/ such as produced by the folds tools in OGHMA suite. " - /> - - </inputs> - - - <configfiles> - <configfile name="config"> -## Desc: this file is sourced in encode wrapper script -## as means to pass all galaxy params to R -"${genotype}" -> genotype -"${phenotype}" -> phenotype -"${prediction}" -> prediction -"${r2}" -> doR2 -"${cor}" -> doCor -"${folds}" -> folds - </configfile> -</configfiles> - -<outputs> - <data format="tabular" name = "output1" label="classifier prediction" /> -</outputs> - - <help> - evaluate the predictions of classifiers. For now correlation between true valeus and predictions, and r-squared distance are implemeted - </help> - </tool> \ No newline at end of file |
b |
diff -r 55449ced23d5 -r 8cc5a7448ca6 folds.R --- a/folds.R Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,39 +0,0 @@ -######################################################## -# -# creation date : 05/01/16 -# last modification : 27/06/16 -# author : Dr Nicolas Beaume -# owner : IRRI -# -######################################################## - - -############################ main function ####################### - -# create fold by picking at random row indexes -createFolds <- function(nbObs, n) { - # pick indexes - index <- sample(1:n, size=nbObs, replace = T) - # populate folds - folds <- NULL - for(i in 1:n) { - folds <- c(folds, list(which(index==i))) - } - return(folds) -} - -############################ main ############################# -# load arguments -cmd <- commandArgs(trailingOnly = T) -source(cmd[1]) -# load data and merge them -con = file(genotype) -genotype <- readLines(con = con, n = 1, ok=T) -close(con) -# fold creation -nObs <- nrow(read.table(genotype, sep="\t", h=T)) -folds <- createFolds(nObs, as.numeric(n)) -# save them into a rds and send back to galaxy the path -out <- paste(out,".rds",sep="") -saveRDS(folds, file=out) -cat(paste(out, "\n", sep="")) \ No newline at end of file |
b |
diff -r 55449ced23d5 -r 8cc5a7448ca6 folds.xml --- a/folds.xml Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,41 +0,0 @@ -<tool id="folds" name="folds" version="1.0.0"> - <description>create folds for classifiers evaluation through cross-validation</description> - <command interpreter="Rscript"> - folds.R $config > ${output1} - </command> - - <inputs> - <param name="genotype" type="data" - label="encoded data" help="encoded must be a .rds file" - /> - - <param name="n" type="integer" value ="7" - label="number of folds" help=" must be an integer. You may want its value be at least 5" - /> - - <param name="foldsFile" type="text" - label="path to the output file" help= " a valid path is expected " - /> - </inputs> - - <configfiles> - <configfile name="config"> -## Desc: this file is sourced in encode wrapper script -## as means to pass all galaxy params to R -"${genotype}" -> genotype -"${n}" -> n -"${foldsFile}" -> out - - </configfile> -</configfiles> - -<outputs> - <data format="tabular" name = "output1" label="folds" /> -</outputs> - - <help> - Takes the genotypes and use it to determine the different folds for further cross-valisations - return a rda file that contains a list of indexes of the genotype, each element of the list is a fold - the list is made to be used by other tools of the OGHMA suite. - </help> - </tool> \ No newline at end of file |
b |
diff -r 55449ced23d5 -r 8cc5a7448ca6 lasso.R --- a/lasso.R Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,122 +0,0 @@ -######################################################## -# -# creation date : 08/01/16 -# last modification : 01/09/16 -# author : Dr Nicolas Beaume -# owner : IRRI -# -######################################################## - -suppressWarnings(suppressMessages(library(glmnet))) -library(methods) -############################ helper functions ####################### - - -# optimize alpha parameter -optimize <- function(genotype, phenotype, alpha=seq(0,1,0.1), repet=7) { - acc <- NULL - indexAlpha <- 1 - for(a in alpha) { - curAcc <- NULL - # repeat nfolds time each analysis - for(i in 1:repet) { - # draw at random 1/3 of the training set for testing and thus choose alpha - # note it is not a cross-validation - n <- ceiling(nrow(genotype)/3) - indexTest <- sample(1:nrow(genotype), size=n) - # create training set and test set - train <- genotype[-indexTest,] - test <- genotype[indexTest,] - phenoTrain <- phenotype[-indexTest] - phenoTest <- phenotype[indexTest] - # cv.glmnet allow to compute lambda at the current alpha - cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=a) - # create model - model <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=a, lambda = cv$lambda.1se) - # predict test set - pred <- predict(model, test, type = "response") - # compute r2 for choosing the best alpha - curAcc <- c(curAcc, r2(phenoTest, pred)) - } - # add mean r2 for this value of lambda to the accuracy vector - acc <- c(acc, mean(curAcc)) - } - # choose best alpha - names(acc) <- alpha - return(as.numeric(names(acc)[which.max(acc)])) -} - -# compute r2 by computing the classic formula -# compare the sum of square difference from target to prediciton -# to the sum of square difference from target to the mean of the target -r2 <- function(target, prediction) { - sst <- sum((target-mean(target))^2) - ssr <- sum((target-prediction)^2) - return(1-ssr/sst) -} -################################## main function ########################### - -lasso <- function(genotype, phenotype, evaluation = T, outFile, folds, alpha=NULL) { - # go for optimization - if(is.null(alpha)) { - alpha <- seq(0,1,0.1) - alpha <- optimize(genotype=genotype, phenotype=phenotype, alpha = alpha) - } - # evaluation - if(evaluation) { - prediction <- NULL - # do cross-validation - for(i in 1:length(folds)) { - # create training and test set - train <- genotype[-folds[[i]],] - test <- genotype[folds[[i]],] - phenoTrain <- phenotype[-folds[[i]]] - phenoTest <- phenotype[folds[[i]]] - # cv.glmnet helps to compute the right lambda for a chosen alpha - cv <- cv.glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha) - # create model - lasso.fit <- glmnet(x=as.matrix(train), y=phenoTrain, alpha=alpha, lambda = cv$lambda.1se) - # predict value of the test set for further evaluation - prediction <- c(prediction, list(predict(lasso.fit, test, type = "response")[,1])) - } - # save predicted value for test set of each fold for further evaluation - saveRDS(prediction, file=paste(outFile,".rds", sep="")) - # just create a model - } else { - # cv.glmnet helps to compute the right lambda for a chosen alpha - cv <- cv.glmnet(x=genotype, y=phenotype, alpha=alpha) - # create model - model <- glmnet(x=genotype, y=phenotype, alpha=alpha, lambda=cv$lambda.1se) - # save model - saveRDS(model, file = paste(outFile, ".rds", sep = "")) - } -} - -############################ main ############################# -# load argument -cmd <- commandArgs(T) -source(cmd[1]) -# check if evaluation is required -evaluation <- F -if(as.integer(doEvaluation) == 1) { - evaluation <- T - con = file(folds) - folds <- readLines(con = con, n = 1, ok=T) - close(con) - folds <- readRDS(folds) -} -# load classifier parameters -alpha <- as.numeric(alpha) -if(alpha < 0 | alpha > 1) {alpha <- NULL} -# load genotype and phenotype -con = file(genotype) -genotype <- readLines(con = con, n = 1, ok=T) -close(con) -genotype <- read.table(genotype, sep="\t", h=T) -# phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve -phenotype <- read.table(phenotype, sep="\t", h=T)[,1] -# run ! -lasso(genotype = data.matrix(genotype), phenotype = phenotype, - evaluation = evaluation, outFile = out, folds = folds, alpha = alpha) -# return path of the result file to galaxy -cat(paste(paste(out, ".rds", sep = ""), "\n", sep="")) |
b |
diff -r 55449ced23d5 -r 8cc5a7448ca6 lasso.xml --- a/lasso.xml Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,54 +0,0 @@ -<tool id="lasso" name="LASSO" version="1.0.0"> - <description>predict phenotype using a LASSO approach</description> - <command interpreter="Rscript"> - lasso.R $config > ${output1} - </command> - - <inputs> - <param name="genotype" type="data" - label="genotype data" help="a tabular datatype containing the encoded genotypes" - /> - - <param name="phenotype" type="data" - label="phenotype data" help=" a tabular datatype containing the phenotypes " - /> - - <param name="eval" type="integer" value="0" - label="do evaluation" help=" whether to produce a model or to use folds to evaluate the tool. 1 means the tool will be evaluate (and a folds argument is required) any other value produces a model " - /> - - <param name="alpha" type="float" value="-1" - label="alpha" help="the alpha parameter of LASSO. -1 means the parameter must be optimized by the tool " - /> - - <param name="folds" type="data" optional="true" - label="folds" help=" OPTIONAL ARGUMENT path to a folds file containing folds indexes in a R list called /folds/ such as produced by the folds tools in OGHMA suite. " - /> - - <param name="model" type="text" - label="path to the output folds" help= " a path to a file where the results (depending on the chosen mode) will be writen" - /> - </inputs> - - <configfiles> - <configfile name="config"> -## Desc: this file is sourced in encode wrapper script -## as means to pass all galaxy params to R -"${genotype}" -> genotype -"${phenotype}" -> phenotype -"${eval}" -> doEvaluation -"${folds}" -> folds -"${model}" -> out -"${alpha}" -> alpha - - </configfile> -</configfiles> - -<outputs> - <data format="tabular" name = "output1" label="LASSO output" /> -</outputs> - - <help> - make the classification using the LASSO method - </help> -</tool> \ No newline at end of file |
b |
diff -r 55449ced23d5 -r 8cc5a7448ca6 plotPrediction.R --- a/plotPrediction.R Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,30 +0,0 @@ -######################################################## -# -# creation date : 07/06/16 -# last modification : 07/06/16 -# author : Dr Nicolas Beaume -# owner : IRRI -# -######################################################## - -library("miscTools") -# scatterplot of the prediction vs target -r2.plot <- function(true, predicted) { - # the scatterplot - plot(true, predicted, xlab="trait value", ylab="predicted value", main="", pch=16, - ylim=c(min(min(true), min(predicted)), max(max(true), max(predicted)))) - # add a red lines with ideal case - lines(true, true, col="red") -} - -############################ main ############################# -# load argument -cmd <- commandArgs(trailingOnly = T) -source(cmd[1]) -# load prediction and target -phenotype <- read.table(phenotype, sep="\t", h=T)[,1] -predicted <- read.table(predicted, sep = "\t", h=T)[,2] -# plot in a pdf that will be available in galaxy history panel -pdf(out) -r2.plot(phenotype, predicted = predicted) -dev.off() \ No newline at end of file |
b |
diff -r 55449ced23d5 -r 8cc5a7448ca6 plotPrediction.xml --- a/plotPrediction.xml Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,36 +0,0 @@ -<tool id="plotPrediction" name="plot prediction" version="1.0.0"> - <description>shows scatterplot of a prediction vs true phenotype</description> - <command interpreter="Rscript"> - plotPrediction.R $config - </command> - - <inputs> - <param name="phenotype" type="data" - label="true phenotype" - /> - - <param name="predictedPhenotype" type="data" - label="predicted phenotype" - /> - - </inputs> - - <configfiles> - <configfile name="config"> -## Desc: this file is sourced in encode wrapper script -## as means to pass all galaxy params to R -"${predictedPhenotype}" -> predicted -"${phenotype}" -> phenotype -"${r2}" -> out - - </configfile> -</configfiles> - -<outputs> - <data format="pdf" name = "r2" /> -</outputs> - - <help> - draw R2 of prediction vs true phenotype - </help> - </tool> \ No newline at end of file |
b |
diff -r 55449ced23d5 -r 8cc5a7448ca6 prediction.R --- a/prediction.R Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,52 +0,0 @@ -######################################################## -# -# creation date : 26/01/16 -# last modification : 02/06/16 -# author : Dr Nicolas Beaume -# owner : IRRI -# -######################################################## - -library(rrBLUP) -suppressWarnings(suppressMessages(library(randomForest))) -library(e1071) -suppressWarnings(suppressMessages(library(glmnet))) -library(methods) - - -############################ main ############################# -classifierNames <- c("list", "randomForest", "svm", "glmnet") -# load argument -cmd <- commandArgs(trailingOnly = T) -source(cmd[1]) -# load data -con = file(genotype) -genotype <- readLines(con = con, n = 1, ok=T) -close(con) -genotype <- read.table(genotype, sep="\t", h=T) -con = file(model) -model <- readLines(con = con, n = 1, ok=T) -close(con) -model <- readRDS(model) -# check if the classifier name is valid -if(all(is.na(match(class(model), classifierNames)))) { - stop(paste(class(model), "is not recognized as a valid model. Valid models are : ", classifierNames)) -} -# run prediction according to the classifier -# rrBLUP prediction -if(any(class(model) == "list")) { - predictions <- as.matrix(genotype) %*% as.matrix(model$u) - predictions <- predictions[,1]+model$beta - predictions <- data.frame(lines=rownames(genotype), predictions=predictions) -# LASSO prediction -} else if(any(class(model) == "glmnet")) { - predictions <- predict(model, as.matrix(genotype), type = "response") - predictions <- data.frame(lines=rownames(genotype), predictions=predictions) -# SVM or RandomForest prediction (predict is a wrapper for many machine learning function) -} else { - predictions <- predict(model, genotype) - predictions <- data.frame(lines=names(predictions), predictions=predictions) -} -# save results -write.table(predictions, file=out, sep="\t", row.names = F) - |
b |
diff -r 55449ced23d5 -r 8cc5a7448ca6 prediction.xml --- a/prediction.xml Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,46 +0,0 @@ -<tool id="prediction" name="prediction" version="1.0.0"> - <description>use machine learning model to predict phenotype from genotype</description> - <command interpreter="Rscript"> - prediction.R $config > ${output1} - </command> - - <inputs> - <param name="genotype" type="data" - label="genotype data" help="genotype must be a .rda file - containing a R dataframe/matrix called /encoded/ " - /> - - <param name="model" type="data" - label="classifier model" help="model created by one of the classifier tool of the OGHMA suite. Represented as a .rda file, containing a dataframe called /model/" - /> - - <!-- deprecated - <param name="name" type="text" - label="names" help=" prefixe given to all results of the evaluation in the output folder (see -o option) to distinguish them" - /> - - - <param name="outputPath" type="data" - label="path to the output folds" help= " a path to a FOLDER where the results will be writen" - /> --> - </inputs> - - <configfiles> - <configfile name="config"> -## Desc: this file is sourced in encode wrapper script -## as means to pass all galaxy params to R -"${genotype}" -> genotype -"${model}" -> model -"${output1}" -> out - - </configfile> -</configfiles> - -<outputs> - <data format="tabular" name = "output1" label="classifier prediction" /> -</outputs> - - <help> - Use model designed by any clasifier tool from the OGHMA suite to predict the phenotype of a dataset provided as input. results are stored in a folder under a .rda file - </help> -</tool> \ No newline at end of file |
b |
diff -r 55449ced23d5 -r 8cc5a7448ca6 randomForest.R --- a/randomForest.R Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,113 +0,0 @@ -######################################################## -# -# creation date : 07/01/16 -# last modification : 25/10/16 -# author : Dr Nicolas Beaume -# -######################################################## - -suppressWarnings(suppressMessages(library(randomForest))) -############################ helper functions ####################### -# optimize -optimize <- function(genotype, phenotype, ntree=1000, - rangeMtry=seq(ceiling(ncol(genotype)/5), - ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100)), - repet=3) { - # accuracy over all mtry values - acc <- NULL - for(mtry in rangeMtry) { - # to compute the mean accuracy over repetiotion for the current mtry value - tempAcc <- NULL - for(i in 1:repet) { - # 1/3 of the dataset is used as test set - n <- ceiling(nrow(genotype)/3) - indexTest <- sample(1:nrow(genotype), size=n) - # create training and test set - train <- genotype[-indexTest,] - test <- genotype[indexTest,] - phenoTrain <- phenotype[-indexTest] - phenoTest <- phenotype[indexTest] - # compute model - model <- randomForest(x=train, y=phenoTrain, ntree = ntree, mtry =mtry) - # predict on test set and compute accuracy - pred <- predict(model, test) - tempAcc <- c(tempAcc, r2(phenoTest, pred)) - } - # find mean accuracy for the current mtry value - acc <- c(acc, mean(tempAcc)) - } - # return mtry for the best accuracy - names(acc) <- rangeMtry - bestParam <- which.max(acc) - return(rangeMtry[bestParam]) -} - -# compute r2 by computing the classic formula -# compare the sum of square difference from target to prediciton -# to the sum of square difference from target to the mean of the target -r2 <- function(target, prediction) { - sst <- sum((target-mean(target))^2) - ssr <- sum((target-prediction)^2) - return(1-ssr/sst) -} -################################## main function ########################### -rfSelection <- function(genotype, phenotype, folds, outFile, evaluation=T, mtry=NULL, ntree=1000) { - - # go for optimization - if(is.null(mtry)) { - # find best mtry - mtry <- seq(ceiling(ncol(genotype)/10), ceiling(ncol(genotype)/3), ceiling(ncol(genotype)/100)) - mtry <- optimize(genotype=genotype, phenotype=phenotype, - ntree = ntree, rangeMtry = mtry) - } - # evaluation - if(evaluation) { - prediction <- NULL - for(i in 1:length(folds)) { - # create training and testing set for the current fold - train <- genotype[-folds[[i]],] - test <- genotype[folds[[i]],] - phenoTrain <- phenotype[-folds[[i]]] - # compute model - rf <- randomForest(x=train, y=phenoTrain, mtry = mtry, ntree = ntree) - # predict and save prediction for the current fold - prediction <- c(prediction, list(predict(rf, test))) - } - # save preductions for all folds to be used for evaluation - saveRDS(prediction, file = paste(outFile, ".rds", sep = "")) - } else { - # just compute the model and save it - model <- randomForest(x=genotype, y=phenotype, mtry = mtry, ntree=ntree) - saveRDS(model, file = paste(outFile, ".rds", sep = "")) - } -} - - -############################ main ############################# -# load parameters -cmd <- commandArgs(T) -source(cmd[1]) -# load classifier parameters -mtry <- as.numeric(mtry) -ntree <- as.numeric(ntree) -if(mtry == -1) {mtry <- NULL} -# check if evaluation is required -evaluation <- F -if(as.integer(doEvaluation) == 1) { - evaluation <- T - con = file(folds) - folds <- readLines(con = con, n = 1, ok=T) - close(con) - folds <- readRDS(folds) -} -# load genotype and phenotype -con = file(genotype) -genotype <- readLines(con = con, n = 1, ok=T) -close(con) -genotype <- read.table(genotype, sep="\t", h=T) -phenotype <- read.table(phenotype, sep="\t", h=T)[,1] -# run ! -rfSelection(genotype = data.matrix(genotype), phenotype=phenotype, - evaluation = evaluation, out = out, folds = folds, mtry = mtry, ntree=ntree) -# send the path containing results to galaxy -cat(paste(paste(out, ".rds", sep = ""), "\n", sep="")) |
b |
diff -r 55449ced23d5 -r 8cc5a7448ca6 randomForest.xml --- a/randomForest.xml Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,59 +0,0 @@ -<tool id="randomForest" name="randomForest" version="1.0.1"> - <description>predict phenotype using a random forest approach</description> - <command interpreter="Rscript"> - randomForest.R $config > ${output1} - </command> - - <inputs> - <param name="genotype" type="data" - label="genotype data" help="a tabular datatype containing the encoded genotypes" - /> - - <param name="phenotype" type="data" - label="phenotype data" help=" a tabular datatype containing the phenotypes " - /> - - <param name="eval" type="integer" value="0" - label="do evaluation" help=" whether to produce a model or to use folds to evaluate the tool. 1 means the tool will be evaluate (and a folds argument is required) any other value produces a model " - /> - - <param name="ntree" type="float" value="1000" - label="ntree" help="the ntree parameter of Random Forest. Suitable value could be around 1000. " - /> - - <param name="mtry" type="float" value="-1" - label="mtry" help="the mtry parameter of Random Forest (number of node per tree). default value are around nuber_of_variable/3 through optimization is highly desirable. -1 means the parameter must be optimized by the tool" - /> - - <param name="folds" type="data" optional="true" - label="folds" help=" OPTIONAL ARGUMENT path to a folds file containing folds indexes in a R list called /folds/ such as produced by the folds tools in OGHMA suite. " - /> - - <param name="model" type="text" - label="path to the output folds" help= " a path to a file where the results (depending on the chosen mode) will be writen" - /> - </inputs> - - <configfiles> - <configfile name="config"> -## Desc: this file is sourced in encode wrapper script -## as means to pass all galaxy params to R -"${genotype}" -> genotype -"${phenotype}" -> phenotype -"${eval}" -> doEvaluation -"${folds}" -> folds -"${model}" -> out -"${mtry}" -> mtry -"${ntree}" -> ntree - - </configfile> -</configfiles> - -<outputs> - <data format="tabular" name = "output1" label="random forest output" /> -</outputs> - - <help> - make the classification using the random forest method - </help> -</tool> \ No newline at end of file |
b |
diff -r 55449ced23d5 -r 8cc5a7448ca6 rrBLUP.R --- a/rrBLUP.R Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,71 +0,0 @@ - -######################################################## -# -# creation date : 05/01/16 -# last modification : 25/10/16 -# author : Dr Nicolas Beaume -# -######################################################## - - -library(rrBLUP) -############################ helper functions ####################### - - -################################## main function ########################### -# do rrBLUP evaluation of classification. -# optimization of paramaters is included in rrBLUP package -rrBLUP <- function(genotype, phenotype, outFile, evaluation=F, folds) { - # Evaluation mode - if(evaluation) { - prediction <- NULL - # run over folds - for(i in 1:length(folds)) { - # create training and test set for this fold - train <- genotype[-folds[[i]],] - test <- genotype[folds[[i]],] - phenoTrain <- phenotype[-folds[[i]]] - phenoTest <- phenotype[folds[[i]]] - # create model - model <-mixed.solve(phenoTrain, Z=train,K=NULL, SE=F,return.Hinv = F) - # predict current test set - pred <- as.matrix(test) %*% as.matrix(model$u) - pred <- pred[,1]+model$beta - prediction <- c(prediction, list(pred)) - } - # save results - saveRDS(prediction, file=paste(outFile,".rds", sep="")) - # just create a model - } else { - # create and save modle - model <-mixed.solve(phenotype, Z=genotype,K=NULL, SE=F,return.Hinv = F) - saveRDS(model, file = paste(outFile, ".rds", sep = "")) - } -} - - -############################ main ############################# -# get argument from xml file -cmd <- commandArgs(T) -source(cmd[1]) -# for evaluation mode : set evaluation as True and load fold file -if(as.integer(evaluation) == 1) { - evaluation <- T - con = file(folds) - folds <- readLines(con = con, n = 1, ok=T) - close(con) - folds <- readRDS(folds) -} else{ - evaluation <- F -} -# load genotype and phenotype -con = file(genotype) -genotype <- readLines(con = con, n = 1, ok=T) -close(con) -genotype <- read.table(genotype, sep="\t", h=T) -phenotype <- read.table(phenotype, sep="\t", h=T)[,1] -# run ! -rrBLUP(genotype = genotype, phenotype = phenotype, outFile = out, - evaluation = evaluation, folds = folds) -# return path of the result to galaxy -cat(paste(paste(out, ".rds", sep = ""), "\n", sep="")) \ No newline at end of file |
b |
diff -r 55449ced23d5 -r 8cc5a7448ca6 rrBLUP.xml --- a/rrBLUP.xml Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,50 +0,0 @@ -<tool id="rrBLUP" name="rrBLUP" version="1.0.0"> - <description>predict phenotype using a rrBLUP approach</description> - <command interpreter="Rscript"> - rrBLUP.R $config > ${output1} - </command> - - <inputs> - <param name="genotype" type="data" - label="genotype data" help="a tabular datatype containing the encoded genotypes" - /> - - <param name="phenotype" type="data" - label="phenotype data" help=" a tabular datatype containing the phenotypes " - /> - - <param name="eval" type="integer" value="0" - label="do evaluation" help=" whether to produce a model or to use folds to evaluate the tool. 1 means the tool will be evaluate (and a folds argument is required) any other value produces a model " - /> - - <param name="folds" type="data" optional="true" - label="folds" help=" OPTIONAL ARGUMENT path to a folds file containing folds indexes in a R list called /folds/ such as produced by the folds tools in OGHMA suite. " - /> - - <param name="model" type="text" - label="path to the output folds" help= " a path to a file where the results (depending on the chosen mode) will be writen" - /> - </inputs> - - <configfiles> - <configfile name="config"> -## Desc: this file is sourced in encode wrapper script -## as means to pass all galaxy params to R -"${genotype}" -> genotype -"${phenotype}" -> phenotype -"${model}" -> out -"${eval}" -> evaluation -"${folds}" -> folds - - - </configfile> -</configfiles> - -<outputs> - <data format="tabular" name = "output1" label="rrBLUP output" /> -</outputs> - - <help> - make the classification using the rrBLUP method - </help> - </tool> \ No newline at end of file |
b |
diff -r 55449ced23d5 -r 8cc5a7448ca6 svm.R --- a/svm.R Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,167 +0,0 @@ -######################################################## -# -# creation date : 07/01/16 -# last modification : 03/07/16 -# author : Dr Nicolas Beaume -# owner : IRRI -# -######################################################## -library("e1071") -options(warn=-1) -############################ helper functions ####################### -# optimize svm parameters -optimizeSVM <- function(train, target, kernel="radial", g=NULL, c=NULL, coef=NULL, d=NULL) { - # tuning parameters then train - model <- NULL - if(is.null(g)){g <- 10^(-6:0)} - if(is.null(c)){c <- 10^(-1:0)} - # optimize parameter for the kernel in use - switch(kernel, - # sigmoid kernel : need gamma, cost and coef - sigmoid={ - if(is.null(coef)){coef <- 0:3}; - # optimize then extract best parameters - tune <- tune.svm(train, target, gamma = g, cost = 10^(0:2), kernel="sigmoid", coef0 = coef); - g <- tune$best.parameters[[1]]; - c <- tune$best.parameters[[3]]; - coef <- tune$best.parameters[[2]]; - # compute model - model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "sigmoid")}, - # linear kernel, only cost is required - linear={ - # optimize then extract best parameters - tune <- tune.svm(train, target, cost = c, kernel="linear"); - c <- tune$best.parameters[[1]]; - # compute model - model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "linear")}, - # polynomial kernel, use degree, gamma, cost and coef as parameters - polynomial={ - # optimize and extract best parameters - if(is.null(coef)){coef <- 0:3}; - if(is.null(d)){d <- 0:4}; - tune <- tune.svm(train, target, gamma = g, cost = c, degree = d, coef0 = coef, kernel="polynomial"); - d <- tune$best.parameters[[1]]; - g <- tune$best.parameters[[2]]; - coef <- tune$best.parameters[[3]]; - c <- tune$best.parameters[[4]]; - # compute model - model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "polynomial", degree = d, coef0 = coef)}, - # default : radial kernel, use gamma and cost as parameters - { - # optimize and extract parameters - tune <- tune.svm(train, target, gamma = g, cost = c, kernel="radial"); - g <- tune$best.parameters[[1]]; - c <- tune$best.parameters[[2]]; - # compute model - model <- svm(x=train, y=target, gamma = g, cost = c, kernel = "radial")} - ) - return(model) -} -################################## main function ########################### -svmClassifier <- function(genotype, phenotype, evaluation = T, outFile, folds, kernel="radial", g=NULL, c=NULL, coef=NULL, d=NULL) { - # optimize classifier if any parameter is NULL - switch(kernel, - # sigmoid kernel : need gamma, cost and coef - sigmoid={ - if(any(is.null(c(coef, c, g)))){ - fit <- optimizeSVM(genotype, phenotype, kernel = "sigmoid", - g = g, c=c, coef = coef); - c <- fit$cost; - g <- fit$gamma; - coef <- fit$coef0; - } - }, - # linear kernel, only cost is required - linear={ - if(is.null(c)){fit <- optimizeSVM(genotype, phenotype, kernel = "linear", c=c); - c <- fit$cost; - } - }, - # polynomial kernel, use degree, gamma, cost and coef as parameters - polynomial={ - if(any(is.null(c(coef, c, g, d)))){fit <- optimizeSVM(genotype, phenotype, kernel = "polynomial", - g = g, c=c, coef = coef, d = d); - c <- fit$cost; - g <- fit$gamma; - coef <- fit$coef0; - d <- fit$degree - } - }, - # default : radial kernel, use gamma and cost as parameters - {if(any(is.null(c(c, g)))){fit <- optimizeSVM(genotype, phenotype, kernel = "radial", - g = g, c=c); - c <- fit$cost; - g <- fit$gamma; - } - } - ) - # do evaluation - if(evaluation) { - prediction <- NULL - # iterate over folds - for(i in 1:length(folds)) { - # prepare indexes for training and test - test <- folds[[i]] - train <- unlist(folds[-i]) - # compute model - svm.fit <- optimizeSVM(train = genotype[train,], target = phenotype[train], kernel=kernel, - g=g, c=c, coef=coef, d=d) - # predict on test set of the current fold - prediction <- c(prediction, list(predict(svm.fit, genotype[test,]))) - } - # save all prediction for further evaluation - saveRDS(prediction, file=paste(outFile, ".rds", sep = "")) - } else { - # compute model and save it - switch(kernel, - # sigmoid kernel : need gamma, cost and coef - sigmoid={ - model <- svm(x = genotype, y = phenotype, kernel="sigmoid", gamma =g, - cost =c, coef0=coef) - }, - # linear kernel, only cost is required - linear={ - model <- svm(x = genotype, y = phenotype, kernel="linear", cost =c) - }, - # polynomial kernel, use degree, gamma, cost and coef as parameters - polynomial={ - model <- svm(x = genotype, y = phenotype, kernel="polynomial", gamma =g, cost =c, - coef0=coef, degree =d) - }, - # default : radial kernel, use gamma and cost as parameters - { model <- svm(x = genotype, y = phenotype, kernel="radial", gamma =g, cost =c) - }) - saveRDS(model, file=paste(outFile, ".rds", sep = "")) - } -} - -############################ main ############################# -# load argument -cmd <- commandArgs(T) -source(cmd[1]) -# check for svm paramater, especially to know if optimization is requiered -if(as.numeric(g) == -1) {g <- NULL} -if(as.numeric(c) == -1) {c <- NULL} -if(as.numeric(coef) == -1) {coef <- NULL} -if(as.numeric(d) == -1) {d <- NULL} -# check if evaluation is required -evaluation <- F -if(as.integer(doEvaluation) == 1) { - evaluation <- T - con = file(folds) - folds <- readLines(con = con, n = 1, ok=T) - close(con) - folds <- readRDS(folds) -} -# load genotype and phenotype -con = file(genotype) -genotype <- readLines(con = con, n = 1, ok=T) -close(con) -genotype <- read.table(genotype, sep="\t", h=T) -# phenotype is written as a table (in columns) but it must be sent as a vector for mixed.solve -phenotype <- read.table(phenotype, sep="\t", h=T)[,1] -# run ! -svmClassifier(genotype = genotype, phenotype = phenotype, - evaluation = evaluation, outFile = out, folds = folds, g=g, c=c, coef=coef, d=d, kernel=kernel) -# retunr path of the result file to galaxy -cat(paste(paste(out, ".rds", sep = ""), "\n", sep="")) |
b |
diff -r 55449ced23d5 -r 8cc5a7448ca6 svm.xml --- a/svm.xml Wed Oct 26 19:16:22 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,74 +0,0 @@ -<tool id="svm" name="svm" version="1.0.0"> - <description>predict phenotype using a SVM approach</description> - <command interpreter="Rscript"> - svm.R $config > ${output1} - </command> - - <inputs> - <param name="genotype" type="data" - label="genotype data" help="a tabular datatype containing the encoded genotypes" - /> - - <param name="phenotype" type="data" - label="phenotype data" help=" a tabular datatype containing the phenotypes " - /> - - <param name="kernel" type="text" value="radial" - label="name of the kernel" help= " four possibilities : radial, linear, polynomial or sigmoid. Any other value is treated as radial" - /> - - <param name="cost" type="float" value="-1" - label="cost" help="the cost parameter of SVM used by all kernels. Suitable values are between 0.1 and 1000. -1 means the parameter must be optimized by the tool " - /> - - <param name="gamma" type="float" value="-1" - label="gamma" help="the gamma parameter of SVM used by radial, polynomial and sigmoid kernels. Suitable values are between 1e-06 and 1. -1 means the parameter must be optimized by the tool " - /> - - <param name="coeficient" type="float" value="-1" - label="coeficient" help="the coeficient parameter of SVM used by polynomial kernel. Suitable values are between 0 and 4. -1 means the parameter must be optimized by the tool " - /> - - <param name="degree" type="float" value="-1" - label="degree" help="the maximum dgree of the polynomial kernel. Suitable values are between 1 and 3. -1 means the parameter must be optimized by the tool " - /> - - <param name="eval" type="integer" value="0" - label="do evaluation" help=" whether to produce a model or to use folds to evaluate the tool. 1 means the tool will be evaluate (and a folds argument is required) any other value produces a model " - /> - - <param name="folds" type="data" optional="true" - label="folds" help=" OPTIONAL ARGUMENT path to a folds file containing folds indexes in a R list called /folds/ such as produced by the folds tools in OGHMA suite. " - /> - - <param name="model" type="text" - label="path to the output folds" help= " a path to a file where the results (depending on the chosen mode) will be writen" - /> - </inputs> - - <configfiles> - <configfile name="config"> -## Desc: this file is sourced in encode wrapper script -## as means to pass all galaxy params to R -"${genotype}" -> genotype -"${phenotype}" -> phenotype -"${eval}" -> doEvaluation -"${folds}" -> folds -"${model}" -> out -"${kernel}" -> kernel -"${gamma}" -> g -"${cost}" -> c -"${coeficient}" -> coef -"${degree}" -> d - - </configfile> -</configfiles> - -<outputs> - <data format="tabular" name = "output1" label="SVM output" /> -</outputs> - - <help> - make the classification using the SVM method - </help> -</tool> \ No newline at end of file |