Mercurial > repos > anmoljh > rcaret_classification_model
view templateLibrary.py @ 1:247d404ffbcf draft default tip
planemo upload commit e1c7666c1ed218d9b273dc3aeb1b6b765f351324-dirty
author | anmoljh |
---|---|
date | Sun, 03 Jun 2018 03:19:45 -0400 |
parents | 2cb81da69c73 |
children |
line wrap: on
line source
def __template4Rnw(): template4Rnw = r'''%% Classification Modeling Script %% Max Kuhn (max.kuhn@pfizer.com, mxkuhn@gmail.com) %% Version: 1.00 %% Created on: 2010/10/02 %% %% The originla file hs been improved by %% Deepak Bharti, Andrew M. Lynn , Anmol J. Hemrom %% Version : 1.01 %% created on : 2014/08/12 %% This is an Sweave template for building and describing %% classification models. It mixes R and LaTeX code. The document can %% be processing using R's Sweave function to produce a tex file. %% %% The inputs are: %% - the initial data set in a data frame called 'rawData' %% - a factor column in the data set called 'class'. this should be the %% outcome variable %% - all other columns in rawData should be predictor variables %% - the type of model should be in a variable called 'modName'. %% %% The script attempts to make some intelligent choices based on the %% model being used. For example, if modName is "pls", the script will %% automatically center and scale the predictor data. There are %% situations where these choices can (and should be) changed. %% %% There are other options that may make sense to change. For example, %% the user may want to adjust the type of resampling. To find these %% parts of the script, search on the string 'OPTION'. These parts of %% the code will document the options. \documentclass[14pt]{report} \usepackage{amsmath} \usepackage[pdftex]{graphicx} \usepackage{color} \usepackage{ctable} \usepackage{xspace} \usepackage{fancyvrb} \usepackage{fancyhdr} \usepackage{lastpage} \usepackage{longtable} \usepackage{algorithm2e} \usepackage[ colorlinks=true, linkcolor=blue, citecolor=blue, urlcolor=blue] {hyperref} \usepackage{lscape} \usepackage{Sweave} \SweaveOpts{keep.source = TRUE} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % define new colors for use \definecolor{darkgreen}{rgb}{0,0.6,0} \definecolor{darkred}{rgb}{0.6,0.0,0} \definecolor{lightbrown}{rgb}{1,0.9,0.8} \definecolor{brown}{rgb}{0.6,0.3,0.3} \definecolor{darkblue}{rgb}{0,0,0.8} \definecolor{darkmagenta}{rgb}{0.5,0,0.5} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newcommand{\bld}[1]{\mbox{\boldmath $$#1$$}} \newcommand{\shell}[1]{\mbox{$$#1$$}} \renewcommand{\vec}[1]{\mbox{\bf {#1}}} \newcommand{\ReallySmallSpacing}{\renewcommand{\baselinestretch}{.6}\Large\normalsize} \newcommand{\SmallSpacing}{\renewcommand{\baselinestretch}{1.1}\Large\normalsize} \newcommand{\halfs}{\frac{1}{2}} \setlength{\oddsidemargin}{-.25 truein} \setlength{\evensidemargin}{0truein} \setlength{\topmargin}{-0.2truein} \setlength{\textwidth}{7 truein} \setlength{\textheight}{8.5 truein} \setlength{\parindent}{0.20truein} \setlength{\parskip}{0.10truein} \setcounter{LTchunksize}{50} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \pagestyle{fancy} \lhead{} %% OPTION Report header name \chead{Classification Model Script} \rhead{} \lfoot{} \cfoot{} \rfoot{\thepage\ of \pageref{LastPage}} \renewcommand{\headrulewidth}{1pt} \renewcommand{\footrulewidth}{1pt} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% OPTION Report title and modeler name \title{Classification Model Script using $METHOD} \author{"Lynn Group with M. Kuhn, SCIS, JNU, New Delhi"} \begin{document} \maketitle \thispagestyle{empty} <<dummy, eval=TRUE, echo=FALSE, results=hide>>= # sets values for variables used later in the program to prevent the \Sexpr error on parsing with Sweave numSamples='' classDistString='' missingText='' numPredictors='' numPCAcomp='' pcaText='' nzvText='' corrText='' ppText='' varText='' splitText="Dummy Text" nirText="Dummy Text" # pctTrain is a variable that is initialised in Data splitting, and reused later in testPred pctTrain=0.8 Smpling='' nzvText1='' classDistString1='' dwnsmpl='' upsmpl='' @ <<startup, eval= TRUE, results = hide, echo = FALSE>>= library(Hmisc) library(caret) library(pROC) versionTest <- compareVersion(packageDescription("caret")$$Version, "4.65") if(versionTest < 0) stop("caret version 4.65 or later is required") library(RColorBrewer) listString <- function (x, period = FALSE, verbose = FALSE) { if (verbose) cat("\n entering listString\n") flush.console() if (!is.character(x)) x <- as.character(x) numElements <- length(x) out <- if (length(x) > 0) { switch(min(numElements, 3), x, paste(x, collapse = " and "), { x <- paste(x, c(rep(",", numElements - 2), " and", ""), sep = "") paste(x, collapse = " ") }) } else "" if (period) out <- paste(out, ".", sep = "") if (verbose) cat(" leaving listString\n\n") flush.console() out } resampleStats <- function(x, digits = 3) { bestPerf <- x$$bestTune colnames(bestPerf) <- gsub("^\\.", "", colnames(bestPerf)) out <- merge(x$$results, bestPerf) out <- out[, colnames(out) %in% x$$perfNames] names(out) <- gsub("ROC", "area under the ROC curve", names(out), fixed = TRUE) names(out) <- gsub("Sens", "sensitivity", names(out), fixed = TRUE) names(out) <- gsub("Spec", "specificity", names(out), fixed = TRUE) names(out) <- gsub("Accuracy", "overall accuracy", names(out), fixed = TRUE) names(out) <- gsub("Kappa", "Kappa statistics", names(out), fixed = TRUE) out <- format(out, digits = digits) listString(paste(names(out), "was", out)) } twoClassNoProbs <- function (data, lev = NULL, model = NULL) { out <- c(sensitivity(data[, "pred"], data[, "obs"], lev[1]), specificity(data[, "pred"], data[, "obs"], lev[2]), confusionMatrix(data[, "pred"], data[, "obs"])$$overall["Kappa"]) names(out) <- c("Sens", "Spec", "Kappa") out } ##OPTION: model name: see ?train for more values/models modName <- "$METHOD" load("$RDATA") rawData <- dataX rawData$$outcome <- dataY @ \section*{Data Sets}\label{S:data} %% OPTION: provide some background on the problem, the experimental %% data, how the compounds were selected etc <<getDataInfo, eval = $GETDATAINFOEVAL, echo = $GETDATAINFOECHO, results = $GETDATAINFORESULT>>= if(!any(names(rawData) == "outcome")) stop("a variable called outcome should be in the data set") if(!is.factor(rawData$$outcome)) stop("the outcome should be a factor vector") ## OPTION: when there are only two classes, the first level of the ## factor is used as the "positive" or "event" for calculating ## sensitivity and specificity. Adjust the outcome factor accordingly. numClasses <- length(levels(rawData$$outcome)) numSamples <- nrow(rawData) numPredictors <- ncol(rawData) - 1 predictorNames <- names(rawData)[names(rawData) != "outcome"] isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric) if(any(!isNum)) stop("all predictors in rawData should be numeric") classTextCheck <- all.equal(levels(rawData$$outcome), make.names(levels(rawData$$outcome))) if(!classTextCheck) warning("the class levels are not valid R variable names; this may cause errors") ## Get the class distribution classDist <- table(rawData$$outcome) classDistString <- paste("``", names(classDist), "'' ($$n$$=", classDist, ")", sep = "") classDistString <- listString(classDistString) @ <<missingFilter, eval = $MISSINGFILTEREVAL, echo = $MISSINGFILTERECHO, results = $MISSINGFILTERRESULT>>= colRate <- apply(rawData[, predictorNames, drop = FALSE], 2, function(x) mean(is.na(x))) ##OPTION thresholds can be changed colExclude <- colRate > $MISSINGFILTERTHRESHC missingText <- "" if(any(colExclude)) { missingText <- paste(missingText, ifelse(sum(colExclude) > 1, " There were ", " There was "), sum(colExclude), ifelse(sum(colExclude) > 1, " predictors ", " predictor "), "with an excessive number of ", "missing data. ", ifelse(sum(colExclude) > 1, " These were excluded. ", " This was excluded. ")) predictorNames <- predictorNames[!colExclude] rawData <- rawData[, names(rawData) %in% c("outcome", predictorNames), drop = FALSE] } rowRate <- apply(rawData[, predictorNames, drop = FALSE], 1, function(x) mean(is.na(x))) rowExclude <- rowRate > $MISSINGFILTERTHRESHR if(any(rowExclude)) { missingText <- paste(missingText, ifelse(sum(rowExclude) > 1, " There were ", " There was "), sum(colExclude), ifelse(sum(rowExclude) > 1, " samples ", " sample "), "with an excessive number of ", "missing data. ", ifelse(sum(rowExclude) > 1, " These were excluded. ", " This was excluded. "), "After filtering, ", sum(!rowExclude), " samples remained.") rawData <- rawData[!rowExclude, ] hasMissing <- apply(rawData[, predictorNames, drop = FALSE], 1, function(x) mean(is.na(x))) } else { hasMissing <- apply(rawData[, predictorNames, drop = FALSE], 1, function(x) any(is.na(x))) missingText <- paste(missingText, ifelse(missingText == "", "There ", "Subsequently, there "), ifelse(sum(hasMissing) == 1, "was ", "were "), ifelse(sum(hasMissing) > 0, sum(hasMissing), "no"), ifelse(sum(hasMissing) == 1, "sample ", "samples "), "with missing values.") rawData <- rawData[complete.cases(rawData),] } rawData1 <- rawData[,1:length(rawData)-1] rawData2 <- rawData[,length(rawData)] set.seed(222) nzv1 <- nearZeroVar(rawData1) if(length(nzv1) > 0) { nzvVars1 <- names(rawData1)[nzv1] rawData <- rawData1[, -nzv1] rawData$outcome <- rawData2 nzvText1 <- paste("There were ", length(nzv1), " predictors that were removed from original data due to", " severely unbalanced distributions that", " could negatively affect the model fit", ifelse(length(nzv1) > 10, ".", paste(": ", listString(nzvVars1), ".", sep = "")), sep = "") } else { rawData <- rawData1 rawData$outcome <- rawData2 nzvText1 <- "" } remove("rawData1") remove("rawData2") @ The initial data set consisted of \Sexpr{numSamples} samples and \Sexpr{numPredictors} predictor variables. The breakdown of the outcome data classes were: \Sexpr{classDistString}. \Sexpr{missingText} \Sexpr{nzvText1} <<pca, eval= $PCAEVAL, echo = $PCAECHO, results = $PCARESULT>>= predictorNames <- names(rawData)[names(rawData) != "outcome"] numPredictors <- length(predictorNames) predictors <- rawData[, predictorNames, drop = FALSE] ## PCA will fail with predictors having less than 2 unique values isZeroVar <- apply(predictors, 2, function(x) length(unique(x)) < 2) if(any(isZeroVar)) predictors <- predictors[, !isZeroVar, drop = FALSE] ## For whatever, only the formula interface to prcomp ## handles missing values pcaForm <- as.formula( paste("~", paste(names(predictors), collapse = "+"))) pca <- prcomp(pcaForm, data = predictors, center = TRUE, scale. = TRUE, na.action = na.omit) ## OPTION: the number of components plotted/discussed can be set numPCAcomp <- $PCACOMP pctVar <- pca$$sdev^2/sum(pca$$sdev^2)*100 pcaText <- paste(round(pctVar[1:numPCAcomp], 1), "\\\\%", sep = "") pcaText <- listString(pcaText) @ To get an initial assessment of the separability of the classes, principal component analysis (PCA) was used to distill the \Sexpr{numPredictors} predictors down into \Sexpr{numPCAcomp} surrogate variables (i.e. the principal components) in a manner that attempts to maximize the amount of information preserved from the original predictor set. Figure \ref{F:inititalPCA} contains plots of the first \Sexpr{numPCAcomp} components, which accounted for \Sexpr{pcaText} percent of the variability in the original predictors (respectively). %% OPTION: remark on how well (or poorly) the data separated \setkeys{Gin}{width = 0.8\textwidth} \begin{figure}[p] \begin{center} <<pcaPlot, eval = $PCAPLOTEVAL, echo = $PCAPLOTECHO, results = $PCAPLOTRESULT, fig = $PCAPLOTFIG, width = 8, height = 8>>= trellis.par.set(caretTheme(), warn = TRUE) if(numPCAcomp == 2) { axisRange <- extendrange(pca$$x[, 1:2]) print( xyplot(PC1 ~ PC2, data = as.data.frame(pca$$x), type = c("p", "g"), groups = rawData$$outcome, auto.key = list(columns = 2), xlim = axisRange, ylim = axisRange)) } else { axisRange <- extendrange(pca$$x[, 1:numPCAcomp]) print( splom(~as.data.frame(pca$$x)[, 1:numPCAcomp], type = c("p", "g"), groups = rawData$$outcome, auto.key = list(columns = 2), as.table = TRUE, prepanel.limits = function(x) axisRange )) } @ \caption[PCA Plot]{A plot of the first \Sexpr{numPCAcomp} principal components for the original data set.} \label{F:inititalPCA} \end{center} \end{figure} <<initialDataSplit, eval = $INITIALDATASPLITEVAL, echo = $INITIALDATASPLITECHO, results = $INITIALDATASPLITRESULT>>= ## OPTION: in small samples sizes, you may not want to set aside a ## training set and focus on the resampling results. set.seed(1234) dataX <- rawData[,1:length(rawData)-1] dataY <- rawData[,length(rawData)] Smpling <- "$SAAMPLING" if(Smpling=="downsampling") { dwnsmpl <- downSample(dataX,dataY) rawData <- dwnsmpl[,1:length(dwnsmpl)-1] rawData$outcome <- dwnsmpl[,length(dwnsmpl)] remove("dwnsmpl") remove("dataX") remove("dataY") }else if(Smpling=="upsampling"){ upsmpl <- upSample(dataX,dataY) rawData <- upsmpl[,1:length(upsmpl)-1] rawData$outcome <- upsmpl[,length(upsmpl)] remove("upsmpl") remove("dataX") remove("dataY") }else{remove("dataX") remove("dataY") } numSamples <- nrow(rawData) predictorNames <- names(rawData)[names(rawData) != "outcome"] numPredictors <- length(predictorNames) classDist1 <- table(rawData$outcome) classDistString1 <- paste("``", names(classDist1), "'' ($n$=", classDist1, ")", sep = "") classDistString1 <- listString(classDistString1) pctTrain <- $PERCENT if(pctTrain < 1) { ## OPTION: seed number can be changed set.seed(1) inTrain <- createDataPartition(rawData$$outcome, p = pctTrain, list = FALSE) trainX <- rawData[ inTrain, predictorNames] testX <- rawData[-inTrain, predictorNames] trainY <- rawData[ inTrain, "outcome"] testY <- rawData[-inTrain, "outcome"] splitText <- paste("The original data were split into ", "a training set ($$n$$=", nrow(trainX), ") and a test set ($$n$$=", nrow(testX), ") in a manner that preserved the ", "distribution of the classes.", sep = "") isZeroVar <- apply(trainX, 2, function(x) length(unique(x)) < 2) if(any(isZeroVar)) { trainX <- trainX[, !isZeroVar, drop = FALSE] testX <- testX[, !isZeroVar, drop = FALSE] } } else { trainX <- rawData[, predictorNames] testX <- NULL trainY <- rawData[, "outcome"] testY <- NULL splitText <- "The entire data set was used as the training set." } trainDist <- table(trainY) nir <- max(trainDist)/length(trainY)*100 niClass <- names(trainDist)[which.max(trainDist)] nirText <- paste("The non--information rate is the accuracy that can be ", "achieved by predicting all samples using the most ", "dominant class. For these data, the rate is ", round(nir, 2), "\\\\% using the ``", niClass, "'' class.", sep = "") remove("rawData") if((!is.null(testX)) && (!is.null(testY))){ #save(trainX,trainY,testX,testY,file="datasets.RData") } else { save(trainX,trainY,file="datasets.RData") } @ \Sexpr{splitText} \Sexpr{nirText} The data set for model building consisted of \Sexpr{numSamples} samples and \Sexpr{numPredictors} predictor variables. The breakdown of the outcome data classes were: \Sexpr{classDistString1}. <<nzv, eval= $NZVEVAL, results = $NZVRESULT, echo = $NZVECHO>>= ## OPTION: other pre-processing steps can be used ppSteps <- caret:::suggestions(modName) set.seed(2) if(ppSteps["nzv"]) { nzv <- nearZeroVar(trainX) if(length(nzv) > 0) { nzvVars <- names(trainX)[nzv] trainX <- trainX[, -nzv] nzvText <- paste("There were ", length(nzv), " predictors that were removed from train set due to", " severely unbalanced distributions that", " could negatively affect the model", ifelse(length(nzv) > 10, ".", paste(": ", listString(nzvVars), ".", sep = "")), sep = "") testX <- testX[, -nzv] } else nzvText <- "" } else nzvText <- "" @ \Sexpr{nzvText} <<corrFilter, eval = $CORRFILTEREVAL, results = $CORRFILTERRESULT, echo = $CORRFILTERECHO>>= if(ppSteps["corr"]) { ## OPTION: corrThresh <- $THRESHHOLDCOR highCorr <- findCorrelation(cor(trainX, use = "pairwise.complete.obs"), corrThresh) if(length(highCorr) > 0) { corrVars <- names(trainX)[highCorr] trainX <- trainX[, -highCorr] corrText <- paste("There were ", length(highCorr), " predictors that were removed due to", " large between--predictor correlations that", " could negatively affect the model fit", ifelse(length(highCorr) > 10, ".", paste(": ", listString(highCorr), ".", sep = "")), " Removing these predictors forced", " all pair--wise correlations to be", " less than ", corrThresh, ".", sep = "") testX <- testX[, -highCorr] } else corrText <- "No correlation among data on given threshold" }else corrText <- "" @ \Sexpr{corrText} <<preProc, eval = $PREPROCEVAL, echo = $PREPROCECHO, results = $PREPROCRESULT>>= ppMethods <- NULL if(ppSteps["center"]) ppMethods <- c(ppMethods, "center") if(ppSteps["scale"]) ppMethods <- c(ppMethods, "scale") if(any(hasMissing) > 0) ppMethods <- c(ppMethods, "knnImpute") ##OPTION other methods, such as spatial sign, can be added to this list if(length(ppMethods) > 0) { ppInfo <- preProcess(trainX, method = ppMethods) trainX <- predict(ppInfo, trainX) if(pctTrain < 1) testX <- predict(ppInfo, testX) ppText <- paste("The following pre--processing methods were", " applied to the training", ifelse(pctTrain < 1, " and test", ""), " data: ", listString(ppMethods), ".", sep = "") ppText <- gsub("center", "mean centering", ppText) ppText <- gsub("scale", "scaling to unit variance", ppText) ppText <- gsub("knnImpute", paste(ppInfo$$k, "--nearest neighbor imputation", sep = ""), ppText) ppText <- gsub("spatialSign", "the spatial sign transformation", ppText) ppText <- gsub("pca", "principal component feature extraction", ppText) ppText <- gsub("ica", "independent component feature extraction", ppText) } else { ppInfo <- NULL ppText <- "" } predictorNames <- names(trainX) if(nzvText != "" | corrText != "" | ppText != "") { varText <- paste("After pre--processing, ", ncol(trainX), "predictors remained for modeling.") } else varText <- "" @ \Sexpr{ppText} \Sexpr{varText} \clearpage \section*{Model Building} <<setupWorkers, eval = TRUE, echo = $SETUPWORKERSECHO, results = $SETUPWORKERSRESULT>>= numWorkers <- $NUMWORKERS ##OPTION: turn up numWorkers to use MPI if(numWorkers > 1) { mpiCalcs <- function(X, FUN, ...) { theDots <- list(...) parLapply(theDots$$cl, X, FUN) } library(snow) cl <- makeCluster(numWorkers, "MPI") } @ <<setupResampling, echo = $SETUPRESAMPLINGECHO, results = $SETUPRESAMPLINGRESULT>>= ##OPTION: the resampling options can be changed. See ## ?trainControl for details resampName <- "$RESAMPNAME" resampNumber <- $RESAMPLENUMBER numRepeat <- $NUMREPEAT resampP <- $RESAMPLENUMBERPERCENT modelInfo <- modelLookup(modName) if(numClasses == 2) { foo <- if(any(modelInfo$$probModel)) twoClassSummary else twoClassNoProbs } else foo <- defaultSummary set.seed(3) ctlObj <- trainControl(method = resampName, number = resampNumber, repeats = numRepeat, p = resampP, classProbs = any(modelInfo$$probModel), summaryFunction = foo) ##OPTION select other performance metrics as needed optMetric <- if(numClasses == 2 & any(modelInfo$$probModel)) "ROC" else "Kappa" if(numWorkers > 1) { ctlObj$$workers <- numWorkers ctlObj$$computeFunction <- mpiCalcs ctlObj$$computeArgs <- list(cl = cl) } @ <<setupGrid, results = $SETUPGRIDRESULT, echo = $SETUPGRIDECHO>>= ##OPTION expand or contract these grids as needed (or ## add more models gridSize <- $SETUPGRIDSIZE if(modName %in% c("svmPoly", "svmRadial", "svmLinear", "lvq", "ctree2", "ctree")) gridSize <- 5 if(modName %in% c("earth", "fda")) gridSize <- 7 if(modName %in% c("knn", "rocc", "glmboost", "rf", "nodeHarvest")) gridSize <- 10 if(modName %in% c("nb")) gridSize <- 2 if(modName %in% c("pam", "rpart")) gridSize <- 15 if(modName %in% c("pls")) gridSize <- min(20, ncol(trainX)) if(modName == "gbm") { tGrid <- expand.grid(.interaction.depth = -1 + (1:5)*2 , .n.trees = (1:10)*20, .shrinkage = .1) } if(modName == "nnet") { tGrid <- expand.grid(.size = -1 + (1:5)*2 , .decay = c(0, .001, .01, .1)) } if(modName == "ada") { tGrid <- expand.grid(.maxdepth = 1, .iter = c(100,200,300,400), .nu = 1 ) } @ <<fitModel, results = $FITMODELRESULT, echo = $FITMODELECHO, eval = $FITMODELEVAL>>= ##OPTION alter as needed set.seed(4) modelFit <- switch(modName, gbm = { mix <- sample(seq(along = trainY)) train( trainX[mix,], trainY[mix], modName, verbose = FALSE, bag.fraction = .9, metric = optMetric, trControl = ctlObj, tuneGrid = tGrid) }, multinom = { train( trainX, trainY, modName, trace = FALSE, metric = optMetric, maxiter = 1000, MaxNWts = 5000, trControl = ctlObj, tuneLength = gridSize) }, nnet = { train( trainX, trainY, modName, metric = optMetric, linout = FALSE, trace = FALSE, maxiter = 1000, MaxNWts = 5000, trControl = ctlObj, tuneGrid = tGrid) }, svmRadial =, svmPoly =, svmLinear = { train( trainX, trainY, modName, metric = optMetric, scaled = TRUE, trControl = ctlObj, tuneLength = gridSize) }, { train(trainX, trainY, modName, trControl = ctlObj, metric = optMetric, tuneLength = gridSize) }) @ <<modelDescr, echo = $MODELDESCRECHO, results = $MODELDESCRRESULT>>= summaryText <- "" resampleName <- switch(tolower(modelFit$$control$$method), boot = paste("the bootstrap (", length(modelFit$$control$$index), " reps)", sep = ""), boot632 = paste("the bootstrap 632 rule (", length(modelFit$$control$$index), " reps)", sep = ""), cv = paste("cross-validation (", modelFit$$control$$number, " fold)", sep = ""), repeatedcv = paste("cross-validation (", modelFit$$control$$number, " fold, repeated ", modelFit$$control$$repeats, " times)", sep = ""), lgocv = paste("repeated train/test splits (", length(modelFit$$control$$index), " reps, ", round(modelFit$$control$$p, 2), "$$\\%$$)", sep = "")) tuneVars <- latexTranslate(tolower(modelInfo$$label)) tuneVars <- gsub("\\#", "the number of ", tuneVars, fixed = TRUE) if(ncol(modelFit$$bestTune) == 1 && colnames(modelFit$$bestTune) == ".parameter") { summaryText <- paste(summaryText, "\n\n", "There are no tuning parameters associated with this model.", "To characterize the model performance on the training set,", resampleName, "was used.", "Table \\\\ref{T:resamps} and Figure \\\\ref{F:profile}", "show summaries of the resampling results. ") } else { summaryText <- paste("There", ifelse(nrow(modelInfo) > 1, "are", "is"), nrow(modelInfo), ifelse(nrow(modelInfo) > 1, "tuning parameters", "tuning parameter"), "associated with this model:", listString(tuneVars, period = TRUE)) paramNames <- gsub(".", "", names(modelFit$$bestTune), fixed = TRUE) ## (i in seq(along = paramNames)) ## { ## check <- modelInfo$$parameter %in% paramNames[i] ## if(any(check)) ## { ## paramNames[i] <- modelInfo$$label[which(check)] ## } ## } paramNames <- gsub("#", "the number of ", paramNames, fixed = TRUE) ## Check to see if there was only one combination fit summaryText <- paste(summaryText, "To choose", ifelse(nrow(modelInfo) > 1, "appropriate values of the tuning parameters,", "an appropriate value of the tuning parameter,"), resampleName, "was used to generated a profile of performance across the", nrow(modelFit$$results), ifelse(nrow(modelInfo) > 1, "combinations of the tuning parameters.", "candidate values."), "Table \\\\ref{T:resamps} and Figure \\\\ref{F:profile} show", "summaries of the resampling profile. ", "The final model fitted to the entire training set was:", listString(paste(latexTranslate(tolower(paramNames)), "=", modelFit$$bestTune[1,]), period = TRUE)) } @ \Sexpr{summaryText} <<resampTable, echo = $RESAMPTABLEECHO, results = $RESAMPTABLERESULT>>= tableData <- modelFit$$results if(all(modelInfo$$parameter == "parameter") && resampName == "boot632") { tableData <- tableData[,-1, drop = FALSE] colNums <- c( length(modelFit$$perfNames), length(modelFit$$perfNames), length(modelFit$$perfNames)) colLabels <- c("Mean", "Standard Deviation","Apparant") constString <- "" isConst <- NULL } else if (all(modelInfo$$parameter == "parameter") && (resampName == "boot" | resampName == "cv" | resampName == "repeatedcv" )){ tableData <- tableData[,-1, drop = FALSE] colNums <- c(length(modelFit$$perfNames), length(modelFit$$perfNames)) colLabels <- c("Mean", "Standard Deviation") constString <- "" isConst <- NULL } else if (all(modelInfo$$parameter == "parameter") && resampName == "LOOCV" ){ tableData <- tableData[,-1, drop = FALSE] colNums <- length(modelFit$$perfNames) colLabels <- c("Measures") constString <- "" isConst <- NULL } else { if (all(modelInfo$$parameter != "parameter") && resampName == "boot632" ){ isConst <- apply(tableData[, modelInfo$$parameter, drop = FALSE], 2, function(x) length(unique(x)) == 1) numParamInTable <- sum(!isConst) if(any(isConst)) { constParam <- modelInfo$$parameter[isConst] constValues <- format(tableData[, constParam, drop = FALSE], digits = 4)[1,,drop = FALSE] tableData <- tableData[, !(names(tableData) %in% constParam), drop = FALSE] constString <- paste("The tuning", ifelse(sum(isConst) > 1, "parmeters", "parameter"), listString(paste("``", names(constValues), "''", sep = "")), ifelse(sum(isConst) > 1, "were", "was"), "held constant at", ifelse(sum(isConst) > 1, "a value of", "values of"), listString(constValues[1,])) } else constString <- "" cn <- colnames(tableData) ## for(i in seq(along = cn)) ## { ## check <- modelInfo$$parameter %in% cn[i] ## if(any(check)) ## { ## cn[i] <- modelInfo$$label[which(check)] ## } ## } ## colnames(tableData) <- cn colNums <- c(numParamInTable, length(modelFit$$perfNames), length(modelFit$$perfNames), length(modelFit$$perfNames)) colLabels <- c("", "Mean", "Standard Deviation", "Apparant") }else if (all(modelInfo$$parameter != "parameter") && (resampName == "boot" | resampName == "repeatedcv" | resampName == "cv") ){ isConst <- apply(tableData[, modelInfo$$parameter, drop = FALSE], 2, function(x) length(unique(x)) == 1) numParamInTable <- sum(!isConst) if(any(isConst)) { constParam <- modelInfo$$parameter[isConst] constValues <- format(tableData[, constParam, drop = FALSE], digits = 4)[1,,drop = FALSE] tableData <- tableData[, !(names(tableData) %in% constParam), drop = FALSE] constString <- paste("The tuning", ifelse(sum(isConst) > 1, "parmeters", "parameter"), listString(paste("``", names(constValues), "''", sep = "")), ifelse(sum(isConst) > 1, "were", "was"), "held constant at", ifelse(sum(isConst) > 1, "a value of", "values of"), listString(constValues[1,])) } else constString <- "" cn <- colnames(tableData) ## for(i in seq(along = cn)) ## { ## check <- modelInfo$$parameter %in% cn[i] ## if(any(check)) ## { ## cn[i] <- modelInfo$$label[which(check)] ## } ## } ## colnames(tableData) <- cn colNums <- c(numParamInTable, length(modelFit$$perfNames), length(modelFit$$perfNames)) colLabels <- c("", "Mean", "Standard Deviation") } else if (all(modelInfo$$parameter != "parameter") && resampName == "LOOCV"){ isConst <- apply(tableData[, modelInfo$$parameter, drop = FALSE], 2, function(x) length(unique(x)) == 1) numParamInTable <- sum(!isConst) if(any(isConst)) { constParam <- modelInfo$$parameter[isConst] constValues <- format(tableData[, constParam, drop = FALSE], digits = 4)[1,,drop = FALSE] tableData <- tableData[, !(names(tableData) %in% constParam), drop = FALSE] constString <- paste("The tuning", ifelse(sum(isConst) > 1, "parmeters", "parameter"), listString(paste("``", names(constValues), "''", sep = "")), ifelse(sum(isConst) > 1, "were", "was"), "held constant at", ifelse(sum(isConst) > 1, "a value of", "values of"), listString(constValues[1,])) } else constString <- "" cn <- colnames(tableData) ## for(i in seq(along = cn)) ## { ## check <- modelInfo$$parameter %in% cn[i] ## if(any(check)) ## { ## cn[i] <- modelInfo$$label[which(check)] ## } ## } ## colnames(tableData) <- cn colNums <- c(numParamInTable, length(modelFit$$perfNames)) colLabels <- c("", "Measures") } } colnames(tableData) <- gsub("SD$$", "", colnames(tableData)) colnames(tableData) <- gsub("Apparent$$", "", colnames(tableData)) colnames(tableData) <- latexTranslate(colnames(tableData)) rownames(tableData) <- latexTranslate(rownames(tableData)) latex(tableData, rowname = NULL, file = "", cgroup = colLabels, n.cgroup = colNums, where = "h!", digits = 4, longtable = nrow(tableData) > 30, caption = paste(resampleName, "results from the model fit.", constString), label = "T:resamps") @ \setkeys{Gin}{ width = 0.9\textwidth} \begin{figure}[b] \begin{center} <<profilePlot, echo = $PROFILEPLOTECHO, fig = $PROFILEPLOTFIG, width = 8, height = 6>>= trellis.par.set(caretTheme(), warn = TRUE) if(all(modelInfo$$parameter == "parameter") | all(isConst) | modName == "nb") { resultsPlot <- resampleHist(modelFit) plotCaption <- paste("Distributions of model performance from the ", "training set estimated using ", resampleName) } else { if(modName %in% c("svmPoly", "svmRadial", "svmLinear")) { resultsPlot <- plot(modelFit, metric = optMetric, xTrans = function(x) log10(x)) resultsPlot <- update(resultsPlot, type = c("g", "p", "l"), ylab = paste(optMetric, " (", resampleName, ")", sep = "")) } else { resultsPlot <- plot(modelFit, metric = optMetric) resultsPlot <- update(resultsPlot, type = c("g", "p", "l"), ylab = paste(optMetric, " (", resampleName, ")", sep = "")) } plotCaption <- paste("A plot of the estimates of the", optMetric, "values calculated using", resampleName) } print(resultsPlot) @ \caption[Performance Plot]{\Sexpr{plotCaption}.} \label{F:profile} \end{center} \end{figure} <<stopWorkers, echo = $STOPWORKERSECHO, results = $STOPWORKERSRESULT>>= if(numWorkers > 1) stopCluster(cl) @ <<testPred, results = $TESTPREDRESULT, echo = $TESTPREDECHO>>= if((!is.null(testX)) && (!is.null(testY))){ save(trainX,trainY,testX,testY,file="datasets.RData") } else { save(trainX,trainY,file="datasets.RData") } if(pctTrain < 1) { cat("\\clearpage\n\\section*{Test Set Results}\n\n") classPred <- predict(modelFit, testX) cm <- confusionMatrix(classPred, testY) values <- cm$$overall[c("Accuracy", "Kappa", "AccuracyPValue", "McnemarPValue")] values <- values[!is.na(values) & !is.nan(values)] values <- c(format(values[1:2], digits = 3), format.pval(values[-(1:2)], digits = 5)) nms <- c("the overall accuracy", "the Kappa statistic", "the $$p$$--value that accuracy is greater than the no--information rate", "the $$p$$--value of concordance from McNemar's test") nms <- nms[seq(along = values)] names(values) <- nms if(any(modelInfo$$probModel)) { classProbs <- extractProb(list(fit = modelFit), testX = testX, testY = testY) classProbs <- subset(classProbs, dataType == "Test") if(numClasses == 2) { tmp <- twoClassSummary(classProbs, lev = levels(classProbs$$obs)) tmp <- c(format(tmp, digits = 3)) names(tmp) <- c("the area under the ROC curve", "the sensitivity", "the specificity") values <- c(values, tmp) } probPlot <- plotClassProbs(classProbs) } testString <- paste("Based on the test set of", nrow(testX), "samples,", listString(paste(names(values), "was", values), period = TRUE), "The confusion matrix for the test set is shown in Table", "\\\\ref{T:cm}.") testString <- paste(testString, " Using ", resampleName, ", the training set estimates were ", resampleStats(modelFit), ".", sep = "") if(any(modelInfo$$probModel)) testString <- paste(testString, "Histograms of the class probabilities", "for the test set samples are shown in", "Figure \\\\ref{F:probs}", ifelse(numClasses == 2, " and the test set ROC curve is in Figure \\\\ref{F:roc}.", ".")) latex(cm$$table, title = "", file = "", where = "h", cgroup = "Observed Values", n.cgroup = numClasses, caption = "The confusion matrix for the test set", label = "T:cm") } else testString <- "" @ \Sexpr{testString} <<classProbsTex, results = $CLASSPROBSTEXRESULT, echo = $CLASSPROBSTEXECHO>>= if(any(modelInfo$probModel) && pctTrain < 1 ) { cat( paste("\\begin{figure}[p]\n", "\\begin{center}\n", "\\includegraphics{classProbs}", "\\caption[PCA Plot]{Class probabilities", "for the test set. Each panel contains ", "separate classes}\n", "\\label{F:probs}\n", "\\end{center}\n", "\\end{figure}")) } if(any(modelInfo$$probModel) & numClasses == 2 & pctTrain < 1 ) { cat( paste("\\begin{figure}[p]\n", "\\begin{center}\n", "\\includegraphics[clip, width = .8\\textwidth]{roc}", "\\caption[ROC Plot]{ROC Curve", "for the test set.}\n", "\\label{F:roc}\n", "\\end{center}\n", "\\end{figure}")) } else { cat (paste("")) } @ <<classProbsTex, results = $CLASSPROBSTEXRESULT1, echo = $CLASSPROBSTEXECHO1 >>= if(any(modelInfo$probModel) && pctTrain < 1) { pdf("classProbs.pdf", height = 7, width = 7) trellis.par.set(caretTheme(), warn = FALSE) print(probPlot) dev.off() } if(any(modelInfo$probModel) & numClasses == 2 & pctTrain < 1) { resPonse<-testY preDictor<-classProbs[, levels(trainY)[1]] pdf("roc.pdf", height = 8, width = 8) # from pROC example at http://web.expasy.org/pROC/screenshots.htm plot.roc(resPonse, preDictor, # data percent=TRUE, # show all values in percent partial.auc=c(100, 90), partial.auc.correct=TRUE, # define a partial AUC (pAUC) print.auc=TRUE, #display pAUC value on the plot with following options: print.auc.pattern="Corrected pAUC (100-90%% SP):\n%.1f%%", print.auc.col="#1c61b6", auc.polygon=TRUE, auc.polygon.col="#1c61b6", # show pAUC as a polygon max.auc.polygon=TRUE, max.auc.polygon.col="#1c61b622", # also show the 100% polygon main="Partial AUC (pAUC)") plot.roc(resPonse, preDictor, percent=TRUE, add=TRUE, type="n", # add to plot, but don't re-add the ROC itself (useless) partial.auc=c(100, 90), partial.auc.correct=TRUE, partial.auc.focus="se", # focus pAUC on the sensitivity print.auc=TRUE, print.auc.pattern="Corrected pAUC (100-90%% SE):\n%.1f%%", print.auc.col="#008600", print.auc.y=40, # do not print auc over the previous one auc.polygon=TRUE, auc.polygon.col="#008600", max.auc.polygon=TRUE, max.auc.polygon.col="#00860022") dev.off() } else { cat("") } @ \section*{Versions} <<versions, echo = FALSE, results = tex>>= toLatex(sessionInfo()) @ <<save-data, echo = $SAVEDATAECHO, results = $SAVEDATARESULT>>= ## change this to the name of modName.... Fit <- modelFit if(exists('ppInfo') && !is.null(ppInfo)){ save(Fit,ppInfo,cm,file="$METHOD-Fit.RData") } else {save(Fit,cm,file="$METHOD-Fit.RData")} @ The model was built using $METHOD and is saved as $METHOD Model for reuse. This contains the variable Fit. \end{document}''' return template4Rnw