comparison templateLibrary.py @ 0:2cb81da69c73 draft

planemo upload commit a1f4dd8eb560c649391ada1a6bb9505893a35272
author anmoljh
date Thu, 31 May 2018 11:58:59 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2cb81da69c73
1 def __template4Rnw():
2
3 template4Rnw = r'''%% Classification Modeling Script
4 %% Max Kuhn (max.kuhn@pfizer.com, mxkuhn@gmail.com)
5 %% Version: 1.00
6 %% Created on: 2010/10/02
7 %%
8 %% The originla file hs been improved by
9 %% Deepak Bharti, Andrew M. Lynn , Anmol J. Hemrom
10 %% Version : 1.01
11 %% created on : 2014/08/12
12 %% This is an Sweave template for building and describing
13 %% classification models. It mixes R and LaTeX code. The document can
14 %% be processing using R's Sweave function to produce a tex file.
15 %%
16 %% The inputs are:
17 %% - the initial data set in a data frame called 'rawData'
18 %% - a factor column in the data set called 'class'. this should be the
19 %% outcome variable
20 %% - all other columns in rawData should be predictor variables
21 %% - the type of model should be in a variable called 'modName'.
22 %%
23 %% The script attempts to make some intelligent choices based on the
24 %% model being used. For example, if modName is "pls", the script will
25 %% automatically center and scale the predictor data. There are
26 %% situations where these choices can (and should be) changed.
27 %%
28 %% There are other options that may make sense to change. For example,
29 %% the user may want to adjust the type of resampling. To find these
30 %% parts of the script, search on the string 'OPTION'. These parts of
31 %% the code will document the options.
32
33 \documentclass[14pt]{report}
34 \usepackage{amsmath}
35 \usepackage[pdftex]{graphicx}
36 \usepackage{color}
37 \usepackage{ctable}
38 \usepackage{xspace}
39 \usepackage{fancyvrb}
40 \usepackage{fancyhdr}
41 \usepackage{lastpage}
42 \usepackage{longtable}
43 \usepackage{algorithm2e}
44 \usepackage[
45 colorlinks=true,
46 linkcolor=blue,
47 citecolor=blue,
48 urlcolor=blue]
49 {hyperref}
50 \usepackage{lscape}
51 \usepackage{Sweave}
52 \SweaveOpts{keep.source = TRUE}
53
54 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
55
56 % define new colors for use
57 \definecolor{darkgreen}{rgb}{0,0.6,0}
58 \definecolor{darkred}{rgb}{0.6,0.0,0}
59 \definecolor{lightbrown}{rgb}{1,0.9,0.8}
60 \definecolor{brown}{rgb}{0.6,0.3,0.3}
61 \definecolor{darkblue}{rgb}{0,0,0.8}
62 \definecolor{darkmagenta}{rgb}{0.5,0,0.5}
63
64 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
65
66 \newcommand{\bld}[1]{\mbox{\boldmath $$#1$$}}
67 \newcommand{\shell}[1]{\mbox{$$#1$$}}
68 \renewcommand{\vec}[1]{\mbox{\bf {#1}}}
69
70 \newcommand{\ReallySmallSpacing}{\renewcommand{\baselinestretch}{.6}\Large\normalsize}
71 \newcommand{\SmallSpacing}{\renewcommand{\baselinestretch}{1.1}\Large\normalsize}
72
73 \newcommand{\halfs}{\frac{1}{2}}
74
75 \setlength{\oddsidemargin}{-.25 truein}
76 \setlength{\evensidemargin}{0truein}
77 \setlength{\topmargin}{-0.2truein}
78 \setlength{\textwidth}{7 truein}
79 \setlength{\textheight}{8.5 truein}
80 \setlength{\parindent}{0.20truein}
81 \setlength{\parskip}{0.10truein}
82
83 \setcounter{LTchunksize}{50}
84
85 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
86 \pagestyle{fancy}
87 \lhead{}
88 %% OPTION Report header name
89 \chead{Classification Model Script}
90 \rhead{}
91 \lfoot{}
92 \cfoot{}
93 \rfoot{\thepage\ of \pageref{LastPage}}
94 \renewcommand{\headrulewidth}{1pt}
95 \renewcommand{\footrulewidth}{1pt}
96 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
97
98 %% OPTION Report title and modeler name
99 \title{Classification Model Script using $METHOD}
100 \author{"Lynn Group with M. Kuhn, SCIS, JNU, New Delhi"}
101
102 \begin{document}
103
104 \maketitle
105
106 \thispagestyle{empty}
107 <<dummy, eval=TRUE, echo=FALSE, results=hide>>=
108 # sets values for variables used later in the program to prevent the \Sexpr error on parsing with Sweave
109 numSamples=''
110 classDistString=''
111 missingText=''
112 numPredictors=''
113 numPCAcomp=''
114 pcaText=''
115 nzvText=''
116 corrText=''
117 ppText=''
118 varText=''
119 splitText="Dummy Text"
120 nirText="Dummy Text"
121 # pctTrain is a variable that is initialised in Data splitting, and reused later in testPred
122 pctTrain=0.8
123 Smpling=''
124 nzvText1=''
125 classDistString1=''
126 dwnsmpl=''
127 upsmpl=''
128
129 @
130 <<startup, eval= TRUE, results = hide, echo = FALSE>>=
131 library(Hmisc)
132 library(caret)
133 library(pROC)
134 versionTest <- compareVersion(packageDescription("caret")$$Version,
135 "4.65")
136 if(versionTest < 0) stop("caret version 4.65 or later is required")
137
138 library(RColorBrewer)
139
140
141 listString <- function (x, period = FALSE, verbose = FALSE)
142 {
143 if (verbose) cat("\n entering listString\n")
144 flush.console()
145 if (!is.character(x))
146 x <- as.character(x)
147 numElements <- length(x)
148 out <- if (length(x) > 0) {
149 switch(min(numElements, 3), x, paste(x, collapse = " and "),
150 {
151 x <- paste(x, c(rep(",", numElements - 2), " and", ""), sep = "")
152 paste(x, collapse = " ")
153 })
154 }
155 else ""
156 if (period) out <- paste(out, ".", sep = "")
157 if (verbose) cat(" leaving listString\n\n")
158 flush.console()
159 out
160 }
161
162 resampleStats <- function(x, digits = 3)
163 {
164 bestPerf <- x$$bestTune
165 colnames(bestPerf) <- gsub("^\\.", "", colnames(bestPerf))
166 out <- merge(x$$results, bestPerf)
167 out <- out[, colnames(out) %in% x$$perfNames]
168 names(out) <- gsub("ROC", "area under the ROC curve", names(out), fixed = TRUE)
169 names(out) <- gsub("Sens", "sensitivity", names(out), fixed = TRUE)
170 names(out) <- gsub("Spec", "specificity", names(out), fixed = TRUE)
171 names(out) <- gsub("Accuracy", "overall accuracy", names(out), fixed = TRUE)
172 names(out) <- gsub("Kappa", "Kappa statistics", names(out), fixed = TRUE)
173
174 out <- format(out, digits = digits)
175 listString(paste(names(out), "was", out))
176 }
177
178 twoClassNoProbs <- function (data, lev = NULL, model = NULL)
179 {
180 out <- c(sensitivity(data[, "pred"], data[, "obs"], lev[1]),
181 specificity(data[, "pred"], data[, "obs"], lev[2]),
182 confusionMatrix(data[, "pred"], data[, "obs"])$$overall["Kappa"])
183
184 names(out) <- c("Sens", "Spec", "Kappa")
185 out
186 }
187
188
189
190 ##OPTION: model name: see ?train for more values/models
191 modName <- "$METHOD"
192
193
194 load("$RDATA")
195 rawData <- dataX
196 rawData$$outcome <- dataY
197
198 @
199
200
201 \section*{Data Sets}\label{S:data}
202
203 %% OPTION: provide some background on the problem, the experimental
204 %% data, how the compounds were selected etc
205
206 <<getDataInfo, eval = $GETDATAINFOEVAL, echo = $GETDATAINFOECHO, results = $GETDATAINFORESULT>>=
207 if(!any(names(rawData) == "outcome")) stop("a variable called outcome should be in the data set")
208 if(!is.factor(rawData$$outcome)) stop("the outcome should be a factor vector")
209
210 ## OPTION: when there are only two classes, the first level of the
211 ## factor is used as the "positive" or "event" for calculating
212 ## sensitivity and specificity. Adjust the outcome factor accordingly.
213 numClasses <- length(levels(rawData$$outcome))
214 numSamples <- nrow(rawData)
215 numPredictors <- ncol(rawData) - 1
216 predictorNames <- names(rawData)[names(rawData) != "outcome"]
217
218 isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric)
219 if(any(!isNum)) stop("all predictors in rawData should be numeric")
220
221 classTextCheck <- all.equal(levels(rawData$$outcome), make.names(levels(rawData$$outcome)))
222 if(!classTextCheck) warning("the class levels are not valid R variable names; this may cause errors")
223
224 ## Get the class distribution
225 classDist <- table(rawData$$outcome)
226 classDistString <- paste("``",
227 names(classDist),
228 "'' ($$n$$=",
229 classDist,
230 ")",
231 sep = "")
232 classDistString <- listString(classDistString)
233 @
234
235 <<missingFilter, eval = $MISSINGFILTEREVAL, echo = $MISSINGFILTERECHO, results = $MISSINGFILTERRESULT>>=
236 colRate <- apply(rawData[, predictorNames, drop = FALSE],
237 2, function(x) mean(is.na(x)))
238
239 ##OPTION thresholds can be changed
240 colExclude <- colRate > $MISSINGFILTERTHRESHC
241
242 missingText <- ""
243
244 if(any(colExclude))
245 {
246 missingText <- paste(missingText,
247 ifelse(sum(colExclude) > 1,
248 " There were ",
249 " There was "),
250 sum(colExclude),
251 ifelse(sum(colExclude) > 1,
252 " predictors ",
253 " predictor "),
254 "with an excessive number of ",
255 "missing data. ",
256 ifelse(sum(colExclude) > 1,
257 " These were excluded. ",
258 " This was excluded. "))
259 predictorNames <- predictorNames[!colExclude]
260 rawData <- rawData[, names(rawData) %in% c("outcome", predictorNames), drop = FALSE]
261 }
262
263
264 rowRate <- apply(rawData[, predictorNames, drop = FALSE],
265 1, function(x) mean(is.na(x)))
266
267 rowExclude <- rowRate > $MISSINGFILTERTHRESHR
268
269
270 if(any(rowExclude)) {
271 missingText <- paste(missingText,
272 ifelse(sum(rowExclude) > 1,
273 " There were ",
274 " There was "),
275 sum(colExclude),
276 ifelse(sum(rowExclude) > 1,
277 " samples ",
278 " sample "),
279 "with an excessive number of ",
280 "missing data. ",
281 ifelse(sum(rowExclude) > 1,
282 " These were excluded. ",
283 " This was excluded. "),
284 "After filtering, ",
285 sum(!rowExclude),
286 " samples remained.")
287 rawData <- rawData[!rowExclude, ]
288 hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
289 1, function(x) mean(is.na(x)))
290 } else {
291 hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
292 1, function(x) any(is.na(x)))
293 missingText <- paste(missingText,
294 ifelse(missingText == "",
295 "There ",
296 "Subsequently, there "),
297 ifelse(sum(hasMissing) == 1,
298 "was ",
299 "were "),
300 ifelse(sum(hasMissing) > 0,
301 sum(hasMissing),
302 "no"),
303 ifelse(sum(hasMissing) == 1,
304 "sample ",
305 "samples "),
306 "with missing values.")
307
308 rawData <- rawData[complete.cases(rawData),]
309
310 }
311
312 rawData1 <- rawData[,1:length(rawData)-1]
313 rawData2 <- rawData[,length(rawData)]
314
315 set.seed(222)
316 nzv1 <- nearZeroVar(rawData1)
317 if(length(nzv1) > 0)
318 {
319 nzvVars1 <- names(rawData1)[nzv1]
320 rawData <- rawData1[, -nzv1]
321 rawData$outcome <- rawData2
322 nzvText1 <- paste("There were ",
323 length(nzv1),
324 " predictors that were removed from original data due to",
325 " severely unbalanced distributions that",
326 " could negatively affect the model fit",
327 ifelse(length(nzv1) > 10,
328 ".",
329 paste(": ",
330 listString(nzvVars1),
331 ".",
332 sep = "")),
333 sep = "")
334
335 } else {
336 rawData <- rawData1
337 rawData$outcome <- rawData2
338 nzvText1 <- ""
339
340 }
341
342 remove("rawData1")
343 remove("rawData2")
344
345 @
346
347 The initial data set consisted of \Sexpr{numSamples} samples and
348 \Sexpr{numPredictors} predictor variables. The breakdown of the
349 outcome data classes were: \Sexpr{classDistString}.
350
351 \Sexpr{missingText}
352
353 \Sexpr{nzvText1}
354
355 <<pca, eval= $PCAEVAL, echo = $PCAECHO, results = $PCARESULT>>=
356
357 predictorNames <- names(rawData)[names(rawData) != "outcome"]
358 numPredictors <- length(predictorNames)
359 predictors <- rawData[, predictorNames, drop = FALSE]
360 ## PCA will fail with predictors having less than 2 unique values
361 isZeroVar <- apply(predictors, 2,
362 function(x) length(unique(x)) < 2)
363 if(any(isZeroVar)) predictors <- predictors[, !isZeroVar, drop = FALSE]
364 ## For whatever, only the formula interface to prcomp
365 ## handles missing values
366 pcaForm <- as.formula(
367 paste("~",
368 paste(names(predictors), collapse = "+")))
369 pca <- prcomp(pcaForm,
370 data = predictors,
371 center = TRUE,
372 scale. = TRUE,
373 na.action = na.omit)
374 ## OPTION: the number of components plotted/discussed can be set
375 numPCAcomp <- $PCACOMP
376 pctVar <- pca$$sdev^2/sum(pca$$sdev^2)*100
377 pcaText <- paste(round(pctVar[1:numPCAcomp], 1),
378 "\\\\%",
379 sep = "")
380 pcaText <- listString(pcaText)
381 @
382
383 To get an initial assessment of the separability of the classes,
384 principal component analysis (PCA) was used to distill the
385 \Sexpr{numPredictors} predictors down into \Sexpr{numPCAcomp}
386 surrogate variables (i.e. the principal components) in a manner that
387 attempts to maximize the amount of information preserved from the
388 original predictor set. Figure \ref{F:inititalPCA} contains plots of
389 the first \Sexpr{numPCAcomp} components, which accounted for
390 \Sexpr{pcaText} percent of the variability in the original predictors
391 (respectively).
392
393
394 %% OPTION: remark on how well (or poorly) the data separated
395
396 \setkeys{Gin}{width = 0.8\textwidth}
397 \begin{figure}[p]
398 \begin{center}
399
400 <<pcaPlot, eval = $PCAPLOTEVAL, echo = $PCAPLOTECHO, results = $PCAPLOTRESULT, fig = $PCAPLOTFIG, width = 8, height = 8>>=
401 trellis.par.set(caretTheme(), warn = TRUE)
402 if(numPCAcomp == 2)
403 {
404 axisRange <- extendrange(pca$$x[, 1:2])
405 print(
406 xyplot(PC1 ~ PC2,
407 data = as.data.frame(pca$$x),
408 type = c("p", "g"),
409 groups = rawData$$outcome,
410 auto.key = list(columns = 2),
411 xlim = axisRange,
412 ylim = axisRange))
413 } else {
414 axisRange <- extendrange(pca$$x[, 1:numPCAcomp])
415 print(
416 splom(~as.data.frame(pca$$x)[, 1:numPCAcomp],
417 type = c("p", "g"),
418 groups = rawData$$outcome,
419 auto.key = list(columns = 2),
420 as.table = TRUE,
421 prepanel.limits = function(x) axisRange
422 ))
423
424 }
425
426 @
427
428 \caption[PCA Plot]{A plot of the first \Sexpr{numPCAcomp}
429 principal components for the original data set.}
430 \label{F:inititalPCA}
431 \end{center}
432 \end{figure}
433
434
435
436 <<initialDataSplit, eval = $INITIALDATASPLITEVAL, echo = $INITIALDATASPLITECHO, results = $INITIALDATASPLITRESULT>>=
437
438 ## OPTION: in small samples sizes, you may not want to set aside a
439 ## training set and focus on the resampling results.
440
441 set.seed(1234)
442 dataX <- rawData[,1:length(rawData)-1]
443 dataY <- rawData[,length(rawData)]
444
445 Smpling <- "$SAAMPLING"
446
447 if(Smpling=="downsampling")
448 {
449 dwnsmpl <- downSample(dataX,dataY)
450 rawData <- dwnsmpl[,1:length(dwnsmpl)-1]
451 rawData$outcome <- dwnsmpl[,length(dwnsmpl)]
452 remove("dwnsmpl")
453 remove("dataX")
454 remove("dataY")
455 }else if(Smpling=="upsampling"){
456 upsmpl <- upSample(dataX,dataY)
457 rawData <- upsmpl[,1:length(upsmpl)-1]
458 rawData$outcome <- upsmpl[,length(upsmpl)]
459 remove("upsmpl")
460 remove("dataX")
461 remove("dataY")
462 }else{remove("dataX")
463 remove("dataY")
464 }
465
466
467
468 numSamples <- nrow(rawData)
469
470 predictorNames <- names(rawData)[names(rawData) != "outcome"]
471 numPredictors <- length(predictorNames)
472
473
474 classDist1 <- table(rawData$outcome)
475 classDistString1 <- paste("``",
476 names(classDist1),
477 "'' ($n$=",
478 classDist1,
479 ")",
480 sep = "")
481 classDistString1 <- listString(classDistString1)
482
483 pctTrain <- $PERCENT
484
485 if(pctTrain < 1)
486 {
487 ## OPTION: seed number can be changed
488 set.seed(1)
489 inTrain <- createDataPartition(rawData$$outcome,
490 p = pctTrain,
491 list = FALSE)
492 trainX <- rawData[ inTrain, predictorNames]
493 testX <- rawData[-inTrain, predictorNames]
494 trainY <- rawData[ inTrain, "outcome"]
495 testY <- rawData[-inTrain, "outcome"]
496 splitText <- paste("The original data were split into ",
497 "a training set ($$n$$=",
498 nrow(trainX),
499 ") and a test set ($$n$$=",
500 nrow(testX),
501 ") in a manner that preserved the ",
502 "distribution of the classes.",
503 sep = "")
504 isZeroVar <- apply(trainX, 2,
505 function(x) length(unique(x)) < 2)
506 if(any(isZeroVar))
507 {
508 trainX <- trainX[, !isZeroVar, drop = FALSE]
509 testX <- testX[, !isZeroVar, drop = FALSE]
510 }
511
512 } else {
513 trainX <- rawData[, predictorNames]
514 testX <- NULL
515 trainY <- rawData[, "outcome"]
516 testY <- NULL
517 splitText <- "The entire data set was used as the training set."
518 }
519 trainDist <- table(trainY)
520 nir <- max(trainDist)/length(trainY)*100
521 niClass <- names(trainDist)[which.max(trainDist)]
522 nirText <- paste("The non--information rate is the accuracy that can be ",
523 "achieved by predicting all samples using the most ",
524 "dominant class. For these data, the rate is ",
525 round(nir, 2), "\\\\% using the ``",
526 niClass,
527 "'' class.",
528 sep = "")
529
530 remove("rawData")
531
532 if((!is.null(testX)) && (!is.null(testY))){
533 #save(trainX,trainY,testX,testY,file="datasets.RData")
534 } else {
535 save(trainX,trainY,file="datasets.RData")
536 }
537
538 @
539
540 \Sexpr{splitText}
541
542 \Sexpr{nirText}
543
544 The data set for model building consisted of \Sexpr{numSamples} samples and
545 \Sexpr{numPredictors} predictor variables. The breakdown of the
546 outcome data classes were: \Sexpr{classDistString1}.
547
548 <<nzv, eval= $NZVEVAL, results = $NZVRESULT, echo = $NZVECHO>>=
549 ## OPTION: other pre-processing steps can be used
550 ppSteps <- caret:::suggestions(modName)
551
552 set.seed(2)
553 if(ppSteps["nzv"])
554 {
555 nzv <- nearZeroVar(trainX)
556 if(length(nzv) > 0)
557 {
558 nzvVars <- names(trainX)[nzv]
559 trainX <- trainX[, -nzv]
560 nzvText <- paste("There were ",
561 length(nzv),
562 " predictors that were removed from train set due to",
563 " severely unbalanced distributions that",
564 " could negatively affect the model",
565 ifelse(length(nzv) > 10,
566 ".",
567 paste(": ",
568 listString(nzvVars),
569 ".",
570 sep = "")),
571 sep = "")
572 testX <- testX[, -nzv]
573 } else nzvText <- ""
574 } else nzvText <- ""
575 @
576
577 \Sexpr{nzvText}
578
579
580 <<corrFilter, eval = $CORRFILTEREVAL, results = $CORRFILTERRESULT, echo = $CORRFILTERECHO>>=
581 if(ppSteps["corr"])
582 {
583 ## OPTION:
584 corrThresh <- $THRESHHOLDCOR
585 highCorr <- findCorrelation(cor(trainX, use = "pairwise.complete.obs"),
586 corrThresh)
587 if(length(highCorr) > 0)
588 {
589 corrVars <- names(trainX)[highCorr]
590 trainX <- trainX[, -highCorr]
591 corrText <- paste("There were ",
592 length(highCorr),
593 " predictors that were removed due to",
594 " large between--predictor correlations that",
595 " could negatively affect the model fit",
596 ifelse(length(highCorr) > 10,
597 ".",
598 paste(": ",
599 listString(highCorr),
600 ".",
601 sep = "")),
602 " Removing these predictors forced",
603 " all pair--wise correlations to be",
604 " less than ",
605 corrThresh,
606 ".",
607 sep = "")
608 testX <- testX[, -highCorr]
609 } else corrText <- "No correlation among data on given threshold"
610 }else corrText <- ""
611 @
612
613 \Sexpr{corrText}
614
615 <<preProc, eval = $PREPROCEVAL, echo = $PREPROCECHO, results = $PREPROCRESULT>>=
616 ppMethods <- NULL
617 if(ppSteps["center"]) ppMethods <- c(ppMethods, "center")
618 if(ppSteps["scale"]) ppMethods <- c(ppMethods, "scale")
619 if(any(hasMissing) > 0) ppMethods <- c(ppMethods, "knnImpute")
620 ##OPTION other methods, such as spatial sign, can be added to this list
621
622 if(length(ppMethods) > 0)
623 {
624 ppInfo <- preProcess(trainX, method = ppMethods)
625 trainX <- predict(ppInfo, trainX)
626 if(pctTrain < 1) testX <- predict(ppInfo, testX)
627 ppText <- paste("The following pre--processing methods were",
628 " applied to the training",
629 ifelse(pctTrain < 1, " and test", ""),
630 " data: ",
631 listString(ppMethods),
632 ".",
633 sep = "")
634 ppText <- gsub("center", "mean centering", ppText)
635 ppText <- gsub("scale", "scaling to unit variance", ppText)
636 ppText <- gsub("knnImpute",
637 paste(ppInfo$$k, "--nearest neighbor imputation", sep = ""),
638 ppText)
639 ppText <- gsub("spatialSign", "the spatial sign transformation", ppText)
640 ppText <- gsub("pca", "principal component feature extraction", ppText)
641 ppText <- gsub("ica", "independent component feature extraction", ppText)
642 } else {
643 ppInfo <- NULL
644 ppText <- ""
645 }
646
647 predictorNames <- names(trainX)
648 if(nzvText != "" | corrText != "" | ppText != "")
649 {
650 varText <- paste("After pre--processing, ",
651 ncol(trainX),
652 "predictors remained for modeling.")
653 } else varText <- ""
654
655 @
656
657 \Sexpr{ppText}
658 \Sexpr{varText}
659
660 \clearpage
661
662 \section*{Model Building}
663
664 <<setupWorkers, eval = TRUE, echo = $SETUPWORKERSECHO, results = $SETUPWORKERSRESULT>>=
665 numWorkers <- $NUMWORKERS
666 ##OPTION: turn up numWorkers to use MPI
667 if(numWorkers > 1)
668 {
669 mpiCalcs <- function(X, FUN, ...)
670 {
671 theDots <- list(...)
672 parLapply(theDots$$cl, X, FUN)
673 }
674
675 library(snow)
676 cl <- makeCluster(numWorkers, "MPI")
677 }
678 @
679
680 <<setupResampling, echo = $SETUPRESAMPLINGECHO, results = $SETUPRESAMPLINGRESULT>>=
681 ##OPTION: the resampling options can be changed. See
682 ## ?trainControl for details
683
684 resampName <- "$RESAMPNAME"
685 resampNumber <- $RESAMPLENUMBER
686 numRepeat <- $NUMREPEAT
687 resampP <- $RESAMPLENUMBERPERCENT
688
689 modelInfo <- modelLookup(modName)
690
691 if(numClasses == 2)
692 {
693 foo <- if(any(modelInfo$$probModel)) twoClassSummary else twoClassNoProbs
694 } else foo <- defaultSummary
695
696 set.seed(3)
697 ctlObj <- trainControl(method = resampName,
698 number = resampNumber,
699 repeats = numRepeat,
700 p = resampP,
701 classProbs = any(modelInfo$$probModel),
702 summaryFunction = foo)
703
704
705 ##OPTION select other performance metrics as needed
706 optMetric <- if(numClasses == 2 & any(modelInfo$$probModel)) "ROC" else "Kappa"
707
708 if(numWorkers > 1)
709 {
710 ctlObj$$workers <- numWorkers
711 ctlObj$$computeFunction <- mpiCalcs
712 ctlObj$$computeArgs <- list(cl = cl)
713 }
714 @
715
716 <<setupGrid, results = $SETUPGRIDRESULT, echo = $SETUPGRIDECHO>>=
717 ##OPTION expand or contract these grids as needed (or
718 ## add more models
719
720 gridSize <- $SETUPGRIDSIZE
721
722 if(modName %in% c("svmPoly", "svmRadial", "svmLinear", "lvq", "ctree2", "ctree")) gridSize <- 5
723 if(modName %in% c("earth", "fda")) gridSize <- 7
724 if(modName %in% c("knn", "rocc", "glmboost", "rf", "nodeHarvest")) gridSize <- 10
725
726 if(modName %in% c("nb")) gridSize <- 2
727 if(modName %in% c("pam", "rpart")) gridSize <- 15
728 if(modName %in% c("pls")) gridSize <- min(20, ncol(trainX))
729
730 if(modName == "gbm")
731 {
732 tGrid <- expand.grid(.interaction.depth = -1 + (1:5)*2 ,
733 .n.trees = (1:10)*20,
734 .shrinkage = .1)
735 }
736
737 if(modName == "nnet")
738 {
739 tGrid <- expand.grid(.size = -1 + (1:5)*2 ,
740 .decay = c(0, .001, .01, .1))
741 }
742
743 if(modName == "ada")
744 {
745 tGrid <- expand.grid(.maxdepth = 1, .iter = c(100,200,300,400), .nu = 1 )
746
747 }
748
749
750 @
751
752 <<fitModel, results = $FITMODELRESULT, echo = $FITMODELECHO, eval = $FITMODELEVAL>>=
753 ##OPTION alter as needed
754
755 set.seed(4)
756 modelFit <- switch(modName,
757 gbm =
758 {
759 mix <- sample(seq(along = trainY))
760 train(
761 trainX[mix,], trainY[mix], modName,
762 verbose = FALSE,
763 bag.fraction = .9,
764 metric = optMetric,
765 trControl = ctlObj,
766 tuneGrid = tGrid)
767 },
768
769 multinom =
770 {
771 train(
772 trainX, trainY, modName,
773 trace = FALSE,
774 metric = optMetric,
775 maxiter = 1000,
776 MaxNWts = 5000,
777 trControl = ctlObj,
778 tuneLength = gridSize)
779 },
780
781 nnet =
782 {
783 train(
784 trainX, trainY, modName,
785 metric = optMetric,
786 linout = FALSE,
787 trace = FALSE,
788 maxiter = 1000,
789 MaxNWts = 5000,
790 trControl = ctlObj,
791 tuneGrid = tGrid)
792
793 },
794
795 svmRadial =, svmPoly =, svmLinear =
796 {
797 train(
798 trainX, trainY, modName,
799 metric = optMetric,
800 scaled = TRUE,
801 trControl = ctlObj,
802 tuneLength = gridSize)
803 },
804 {
805 train(trainX, trainY, modName,
806 trControl = ctlObj,
807 metric = optMetric,
808 tuneLength = gridSize)
809 })
810
811 @
812
813 <<modelDescr, echo = $MODELDESCRECHO, results = $MODELDESCRRESULT>>=
814 summaryText <- ""
815
816 resampleName <- switch(tolower(modelFit$$control$$method),
817 boot = paste("the bootstrap (", length(modelFit$$control$$index), " reps)", sep = ""),
818 boot632 = paste("the bootstrap 632 rule (", length(modelFit$$control$$index), " reps)", sep = ""),
819 cv = paste("cross-validation (", modelFit$$control$$number, " fold)", sep = ""),
820 repeatedcv = paste("cross-validation (", modelFit$$control$$number, " fold, repeated ",
821 modelFit$$control$$repeats, " times)", sep = ""),
822 lgocv = paste("repeated train/test splits (", length(modelFit$$control$$index), " reps, ",
823 round(modelFit$$control$$p, 2), "$$\\%$$)", sep = ""))
824
825 tuneVars <- latexTranslate(tolower(modelInfo$$label))
826 tuneVars <- gsub("\\#", "the number of ", tuneVars, fixed = TRUE)
827 if(ncol(modelFit$$bestTune) == 1 && colnames(modelFit$$bestTune) == ".parameter")
828 {
829 summaryText <- paste(summaryText,
830 "\n\n",
831 "There are no tuning parameters associated with this model.",
832 "To characterize the model performance on the training set,",
833 resampleName,
834 "was used.",
835 "Table \\\\ref{T:resamps} and Figure \\\\ref{F:profile}",
836 "show summaries of the resampling results. ")
837
838 } else {
839 summaryText <- paste("There",
840 ifelse(nrow(modelInfo) > 1, "are", "is"),
841 nrow(modelInfo),
842 ifelse(nrow(modelInfo) > 1, "tuning parameters", "tuning parameter"),
843 "associated with this model:",
844 listString(tuneVars, period = TRUE))
845
846
847
848 paramNames <- gsub(".", "", names(modelFit$$bestTune), fixed = TRUE)
849 ## (i in seq(along = paramNames))
850 ## {
851 ## check <- modelInfo$$parameter %in% paramNames[i]
852 ## if(any(check))
853 ## {
854 ## paramNames[i] <- modelInfo$$label[which(check)]
855 ## }
856 ## }
857
858 paramNames <- gsub("#", "the number of ", paramNames, fixed = TRUE)
859 ## Check to see if there was only one combination fit
860 summaryText <- paste(summaryText,
861 "To choose",
862 ifelse(nrow(modelInfo) > 1,
863 "appropriate values of the tuning parameters,",
864 "an appropriate value of the tuning parameter,"),
865 resampleName,
866 "was used to generated a profile of performance across the",
867 nrow(modelFit$$results),
868 ifelse(nrow(modelInfo) > 1,
869 "combinations of the tuning parameters.",
870 "candidate values."),
871
872 "Table \\\\ref{T:resamps} and Figure \\\\ref{F:profile} show",
873 "summaries of the resampling profile. ", "The final model fitted to the entire training set was:",
874 listString(paste(latexTranslate(tolower(paramNames)), "=", modelFit$$bestTune[1,]), period = TRUE))
875
876 }
877 @
878
879 \Sexpr{summaryText}
880
881 <<resampTable, echo = $RESAMPTABLEECHO, results = $RESAMPTABLERESULT>>=
882 tableData <- modelFit$$results
883
884 if(all(modelInfo$$parameter == "parameter") && resampName == "boot632")
885 {
886 tableData <- tableData[,-1, drop = FALSE]
887 colNums <- c( length(modelFit$$perfNames), length(modelFit$$perfNames), length(modelFit$$perfNames))
888 colLabels <- c("Mean", "Standard Deviation","Apparant")
889 constString <- ""
890 isConst <- NULL
891 } else if (all(modelInfo$$parameter == "parameter") && (resampName == "boot" | resampName == "cv" | resampName == "repeatedcv" )){
892 tableData <- tableData[,-1, drop = FALSE]
893 colNums <- c(length(modelFit$$perfNames), length(modelFit$$perfNames))
894 colLabels <- c("Mean", "Standard Deviation")
895 constString <- ""
896 isConst <- NULL
897 } else if (all(modelInfo$$parameter == "parameter") && resampName == "LOOCV" ){
898 tableData <- tableData[,-1, drop = FALSE]
899 colNums <- length(modelFit$$perfNames)
900 colLabels <- c("Measures")
901 constString <- ""
902 isConst <- NULL
903 } else {
904 if (all(modelInfo$$parameter != "parameter") && resampName == "boot632" ){
905 isConst <- apply(tableData[, modelInfo$$parameter, drop = FALSE],
906 2,
907 function(x) length(unique(x)) == 1)
908
909 numParamInTable <- sum(!isConst)
910
911 if(any(isConst))
912 {
913 constParam <- modelInfo$$parameter[isConst]
914 constValues <- format(tableData[, constParam, drop = FALSE], digits = 4)[1,,drop = FALSE]
915 tableData <- tableData[, !(names(tableData) %in% constParam), drop = FALSE]
916 constString <- paste("The tuning",
917 ifelse(sum(isConst) > 1,
918 "parmeters",
919 "parameter"),
920 listString(paste("``", names(constValues), "''", sep = "")),
921 ifelse(sum(isConst) > 1,
922 "were",
923 "was"),
924 "held constant at",
925 ifelse(sum(isConst) > 1,
926 "a value of",
927 "values of"),
928 listString(constValues[1,]))
929
930 } else constString <- ""
931
932 cn <- colnames(tableData)
933 ## for(i in seq(along = cn))
934 ## {
935 ## check <- modelInfo$$parameter %in% cn[i]
936 ## if(any(check))
937 ## {
938 ## cn[i] <- modelInfo$$label[which(check)]
939 ## }
940 ## }
941 ## colnames(tableData) <- cn
942
943 colNums <- c(numParamInTable,
944 length(modelFit$$perfNames),
945 length(modelFit$$perfNames),
946 length(modelFit$$perfNames))
947 colLabels <- c("", "Mean", "Standard Deviation", "Apparant")
948
949 }else if (all(modelInfo$$parameter != "parameter") && (resampName == "boot" | resampName == "repeatedcv" | resampName == "cv") ){
950 isConst <- apply(tableData[, modelInfo$$parameter, drop = FALSE],
951 2,
952 function(x) length(unique(x)) == 1)
953
954 numParamInTable <- sum(!isConst)
955
956 if(any(isConst))
957 {
958 constParam <- modelInfo$$parameter[isConst]
959 constValues <- format(tableData[, constParam, drop = FALSE], digits = 4)[1,,drop = FALSE]
960 tableData <- tableData[, !(names(tableData) %in% constParam), drop = FALSE]
961 constString <- paste("The tuning",
962 ifelse(sum(isConst) > 1,
963 "parmeters",
964 "parameter"),
965 listString(paste("``", names(constValues), "''", sep = "")),
966 ifelse(sum(isConst) > 1,
967 "were",
968 "was"),
969 "held constant at",
970 ifelse(sum(isConst) > 1,
971 "a value of",
972 "values of"),
973 listString(constValues[1,]))
974
975 } else constString <- ""
976
977 cn <- colnames(tableData)
978 ## for(i in seq(along = cn))
979 ## {
980 ## check <- modelInfo$$parameter %in% cn[i]
981 ## if(any(check))
982 ## {
983 ## cn[i] <- modelInfo$$label[which(check)]
984 ## }
985 ## }
986 ## colnames(tableData) <- cn
987
988 colNums <- c(numParamInTable,
989 length(modelFit$$perfNames),
990 length(modelFit$$perfNames))
991 colLabels <- c("", "Mean", "Standard Deviation")
992
993 }
994 else if (all(modelInfo$$parameter != "parameter") && resampName == "LOOCV"){
995 isConst <- apply(tableData[, modelInfo$$parameter, drop = FALSE],
996 2,
997 function(x) length(unique(x)) == 1)
998
999 numParamInTable <- sum(!isConst)
1000
1001 if(any(isConst))
1002 {
1003 constParam <- modelInfo$$parameter[isConst]
1004 constValues <- format(tableData[, constParam, drop = FALSE], digits = 4)[1,,drop = FALSE]
1005 tableData <- tableData[, !(names(tableData) %in% constParam), drop = FALSE]
1006 constString <- paste("The tuning",
1007 ifelse(sum(isConst) > 1,
1008 "parmeters",
1009 "parameter"),
1010 listString(paste("``", names(constValues), "''", sep = "")),
1011 ifelse(sum(isConst) > 1,
1012 "were",
1013 "was"),
1014 "held constant at",
1015 ifelse(sum(isConst) > 1,
1016 "a value of",
1017 "values of"),
1018 listString(constValues[1,]))
1019
1020 } else constString <- ""
1021
1022 cn <- colnames(tableData)
1023 ## for(i in seq(along = cn))
1024 ## {
1025 ## check <- modelInfo$$parameter %in% cn[i]
1026 ## if(any(check))
1027 ## {
1028 ## cn[i] <- modelInfo$$label[which(check)]
1029 ## }
1030 ## }
1031 ## colnames(tableData) <- cn
1032
1033 colNums <- c(numParamInTable,
1034 length(modelFit$$perfNames))
1035 colLabels <- c("", "Measures")
1036
1037 }
1038
1039 }
1040
1041
1042
1043 colnames(tableData) <- gsub("SD$$", "", colnames(tableData))
1044 colnames(tableData) <- gsub("Apparent$$", "", colnames(tableData))
1045 colnames(tableData) <- latexTranslate(colnames(tableData))
1046 rownames(tableData) <- latexTranslate(rownames(tableData))
1047
1048 latex(tableData,
1049 rowname = NULL,
1050 file = "",
1051 cgroup = colLabels,
1052 n.cgroup = colNums,
1053 where = "h!",
1054 digits = 4,
1055 longtable = nrow(tableData) > 30,
1056 caption = paste(resampleName, "results from the model fit.", constString),
1057 label = "T:resamps")
1058 @
1059
1060 \setkeys{Gin}{ width = 0.9\textwidth}
1061 \begin{figure}[b]
1062 \begin{center}
1063
1064 <<profilePlot, echo = $PROFILEPLOTECHO, fig = $PROFILEPLOTFIG, width = 8, height = 6>>=
1065 trellis.par.set(caretTheme(), warn = TRUE)
1066 if(all(modelInfo$$parameter == "parameter") | all(isConst) | modName == "nb")
1067 {
1068 resultsPlot <- resampleHist(modelFit)
1069 plotCaption <- paste("Distributions of model performance from the ",
1070 "training set estimated using ",
1071 resampleName)
1072 } else {
1073 if(modName %in% c("svmPoly", "svmRadial", "svmLinear"))
1074 {
1075 resultsPlot <- plot(modelFit,
1076 metric = optMetric,
1077 xTrans = function(x) log10(x))
1078 resultsPlot <- update(resultsPlot,
1079 type = c("g", "p", "l"),
1080 ylab = paste(optMetric, " (", resampleName, ")", sep = ""))
1081
1082 } else {
1083 resultsPlot <- plot(modelFit,
1084 metric = optMetric)
1085 resultsPlot <- update(resultsPlot,
1086 type = c("g", "p", "l"),
1087 ylab = paste(optMetric, " (", resampleName, ")", sep = ""))
1088 }
1089 plotCaption <- paste("A plot of the estimates of the",
1090 optMetric,
1091 "values calculated using",
1092 resampleName)
1093 }
1094 print(resultsPlot)
1095 @
1096 \caption[Performance Plot]{\Sexpr{plotCaption}.}
1097 \label{F:profile}
1098 \end{center}
1099 \end{figure}
1100
1101
1102 <<stopWorkers, echo = $STOPWORKERSECHO, results = $STOPWORKERSRESULT>>=
1103 if(numWorkers > 1) stopCluster(cl)
1104 @
1105
1106 <<testPred, results = $TESTPREDRESULT, echo = $TESTPREDECHO>>=
1107 if((!is.null(testX)) && (!is.null(testY))){
1108 save(trainX,trainY,testX,testY,file="datasets.RData")
1109 } else {
1110 save(trainX,trainY,file="datasets.RData")
1111 }
1112
1113 if(pctTrain < 1)
1114 {
1115 cat("\\clearpage\n\\section*{Test Set Results}\n\n")
1116 classPred <- predict(modelFit, testX)
1117 cm <- confusionMatrix(classPred, testY)
1118 values <- cm$$overall[c("Accuracy", "Kappa", "AccuracyPValue", "McnemarPValue")]
1119
1120 values <- values[!is.na(values) & !is.nan(values)]
1121 values <- c(format(values[1:2], digits = 3),
1122 format.pval(values[-(1:2)], digits = 5))
1123 nms <- c("the overall accuracy", "the Kappa statistic",
1124 "the $$p$$--value that accuracy is greater than the no--information rate",
1125 "the $$p$$--value of concordance from McNemar's test")
1126 nms <- nms[seq(along = values)]
1127 names(values) <- nms
1128
1129 if(any(modelInfo$$probModel))
1130 {
1131 classProbs <- extractProb(list(fit = modelFit),
1132 testX = testX,
1133 testY = testY)
1134 classProbs <- subset(classProbs, dataType == "Test")
1135 if(numClasses == 2)
1136 {
1137 tmp <- twoClassSummary(classProbs, lev = levels(classProbs$$obs))
1138 tmp <- c(format(tmp, digits = 3))
1139 names(tmp) <- c("the area under the ROC curve", "the sensitivity", "the specificity")
1140
1141 values <- c(values, tmp)
1142
1143 }
1144 probPlot <- plotClassProbs(classProbs)
1145 }
1146 testString <- paste("Based on the test set of",
1147 nrow(testX),
1148 "samples,",
1149 listString(paste(names(values), "was", values), period = TRUE),
1150 "The confusion matrix for the test set is shown in Table",
1151 "\\\\ref{T:cm}.")
1152 testString <- paste(testString,
1153 " Using ", resampleName,
1154 ", the training set estimates were ",
1155 resampleStats(modelFit),
1156 ".",
1157 sep = "")
1158
1159 if(any(modelInfo$$probModel)) testString <- paste(testString,
1160 "Histograms of the class probabilities",
1161 "for the test set samples are shown in",
1162 "Figure \\\\ref{F:probs}",
1163 ifelse(numClasses == 2,
1164 " and the test set ROC curve is in Figure \\\\ref{F:roc}.",
1165 "."))
1166
1167
1168
1169 latex(cm$$table,
1170 title = "",
1171 file = "",
1172 where = "h",
1173 cgroup = "Observed Values",
1174 n.cgroup = numClasses,
1175 caption = "The confusion matrix for the test set",
1176 label = "T:cm")
1177
1178 } else testString <- ""
1179 @
1180 \Sexpr{testString}
1181
1182
1183 <<classProbsTex, results = $CLASSPROBSTEXRESULT, echo = $CLASSPROBSTEXECHO>>=
1184 if(any(modelInfo$probModel) && pctTrain < 1 ) {
1185 cat(
1186 paste("\\begin{figure}[p]\n",
1187 "\\begin{center}\n",
1188 "\\includegraphics{classProbs}",
1189 "\\caption[PCA Plot]{Class probabilities",
1190 "for the test set. Each panel contains ",
1191 "separate classes}\n",
1192 "\\label{F:probs}\n",
1193 "\\end{center}\n",
1194 "\\end{figure}"))
1195 }
1196 if(any(modelInfo$$probModel) & numClasses == 2 & pctTrain < 1 )
1197 {
1198 cat(
1199 paste("\\begin{figure}[p]\n",
1200 "\\begin{center}\n",
1201 "\\includegraphics[clip, width = .8\\textwidth]{roc}",
1202 "\\caption[ROC Plot]{ROC Curve",
1203 "for the test set.}\n",
1204 "\\label{F:roc}\n",
1205 "\\end{center}\n",
1206 "\\end{figure}"))
1207 } else {
1208 cat (paste(""))
1209 }
1210
1211 @
1212 <<classProbsTex, results = $CLASSPROBSTEXRESULT1, echo = $CLASSPROBSTEXECHO1 >>=
1213 if(any(modelInfo$probModel) && pctTrain < 1) {
1214 pdf("classProbs.pdf", height = 7, width = 7)
1215 trellis.par.set(caretTheme(), warn = FALSE)
1216 print(probPlot)
1217 dev.off()
1218 }
1219 if(any(modelInfo$probModel) & numClasses == 2 & pctTrain < 1) {
1220 resPonse<-testY
1221 preDictor<-classProbs[, levels(trainY)[1]]
1222 pdf("roc.pdf", height = 8, width = 8)
1223 # from pROC example at http://web.expasy.org/pROC/screenshots.htm
1224 plot.roc(resPonse, preDictor, # data
1225 percent=TRUE, # show all values in percent
1226 partial.auc=c(100, 90), partial.auc.correct=TRUE, # define a partial AUC (pAUC)
1227 print.auc=TRUE, #display pAUC value on the plot with following options:
1228 print.auc.pattern="Corrected pAUC (100-90%% SP):\n%.1f%%", print.auc.col="#1c61b6",
1229 auc.polygon=TRUE, auc.polygon.col="#1c61b6", # show pAUC as a polygon
1230 max.auc.polygon=TRUE, max.auc.polygon.col="#1c61b622", # also show the 100% polygon
1231 main="Partial AUC (pAUC)")
1232 plot.roc(resPonse, preDictor,
1233 percent=TRUE, add=TRUE, type="n", # add to plot, but don't re-add the ROC itself (useless)
1234 partial.auc=c(100, 90), partial.auc.correct=TRUE,
1235 partial.auc.focus="se", # focus pAUC on the sensitivity
1236 print.auc=TRUE, print.auc.pattern="Corrected pAUC (100-90%% SE):\n%.1f%%", print.auc.col="#008600",
1237 print.auc.y=40, # do not print auc over the previous one
1238 auc.polygon=TRUE, auc.polygon.col="#008600",
1239 max.auc.polygon=TRUE, max.auc.polygon.col="#00860022")
1240 dev.off()
1241 } else {
1242 cat("")
1243 }
1244
1245 @
1246
1247 \section*{Versions}
1248
1249 <<versions, echo = FALSE, results = tex>>=
1250 toLatex(sessionInfo())
1251
1252 @
1253
1254 <<save-data, echo = $SAVEDATAECHO, results = $SAVEDATARESULT>>=
1255 ## change this to the name of modName....
1256 Fit <- modelFit
1257 if(exists('ppInfo') && !is.null(ppInfo)){
1258 save(Fit,ppInfo,cm,file="$METHOD-Fit.RData")
1259 } else {save(Fit,cm,file="$METHOD-Fit.RData")}
1260
1261 @
1262 The model was built using $METHOD and is saved as $METHOD Model for reuse. This contains the variable Fit.
1263
1264 \end{document}'''
1265
1266 return template4Rnw