Mercurial > repos > anmoljh > feature_selection
comparison feature_selection.R @ 0:b4d2524e79ab draft
planemo upload commit a1f4dd8eb560c649391ada1a6bb9505893a35272
| author | anmoljh |
|---|---|
| date | Fri, 01 Jun 2018 05:16:19 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:b4d2524e79ab |
|---|---|
| 1 args <- commandArgs(T) | |
| 2 | |
| 3 arg1 <- args[1] | |
| 4 arg2 <- args[2] | |
| 5 arg3 <- args[3] | |
| 6 arg4 <- args[4] | |
| 7 arg5 <- args[5] | |
| 8 arg6 <- args[6] | |
| 9 arg7 <- args[7] | |
| 10 arg8 <- args[8] | |
| 11 arg9 <- args[9] | |
| 12 arg10 <- args[10] | |
| 13 library(caret) | |
| 14 library(doMC) | |
| 15 load(arg1) | |
| 16 | |
| 17 #RAWDATA <- dataX | |
| 18 #RAWDATA$outcome <- dataY | |
| 19 | |
| 20 | |
| 21 ########################### | |
| 22 Smpling <- arg9 | |
| 23 | |
| 24 if(Smpling=="downsampling") | |
| 25 { | |
| 26 dwnsmpl <- downSample(dataX,dataY) | |
| 27 RAWDATA <- dwnsmpl[,1:length(dwnsmpl)-1] | |
| 28 RAWDATA$outcome <- dwnsmpl[,length(dwnsmpl)] | |
| 29 dataX <- RAWDATA[,1:length(dwnsmpl)-1] | |
| 30 dataY <- RAWDATA[,"outcome"] | |
| 31 remove("dwnsmpl") | |
| 32 }else if(Smpling=="upsampling"){ | |
| 33 upsmpl <- upSample(dataX,dataY) | |
| 34 RAWDATA <- upsmpl[,1:length(upsmpl)-1] | |
| 35 RAWDATA$outcome <- upsmpl[,length(upsmpl)] | |
| 36 dataX <- RAWDATA[,1:length(upsmpl)-1] | |
| 37 dataY <- RAWDATA[,"outcome"] | |
| 38 remove("upsmpl") | |
| 39 }else { | |
| 40 RAWDATA <- dataX | |
| 41 RAWDATA$outcome <- dataY | |
| 42 } | |
| 43 | |
| 44 | |
| 45 | |
| 46 | |
| 47 ########################## | |
| 48 | |
| 49 | |
| 50 rawData <- dataX | |
| 51 predictorNames <- names(rawData) | |
| 52 | |
| 53 isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric) | |
| 54 if(any(!isNum)) stop("all predictors in rawData should be numeric") | |
| 55 | |
| 56 colRate <- apply(rawData[, predictorNames, drop = FALSE], | |
| 57 2, function(x) mean(is.na(x))) | |
| 58 colExclude <- colRate > 0.1 | |
| 59 if(any(colExclude)){ | |
| 60 predictorNames <- predictorNames[-which(colExclude)] | |
| 61 rawData <- RAWDATA[, c(predictorNames,"outcome")] | |
| 62 } else { | |
| 63 rawData <- RAWDATA | |
| 64 } | |
| 65 rowRate <- apply(rawData[, predictorNames, drop = FALSE], | |
| 66 1, function(x) mean(is.na(x))) | |
| 67 | |
| 68 | |
| 69 rowExclude <- rowRate > 0 | |
| 70 if(any(rowExclude)){ | |
| 71 rawData <- rawData[!rowExclude, ] | |
| 72 ##hasMissing <- apply(rawData[, predictorNames, drop = FALSE], | |
| 73 ##1, function(x) mean(is.na(x))) | |
| 74 | |
| 75 ############################################################################ | |
| 76 | |
| 77 | |
| 78 ############################################################################### | |
| 79 } else { | |
| 80 rawData <- rawData[complete.cases(rawData),] | |
| 81 | |
| 82 } | |
| 83 | |
| 84 set.seed(2) | |
| 85 | |
| 86 #print(dim(dataX)) | |
| 87 #print(dim(rawData)) | |
| 88 #print(length(dataY)) | |
| 89 | |
| 90 nzv <- nearZeroVar(rawData[,1:(length(rawData) - 1)]) | |
| 91 if(length(nzv) > 0) { | |
| 92 #nzvVars <- names(rawData)[nzv] | |
| 93 rawData <- rawData[,-nzv] | |
| 94 #rawData$outcome <- dataY | |
| 95 } | |
| 96 | |
| 97 predictorNames <- names(rawData)[names(rawData) != "outcome"] | |
| 98 | |
| 99 dx <- rawData[,1:length(rawData)-1] | |
| 100 dy <- rawData[,length(rawData)] | |
| 101 corrThresh <- as.numeric(arg8) | |
| 102 highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh) | |
| 103 dx <- dx[, -highCorr] | |
| 104 subsets <- seq(1,length(dx),by=5) | |
| 105 normalization <- preProcess(dx) | |
| 106 dx <- predict(normalization, dx) | |
| 107 dx <- as.data.frame(dx) | |
| 108 | |
| 109 if (arg4 == "lmFuncs"){ | |
| 110 ctrl1 <- rfeControl(functions = lmFuncs, | |
| 111 method = arg5 , | |
| 112 repeats = as.numeric(arg6), | |
| 113 number = as.numeric(arg7), | |
| 114 verbose = FALSE) | |
| 115 } else if(arg4 == "rfFuncs"){ | |
| 116 ctrl1 <- rfeControl(functions = rfFuncs, | |
| 117 method = arg5 , | |
| 118 repeats = as.numeric(arg6), | |
| 119 number = as.numeric(arg7), | |
| 120 verbose = FALSE) | |
| 121 }else if (arg4 == "treebagFuncs"){ | |
| 122 ctrl1 <- rfeControl(functions = treebagFuncs, | |
| 123 method = arg5 , | |
| 124 repeats = as.numeric(arg6), | |
| 125 number = as.numeric(arg7), | |
| 126 verbose = FALSE) | |
| 127 }else { | |
| 128 | |
| 129 ctrl1 <- rfeControl(functions = nbFuncs, | |
| 130 method = arg5 , | |
| 131 repeats = as.numeric(arg6), | |
| 132 number = as.numeric(arg7), | |
| 133 verbose = FALSE) | |
| 134 } | |
| 135 | |
| 136 | |
| 137 | |
| 138 if (as.numeric(arg10) == 1){ | |
| 139 Profile <- rfe(dx, dy,sizes = subsets,rfeControl = ctrl1) | |
| 140 | |
| 141 pred11 <- predictors(Profile) | |
| 142 save(Profile,file=arg2) | |
| 143 dataX <- rawData[,pred11] | |
| 144 dataY <- rawData$outcome | |
| 145 | |
| 146 save(dataX,dataY,file=arg3) | |
| 147 rm(dataX) | |
| 148 rm(dataY) | |
| 149 } else if (as.numeric(arg10) > 1){ | |
| 150 registerDoMC(cores = as.numeric(arg10)) | |
| 151 | |
| 152 Profile <- rfe(dx, dy,sizes = subsets,rfeControl = ctrl1) | |
| 153 | |
| 154 pred11 <- predictors(Profile) | |
| 155 save(Profile,file=arg2) | |
| 156 dataX <- rawData[,pred11] | |
| 157 dataY <- rawData$outcome | |
| 158 | |
| 159 save(dataX,dataY,file=arg3) | |
| 160 rm(dataX) | |
| 161 rm(dataY) | |
| 162 } else { stop("something went wrong. please see the parameters")} | |
| 163 | |
| 164 |
