Mercurial > repos > anmoljh > feature_selection
view feature_selection.R @ 0:b4d2524e79ab draft
planemo upload commit a1f4dd8eb560c649391ada1a6bb9505893a35272
author | anmoljh |
---|---|
date | Fri, 01 Jun 2018 05:16:19 -0400 |
parents | |
children |
line wrap: on
line source
args <- commandArgs(T) arg1 <- args[1] arg2 <- args[2] arg3 <- args[3] arg4 <- args[4] arg5 <- args[5] arg6 <- args[6] arg7 <- args[7] arg8 <- args[8] arg9 <- args[9] arg10 <- args[10] library(caret) library(doMC) load(arg1) #RAWDATA <- dataX #RAWDATA$outcome <- dataY ########################### Smpling <- arg9 if(Smpling=="downsampling") { dwnsmpl <- downSample(dataX,dataY) RAWDATA <- dwnsmpl[,1:length(dwnsmpl)-1] RAWDATA$outcome <- dwnsmpl[,length(dwnsmpl)] dataX <- RAWDATA[,1:length(dwnsmpl)-1] dataY <- RAWDATA[,"outcome"] remove("dwnsmpl") }else if(Smpling=="upsampling"){ upsmpl <- upSample(dataX,dataY) RAWDATA <- upsmpl[,1:length(upsmpl)-1] RAWDATA$outcome <- upsmpl[,length(upsmpl)] dataX <- RAWDATA[,1:length(upsmpl)-1] dataY <- RAWDATA[,"outcome"] remove("upsmpl") }else { RAWDATA <- dataX RAWDATA$outcome <- dataY } ########################## rawData <- dataX predictorNames <- names(rawData) isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric) if(any(!isNum)) stop("all predictors in rawData should be numeric") colRate <- apply(rawData[, predictorNames, drop = FALSE], 2, function(x) mean(is.na(x))) colExclude <- colRate > 0.1 if(any(colExclude)){ predictorNames <- predictorNames[-which(colExclude)] rawData <- RAWDATA[, c(predictorNames,"outcome")] } else { rawData <- RAWDATA } rowRate <- apply(rawData[, predictorNames, drop = FALSE], 1, function(x) mean(is.na(x))) rowExclude <- rowRate > 0 if(any(rowExclude)){ rawData <- rawData[!rowExclude, ] ##hasMissing <- apply(rawData[, predictorNames, drop = FALSE], ##1, function(x) mean(is.na(x))) ############################################################################ ############################################################################### } else { rawData <- rawData[complete.cases(rawData),] } set.seed(2) #print(dim(dataX)) #print(dim(rawData)) #print(length(dataY)) nzv <- nearZeroVar(rawData[,1:(length(rawData) - 1)]) if(length(nzv) > 0) { #nzvVars <- names(rawData)[nzv] rawData <- rawData[,-nzv] #rawData$outcome <- dataY } predictorNames <- names(rawData)[names(rawData) != "outcome"] dx <- rawData[,1:length(rawData)-1] dy <- rawData[,length(rawData)] corrThresh <- as.numeric(arg8) highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh) dx <- dx[, -highCorr] subsets <- seq(1,length(dx),by=5) normalization <- preProcess(dx) dx <- predict(normalization, dx) dx <- as.data.frame(dx) if (arg4 == "lmFuncs"){ ctrl1 <- rfeControl(functions = lmFuncs, method = arg5 , repeats = as.numeric(arg6), number = as.numeric(arg7), verbose = FALSE) } else if(arg4 == "rfFuncs"){ ctrl1 <- rfeControl(functions = rfFuncs, method = arg5 , repeats = as.numeric(arg6), number = as.numeric(arg7), verbose = FALSE) }else if (arg4 == "treebagFuncs"){ ctrl1 <- rfeControl(functions = treebagFuncs, method = arg5 , repeats = as.numeric(arg6), number = as.numeric(arg7), verbose = FALSE) }else { ctrl1 <- rfeControl(functions = nbFuncs, method = arg5 , repeats = as.numeric(arg6), number = as.numeric(arg7), verbose = FALSE) } if (as.numeric(arg10) == 1){ Profile <- rfe(dx, dy,sizes = subsets,rfeControl = ctrl1) pred11 <- predictors(Profile) save(Profile,file=arg2) dataX <- rawData[,pred11] dataY <- rawData$outcome save(dataX,dataY,file=arg3) rm(dataX) rm(dataY) } else if (as.numeric(arg10) > 1){ registerDoMC(cores = as.numeric(arg10)) Profile <- rfe(dx, dy,sizes = subsets,rfeControl = ctrl1) pred11 <- predictors(Profile) save(Profile,file=arg2) dataX <- rawData[,pred11] dataY <- rawData$outcome save(dataX,dataY,file=arg3) rm(dataX) rm(dataY) } else { stop("something went wrong. please see the parameters")}