diff feature_selection.R @ 0:b4d2524e79ab draft

planemo upload commit a1f4dd8eb560c649391ada1a6bb9505893a35272
author anmoljh
date Fri, 01 Jun 2018 05:16:19 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/feature_selection.R	Fri Jun 01 05:16:19 2018 -0400
@@ -0,0 +1,164 @@
+args <- commandArgs(T)
+
+arg1 <- args[1]
+arg2 <- args[2]
+arg3 <- args[3]
+arg4 <- args[4]
+arg5 <- args[5]
+arg6 <- args[6]
+arg7 <- args[7]
+arg8 <- args[8]
+arg9 <- args[9]
+arg10 <- args[10]
+library(caret)
+library(doMC)
+load(arg1)
+
+#RAWDATA <- dataX
+#RAWDATA$outcome <- dataY
+
+
+###########################
+Smpling <- arg9
+
+if(Smpling=="downsampling")
+{
+dwnsmpl <- downSample(dataX,dataY)
+RAWDATA <- dwnsmpl[,1:length(dwnsmpl)-1]
+RAWDATA$outcome <- dwnsmpl[,length(dwnsmpl)]
+dataX <- RAWDATA[,1:length(dwnsmpl)-1]
+dataY <- RAWDATA[,"outcome"]
+remove("dwnsmpl")
+}else if(Smpling=="upsampling"){
+upsmpl <- upSample(dataX,dataY)
+RAWDATA <- upsmpl[,1:length(upsmpl)-1]
+RAWDATA$outcome <- upsmpl[,length(upsmpl)]
+dataX <- RAWDATA[,1:length(upsmpl)-1]
+dataY <- RAWDATA[,"outcome"]
+remove("upsmpl")
+}else { 
+RAWDATA <- dataX
+RAWDATA$outcome <- dataY
+}
+
+
+
+
+##########################
+
+
+rawData <- dataX
+predictorNames <- names(rawData)
+
+isNum <- apply(rawData[,predictorNames, drop = FALSE], 2, is.numeric)
+if(any(!isNum)) stop("all predictors in rawData should be numeric")
+
+colRate <- apply(rawData[, predictorNames, drop = FALSE],
+                 2, function(x) mean(is.na(x)))
+colExclude <- colRate > 0.1
+	if(any(colExclude)){
+				predictorNames <- predictorNames[-which(colExclude)]
+				rawData <- RAWDATA[, c(predictorNames,"outcome")]
+				 } else {
+	                        rawData <- RAWDATA 
+						}  
+                		rowRate <- apply(rawData[, predictorNames, drop = FALSE],
+                 		1, function(x) mean(is.na(x)))
+			
+
+rowExclude <- rowRate > 0
+	if(any(rowExclude)){
+  				rawData <- rawData[!rowExclude, ]
+    				##hasMissing <- apply(rawData[, predictorNames, drop = FALSE],
+                        	##1, function(x) mean(is.na(x)))
+                   
+############################################################################
+                                                                      
+            
+###############################################################################                        	
+                    } else {  
+                    		rawData <- rawData[complete.cases(rawData),]
+
+                    		} 
+                    
+set.seed(2)
+
+#print(dim(dataX))
+#print(dim(rawData))
+#print(length(dataY))
+
+nzv <- nearZeroVar(rawData[,1:(length(rawData) - 1)])
+	  if(length(nzv) > 0)  {
+    				#nzvVars <- names(rawData)[nzv]
+    				rawData <- rawData[,-nzv]
+   				#rawData$outcome <- dataY
+    				} 
+    
+predictorNames <- names(rawData)[names(rawData) != "outcome"]
+   
+dx <- rawData[,1:length(rawData)-1]
+dy <- rawData[,length(rawData)]
+corrThresh <- as.numeric(arg8)
+highCorr <- findCorrelation(cor(dx, use = "pairwise.complete.obs"),corrThresh)
+dx <- dx[, -highCorr]
+subsets <- seq(1,length(dx),by=5)
+normalization <- preProcess(dx)
+dx <- predict(normalization, dx)
+dx <- as.data.frame(dx)
+
+if (arg4 == "lmFuncs"){
+ctrl1 <- rfeControl(functions = lmFuncs,
+                   method = arg5 ,
+                   repeats = as.numeric(arg6),
+                   number = as.numeric(arg7),
+                   verbose = FALSE)
+} else if(arg4 == "rfFuncs"){
+ctrl1 <- rfeControl(functions = rfFuncs,
+                   method = arg5 ,
+                   repeats = as.numeric(arg6),
+                   number = as.numeric(arg7),
+                   verbose = FALSE)
+}else if (arg4 == "treebagFuncs"){
+ctrl1 <- rfeControl(functions = treebagFuncs,
+                   method = arg5 ,
+                   repeats = as.numeric(arg6),
+                   number = as.numeric(arg7),
+                   verbose = FALSE)
+}else {
+
+ctrl1 <- rfeControl(functions = nbFuncs,
+                   method = arg5 ,
+                   repeats = as.numeric(arg6),
+                   number = as.numeric(arg7),
+                   verbose = FALSE)
+}
+
+
+
+if (as.numeric(arg10) == 1){ 
+Profile <- rfe(dx, dy,sizes = subsets,rfeControl = ctrl1)
+
+pred11 <- predictors(Profile)
+save(Profile,file=arg2)
+dataX <- rawData[,pred11]
+dataY <- rawData$outcome
+
+save(dataX,dataY,file=arg3)
+rm(dataX)
+rm(dataY)
+} else if (as.numeric(arg10) > 1){
+registerDoMC(cores = as.numeric(arg10))
+
+Profile <- rfe(dx, dy,sizes = subsets,rfeControl = ctrl1)
+
+pred11 <- predictors(Profile)
+save(Profile,file=arg2)
+dataX <- rawData[,pred11]
+dataY <- rawData$outcome
+
+save(dataX,dataY,file=arg3)
+rm(dataX)
+rm(dataY)
+} else { stop("something went wrong. please see the parameters")}  
+
+