Mercurial > repos > jay > padaug_peptide_sequence_analysis
comparison PDAUG_ML_Models/PDAUG_ML_Models.py @ 0:5d01ab729b2b draft
"planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
author | jay |
---|---|
date | Wed, 28 Oct 2020 01:43:12 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5d01ab729b2b |
---|---|
1 | |
2 import numpy as np | |
3 import sys,os | |
4 from scipy import interp | |
5 import pandas as pd | |
6 | |
7 ############################################################### | |
8 from sklearn.metrics import * | |
9 from sklearn import preprocessing | |
10 from sklearn.metrics import accuracy_score | |
11 from sklearn.metrics import precision_recall_fscore_support | |
12 from sklearn.metrics import roc_curve, auc | |
13 from sklearn.model_selection import StratifiedKFold | |
14 from sklearn.preprocessing import StandardScaler | |
15 from sklearn.preprocessing import MinMaxScaler | |
16 ############################################################### | |
17 from sklearn.linear_model import LogisticRegression | |
18 from sklearn.naive_bayes import GaussianNB | |
19 from sklearn.neighbors import KNeighborsClassifier | |
20 from sklearn.tree import DecisionTreeClassifier | |
21 from sklearn.svm import SVC | |
22 from sklearn.ensemble import RandomForestClassifier | |
23 from sklearn.linear_model import SGDClassifier | |
24 from sklearn.ensemble import GradientBoostingClassifier | |
25 from sklearn.neural_network import MLPClassifier | |
26 ############################################################### | |
27 from itertools import cycle | |
28 ################################################################ | |
29 from sklearn.model_selection import train_test_split | |
30 | |
31 | |
32 | |
33 def ReturnData(TrainFile, TestMethod, TestFile=None): | |
34 | |
35 if (TestFile == None) and (TestMethod == 'Internal' or 'CrossVal'): | |
36 | |
37 df = pd.read_csv(TrainFile, sep='\t') | |
38 clm_list = df.columns.tolist() | |
39 X_train = df[clm_list[0:len(clm_list)-1]].values | |
40 y_train = df[clm_list[len(clm_list)-1]].values | |
41 X_test = None | |
42 y_test = None | |
43 return X_train, y_train, X_test, y_test | |
44 | |
45 elif (TestFile is not None) and (TestMethod == 'External'): | |
46 | |
47 df = pd.read_csv(TrainFile, sep='\t') | |
48 clm_list = df.columns.tolist() | |
49 X_train = df[clm_list[0:len(clm_list)-1]].values | |
50 y_train = df[clm_list[len(clm_list)-1]].values | |
51 df1 = pd.read_csv(TestFile, sep='\t') | |
52 clm_list = df1.columns.tolist() | |
53 X_test = df1[clm_list[0:len(clm_list)-1]].values | |
54 y_test = df1[clm_list[len(clm_list)-1]].values | |
55 return X_train, y_train, X_test, y_test | |
56 | |
57 elif (TestFile is not None) and (TestMethod == 'Predict'): | |
58 | |
59 df = pd.read_csv(TrainFile, sep='\t') | |
60 clm_list = df.columns.tolist() | |
61 X_train = df[clm_list[0:len(clm_list)-1]].values | |
62 y_train = df[clm_list[len(clm_list)-1]].values | |
63 | |
64 df = pd.read_csv(TestFile, sep='\t') | |
65 X_test = df | |
66 y_test = None | |
67 return X_train, y_train, X_train, y_train | |
68 | |
69 def Fit_Model(TrainData, Test_Method, Algo, Selected_Sclaer, Workdirpath, htmlOutDir, OutFile, htmlFname, NoOfFolds=None, TestSize=None, TestData=None ): | |
70 | |
71 if not os.path.exists(htmlOutDir): | |
72 os.makedirs(htmlOutDir) | |
73 | |
74 if Test_Method == 'Internal': | |
75 X,y,_,_ = ReturnData(TrainData, Test_Method) | |
76 | |
77 mean_tpr = 0.0 | |
78 mean_fpr = np.linspace(0, 1, 100) | |
79 | |
80 specificity_list = [] | |
81 sensitivity_list = [] | |
82 precison_list = [] | |
83 mcc_list = [] | |
84 f1_list = [] | |
85 | |
86 folds = StratifiedKFold(n_splits=5) | |
87 mean_tpr = 0.0 | |
88 mean_fpr = np.linspace(0, 1, 100) | |
89 | |
90 ########################## | |
91 accuracy_score_l = [] | |
92 cohen_kappa_score_l = [] | |
93 matthews_corrcoef_l = [] | |
94 precision_l = [] | |
95 recall_l = [] | |
96 f_score_l = [] | |
97 ########################## | |
98 | |
99 folds = StratifiedKFold(n_splits=5) | |
100 | |
101 for i, (train, test) in enumerate(folds.split(X, y)): | |
102 | |
103 if Selected_Sclaer=='Min_Max': | |
104 scaler = MinMaxScaler().fit(X[train]) | |
105 x_train = scaler.transform(X[train]) | |
106 x_test = scaler.transform(X[test]) | |
107 | |
108 elif Selected_Sclaer=='Standard_Scaler': | |
109 scaler = preprocessing.StandardScaler().fit(X[train]) | |
110 x_train = scaler.transform(X[train]) | |
111 x_test = scaler.transform(X[test]) | |
112 | |
113 elif Selected_Sclaer == 'No_Scaler': | |
114 x_train = X[train] | |
115 x_test = X[test] | |
116 | |
117 else: | |
118 print('Scalling Method option was not correctly selected...!') | |
119 | |
120 prob = Algo.fit(x_train, y[train]).predict_proba(x_test) | |
121 predicted = Algo.fit(x_train, y[train]).predict(x_test) | |
122 | |
123 fpr, tpr, thresholds = roc_curve(y[test], prob[:, 1]) | |
124 mean_tpr += interp(mean_fpr, fpr, tpr) | |
125 mean_tpr[0] = 0.0 | |
126 | |
127 TN, FP, FN, TP = confusion_matrix(y[test], predicted).ravel() | |
128 | |
129 accuracy_score_l.append(round(accuracy_score(y[test], predicted),3)) | |
130 a = precision_recall_fscore_support(y[test], predicted, average='macro') | |
131 precision_l.append(round(a[0],3)) | |
132 recall_l.append(round(a[1],3)) | |
133 f_score_l .append(round(a[2],3)) | |
134 | |
135 accuracy_score_mean = round(float(sum(accuracy_score_l)/float(len(accuracy_score_l))),3) | |
136 precision_mean = round(float(sum(precision_l)/float(len(precision_l))),3) | |
137 recall_mean = round(float(sum(recall_l)/float(len(recall_l))),3) | |
138 f_score_mean = round(float(sum(f_score_l )/float(len(f_score_l ))),3) | |
139 | |
140 | |
141 mean_tpr /= folds.get_n_splits(X, y) | |
142 mean_tpr[-1] = 1.0 | |
143 mean_auc = auc(mean_fpr, mean_tpr) | |
144 | |
145 ######################################################################################################################################## | |
146 V_header = ["Algo","accuracy","precision","recall","f1","mean_auc"] # | |
147 v_values = [sys.argv[1], round(accuracy_score_mean, 3), round(precision_mean, 3), round(recall_mean, 3),round(f_score_mean, 3), round(mean_auc, 3)] # | |
148 ######################################################################################################################################## | |
149 | |
150 df = pd.DataFrame([v_values], columns=V_header) | |
151 df.to_csv(os.path.join(Workdirpath, OutFile), columns=V_header, sep='\t', index=None) | |
152 | |
153 ############################################################ | |
154 from plotly.subplots import make_subplots | |
155 import plotly.graph_objects as go | |
156 | |
157 fig = make_subplots( | |
158 rows=1, cols=2, | |
159 specs=[[{"type": "xy"}, {"type": "scatter"}],], subplot_titles=("Algorithm performance", " ROC curve (AUC Score = %0.2f" % mean_auc+')'), | |
160 | |
161 ) | |
162 | |
163 fig.add_trace( go.Bar(x=V_header[1:], y=v_values[1:],marker_color=['#F58518','#109618','#E45756','#1F77B4','#19D3F3']), row=1, col=1) | |
164 | |
165 print (mean_fpr, mean_tpr) | |
166 | |
167 fig.add_trace(go.Scatter(x=mean_fpr, y=mean_tpr), row=1, col=2) | |
168 fig.update_yaxes(title_text="True Positive Rate", range=[0, 1], row=1, col=2) | |
169 fig.update_xaxes(title_text="False Positive Rate", range=[0, 1], row=1, col=2) | |
170 fig.update_yaxes(title_text="Score", range=[0, 1], row=1, col=1) | |
171 fig.update_xaxes(title_text="Performance measures",row=1, col=1) | |
172 fig.update_layout(height=700, showlegend=False, title="Machine ") | |
173 fig.write_html(os.path.join(Workdirpath, htmlOutDir, htmlFname)) | |
174 | |
175 ############################################################ | |
176 | |
177 elif Test_Method == 'External': | |
178 | |
179 X_train,y_train,X_test,y_test = ReturnData(TrainData, Test_Method, TestData) | |
180 | |
181 if Selected_Sclaer=='Min_Max': | |
182 scaler = MinMaxScaler().fit(X_train) | |
183 x_train = scaler.transform(X_train) | |
184 x_test = scaler.transform(X_test) | |
185 | |
186 elif Selected_Sclaer=='Standard_Scaler': | |
187 scaler = preprocessing.StandardScaler().fit(X_train) | |
188 x_train = scaler.transform(X_train) | |
189 x_test = scaler.transform(X_test) | |
190 | |
191 elif Selected_Sclaer == 'No_Scaler': | |
192 x_train = X_train | |
193 x_test = X_test | |
194 | |
195 else: | |
196 print('Scalling Method option was not correctly selected...!') | |
197 | |
198 prob = Algo.fit(x_train, y_train).predict_proba(x_test) | |
199 predicted = Algo.fit(x_train, y_train).predict(x_test) | |
200 | |
201 fpr, tpr, thresholds = roc_curve(y_test, prob[:, 1]) | |
202 TN, FP, FN, TP = confusion_matrix(y_test, predicted).ravel() | |
203 accu_score = accuracy_score(y_test, predicted) | |
204 | |
205 a = precision_recall_fscore_support(y_test, predicted, average='macro') | |
206 | |
207 pre_score = round(a[0],3) | |
208 recall_score= round(a[1],3) | |
209 f_score= round(a[2],3) | |
210 | |
211 pl.plot(fpr, tpr, '--', lw=2) | |
212 auc_score = auc(fpr, tpr) | |
213 | |
214 a = precision_recall_fscore_support(y_test, predicted, average='macro') | |
215 pre_score = round(a[0],3) | |
216 rec_score = round(a[1],3) | |
217 f_score = round(a[2],3) | |
218 | |
219 V_header = ["accuracy","presision","recall","f1","mean_auc"] | |
220 v_values = [accu_score, pre_score, rec_score, f_score, auc_score] | |
221 | |
222 pl.figure() | |
223 pl.plot(fpr, tpr, '-', color='red',label='AUC = %0.2f' % auc_score, lw=2) | |
224 pl.xlim([0.0, 1.0]) | |
225 pl.ylim([0.0, 1.05]) | |
226 pl.xlabel('False Positive Rate') | |
227 pl.ylabel('True Positive Rate') | |
228 pl.title('ROC Cureve') | |
229 pl.legend(loc="lower right") | |
230 | |
231 df = pd.DataFrame([v_values], columns=V_header) | |
232 pl.savefig(os.path.join(Workdirpath, htmlOutDir, "out.png")) | |
233 df.to_csv(os.path.join(Workdirpath, OutFile), columns=V_header, sep='\t') | |
234 pl.figure() | |
235 pl.bar(V_header, v_values, color=(0.2, 0.4, 0.6, 0.6)) | |
236 pl.xlabel('Accuracy Perameters', fontweight='bold', color = 'orange', fontsize='17', horizontalalignment='center') | |
237 pl.savefig(os.path.join(Workdirpath, htmlOutDir, "2.png")) | |
238 #pl.show() | |
239 HTML_Gen(os.path.join(Workdirpath, htmlOutDir, htmlFname)) | |
240 | |
241 elif Test_Method == "TestSplit": | |
242 | |
243 X_train,y_train,_,_ = ReturnData(TrainData, Test_Method) | |
244 X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=float(TestSize), random_state=0) | |
245 | |
246 | |
247 if Selected_Sclaer=='Min_Max': | |
248 scaler = MinMaxScaler().fit(X_train) | |
249 x_train = scaler.transform(X_train) | |
250 x_test = scaler.transform(X_test) | |
251 | |
252 elif Selected_Sclaer=='Standard_Scaler': | |
253 scaler = preprocessing.StandardScaler().fit(X_train) | |
254 x_train = scaler.transform(X_train) | |
255 x_test = scaler.transform(X_test) | |
256 | |
257 elif Selected_Sclaer == 'No_Scaler': | |
258 x_train = X_train | |
259 x_test = X_test | |
260 | |
261 else: | |
262 print('Scalling Method option was not correctly selected...!') | |
263 | |
264 prob = Algo.fit(x_train, y_train).predict_proba(x_test) | |
265 predicted = Algo.fit(x_train, y_train).predict(x_test) | |
266 fpr, tpr, thresholds = roc_curve(y_test, prob[:, 1]) | |
267 accu_score = accuracy_score(y_test, predicted) | |
268 | |
269 a = precision_recall_fscore_support(y_test, predicted, average='macro') | |
270 | |
271 pre_score = round(a[0],3) | |
272 recall_score= round(a[1],3) | |
273 f_score= round(a[2],3) | |
274 | |
275 pl.plot(fpr, tpr, '-', color='red',label='AUC = %0.2f' % accu_score, lw=2) | |
276 | |
277 pl.xlim([0.0, 1.0]) | |
278 pl.ylim([0.0, 1.05]) | |
279 pl.xlabel('False Positive Rate') | |
280 pl.ylabel('True Positive Rate') | |
281 pl.title('ROC Cureve') | |
282 pl.legend(loc="lower right") | |
283 pl.savefig(os.path.join(Workdirpath, htmlOutDir, "out.png")) | |
284 pl.plot(fpr, tpr, '--', lw=2) | |
285 | |
286 auc_score = auc(fpr, tpr) | |
287 | |
288 a = precision_recall_fscore_support(y_test, predicted, average='macro') | |
289 pre_score = round(a[0],3) | |
290 rec_score = round(a[1],3) | |
291 f_score = round(a[2],3) | |
292 | |
293 V_header = ["accuracy","presision","recall","f1","mean_auc"] | |
294 v_values = [accu_score, pre_score, rec_score, f_score, auc_score] | |
295 df = pd.DataFrame([v_values], columns=V_header) | |
296 df.to_csv(os.path.join(Workdirpath, OutFile), columns=V_header, sep='\t') | |
297 pl.figure() | |
298 pl.bar(V_header, v_values, color=(0.2, 0.4, 0.6, 0.6)) | |
299 pl.xlabel('Accuracy Perameters', fontweight='bold', color = 'orange', fontsize='17', horizontalalignment='center') | |
300 pl.savefig(os.path.join(Workdirpath, htmlOutDir, "2.png")) | |
301 #pl.show() | |
302 HTML_Gen(os.path.join(Workdirpath, htmlOutDir, htmlFname)) | |
303 | |
304 elif Test_Method == "Predict": | |
305 | |
306 X_train, y_train, X_test, _ = ReturnData(TrainData, Test_Method,TestData) | |
307 | |
308 if Selected_Sclaer=='Min_Max': | |
309 scaler = MinMaxScaler().fit(X_train) | |
310 x_train = scaler.transform(X_train) | |
311 x_test = scaler.transform(X_test) | |
312 | |
313 elif Selected_Sclaer=='Standard_Scaler': | |
314 scaler = preprocessing.StandardScaler().fit(X_train) | |
315 x_train = scaler.transform(X_train) | |
316 x_test = scaler.transform(X_test) | |
317 | |
318 elif Selected_Sclaer == 'No_Scaler': | |
319 x_train = X_train | |
320 x_test = X_test | |
321 | |
322 else: | |
323 print('Scalling Method option was not correctly selected...!') | |
324 | |
325 predicted = model.fit(x_train, y_train).predict(x_test) | |
326 | |
327 | |
328 return predicted | |
329 | |
330 def SVM_Classifier(C, kernel, degree, gamma, coef0, shrinking, probability, tol, cache_size, verbose, max_iter, decision_function_shape, randomState, breakties, TrainFile, TestMethod, SelectedSclaer, NFolds, TestFile, OutFile, htmlOutDir, htmlFname, Workdirpath): | |
331 | |
332 if randomState == None: | |
333 randomState =None | |
334 else: | |
335 randomState = int(randomState) | |
336 | |
337 | |
338 if cache_size == None: | |
339 cache_size =None | |
340 else: | |
341 cache_size = float(cache_size) | |
342 | |
343 | |
344 if probability or shrinking == 'true': | |
345 probability, shrinking = True, True | |
346 else: | |
347 probability, shrinking = False, False | |
348 | |
349 | |
350 if verbose == 'true': | |
351 verbose = True | |
352 else: | |
353 verbose = False | |
354 | |
355 | |
356 if breakties == 'true': | |
357 breakties = True | |
358 else: | |
359 breakties = False | |
360 | |
361 | |
362 | |
363 | |
364 pera={ | |
365 | |
366 'C':float(C), | |
367 'kernel':kernel, | |
368 'degree':int(degree), #3 | |
369 'gamma':gamma, #default=scale | |
370 'coef0':float(coef0), #default=0.0 | |
371 'shrinking':shrinking, #P | |
372 'probability':probability, | |
373 'tol':float(tol), #default=1e-3 | |
374 'cache_size':cache_size, | |
375 'verbose':verbose, | |
376 'max_iter':int(max_iter),#default=-1 | |
377 'decision_function_shape':decision_function_shape, | |
378 'random_state':randomState, | |
379 'break_ties':breakties | |
380 } | |
381 | |
382 | |
383 model = SVC(**pera ) | |
384 | |
385 Fit_Model(TrainData=TrainFile, Test_Method=TestMethod, Algo=model, Selected_Sclaer=SelectedSclaer, Workdirpath=Workdirpath, htmlOutDir=htmlOutDir, OutFile=OutFile, htmlFname=htmlFname, NoOfFolds=int(NFolds), TestData=TestFile) | |
386 | |
387 | |
388 def SGD_Classifier( loss, penalty, alpha, l1_ratio, fit_intercept, max_iter, tol, shuffle, verbose, epsilon, n_jobs, random_state, learning_rate, eta0, power_t, early_stopping, validation_fraction, n_iter_no_change, warm_start, average, TrainFile, TestMethod, SelectedSclaer, NFolds, TestFile, OutFile, htmlOutDir, htmlFname, Workdirpath): | |
389 | |
390 if n_jobs == 'none': | |
391 n_jobs =None | |
392 else: | |
393 n_jobs = int(n_jobs) | |
394 | |
395 if random_state == 'none': | |
396 random_state =None | |
397 else: | |
398 random_state = int(random_state) | |
399 | |
400 if fit_intercept == 'true': | |
401 fit_intercept = True | |
402 else: | |
403 fit_intercept = False | |
404 | |
405 if shuffle == 'true': | |
406 shuffle = True | |
407 else: | |
408 shuffle = False | |
409 | |
410 if early_stopping == 'true': | |
411 early_stopping = True | |
412 else: | |
413 early_stopping = False | |
414 | |
415 if warm_start == 'true': | |
416 warm_start = True | |
417 else: | |
418 warm_start = False | |
419 | |
420 if average == 'true': | |
421 average = True | |
422 else: | |
423 average = False | |
424 | |
425 pera = {"loss":loss, | |
426 "penalty":penalty, | |
427 "alpha":float(alpha),#0.0001 | |
428 "l1_ratio":float(l1_ratio),#0.15 | |
429 "fit_intercept":fit_intercept,#true | |
430 "max_iter":int(max_iter),#default=1000 | |
431 "tol":float(tol),#default=1e-3 | |
432 "shuffle":shuffle, | |
433 "verbose":int(verbose), #default=0 | |
434 "epsilon":float(epsilon), #default=0.1 | |
435 "n_jobs":n_jobs, #default=None | |
436 "random_state":random_state, #default=None | |
437 "learning_rate":learning_rate, | |
438 "eta0":float(eta0), #default=0.0 | |
439 "power_t":float(power_t), #default=0.5 | |
440 "early_stopping":early_stopping, | |
441 "validation_fraction":float(validation_fraction), #default=0.1 | |
442 "n_iter_no_change":int(n_iter_no_change), #default=5 | |
443 "warm_start":warm_start, | |
444 "average":average} | |
445 | |
446 model = SGDClassifier(**pera) | |
447 | |
448 Fit_Model(TrainData=TrainFile, Test_Method=TestMethod, Algo=model, Selected_Sclaer=SelectedSclaer, Workdirpath=Workdirpath, htmlOutDir=htmlOutDir, OutFile=OutFile, htmlFname=htmlFname, NoOfFolds=int(NFolds), TestData=TestFile) | |
449 | |
450 | |
451 def DT_Classifier(criterion, splitter, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, random_state, max_leaf_nodes, min_impurity_decrease, min_impurity_split, presort, ccpalpha, max_features, TrainFile, TestMethod, SelectedSclaer, NFolds, TestFile, OutFile, htmlOutDir, htmlFname, Workdirpath): | |
452 | |
453 if max_depth == 'none': | |
454 max_depth =None | |
455 else: | |
456 max_depth = int(max_depth) | |
457 | |
458 if '.' in min_samples_split: | |
459 min_samples_split = float(min_samples_split) | |
460 else: | |
461 min_samples_split = int(min_samples_split) | |
462 | |
463 if '.' in min_samples_leaf: | |
464 min_samples_split = float(min_samples_leaf) | |
465 else: | |
466 min_samples_leaf = int(min_samples_leaf) | |
467 | |
468 if max_features == 'none': | |
469 max_features = None | |
470 else: | |
471 if '.' in max_features: | |
472 max_features = float(max_features) | |
473 else: | |
474 max_features = int(max_features) | |
475 | |
476 if random_state == 'none': | |
477 random_state = None | |
478 else: | |
479 random_state = int(random_state) | |
480 | |
481 | |
482 if max_leaf_nodes == 'none': | |
483 max_leaf_nodes = None | |
484 else: | |
485 max_leaf_nodes = int(max_leaf_nodes) | |
486 | |
487 | |
488 pera = {"criterion":criterion, | |
489 "splitter":splitter, | |
490 "max_depth":max_depth,#int, default=None | |
491 "min_samples_split":int(min_samples_split),#default=2 | |
492 "min_samples_leaf":int(min_samples_leaf), #default=1 | |
493 "min_weight_fraction_leaf":float(min_weight_fraction_leaf),#default=0.0 | |
494 "random_state":random_state, #default=None | |
495 "max_leaf_nodes":max_leaf_nodes, #default=None | |
496 "min_impurity_decrease":float(min_impurity_decrease),#float, default=0.0 | |
497 "min_impurity_split":float(min_impurity_split), #float, default=1e-7 | |
498 "presort":presort,#default=deprecated | |
499 'ccp_alpha':float(ccpalpha),#non-negative float, default=0.0 | |
500 'max_features': max_features}#int, float or {"auto", "sqrt", "log2"}, default=None | |
501 | |
502 model = DecisionTreeClassifier(**pera) | |
503 | |
504 #Fit_Model('GBC.tsv', 'Internal', model, 'Min_Max', os.getcwd(), os.path.join(os.getcwd(),'report_dir'), 'out.tsv', 'out.html', NoOfFolds=3) | |
505 Fit_Model(TrainData=TrainFile, Test_Method=TestMethod, Algo=model, Selected_Sclaer=SelectedSclaer, Workdirpath=Workdirpath, htmlOutDir=htmlOutDir, OutFile=OutFile, htmlFname=htmlFname, NoOfFolds=int(NFolds), TestData=TestFile) | |
506 | |
507 | |
508 def GB_Classifier(loss, learning_rate, n_estimators, subsample, criterion, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_depth, min_impurity_decrease,min_impurity_split, init, random_state, verbose, max_leaf_nodes, warm_start, presort, validation_fraction, n_iter_no_change, tol, ccpalpha, max_features, TrainFile, TestMethod, SelectedSclaer, NFolds, TestFile, OutFile, htmlOutDir, htmlFname, Workdirpath): | |
509 | |
510 if '.' in min_samples_split: | |
511 min_samples_split = float(min_samples_split) | |
512 else: | |
513 min_samples_split = int(min_samples_split) | |
514 | |
515 if '.' in min_samples_leaf: | |
516 min_samples_split = float(min_samples_leaf) | |
517 else: | |
518 min_samples_leaf = int(min_samples_leaf) | |
519 | |
520 if max_features == 'none': | |
521 max_features = None | |
522 else: | |
523 if '.' in max_features: | |
524 max_features = float(max_features) | |
525 else: | |
526 max_features = int(max_features) | |
527 | |
528 if random_state == 'none': | |
529 random_state = None | |
530 else: | |
531 random_state = int(random_state) | |
532 | |
533 if max_leaf_nodes == 'none': | |
534 max_leaf_nodes = None | |
535 else: | |
536 max_leaf_nodes = int(max_leaf_nodes) | |
537 | |
538 if warm_start == 'true': | |
539 warm_start = True | |
540 else: | |
541 warm_start = False | |
542 | |
543 | |
544 if n_iter_no_change == 'none': | |
545 n_iter_no_change = None | |
546 else: | |
547 n_iter_no_change = int(n_iter_no_change) | |
548 | |
549 | |
550 if init == 'none': | |
551 init = None | |
552 else: | |
553 init = init | |
554 | |
555 | |
556 pera = {"loss":loss, | |
557 "learning_rate":float(learning_rate), | |
558 "n_estimators":int(n_estimators), #int (default=100) | |
559 "subsample":float(subsample), #float, optional (default=1.0) | |
560 "criterion":criterion, | |
561 "min_samples_split":min_samples_split, #int, float, optional (default=2) | |
562 "min_samples_leaf":min_samples_leaf, #int, float, optional (default=1) | |
563 "min_weight_fraction_leaf":float(min_weight_fraction_leaf), #float, optional (default=0.) | |
564 "max_depth":int(max_depth), #integer, optional (default=3) | |
565 "min_impurity_decrease":float(min_impurity_decrease),#float, optional (default=0.) | |
566 "min_impurity_split":float(min_impurity_split), #float, (default=1e-7) | |
567 "init":init, #estimator or zero, optional (default=None) | |
568 "random_state":random_state, #int, RandomState instance or None, optional (default=None) | |
569 "verbose":int(verbose), #int, default: 0 | |
570 "max_features": max_features,#int, float, string or None, optional (default=None) | |
571 "max_leaf_nodes":max_leaf_nodes, #int or None, optional (default=None) | |
572 "warm_start":warm_start, #bool, default: False | |
573 "presort":presort, #deprecated, default=deprecated | |
574 "validation_fraction":float(validation_fraction), #float, optional, default 0.1 | |
575 "n_iter_no_change":n_iter_no_change, #int, default None | |
576 "tol":float(tol),#default 1e-4 | |
577 "ccp_alpha":float(ccpalpha)} #non-negative float, optional (default=0.0) | |
578 | |
579 | |
580 model = GradientBoostingClassifier(**pera) | |
581 | |
582 #Fit_Model('GBC.tsv', 'Internal', model, 'Min_Max', os.getcwd(), os.path.join(os.getcwd(),'report_dir'), 'out.tsv', 'out.html', NoOfFolds=3) | |
583 Fit_Model(TrainData=TrainFile, Test_Method=TestMethod, Algo=model, Selected_Sclaer=SelectedSclaer, Workdirpath=Workdirpath, htmlOutDir=htmlOutDir, OutFile=OutFile, htmlFname=htmlFname, NoOfFolds=int(NFolds), TestData=TestFile) | |
584 | |
585 | |
586 def RF_Classifier( n_estimators, criterion, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, max_leaf_nodes, min_impurity_decrease, min_impurity_split, bootstrap, oob_score, n_jobs, random_state, verbose, warm_start, ccp_alpha, max_samples, TrainFile, TestMethod, SelectedSclaer, NFolds, TestFile, OutFile, htmlOutDir, htmlFname, Workdirpath): | |
587 | |
588 if max_depth == 'none': | |
589 max_depth = None | |
590 else: | |
591 max_depth = int(max_depth) | |
592 | |
593 if '.' in min_samples_split: | |
594 min_samples_split = float(min_samples_split) | |
595 else: | |
596 min_samples_split = int(min_samples_split) | |
597 | |
598 if '.' in min_samples_leaf: | |
599 min_samples_split = float(min_samples_leaf) | |
600 else: | |
601 min_samples_leaf = int(min_samples_leaf) | |
602 | |
603 if max_features == 'auto': | |
604 max_features = 'auto' | |
605 else: | |
606 if '.' in max_features: | |
607 max_features = float(max_features) | |
608 else: | |
609 max_features = int(max_features) | |
610 | |
611 if max_leaf_nodes == 'none': | |
612 max_leaf_nodes = None | |
613 else: | |
614 max_leaf_nodes = int(max_leaf_nodes) | |
615 | |
616 if bootstrap == 'true': | |
617 bootstrap = True | |
618 else: | |
619 bootstrap = False | |
620 | |
621 if oob_score == 'true': | |
622 oob_score = True | |
623 else: | |
624 oob_score = False | |
625 | |
626 if n_jobs == 'none': | |
627 n_jobs = None | |
628 else: | |
629 n_jobs = int(n_jobs) | |
630 | |
631 if random_state == 'none': | |
632 random_state = None | |
633 else: | |
634 random_state = int(random_state) | |
635 | |
636 if warm_start == 'true': | |
637 warm_start = True | |
638 else: | |
639 warm_start = False | |
640 | |
641 if max_samples == 'none': | |
642 max_samples = None | |
643 else: | |
644 if '.' in max_samples: | |
645 max_samples = float(max_samples) | |
646 else: | |
647 max_samples = int(max_samples) | |
648 | |
649 | |
650 pera = { | |
651 "n_estimators":int(n_estimators), #integer, optional (default=100) | |
652 "criterion":criterion, #string, optional (default='gini') | |
653 "max_depth":max_depth, #integer #or None, optional (default=None) | |
654 "min_samples_split":min_samples_split,# int, float, optional (default=2) | |
655 "min_samples_leaf":min_samples_leaf, #int, float, optional (default=1) | |
656 "min_weight_fraction_leaf":float(min_weight_fraction_leaf),#float, optional (default=0.) | |
657 "max_features":max_features, #int, float, string or None, optional (default='auto') | |
658 "max_leaf_nodes":max_leaf_nodes, #int or None, optional (default=None) | |
659 "min_impurity_decrease":float(min_impurity_decrease), #float, optional (default=0.) | |
660 "min_impurity_split":float(min_samples_split), #float, (default=1e-7) | |
661 "bootstrap":bootstrap, #boolean, optional (default=True) | |
662 "oob_score":oob_score, #bool (default=False) | |
663 "n_jobs":n_jobs, #int or None, optional (default=None) | |
664 "random_state":random_state, #int, RandomState instance or None, optional (default=None) | |
665 "verbose":int(verbose), #int, optional (default=0) | |
666 "warm_start":warm_start,#bool, optional (default=False) | |
667 "ccp_alpha":float(ccp_alpha),#non-negative float, optional (default=0.0) | |
668 "max_samples": max_samples #int or float, default=None | |
669 } | |
670 | |
671 model = RandomForestClassifier(**pera) | |
672 #Fit_Model('GBC.tsv', 'Internal', model, 'Min_Max', os.getcwd(), os.path.join(os.getcwd(),'report_dir'), 'out.tsv', 'out.html', NoOfFolds=3) | |
673 Fit_Model(TrainData=TrainFile, Test_Method=TestMethod, Algo=model, Selected_Sclaer=SelectedSclaer, Workdirpath=Workdirpath, htmlOutDir=htmlOutDir, OutFile=OutFile, htmlFname=htmlFname, NoOfFolds=int(NFolds), TestData=TestFile) | |
674 | |
675 | |
676 def LR_Classifier(penalty, dual, tol, C, fit_intercept, intercept_scaling, random_state, solver, max_iter, multi_class, verbose, warm_start, n_jobs, l1_ratio, TrainFile, TestMethod, SelectedSclaer, NFolds, TestFile, OutFile, htmlOutDir, htmlFname, Workdirpath): | |
677 | |
678 if dual == 'true': | |
679 dual = True | |
680 else: | |
681 dual = False | |
682 | |
683 if fit_intercept == "true": | |
684 fit_intercept = True | |
685 else: | |
686 fit_intercept = False | |
687 | |
688 if random_state == 'none': | |
689 random_state = None | |
690 else: | |
691 random_state = int(random_state) | |
692 | |
693 if warm_start == 'true': | |
694 warm_start = True | |
695 else: | |
696 warm_start = False | |
697 | |
698 if n_jobs == "none": | |
699 n_jobs = None | |
700 else: | |
701 n_jobs = int(n_jobs) | |
702 | |
703 if l1_ratio == "none": | |
704 l1_ratio = None | |
705 else: | |
706 l1_ratio =float(l1_ratio) | |
707 | |
708 pera = { | |
709 "penalty":penalty, #l2 | |
710 "dual":dual, #false | |
711 "tol":float(tol), #1e-4 | |
712 "C":float(C), #1.0 | |
713 "fit_intercept":fit_intercept, #True | |
714 "intercept_scaling":float(intercept_scaling), #1 | |
715 "random_state":random_state, #None | |
716 "solver":solver, #lbfgs | |
717 "max_iter":int(max_iter), #100 | |
718 "multi_class":multi_class, #auto | |
719 "verbose":int(verbose), #0 | |
720 "warm_start":warm_start,#False | |
721 "n_jobs":n_jobs, #None | |
722 "l1_ratio":l1_ratio} #None | |
723 | |
724 model = LogisticRegression(**pera) | |
725 | |
726 Fit_Model(TrainData=TrainFile, Test_Method=TestMethod, Algo=model, Selected_Sclaer=SelectedSclaer, Workdirpath=Workdirpath, htmlOutDir=htmlOutDir, OutFile=OutFile, htmlFname=htmlFname, NoOfFolds=int(NFolds), TestData=TestFile) | |
727 | |
728 | |
729 def KN_Classifier(n_neighbors, weights, algorithm, leaf_size, p, metric, metric_params, n_jobs, TrainFile, TestMethod, SelectedSclaer, NFolds, TestFile, OutFile, htmlOutDir, htmlFname, Workdirpath): | |
730 | |
731 if n_jobs == 'none': | |
732 n_jobs = None | |
733 else: | |
734 n_jobs = int(n_jobs) | |
735 | |
736 pera = { | |
737 "n_neighbors":int(n_neighbors),#int5 | |
738 "weights":weights, | |
739 "algorithm":algorithm, | |
740 "leaf_size":int(leaf_size), #int30 | |
741 "p":int(p), #int2 | |
742 "metric":metric, #minkowski | |
743 "n_jobs":n_jobs} #none | |
744 | |
745 model = KNeighborsClassifier(**pera) | |
746 | |
747 #Fit_Model('GBC.tsv', 'Internal', model, 'Min_Max', os.getcwd(), os.path.join(os.getcwd(),'report_dir'), 'out.tsv', 'out.html', NoOfFolds=3) | |
748 Fit_Model(TrainData=TrainFile, Test_Method=TestMethod, Algo=model, Selected_Sclaer=SelectedSclaer, Workdirpath=Workdirpath, htmlOutDir=htmlOutDir, OutFile=OutFile, htmlFname=htmlFname, NoOfFolds=int(NFolds), TestData=TestFile) | |
749 | |
750 def GNB_Classifier( var_smoothing, TrainFile, TestMethod, SelectedSclaer, NFolds, TestFile, OutFile, htmlOutDir, htmlFname, Workdirpath): | |
751 | |
752 pera = { | |
753 "var_smoothing":float(var_smoothing)} # | |
754 | |
755 model = GaussianNB(**pera) | |
756 | |
757 #Fit_Model('GBC.tsv', 'Internal', model, 'Min_Max', os.getcwd(), os.path.join(os.getcwd(),'report_dir'), 'out.tsv', 'out.html', NoOfFolds=3) | |
758 Fit_Model(TrainData=TrainFile, Test_Method=TestMethod, Algo=model, Selected_Sclaer=SelectedSclaer, Workdirpath=Workdirpath, htmlOutDir=htmlOutDir, OutFile=OutFile, htmlFname=htmlFname, NoOfFolds=int(NFolds), TestData=TestFile) | |
759 | |
760 | |
761 def MLP_Classifier(hidden_layer_sizes, activation,solver,alpha,batch_size,learning_rate,learning_rate_init,power_t,max_iter,shuffle,random_state,tol,verbose,warm_start,momentum,nesterovs_momentum,early_stopping,validation_fraction,beta_1,beta_2,epsilon,n_iter_no_change,max_fun,TrainFile, TestMethod, SelectedSclaer, NFolds, Testspt, TestFile, OutFile, htmlOutDir, htmlFname, Workdirpath): | |
762 | |
763 if shuffle == 'true': | |
764 shuffle = True | |
765 else: | |
766 shuffle = False | |
767 | |
768 if nesterovs_momentum == 'true': | |
769 nesterovs_momentum = True | |
770 else: | |
771 nesterovs_momentum = False | |
772 | |
773 if early_stopping == 'true': | |
774 early_stopping = True | |
775 else: | |
776 early_stopping = False | |
777 | |
778 if random_state == 'none': | |
779 random_state = None | |
780 else: | |
781 random_state = int(random_state) | |
782 | |
783 if verbose == 'false': | |
784 verbose = False | |
785 else: | |
786 verbose = True | |
787 | |
788 if warm_start == 'true': | |
789 warm_start = True | |
790 else: | |
791 warm_start = False | |
792 | |
793 pera ={ | |
794 'hidden_layer_sizes':hidden_layer_sizes, #=(100,), | |
795 'activation':activation, #='relu', | |
796 'solver':solver, #='adam', | |
797 'alpha':alpha, #=0.0001, | |
798 'batch_size':batch_size, #='auto', | |
799 'learning_rate':learning_rate, #='constant', | |
800 'learning_rate_init':learning_rate_init, #=0.001, | |
801 'power_t':power_t, #=0.5, | |
802 'max_iter':max_iter, #=200, | |
803 'shuffle':shuffle, #=True, | |
804 'random_state':random_state, #=None, | |
805 'tol':tol, #=0.0001, | |
806 'verbose':verbose, #=False, | |
807 'warm_start':warm_start, #=False, | |
808 'momentum':momentum, #=0.9, | |
809 'nesterovs_momentum':nesterovs_momentum, #=True, | |
810 'early_stopping':early_stopping, #=False, | |
811 'validation_fraction':validation_fraction, #=0.1, | |
812 'beta_1':beta_1, #=0.9, | |
813 'beta_2':beta_2, #=0.999, | |
814 'epsilon':epsilon, #=1e-08, | |
815 'n_iter_no_change':n_iter_no_change, #=10, | |
816 'max_fun':max_fun #=15000 | |
817 } | |
818 | |
819 model = MLPClassifier(**pera) | |
820 | |
821 Fit_Model(TrainData=TrainFile, Test_Method=TestMethod, Algo=model, Selected_Sclaer=SelectedSclaer, Workdirpath=Workdirpath, htmlOutDir=htmlOutDir, OutFile=OutFile, htmlFname=htmlFname, NoOfFolds=NFolds, TestSize=Testspt, TestData=TestFile) | |
822 | |
823 | |
824 if __name__=="__main__": | |
825 | |
826 import argparse | |
827 | |
828 parser = argparse.ArgumentParser(description='Deployment tool') | |
829 subparsers = parser.add_subparsers() | |
830 | |
831 svmc = subparsers.add_parser('SVMC') | |
832 svmc.add_argument("--C", required=False, default=1.0, help="Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty.") | |
833 svmc.add_argument("--kernel", required=False, default='rbf', help="Specifies the kernel type to be used in the algorithm. It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or a callable. If none is given, 'rbf' will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape (n_samples, n_samples).") | |
834 svmc.add_argument("--degree", required=False, default=3, help="Degree of the polynomial kernel function ('poly'). Ignored by all other kernels.") | |
835 svmc.add_argument("--gamma", required=False, default='scale', help="Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. if gamma='scale' (default) is passed then it uses 1 / (n_features * X.var()) as value of gamma, if 'auto', uses 1 / n_features.") | |
836 svmc.add_argument("--coef0", required=False, default=0.0, help="Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'.") | |
837 svmc.add_argument("--shrinking", required=False, default=True, help="Whether to use the shrinking heuristic.") | |
838 svmc.add_argument("--probability", required=False, default=True, help="Whether to enable probability estimates. This must be enabled prior to calling fit, will slow down that method as it internally uses 5-fold cross-validation, and predict_proba may be inconsistent with predict") | |
839 svmc.add_argument("--tol", required=False, default=0.001, help="Tolerance for stopping criterion.") | |
840 svmc.add_argument("--cache_size", required=False, default=200, help="Specify the size of the kernel cache (in MB).") | |
841 svmc.add_argument("--verbose", required=False, default=False, help="Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm that, if enabled, may not work properly in a multithreaded context.") | |
842 svmc.add_argument("--max_iter", required=False, default=-1, help="Hard limit on iterations within solver, or -1 for no limit.") | |
843 svmc.add_argument("--decision_function_shape", required=False, default='ovr', help="Whether to return a one-vs-rest ('ovr') decision function of shape (n_samples, n_classes) as all other classifiers, or the original one-vs-one ('ovo') decision function of libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one ('ovo') is always used as multi-class strategy.") | |
844 svmc.add_argument("--randomState", required=False, default=None, help="The seed of the pseudo random number generator used when shuffling the data for probability estimates. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random.") | |
845 svmc.add_argument("--breakties", required=False, default=False, help="If true, decision_function_shape='ovr', and number of classes > 2, predict will break ties according to the confidence values of decision_function; otherwise the first class among the tied classes is returned. Please note that breaking ties comes at a relatively high computational cost compared to a simple predict." ) | |
846 svmc.add_argument("--TrainFile", required=True, default=None, help="Positive negative dataset Ex. 'Train.csv'") | |
847 svmc.add_argument("--TestMethod", required=True, default=None, help="Internal','CrossVal', 'External', 'Predict'") | |
848 svmc.add_argument("--SelectedSclaer", required=True, help="'Min_Max','Standard_Scaler','No_Scaler'") | |
849 svmc.add_argument("--NFolds", required=False, default=5, help="int, Max=10") | |
850 svmc.add_argument("--TestFile", required=False, default=None, help="Test data, 'Test.csv'") | |
851 svmc.add_argument("--OutFile", required=False, default='Out.csv', help="Out.csv") | |
852 svmc.add_argument("--htmlOutDir", required=False, default=os.path.join(os.getcwd(),'report_dir'), help="HTML Out Dir") | |
853 svmc.add_argument("--htmlFname", required=False, default='Out.html', help="HTML out file") | |
854 svmc.add_argument("--Workdirpath", required=False, default=os.getcwd(), help="Working Directory Path") | |
855 | |
856 sgdc = subparsers.add_parser('SGDC') | |
857 sgdc.add_argument("--loss", required=False, default='log', help="The loss function to be used. Defaults to 'hinge', which gives a linear SVM. The possible options are 'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', or a regression loss: 'squared_loss', 'huber', 'epsilon_insensitive', or squared_epsilon_insensitive'.") | |
858 sgdc.add_argument("--penalty", required=False, default='l2', help="The penalty (aka regularization term) to be used. Defaults to 'l2' which is the standard regularizer for linear SVM models. 'l1' and 'elasticnet' might bring sparsity to the model (feature selection) not achievable with 'l2'.") | |
859 sgdc.add_argument("--alpha", required=False, default=0.0001, help="Constant that multiplies the regularization term. Defaults to 0.0001. Also used to compute learning_rate when set to 'optimal'.") | |
860 sgdc.add_argument("--l1_ratio", required=False, default=0.15, help="The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1. l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1. Defaults to 0.15.") | |
861 sgdc.add_argument("--fit_intercept", required=False, default=True, help="Whether the intercept should be estimated or not. If False, the data is assumed to be already centered. Defaults to True.") | |
862 sgdc.add_argument("--max_iter", required=False, default=1000, help="The maximum number of passes over the training data (aka epochs). It only impacts the behavior in the fit method, and not the partial_fit method.") | |
863 sgdc.add_argument("--tol", required=False, default=0.001, help="The stopping criterion. If it is not None, the iterations will stop when (loss > best_loss - tol) for n_iter_no_change consecutive epochs.") | |
864 sgdc.add_argument("--shuffle", required=False, default=True, help="Whether or not the training data should be shuffled after each epoch. Defaults to True.") | |
865 sgdc.add_argument("--verbose", required=False, default=0, help="The verbosity level.") | |
866 sgdc.add_argument("--epsilon", required=False, default=0.1, help="Epsilon in the epsilon-insensitive loss functions; only if loss is 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'. For 'huber', determines the threshold at which it becomes less important to get the prediction exactly right. For epsilon-insensitive, any differences between the current prediction and the correct label are ignored if they are less than this threshold.") | |
867 sgdc.add_argument("--n_jobs", required=False, default='none', help="The number of CPUs to use to do the OVA (One Versus All, for multi-class problems) computation. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. See Glossary for more details.") | |
868 sgdc.add_argument("--random_state", required=False, default='none', help="The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random.") | |
869 sgdc.add_argument("--learning_rate", required=False, default='optimal', help="The learning rate schedule:") | |
870 sgdc.add_argument("--eta0", required=False, default=0.0, help="eta = eta0") | |
871 sgdc.add_argument("--power_t", required=False, default=0.5, help="eta = 1.0 / (alpha * (t + t0)) where t0 is chosen by a heuristic proposed by Leon Bottou.") | |
872 sgdc.add_argument("--early_stopping", required=False, default=False, help="MinMaxScaler") | |
873 sgdc.add_argument("--validation_fraction", required=False, default=0.1, help="The proportion of training data to set aside as validation set for early stopping. Must be between 0 and 1. Only used if early_stopping is True.") | |
874 sgdc.add_argument("--n_iter_no_change", required=False, default=5, help="Number of iterations with no improvement to wait before early stopping.") | |
875 sgdc.add_argument("--warm_start", required=False, default=False, help="When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution.") | |
876 sgdc.add_argument("--average", required=False, default=False, help="MinMaxScaler") | |
877 sgdc.add_argument("--TrainFile", required=True, default=None, help="Positive negative dataset Ex. 'Train.csv'") | |
878 sgdc.add_argument("--TestMethod", required=True, default=None, help="Internal','CrossVal', 'External', 'Predict'") | |
879 sgdc.add_argument("--SelectedSclaer", required=True, help="'Min_Max','Standard_Scaler','No_Scaler'") | |
880 sgdc.add_argument("--NFolds", required=False, default=5, help="int, Max=10") | |
881 sgdc.add_argument("--TestFile", required=False, default=None, help="Test data, 'Test.csv'") | |
882 sgdc.add_argument("--OutFile", required=False, default='Out.csv', help="float, Max=1.0") | |
883 sgdc.add_argument("--htmlOutDir", required=False, default=os.path.join(os.getcwd(),'report_dir'), help="HTML Out Dir") | |
884 sgdc.add_argument("--htmlFname", required=False, default='Out.html', help="") | |
885 sgdc.add_argument("--Workdirpath", required=False, default=os.getcwd(), help="Working Directory Path") | |
886 | |
887 dtc = subparsers.add_parser('DTC') | |
888 dtc.add_argument("--criterion", required=False, default='gini', help="The function to measure the quality of a split. Supported criteria are 'gini' for the Gini impurity and 'entropy' for the information gain.") | |
889 dtc.add_argument("--splitter", required=False, default='best', help="The strategy used to choose the split at each node. Supported strategies are 'best' to choose the best split and 'random' to choose the best random split." ) | |
890 dtc.add_argument("--max_depth", required=False, default='none', help="The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.") | |
891 dtc.add_argument("--min_samples_split", required=False, default='2', help="The minimum number of samples required to split an internal node: If int, then consider min_samples_split as the minimum number. If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split.") | |
892 dtc.add_argument("--min_samples_leaf", required=False, default='1', help="The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.") | |
893 dtc.add_argument("--min_weight_fraction_leaf", required=False, default=0.0, help="The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.") | |
894 dtc.add_argument("--random_state", required=False, default='none', help="If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random.") | |
895 dtc.add_argument("--max_leaf_nodes", required=False, default='none', help="A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following") | |
896 dtc.add_argument("--min_impurity_decrease", required=False, default=0.0, help="") | |
897 dtc.add_argument("--min_impurity_split", required=False, default=1e-09, help="Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf.") | |
898 dtc.add_argument("--presort", required=False, default='deprecate', help="This parameter is deprecated and will be removed in v0.24.") | |
899 dtc.add_argument("--ccpalpha", required=False, default=0.0, help="Complexity parameter used for Minimal Cost-Complexity Pruning. The subtree with the largest cost complexity that is smaller than ccp_alpha will be chosen. By default, no pruning is performed. See Minimal Cost-Complexity Pruning for details.") | |
900 dtc.add_argument("--TrainFile", required=True, default=None, help="Positive negative dataset Ex. 'Train.csv'") | |
901 dtc.add_argument("--TestMethod", required=True, default=None, help="Internal','CrossVal', 'External', 'Predict'") | |
902 dtc.add_argument("--max_features", required=False, default='none') | |
903 dtc.add_argument("--SelectedSclaer", required=True, help="'Min_Max',Standard_Scaler','No_Scaler'") | |
904 dtc.add_argument("--NFolds", required=False, default=5, help="int, Max=10") | |
905 dtc.add_argument("--TestFile", required=False, default=None, help="Test data, 'Test.csv'") | |
906 dtc.add_argument("--OutFile", required=False, default='Out.csv', help="Out.tsv") | |
907 dtc.add_argument("--htmlOutDir", required=False, default=os.path.join(os.getcwd(),'report_dir'), help="HTML Out Dir") | |
908 dtc.add_argument("--htmlFname", required=False, default='Out.html', help="") | |
909 dtc.add_argument("--Workdirpath", required=False, default=os.getcwd(), help="Working Directory Path") | |
910 | |
911 gbc = subparsers.add_parser('GBC') | |
912 gbc.add_argument("--loss", required=False, default='deviance', help="loss function to be optimized. 'deviance' refers to deviance (= logistic regression) for classification with probabilistic outputs. For loss 'exponential' gradient boosting recovers the AdaBoost algorithm.") | |
913 gbc.add_argument("--learning_rate", required=False, default=0.1, help="learning rate shrinks the contribution of each tree by learning_rate. There is a trade-off between learning_rate and n_estimators.") | |
914 gbc.add_argument("--n_estimators", required=False, default=100, help="The number of boosting stages to perform. Gradient boosting is fairly robust to over-fitting so a large number usually results in better performance.") | |
915 gbc.add_argument("--subsample", required=False, default=1.0, help="The fraction of samples to be used for fitting the individual base learners. If smaller than 1.0 this results in Stochastic Gradient Boosting. subsample interacts with the parameter n_estimators. Choosing subsample < 1.0 leads to a reduction of variance and an increase in bias.") | |
916 gbc.add_argument("--criterion", required=False,default='friedman_mse', help="The function to measure the quality of a split. Supported criteria are 'friedman_mse' for the mean squared error with improvement score by Friedman, 'mse' for mean squared error, and 'mae' for the mean absolute error. The default value of 'friedman_mse' is generally the best as it can provide a better approximation in some cases.") | |
917 gbc.add_argument("--min_samples_split", required=False, default='2', help="The minimum number of samples required to split an internal node: If int, then consider min_samples_split as the minimum number. If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split.") | |
918 gbc.add_argument("--min_samples_leaf", required=False, default='1', help="The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.If int, then consider min_samples_leaf as the minimum number. If float, then min_samples_leaf is a fraction and ceil(min_samples_leaf * n_samples) are the minimum number of samples for each node.") | |
919 gbc.add_argument("--min_weight_fraction_leaf", required=False, default=0, help="The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.") | |
920 gbc.add_argument("--max_depth", required=False, default=3, help="maximum depth of the individual regression estimators. The maximum depth limits the number of nodes in the tree. Tune this parameter for best performance; the best value depends on the interaction of the input variables.") | |
921 gbc.add_argument("--min_impurity_decrease", required=False, default=0.0, help="A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following: 'N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity'), where N is the total number of samples, N_t is the number of samples at the current node, N_t_L is the number of samples in the left child, and N_t_R is the number of samples in the right child. N, N_t, N_t_R and N_t_L all refer to the weighted sum, if sample_weight is passed. New in version 0.19.") | |
922 gbc.add_argument("--min_impurity_split", required=False, default=0.00000007, help="Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf.") | |
923 gbc.add_argument("--init", required=False,default='none', help="An estimator object that is used to compute the initial predictions. init has to provide fit and predict_proba. If 'zero', the initial raw predictions are set to zero. By default, a DummyEstimator predicting the classes priors is used.") | |
924 gbc.add_argument("--random_state", required=False, default='none', help="If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random.") | |
925 gbc.add_argument("--max_features", required=False, default='none', help="The number of features to consider when looking for the best split: If int, then consider max_features features at each split. If float, then max_features is a fraction and int(max_features * n_features) features are considered at each split.If 'auto', then max_features=sqrt(n_features). If 'sqrt', then max_features=sqrt(n_features). If 'log2', then max_features=log2(n_features). If None, then max_features=n_features. Choosing max_features < n_features leads to a reduction of variance and an increase in bias. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than max_features features.") | |
926 gbc.add_argument("--verbose",required=False, default=0, help="Enable verbose output. If 1 then it prints progress and performance once in a while (the more trees the lower the frequency). If greater than 1 then it prints progress and performance for every tree.") | |
927 gbc.add_argument("--max_leaf_nodes", required=False, default=4, help="Grow trees with max_leaf_nodes in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.") | |
928 gbc.add_argument("--warm_start", required=False, default='false', help="When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just erase the previous solution." ) | |
929 gbc.add_argument("--presort", required=False,default='auto', help="This parameter is deprecated and will be removed in v0.24.") | |
930 gbc.add_argument("--validation_fraction", required=False, default=0.1, help="The proportion of training data to set aside as validation set for early stopping. Must be between 0 and 1. Only used if n_iter_no_change is set to an integer.") | |
931 gbc.add_argument("--n_iter_no_change", required=False, default=10, help="n_iter_no_change is used to decide if early stopping will be used to terminate training when validation score is not improving. By default it is set to None to disable early stopping. If set to a number, it will set aside validation_fraction size of the training data as validation and terminate training when validation score is not improving in all of the previous n_iter_no_change numbers of iterations. The split is stratified.") | |
932 gbc.add_argument("--tol", required=False, default=0.0001, help="Tolerance for the early stopping. When the loss is not improving by at least tol for n_iter_no_change iterations (if set to a number), the training stops.") | |
933 gbc.add_argument("--ccpalpha", required=False, default=0.0, help="Complexity parameter used for Minimal Cost-Complexity Pruning. The subtree with the largest cost complexity that is smaller than ccp_alpha will be chosen. By default, no pruning is performed. See Minimal Cost-Complexity Pruning for details.") | |
934 gbc.add_argument("--TrainFile", required=True, default=None, help="Positive negative dataset Ex. 'Train.csv'") | |
935 gbc.add_argument("--TestMethod", required=True, default=None, help="Internal','CrossVal', 'External', 'Predict'") | |
936 gbc.add_argument("--SelectedSclaer", required=True, help="'Min_Max',Standard_Scaler','No_Scaler'") | |
937 gbc.add_argument("--NFolds", required=False, default=5, help="int, Max=10") | |
938 gbc.add_argument("--TestFile", required=False, default=None, help="Test data, 'Test.csv'") | |
939 gbc.add_argument("--OutFile", required=False, default='Out.csv', help="Out.tsv") | |
940 gbc.add_argument("--htmlOutDir", required=False, default=os.path.join(os.getcwd(),'report_dir'), help="HTML Out Dir") | |
941 gbc.add_argument("--htmlFname", required=False, default='Out.html', help="") | |
942 gbc.add_argument("--Workdirpath", required=False, default=os.getcwd(), help="Working Directory Path") | |
943 | |
944 rfc = subparsers.add_parser('RFC') | |
945 rfc.add_argument("--n_estimators", required=False, default=100, help="The number of trees in the forest.") | |
946 rfc.add_argument("--criterion", required=False, default='gini', help="The function to measure the quality of a split. Supported criteria are 'gini' for the Gini impurity and 'entropy' for the information gain. Note: this parameter is tree-specific." ) | |
947 rfc.add_argument("--max_depth", required=False, default='none', help="The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.") | |
948 rfc.add_argument("--min_samples_split", required=False, default='2', help="The minimum number of samples required to split an internal node: If int, then consider min_samples_split as the minimum number. If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split.") | |
949 rfc.add_argument("--min_samples_leaf", required=False, default='1', help="The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.") | |
950 rfc.add_argument("--min_weight_fraction_leaf", required=False, default=0.0, help="The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.") | |
951 rfc.add_argument("--max_features", required=False, default='auto', help="The number of features to consider when looking for the best split:") | |
952 rfc.add_argument("--max_leaf_nodes", required=False, default='none', help="Grow trees with max_leaf_nodes in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.") | |
953 rfc.add_argument("--min_impurity_decrease", required=False, default=0.0, help="A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where N is the total number of samples, N_t is the number of samples at the current node, N_t_L is the number of samples in the left child, and N_t_R is the number of samples in the right child. N, N_t, N_t_R and N_t_L all refer to the weighted sum, if sample_weight is passed. New in version 0.19.") | |
954 rfc.add_argument("--min_impurity_split", required=False, default=1e-7, help="Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf.") | |
955 rfc.add_argument("--bootstrap", required=False, default='true', help="Whether bootstrap samples are used when building trees. If False, the whole datset is used to build each tree.") | |
956 rfc.add_argument("--oob_score", required=False, default='false', help="Whether to use out-of-bag samples to estimate the generalization accuracy.") | |
957 rfc.add_argument("--n_jobs", required=False, default=-1, help="The number of jobs to run in parallel. fit, predict, decision_path and apply are all parallelized over the trees. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. See Glossary for more details." ) | |
958 rfc.add_argument("--random_state", required=False, default='none', help="Controls both the randomness of the bootstrapping of the samples used when building trees (if bootstrap=True) and the sampling of the features to consider when looking for the best split at each node (if max_features < n_features). See Glossary for details.") | |
959 rfc.add_argument("--verbose", required=False, default=0, help="Controls the verbosity when fitting and predicting." ) | |
960 rfc.add_argument("--max_samples", required=False, default='none', help="") | |
961 rfc.add_argument("--ccp_alpha", required=False, default=0.0, help="") | |
962 rfc.add_argument("--warm_start", required=False, default='false', help="When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest. See the Glossary.") | |
963 rfc.add_argument("--TrainFile", required=True, default=None, help="Positive negative dataset Ex. 'Train.csv'") | |
964 rfc.add_argument("--TestMethod", required=True, default=None, help="Internal','CrossVal', 'External', 'Predict'") | |
965 rfc.add_argument("--SelectedSclaer", required=True, help="'Min_Max',Standard_Scaler','No_Scaler'") | |
966 rfc.add_argument("--NFolds", required=False, default=5, help="int, Max=10") | |
967 rfc.add_argument("--TestFile", required=False, default=None, help="Test data, 'Test.csv'") | |
968 rfc.add_argument("--OutFile", required=False, default='Out.csv', help="Out.tsv") | |
969 rfc.add_argument("--htmlOutDir", required=False, default=os.path.join(os.getcwd(),'report_dir'), help="HTML Out Dir") | |
970 rfc.add_argument("--htmlFname", required=False, default='Out.html', help="") | |
971 rfc.add_argument("--Workdirpath", required=False, default=os.getcwd(), help="Working Directory Path") | |
972 | |
973 lrc = subparsers.add_parser('LRC') | |
974 lrc.add_argument("--penalty", required=False, default='l2', help="Used to specify the norm used in the penalization. The 'newton-cg', 'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is only supported by the 'saga' solver. If 'none' (not supported by the liblinear solver), no regularization is applied." ) | |
975 lrc.add_argument("--dual", required=False, default='false', help="Dual or primal formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer dual=False when n_samples > n_features.") | |
976 lrc.add_argument("--tol", required=False, default=0.0001, help="Tolerance for stopping criteria.") | |
977 lrc.add_argument("--C", required=False, default=1.0, help="Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization." ) | |
978 lrc.add_argument("--fit_intercept", required=False, default='true', help="Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function." ) | |
979 lrc.add_argument("--intercept_scaling", required=False, default=1, help="Useful only when the solver 'liblinear' is used and self.fit_intercept is set to True. In this case, x becomes [x, self.intercept_scaling], i.e. a 'synthetic' feature with constant value equal to intercept_scaling is appended to the instance vector. The intercept becomes intercept_scaling * synthetic_feature_weight." ) | |
980 lrc.add_argument("--random_state", required=False, default=10, help="The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random. Used when solver == 'sag' or 'liblinear'.") | |
981 lrc.add_argument("--solver", required=False, default='lbfgs', help="Algorithm to use in the optimization problem. For small datasets, 'liblinear' is a good choice, whereas 'sag' and 'saga' are faster for large ones. For multiclass problems, only 'newton-cg', 'sag', 'saga' and 'lbfgs' handle multinomial loss; 'liblinear' is limited to one-versus-rest schemes. 'newton-cg', 'lbfgs', 'sag' and 'saga' handle L2 or no penalty 'liblinear' and 'saga' also handle L1 penalty 'saga' also supports 'elasticnet' penalty 'liblinear' does not support setting penalty='none' Note that 'sag' and 'saga' fast convergence is only guaranteed on features with approximately the same scale. You can preprocess the data with a scaler from sklearn.preprocessing. New in version 0.17: Stochastic Average Gradient descent solver.") | |
982 lrc.add_argument("--max_iter", required=False, default=100, help="Maximum number of iterations taken for the solvers to converge."), | |
983 lrc.add_argument("--multi_class", required=False, default='auto', help="If the option chosen is 'ovr', then a binary problem is fit for each label. For 'multinomial' the loss minimised is the multinomial loss fit across the entire probability distribution, even when the data is binary. 'multinomial' is unavailable when solver='liblinear'. 'auto' selects 'ovr' if the data is binary, or if solver='liblinear', and otherwise selects 'multinomial'. New in version 0.18: Stochastic Average Gradient descent solver for 'multinomial' case.") | |
984 lrc.add_argument("--verbose", required=False, default=0, help="For the liblinear and lbfgs solvers set verbose to any positive number for verbosity.") | |
985 lrc.add_argument("--warm_start", required=False, default='false', help="When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. Useless for liblinear solver. See the Glossary. New in version 0.17: warm_start to support lbfgs, newton-cg, sag, saga solvers.") | |
986 lrc.add_argument("--n_jobs", required=False, default='none', help="Number of CPU cores used when parallelizing over classes if multi_class='ovr'. This parameter is ignored when the solver is set to 'liblinear' regardless of whether 'multi_class' is specified or not. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. See Glossary for more details." ) | |
987 lrc.add_argument("--l1_ratio", required=False, default='none', help="The Elastic-Net mixing parameter, with 0 <= l1_ratio <= 1. Only used if penalty='elasticnet'. Setting 'l1_ratio=0 is equivalent to using penalty='l2', while setting l1_ratio=1 is equivalent to using penalty='l1'. For 0 < l1_ratio <1, the penalty is a combination of L1 and L2.") | |
988 lrc.add_argument("--TrainFile", required=True, default=None, help="Positive negative dataset Ex. 'Train.csv'") | |
989 lrc.add_argument("--TestMethod", required=True, default=None, help="Internal','CrossVal', 'External', 'Predict'") | |
990 lrc.add_argument("--SelectedSclaer", required=True, help="'Min_Max',Standard_Scaler','No_Scaler'") | |
991 lrc.add_argument("--NFolds", required=False, default=5, help="int, Max=10") | |
992 lrc.add_argument("--TestFile", required=False, default=None, help="Test data, 'Test.csv'") | |
993 lrc.add_argument("--OutFile", required=False, default='Out.csv', help="Out.tsv") | |
994 lrc.add_argument("--htmlOutDir", required=False, default=os.path.join(os.getcwd(),'report_dir'), help="HTML Out Dir") | |
995 lrc.add_argument("--htmlFname", required=False, default='Out.html', help="") | |
996 lrc.add_argument("--Workdirpath", required=False, default=os.getcwd(), help="Working Directory Path") | |
997 | |
998 knc = subparsers.add_parser('KNC') | |
999 knc.add_argument("--n_neighbors", required=False, default=5, help="Number of neighbors to use by default for kneighbors queries.") | |
1000 knc.add_argument("--weights",required=False, default='uniform', help="weight function used in prediction. Possible values: 'uniform' : uniform weights. All points in each neighborhood are weighted equally. 'distance' : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away. [callable] : a user-defined function which accepts an array of distances, and returns an array of the same shape containing the weights.") | |
1001 knc.add_argument("--algorithm", required=False, default='auto', help="Algorithm used to compute the nearest neighbors:'ball_tree' will use BallTree 'kd_tree' will use KDTree 'brute' will use a brute-force search. 'auto' will attempt to decide the most appropriate algorithm based on the values passed to fit method. Note: fitting on sparse input will override the setting of this parameter, using brute force." ) | |
1002 knc.add_argument("--leaf_size", required=False, default=30, help="Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem.") | |
1003 knc.add_argument("--p", required=False, default=2, help="Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used." ) | |
1004 knc.add_argument("--metric", required=False, default='minkowski', help="the distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean metric. See the documentation of the DistanceMetric class for a list of available metrics. If metric is 'precomputed', X is assumed to be a distance matrix and must be square during fit. X may be a Glossary, in which case only 'nonzero' elements may be considered neighbors.") | |
1005 knc.add_argument("--metric_params", required=False, default=None, help="Additional keyword arguments for the metric function." ) | |
1006 knc.add_argument("--n_jobs", required=False, default='none', help="The number of parallel jobs to run for neighbors search. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. See Glossary for more details. Doesn't affect fit method.") | |
1007 knc.add_argument("--TrainFile", required=True, default=None, help="Positive negative dataset Ex. 'Train.csv'") | |
1008 knc.add_argument("--TestMethod", required=True, default=None, help="Internal','CrossVal', 'External', 'Predict'") | |
1009 knc.add_argument("--SelectedSclaer", required=True, help="'Min_Max',Standard_Scaler','No_Scaler'") | |
1010 knc.add_argument("--NFolds", required=False, default=5, help="int, Max=10") | |
1011 knc.add_argument("--TestFile", required=False, default=None, help="Test data, 'Test.csv'") | |
1012 knc.add_argument("--OutFile", required=False, default='Out.csv', help="Out.tsv") | |
1013 knc.add_argument("--htmlOutDir", required=False, default=os.path.join(os.getcwd(),'report_dir'), help="HTML Out Dir") | |
1014 knc.add_argument("--htmlFname", required=False, default='Out.html', help="") | |
1015 knc.add_argument("--Workdirpath", required=False, default=os.getcwd(), help="Working Directory Path") | |
1016 | |
1017 gnbc = subparsers.add_parser('GNBC') | |
1018 #gnbc.add_argument("--priors", required=False, default=None, help="Prior probabilities of the classes. If specified the priors are not adjusted according to the data.") | |
1019 gnbc.add_argument("--var_smoothing", required=False, default=1e-09, help="Portion of the largest variance of all features that is added to variances for calculation stability.") | |
1020 gnbc.add_argument("--TrainFile", required=True, default=None, help="Positive negative dataset Ex. 'Train.csv'") | |
1021 gnbc.add_argument("--TestMethod", required=True, default=None, help="Internal','CrossVal', 'External', 'Predict'") | |
1022 gnbc.add_argument("--SelectedSclaer", required=True, help="'Min_Max',Standard_Scaler','No_Scaler'") | |
1023 gnbc.add_argument("--NFolds", required=False, default=5, help="int, Max=10") | |
1024 gnbc.add_argument("--TestFile", required=False, default=None, help="Test data, 'Test.csv'") | |
1025 gnbc.add_argument("--OutFile", required=False, default='Out.csv', help="Out.tsv") | |
1026 gnbc.add_argument("--htmlOutDir", required=False, default=os.path.join(os.getcwd(),'report_dir'), help="HTML Out Dir") | |
1027 gnbc.add_argument("--htmlFname", required=False, default='Out.html', help="") | |
1028 gnbc.add_argument("--Workdirpath", required=False, default=os.getcwd(), help="Working Directory Path") | |
1029 | |
1030 MLP = subparsers.add_parser('MLP') | |
1031 MLP.add_argument("--hidden_layer_sizes", required=False, default=(100,), help="") | |
1032 MLP.add_argument("--activation", required=False, default='relu', help="") | |
1033 MLP.add_argument("--solver", required=False, default='adam', help="") | |
1034 MLP.add_argument("--alpha", required=False, default=0.0001 , help="") | |
1035 MLP.add_argument("--batch_size", required=False, default='auto', help="") | |
1036 MLP.add_argument("--learning_rate", required=False, default='constant', help="") | |
1037 MLP.add_argument("--learning_rate_init", required=False, default=0.001, help="") | |
1038 MLP.add_argument("--power_t", required=False, default=0.5, help="") | |
1039 MLP.add_argument("--max_iter", required=False, default=200, help="") | |
1040 MLP.add_argument("--shuffle", required=False, default='true', help="") | |
1041 MLP.add_argument("--random_state", required=False, default='none', help="") | |
1042 MLP.add_argument("--tol", required=False, default=0.0001, help="") | |
1043 MLP.add_argument("--verbose", required=False, default='false', help="") | |
1044 MLP.add_argument("--warm_start", required=False, default='false', help="") | |
1045 MLP.add_argument("--momentum", required=False, default=0.9, help="") | |
1046 MLP.add_argument("--nesterovs_momentum", required=False, default='true' ,help="") | |
1047 MLP.add_argument("--early_stopping", required=False, default='false' ,help="") | |
1048 MLP.add_argument("--validation_fraction", required=False, default=0.1 ,help="") | |
1049 MLP.add_argument("--beta_1", required=False, default=0.9, help="") | |
1050 MLP.add_argument("--beta_2", required=False , default=0.999, help="") | |
1051 MLP.add_argument("--epsilon", required=False, default=1e-08, help="") | |
1052 MLP.add_argument("--n_iter_no_change", required=False, default=10, help="") | |
1053 MLP.add_argument("--max_fun", required=False, default=15000, help="") | |
1054 MLP.add_argument("--TrainFile", required=True, default=None, help="Positive negative dataset Ex. 'Train.csv'") | |
1055 MLP.add_argument("--TestMethod", required=True, default=None, help="Internal','CrossVal', 'External', 'Predict'") | |
1056 MLP.add_argument("--SelectedSclaer", required=True, help="'Min_Max',Standard_Scaler','No_Scaler'") | |
1057 MLP.add_argument("--NFolds", required=False, default=5, help="int, Max=10") | |
1058 MLP.add_argument("--Testspt", required=False, default=0.2, help="float, Max=1.0") | |
1059 MLP.add_argument("--TestFile", required=False, default=None, help="Test data, 'Test.csv'") | |
1060 MLP.add_argument("--OutFile", required=False, default='Out.csv', help="Out.tsv") | |
1061 MLP.add_argument("--htmlOutDir", required=False, default=os.path.join(os.getcwd(),'report_dir'), help="HTML Out Dir") | |
1062 MLP.add_argument("--htmlFname", required=False, help="HTML out file", default="jai.html") | |
1063 MLP.add_argument("--Workdirpath", required=False, default=os.getcwd(), help="Working Directory Path") | |
1064 | |
1065 args = parser.parse_args() | |
1066 | |
1067 if sys.argv[1] == 'SVMC': | |
1068 SVM_Classifier(args.C, args.kernel, args.degree, args.gamma, args.coef0, args.shrinking, args.probability, args.tol, args.cache_size, args.verbose, args.max_iter, args.decision_function_shape, args.randomState, args.breakties, args.TrainFile, args.TestMethod, args.SelectedSclaer, args.NFolds, args.TestFile, args.OutFile, args.htmlOutDir, args.htmlFname, args.Workdirpath) | |
1069 elif sys.argv[1] == 'SGDC': | |
1070 SGD_Classifier( args.loss, args.penalty, args.alpha, args.l1_ratio, args.fit_intercept, args.max_iter, args.tol, args.shuffle, args.verbose, args.epsilon, args.n_jobs, args.random_state, args.learning_rate, args.eta0, args.power_t, args.early_stopping, args.validation_fraction, args.n_iter_no_change, args.warm_start, args.average, args.TrainFile, args.TestMethod, args.SelectedSclaer, args.NFolds, args.TestFile, args.OutFile, args.htmlOutDir, args.htmlFname, args.Workdirpath) | |
1071 elif sys.argv[1] == 'DTC': | |
1072 DT_Classifier(args.criterion, args.splitter, args.max_depth, args.min_samples_split, args.min_samples_leaf, args.min_weight_fraction_leaf, args.random_state, args.max_leaf_nodes, args.min_impurity_decrease, args.min_impurity_split, args.presort, args.ccpalpha, args.max_features, args.TrainFile, args.TestMethod, args.SelectedSclaer, args.NFolds, args.TestFile, args.OutFile, args.htmlOutDir, args.htmlFname, args.Workdirpath) | |
1073 elif sys.argv[1] == 'GBC': | |
1074 GB_Classifier(args.loss, args.learning_rate, args.n_estimators, args.subsample, args.criterion, args.min_samples_split, args.min_samples_leaf, args.min_weight_fraction_leaf, args.max_depth, args.min_impurity_decrease, args.min_impurity_split, args.init, args.random_state, args.verbose, args.max_leaf_nodes, args.warm_start, args.presort, args.validation_fraction, args.n_iter_no_change, args.tol, args.ccpalpha, args.max_features, args.TrainFile, args.TestMethod, args.SelectedSclaer, args.NFolds, args.TestFile, args.OutFile, args.htmlOutDir, args.htmlFname, args.Workdirpath) | |
1075 elif sys.argv[1] == 'RFC': | |
1076 RF_Classifier( args.n_estimators, args.criterion, args.max_depth, args.min_samples_split, args.min_samples_leaf, args.min_weight_fraction_leaf, args.max_features, args.max_leaf_nodes, args.min_impurity_decrease, args.min_impurity_split, args.bootstrap, args.oob_score, args.n_jobs, args.random_state, args.verbose, args.warm_start, args.ccp_alpha, args.max_samples, args.TrainFile, args.TestMethod, args.SelectedSclaer, args.NFolds, args.TestFile, args.OutFile, args.htmlOutDir, args.htmlFname, args.Workdirpath) | |
1077 elif sys.argv[1] == 'LRC': | |
1078 LR_Classifier(args.penalty, args.dual, args.tol, args.C, args.fit_intercept, args.intercept_scaling, args.random_state, args.solver, args.max_iter, args.multi_class, args.verbose, args.warm_start, args.n_jobs, args.l1_ratio, args.TrainFile, args.TestMethod, args.SelectedSclaer, args.NFolds, args.TestFile, args.OutFile, args.htmlOutDir, args.htmlFname, args.Workdirpath) | |
1079 elif sys.argv[1] == 'KNC': | |
1080 KN_Classifier(args.n_neighbors, args.weights, args.algorithm, args.leaf_size, args.p, args.metric, args.metric_params, args.n_jobs, args.TrainFile, args.TestMethod, args.SelectedSclaer, args.NFolds, args.TestFile, args.OutFile, args.htmlOutDir, args.htmlFname, args.Workdirpath) | |
1081 elif sys.argv[1] == 'GNBC': | |
1082 GNB_Classifier( args.var_smoothing, args.TrainFile, args.TestMethod, args.SelectedSclaer, args.NFolds, args.TestFile, args.OutFile, args.htmlOutDir, args.htmlFname, args.Workdirpath) | |
1083 elif sys.argv[1] == 'MLP' : | |
1084 MLP_Classifier(args.hidden_layer_sizes, args.activation, args.solver, args.alpha, args.batch_size, args.learning_rate, args.learning_rate_init, args.power_t, args.max_iter, args.shuffle, args.random_state, args.tol, args.verbose, args.warm_start, args.momentum, args.nesterovs_momentum, args.early_stopping, args.validation_fraction, args.beta_1, args.beta_2, args.epsilon, args.n_iter_no_change, args.max_fun, args.TrainFile, args.TestMethod, args.SelectedSclaer, args.NFolds, args.Testspt, args.TestFile, args.OutFile, args.htmlOutDir, args.htmlFname, args.Workdirpath) | |
1085 else: | |
1086 print ("option not correct") | |
1087 exit() | |
1088 |