Mercurial > repos > goeckslab > pycaret_compare
annotate feature_importance.py @ 0:915447b14520 draft
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
| author | goeckslab | 
|---|---|
| date | Wed, 11 Dec 2024 05:00:00 +0000 | 
| parents | |
| children | 4aa511539199 | 
| rev | line source | 
|---|---|
| 
0
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
1 import base64 | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
2 import logging | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
3 import os | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
4 | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
5 import matplotlib.pyplot as plt | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
6 | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
7 import pandas as pd | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
8 | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
9 from pycaret.classification import ClassificationExperiment | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
10 from pycaret.regression import RegressionExperiment | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
11 | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
12 logging.basicConfig(level=logging.DEBUG) | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
13 LOG = logging.getLogger(__name__) | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
14 | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
15 | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
16 class FeatureImportanceAnalyzer: | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
17 def __init__( | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
18 self, | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
19 task_type, | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
20 output_dir, | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
21 data_path=None, | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
22 data=None, | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
23 target_col=None): | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
24 | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
25 if data is not None: | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
26 self.data = data | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
27 LOG.info("Data loaded from memory") | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
28 else: | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
29 self.target_col = target_col | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
30 self.data = pd.read_csv(data_path, sep=None, engine='python') | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
31 self.data.columns = self.data.columns.str.replace('.', '_') | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
32 self.data = self.data.fillna(self.data.median(numeric_only=True)) | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
33 self.task_type = task_type | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
34 self.target = self.data.columns[int(target_col) - 1] | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
35 self.exp = ClassificationExperiment() \ | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
36 if task_type == 'classification' \ | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
37 else RegressionExperiment() | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
38 self.plots = {} | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
39 self.output_dir = output_dir | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
40 | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
41 def setup_pycaret(self): | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
42 LOG.info("Initializing PyCaret") | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
43 setup_params = { | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
44 'target': self.target, | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
45 'session_id': 123, | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
46 'html': True, | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
47 'log_experiment': False, | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
48 'system_log': False | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
49 } | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
50 LOG.info(self.task_type) | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
51 LOG.info(self.exp) | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
52 self.exp.setup(self.data, **setup_params) | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
53 | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
54 # def save_coefficients(self): | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
55 # model = self.exp.create_model('lr') | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
56 # coef_df = pd.DataFrame({ | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
57 # 'Feature': self.data.columns.drop(self.target), | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
58 # 'Coefficient': model.coef_[0] | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
59 # }) | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
60 # coef_html = coef_df.to_html(index=False) | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
61 # return coef_html | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
62 | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
63 def save_tree_importance(self): | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
64 model = self.exp.create_model('rf') | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
65 importances = model.feature_importances_ | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
66 processed_features = self.exp.get_config('X_transformed').columns | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
67 LOG.debug(f"Feature importances: {importances}") | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
68 LOG.debug(f"Features: {processed_features}") | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
69 feature_importances = pd.DataFrame({ | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
70 'Feature': processed_features, | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
71 'Importance': importances | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
72 }).sort_values(by='Importance', ascending=False) | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
73 plt.figure(figsize=(10, 6)) | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
74 plt.barh( | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
75 feature_importances['Feature'], | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
76 feature_importances['Importance']) | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
77 plt.xlabel('Importance') | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
78 plt.title('Feature Importance (Random Forest)') | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
79 plot_path = os.path.join( | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
80 self.output_dir, | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
81 'tree_importance.png') | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
82 plt.savefig(plot_path) | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
83 plt.close() | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
84 self.plots['tree_importance'] = plot_path | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
85 | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
86 def save_shap_values(self): | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
87 model = self.exp.create_model('lightgbm') | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
88 import shap | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
89 explainer = shap.Explainer(model) | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
90 shap_values = explainer.shap_values( | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
91 self.exp.get_config('X_transformed')) | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
92 shap.summary_plot(shap_values, | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
93 self.exp.get_config('X_transformed'), show=False) | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
94 plt.title('Shap (LightGBM)') | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
95 plot_path = os.path.join( | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
96 self.output_dir, 'shap_summary.png') | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
97 plt.savefig(plot_path) | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
98 plt.close() | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
99 self.plots['shap_summary'] = plot_path | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
100 | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
101 def generate_feature_importance(self): | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
102 # coef_html = self.save_coefficients() | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
103 self.save_tree_importance() | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
104 self.save_shap_values() | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
105 | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
106 def encode_image_to_base64(self, img_path): | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
107 with open(img_path, 'rb') as img_file: | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
108 return base64.b64encode(img_file.read()).decode('utf-8') | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
109 | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
110 def generate_html_report(self): | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
111 LOG.info("Generating HTML report") | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
112 | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
113 # Read and encode plot images | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
114 plots_html = "" | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
115 for plot_name, plot_path in self.plots.items(): | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
116 encoded_image = self.encode_image_to_base64(plot_path) | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
117 plots_html += f""" | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
118 <div class="plot" id="{plot_name}"> | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
119 <h2>{'Feature importance analysis from a' | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
120 'trained Random Forest' | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
121 if plot_name == 'tree_importance' | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
122 else 'SHAP Summary from a trained lightgbm'}</h2> | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
123 <h3>{'Use gini impurity for' | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
124 'calculating feature importance for classification' | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
125 'and Variance Reduction for regression' | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
126 if plot_name == 'tree_importance' | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
127 else ''}</h3> | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
128 <img src="data:image/png;base64, | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
129 {encoded_image}" alt="{plot_name}"> | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
130 </div> | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
131 """ | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
132 | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
133 # Generate HTML content with tabs | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
134 html_content = f""" | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
135 <h1>PyCaret Feature Importance Report</h1> | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
136 {plots_html} | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
137 """ | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
138 | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
139 return html_content | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
140 | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
141 def run(self): | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
142 LOG.info("Running feature importance analysis") | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
143 self.setup_pycaret() | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
144 self.generate_feature_importance() | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
145 html_content = self.generate_html_report() | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
146 LOG.info("Feature importance analysis completed") | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
147 return html_content | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
148 | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
149 | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
150 if __name__ == "__main__": | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
151 import argparse | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
152 parser = argparse.ArgumentParser(description="Feature Importance Analysis") | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
153 parser.add_argument( | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
154 "--data_path", type=str, help="Path to the dataset") | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
155 parser.add_argument( | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
156 "--target_col", type=int, | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
157 help="Index of the target column (1-based)") | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
158 parser.add_argument( | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
159 "--task_type", type=str, | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
160 choices=["classification", "regression"], | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
161 help="Task type: classification or regression") | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
162 parser.add_argument( | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
163 "--output_dir", | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
164 type=str, | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
165 help="Directory to save the outputs") | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
166 args = parser.parse_args() | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
167 | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
168 analyzer = FeatureImportanceAnalyzer( | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
169 args.data_path, args.target_col, | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
170 args.task_type, args.output_dir) | 
| 
 
915447b14520
planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
 
goeckslab 
parents:  
diff
changeset
 | 
171 analyzer.run() | 
