Mercurial > repos > jay > feature_selector
comparison feature_selection/featureSelection.py @ 0:76a728a52df6 draft default tip
planemo upload for repository https://github.com/jaidevjoshi83/MicroBiomML commit 5ef78d4decc95ac107c468499328e7f086289ff9-dirty
| author | jay |
|---|---|
| date | Tue, 17 Feb 2026 10:52:45 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:76a728a52df6 |
|---|---|
| 1 import pandas as pd | |
| 2 from sklearn.feature_selection import SequentialFeatureSelector | |
| 3 from sklearn.linear_model import LogisticRegression | |
| 4 from sklearn.tree import DecisionTreeClassifier | |
| 5 from sklearn.ensemble import RandomForestClassifier | |
| 6 from sklearn.svm import SVC | |
| 7 import os | |
| 8 import time | |
| 9 import argparse | |
| 10 import json | |
| 11 import subprocess | |
| 12 | |
| 13 def retrieve_results_from_hdc_folds(n_folds, text): | |
| 14 | |
| 15 split_text = text.splitlines() | |
| 16 n_folds = n_folds | |
| 17 df_list = [] | |
| 18 for i in range(n_folds): | |
| 19 for n, line in enumerate(split_text): | |
| 20 if f"Fold {i}" in split_text[n]: | |
| 21 df_list.append([float(split_text[n+2].split(":")[1]), 'NaN', float(split_text[n+5].split(":")[1]), float(split_text[n+4].split(":")[1]), float(split_text[n+3].split(":")[1]), "NaN", float(split_text[n+6].split(":")[1])]) | |
| 22 | |
| 23 df = pd.DataFrame(df_list, columns=["Accuracy", "AUC", "Recall", "Prec.", "F1", "Kappa", "MCC"]) | |
| 24 | |
| 25 mean_row = df.mean(numeric_only=True) | |
| 26 std_row = df.std(numeric_only=True) | |
| 27 | |
| 28 mean_df = mean_row.to_frame().T | |
| 29 mean_df['Fold'] = 'Mean' | |
| 30 | |
| 31 std_df = std_row.to_frame().T | |
| 32 std_df['Fold'] = 'Std' | |
| 33 | |
| 34 df = df.reset_index().rename(columns={'index': 'Fold'}) | |
| 35 | |
| 36 df_with_stats = pd.concat([df, mean_df, std_df], ignore_index=True) | |
| 37 | |
| 38 return df_with_stats | |
| 39 | |
| 40 def parse_arguments(): | |
| 41 """Parse command-line arguments.""" | |
| 42 parser = argparse.ArgumentParser(description="Feature selection using SequentialFeatureSelector on a single TSV file") | |
| 43 parser.add_argument('--input', required=True, help="Path to count matrix file") | |
| 44 parser.add_argument('--metadata', required=True, help="Path to metadata file") | |
| 45 parser.add_argument('--threads', type=int, required=True, help="Number of threads") | |
| 46 parser.add_argument('--classifier', required=True, choices=['lr', 'dt', 'sv', 'rf', 'hdc'], help="Classifier choice") | |
| 47 parser.add_argument('--label', required=True, help="Name of the class label column in the dataset") | |
| 48 parser.add_argument('--tol', type=float, default=1e-5, help="Tolerance for SequentialFeatureSelector convergence (default: 1e-5)") | |
| 49 parser.add_argument('--index_clm', type=str, default='sample_id', help="Index Column") | |
| 50 parser.add_argument('--feature_out', type=str, default='out.tsv', help="Output file for selected features") | |
| 51 parser.add_argument('--log', type=str, default='out.log', help="Log file") | |
| 52 parser.add_argument('--feature_selection', type=str, default=None, help="Path to feature selection file") | |
| 53 parser.add_argument('--dp_columns', type=str, help='Columns to drop from training data') | |
| 54 return parser.parse_args() | |
| 55 | |
| 56 def load_and_preprocess_data(args): | |
| 57 """Load and preprocess the input data.""" | |
| 58 df_counts = pd.read_csv(args.input, sep="\t") | |
| 59 | |
| 60 if args.dp_columns: | |
| 61 dp_column_list = [df_counts.columns.tolist()[int(i) - 1] for i in args.dp_columns.split(',')] | |
| 62 df_counts.drop(columns=dp_column_list, inplace=True) | |
| 63 | |
| 64 df_metadata = pd.read_csv(args.metadata, sep="\t") | |
| 65 target_column = df_metadata.columns.to_list()[int(args.label)-1] | |
| 66 | |
| 67 if args.index_clm and args.index_clm in df_counts.columns and args.index_clm in df_metadata.columns: | |
| 68 df_counts.set_index(args.index_clm, inplace=True) | |
| 69 df_metadata.set_index(args.index_clm, inplace=True) | |
| 70 | |
| 71 df = pd.concat([df_counts, df_metadata[target_column]], axis=1) | |
| 72 | |
| 73 df.to_csv(f"temp_input.tsv", sep='\t', index=False) | |
| 74 | |
| 75 if args.feature_selection: | |
| 76 with open(args.feature_selection) as f: | |
| 77 features = [line.strip() for line in f] | |
| 78 df = df[features] | |
| 79 return df, target_column | |
| 80 | |
| 81 def run_hdc_classification(args, df, dir_name, start_time): | |
| 82 """Run HDC classification and extract features.""" | |
| 83 temp_input_file = f"./{dir_name}/temp_hdc_input.tsv" | |
| 84 df.to_csv(temp_input_file, sep='\t', index=False) | |
| 85 | |
| 86 command = ["chopin2.py", "--input", temp_input_file, "--kfolds", "5", "--levels", "100", "--feature-selection", "backward"] | |
| 87 result = subprocess.run(command, capture_output=True, text=True) | |
| 88 | |
| 89 if result.returncode == 0: | |
| 90 text = result.stdout | |
| 91 | |
| 92 # Extract and save features | |
| 93 try: | |
| 94 features_start = text.index("Features\n----------") + len("Features\n----------\n") | |
| 95 features_end = text.index("\n\nTotal ML models:") | |
| 96 selected_features_str = text[features_start:features_end] | |
| 97 selected_features = selected_features_str.strip().split('\n') | |
| 98 | |
| 99 print("Selected Features:", selected_features) | |
| 100 | |
| 101 out_df = pd.DataFrame(selected_features, columns=['feature_name']) | |
| 102 | |
| 103 out_df.to_csv("selected_features.tsv", sep="\t", index=False) | |
| 104 | |
| 105 except ValueError: | |
| 106 print("Could not find the feature list in the HDC output.") | |
| 107 | |
| 108 # Parse and write scores to the log file | |
| 109 try: | |
| 110 df_scores = retrieve_results_from_hdc_folds(5, text) | |
| 111 with open("hdc_performance.log", 'a') as log_file: | |
| 112 log_file.write("\n--- HDC Performance Scores ---\n") | |
| 113 df_scores.to_string(log_file) | |
| 114 log_file.write("\n") | |
| 115 except Exception as e: | |
| 116 print(f"\nCould not parse performance scores: {e}") | |
| 117 | |
| 118 else: | |
| 119 print("Command failed:", result.stderr) | |
| 120 | |
| 121 elapsed_time = round(time.time() - start_time, 3) | |
| 122 with open(args.log, 'w') as log_file: | |
| 123 log_file.write(f"{'time in seconds'}\t{'algorithm'}\n") | |
| 124 log_file.write(f"{elapsed_time}\t{args.classifier}\n") | |
| 125 | |
| 126 def run_sequential_feature_selection(args, df, start_time, target_label): | |
| 127 | |
| 128 """Run sequential feature selection for non-HDC classifiers.""" | |
| 129 classifiers = { | |
| 130 'lr': LogisticRegression(), | |
| 131 'dt': DecisionTreeClassifier(), | |
| 132 'sv': SVC(), | |
| 133 'rf': RandomForestClassifier() | |
| 134 } | |
| 135 classifier = classifiers[args.classifier] | |
| 136 | |
| 137 | |
| 138 if target_label not in df.columns: | |
| 139 raise ValueError(f"Label column '{target_label}' not found in the data.") | |
| 140 | |
| 141 X = df.drop(columns=[target_label]) | |
| 142 | |
| 143 | |
| 144 labels = list(set(df[target_label].to_list() )) | |
| 145 | |
| 146 if len(labels) != 2: | |
| 147 raise ValueError(f"Expected exactly 2 class labels, found {len(labels)}: {labels}") | |
| 148 | |
| 149 label_mapping = {labels[0]: 0, labels[1]: 1} | |
| 150 | |
| 151 y = df[target_label].map(label_mapping).tolist() | |
| 152 | |
| 153 sfs = SequentialFeatureSelector( | |
| 154 classifier, | |
| 155 n_features_to_select='auto', | |
| 156 direction='backward', | |
| 157 tol=args.tol, | |
| 158 n_jobs=args.threads | |
| 159 ) | |
| 160 sfs.fit(X, y) | |
| 161 | |
| 162 selected_feature_names = X.columns[sfs.get_support()] | |
| 163 | |
| 164 | |
| 165 out_df = pd.DataFrame(selected_feature_names, columns=['feature_name']) | |
| 166 out_df.to_csv(args.feature_out, sep="\t", index=False) | |
| 167 | |
| 168 elapsed_time = round(time.time() - start_time, 3) | |
| 169 with open(args.log, 'w') as log_file: | |
| 170 log_file.write(f"{'time in seconds'}\t{'algorithm'}\n") | |
| 171 log_file.write(f"{elapsed_time}\t{args.classifier}\n") | |
| 172 | |
| 173 def main(): | |
| 174 """Main function to orchestrate the feature selection process.""" | |
| 175 args = parse_arguments() | |
| 176 | |
| 177 dir_name = os.path.splitext(os.path.basename(args.input))[0] + "_" + args.classifier | |
| 178 os.makedirs(f"./{dir_name}", exist_ok=True) | |
| 179 | |
| 180 start_time = time.time() | |
| 181 | |
| 182 df, target_label = load_and_preprocess_data(args) | |
| 183 | |
| 184 if args.classifier == 'hdc': | |
| 185 run_hdc_classification(args, df, dir_name, start_time) | |
| 186 else: | |
| 187 run_sequential_feature_selection(args, df, start_time, target_label) | |
| 188 | |
| 189 if __name__ == "__main__": | |
| 190 main() | |
| 191 |
