Mercurial > repos > recetox > table_pandas_rename_columns_regex
comparison table_pandas_rename_columns_regex.py @ 0:505a8e975968 draft default tip
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/tables commit d0ff40eb2b536fec6c973c3a9ea8e7f31cd9a0d6
author | recetox |
---|---|
date | Wed, 29 Jan 2025 15:35:08 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:505a8e975968 |
---|---|
1 import argparse | |
2 import logging | |
3 import re | |
4 from typing import List, Tuple | |
5 | |
6 | |
7 import pandas as pd | |
8 from utils import LoadDataAction, SplitColumnIndicesAction, StoreOutputAction | |
9 | |
10 | |
11 def rename_columns( | |
12 df: pd.DataFrame, columns: List[int], regex_check: str, regex_replace: str | |
13 ) -> pd.DataFrame: | |
14 """ | |
15 Rename columns in the dataframe based on regex patterns. | |
16 | |
17 Parameters: | |
18 df (pd.DataFrame): The input dataframe. | |
19 columns (List[int]): The 0-based indices of the columns to rename. | |
20 regex_check (str): The regex pattern to check for in column names. | |
21 regex_replace (str): The regex pattern to replace with in column names. | |
22 | |
23 Returns: | |
24 pd.DataFrame: The dataframe with renamed columns. | |
25 """ | |
26 try: | |
27 # Map column indices to column names | |
28 column_names = [df.columns[i] for i in columns] | |
29 | |
30 # Rename the specified columns using the regex patterns | |
31 for column in column_names: | |
32 if column in df.columns: | |
33 new_column_name = re.sub(regex_check, regex_replace, column) | |
34 df.rename(columns={column: new_column_name}, inplace=True) | |
35 return df | |
36 except IndexError as e: | |
37 logging.error(f"Invalid column index: {e}") | |
38 raise | |
39 except re.error as e: | |
40 logging.error(f"Invalid regex pattern: {e}") | |
41 raise | |
42 except Exception as e: | |
43 logging.error(f"Error renaming columns: {e}") | |
44 raise | |
45 | |
46 | |
47 def main( | |
48 input_dataset: pd.DataFrame, | |
49 columns: List[int], | |
50 regex_check: str, | |
51 regex_replace: str, | |
52 output_dataset: Tuple[callable, str], | |
53 ) -> None: | |
54 """ | |
55 Main function to load the dataset, rename columns, and save the result. | |
56 | |
57 Parameters: | |
58 input_dataset (Tuple[pd.DataFrame, str]): The input dataset and its file extension. | |
59 columns (List[int]): The 0-based indices of the columns to rename. | |
60 regex_check (str): The regex pattern to check for in column names. | |
61 regex_replace (str): The regex pattern to replace with in column names. | |
62 output_dataset (Tuple[callable, str]): The output dataset and its file extension. | |
63 """ | |
64 try: | |
65 write_func, file_path = output_dataset | |
66 write_func(rename_columns(input_dataset, columns, regex_check, regex_replace), file_path) | |
67 except Exception as e: | |
68 logging.error(f"Error in main function: {e}") | |
69 raise | |
70 | |
71 | |
72 if __name__ == "__main__": | |
73 logging.basicConfig(level=logging.INFO) | |
74 parser = argparse.ArgumentParser( | |
75 description="Apply regex-based transformations on multiple dataframe columns." | |
76 ) | |
77 parser.add_argument( | |
78 "--input_dataset", | |
79 nargs=2, | |
80 action=LoadDataAction, | |
81 required=True, | |
82 help="Path to the input dataset and its file extension (csv, tsv, parquet)", | |
83 ) | |
84 parser.add_argument( | |
85 "--columns", | |
86 action=SplitColumnIndicesAction, | |
87 required=True, | |
88 help="Comma-separated list of 1-based indices of the columns to apply the transformation on", | |
89 ) | |
90 parser.add_argument( | |
91 "--regex_check", | |
92 type=str, | |
93 required=True, | |
94 help="Regex pattern to check for in column names", | |
95 ) | |
96 parser.add_argument( | |
97 "--regex_replace", | |
98 type=str, | |
99 required=True, | |
100 help="Regex pattern to replace with in column names", | |
101 ) | |
102 parser.add_argument( | |
103 "--output_dataset", | |
104 nargs=2, | |
105 action=StoreOutputAction, | |
106 required=True, | |
107 help="Path to the output dataset and its file extension (csv, tsv, parquet)", | |
108 ) | |
109 | |
110 args = parser.parse_args() | |
111 main( | |
112 args.input_dataset, | |
113 args.columns, | |
114 args.regex_check, | |
115 args.regex_replace, | |
116 args.output_dataset, | |
117 ) |