comparison table_pandas_rename_columns_regex.py @ 0:505a8e975968 draft default tip

planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/tables commit d0ff40eb2b536fec6c973c3a9ea8e7f31cd9a0d6
author recetox
date Wed, 29 Jan 2025 15:35:08 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:505a8e975968
1 import argparse
2 import logging
3 import re
4 from typing import List, Tuple
5
6
7 import pandas as pd
8 from utils import LoadDataAction, SplitColumnIndicesAction, StoreOutputAction
9
10
11 def rename_columns(
12 df: pd.DataFrame, columns: List[int], regex_check: str, regex_replace: str
13 ) -> pd.DataFrame:
14 """
15 Rename columns in the dataframe based on regex patterns.
16
17 Parameters:
18 df (pd.DataFrame): The input dataframe.
19 columns (List[int]): The 0-based indices of the columns to rename.
20 regex_check (str): The regex pattern to check for in column names.
21 regex_replace (str): The regex pattern to replace with in column names.
22
23 Returns:
24 pd.DataFrame: The dataframe with renamed columns.
25 """
26 try:
27 # Map column indices to column names
28 column_names = [df.columns[i] for i in columns]
29
30 # Rename the specified columns using the regex patterns
31 for column in column_names:
32 if column in df.columns:
33 new_column_name = re.sub(regex_check, regex_replace, column)
34 df.rename(columns={column: new_column_name}, inplace=True)
35 return df
36 except IndexError as e:
37 logging.error(f"Invalid column index: {e}")
38 raise
39 except re.error as e:
40 logging.error(f"Invalid regex pattern: {e}")
41 raise
42 except Exception as e:
43 logging.error(f"Error renaming columns: {e}")
44 raise
45
46
47 def main(
48 input_dataset: pd.DataFrame,
49 columns: List[int],
50 regex_check: str,
51 regex_replace: str,
52 output_dataset: Tuple[callable, str],
53 ) -> None:
54 """
55 Main function to load the dataset, rename columns, and save the result.
56
57 Parameters:
58 input_dataset (Tuple[pd.DataFrame, str]): The input dataset and its file extension.
59 columns (List[int]): The 0-based indices of the columns to rename.
60 regex_check (str): The regex pattern to check for in column names.
61 regex_replace (str): The regex pattern to replace with in column names.
62 output_dataset (Tuple[callable, str]): The output dataset and its file extension.
63 """
64 try:
65 write_func, file_path = output_dataset
66 write_func(rename_columns(input_dataset, columns, regex_check, regex_replace), file_path)
67 except Exception as e:
68 logging.error(f"Error in main function: {e}")
69 raise
70
71
72 if __name__ == "__main__":
73 logging.basicConfig(level=logging.INFO)
74 parser = argparse.ArgumentParser(
75 description="Apply regex-based transformations on multiple dataframe columns."
76 )
77 parser.add_argument(
78 "--input_dataset",
79 nargs=2,
80 action=LoadDataAction,
81 required=True,
82 help="Path to the input dataset and its file extension (csv, tsv, parquet)",
83 )
84 parser.add_argument(
85 "--columns",
86 action=SplitColumnIndicesAction,
87 required=True,
88 help="Comma-separated list of 1-based indices of the columns to apply the transformation on",
89 )
90 parser.add_argument(
91 "--regex_check",
92 type=str,
93 required=True,
94 help="Regex pattern to check for in column names",
95 )
96 parser.add_argument(
97 "--regex_replace",
98 type=str,
99 required=True,
100 help="Regex pattern to replace with in column names",
101 )
102 parser.add_argument(
103 "--output_dataset",
104 nargs=2,
105 action=StoreOutputAction,
106 required=True,
107 help="Path to the output dataset and its file extension (csv, tsv, parquet)",
108 )
109
110 args = parser.parse_args()
111 main(
112 args.input_dataset,
113 args.columns,
114 args.regex_check,
115 args.regex_replace,
116 args.output_dataset,
117 )