Mercurial > repos > recetox > table_pandas_rename_columns_regex
view table_pandas_rename_columns_regex.py @ 0:505a8e975968 draft default tip
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/tables commit d0ff40eb2b536fec6c973c3a9ea8e7f31cd9a0d6
author | recetox |
---|---|
date | Wed, 29 Jan 2025 15:35:08 +0000 |
parents | |
children |
line wrap: on
line source
import argparse import logging import re from typing import List, Tuple import pandas as pd from utils import LoadDataAction, SplitColumnIndicesAction, StoreOutputAction def rename_columns( df: pd.DataFrame, columns: List[int], regex_check: str, regex_replace: str ) -> pd.DataFrame: """ Rename columns in the dataframe based on regex patterns. Parameters: df (pd.DataFrame): The input dataframe. columns (List[int]): The 0-based indices of the columns to rename. regex_check (str): The regex pattern to check for in column names. regex_replace (str): The regex pattern to replace with in column names. Returns: pd.DataFrame: The dataframe with renamed columns. """ try: # Map column indices to column names column_names = [df.columns[i] for i in columns] # Rename the specified columns using the regex patterns for column in column_names: if column in df.columns: new_column_name = re.sub(regex_check, regex_replace, column) df.rename(columns={column: new_column_name}, inplace=True) return df except IndexError as e: logging.error(f"Invalid column index: {e}") raise except re.error as e: logging.error(f"Invalid regex pattern: {e}") raise except Exception as e: logging.error(f"Error renaming columns: {e}") raise def main( input_dataset: pd.DataFrame, columns: List[int], regex_check: str, regex_replace: str, output_dataset: Tuple[callable, str], ) -> None: """ Main function to load the dataset, rename columns, and save the result. Parameters: input_dataset (Tuple[pd.DataFrame, str]): The input dataset and its file extension. columns (List[int]): The 0-based indices of the columns to rename. regex_check (str): The regex pattern to check for in column names. regex_replace (str): The regex pattern to replace with in column names. output_dataset (Tuple[callable, str]): The output dataset and its file extension. """ try: write_func, file_path = output_dataset write_func(rename_columns(input_dataset, columns, regex_check, regex_replace), file_path) except Exception as e: logging.error(f"Error in main function: {e}") raise if __name__ == "__main__": logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser( description="Apply regex-based transformations on multiple dataframe columns." ) parser.add_argument( "--input_dataset", nargs=2, action=LoadDataAction, required=True, help="Path to the input dataset and its file extension (csv, tsv, parquet)", ) parser.add_argument( "--columns", action=SplitColumnIndicesAction, required=True, help="Comma-separated list of 1-based indices of the columns to apply the transformation on", ) parser.add_argument( "--regex_check", type=str, required=True, help="Regex pattern to check for in column names", ) parser.add_argument( "--regex_replace", type=str, required=True, help="Regex pattern to replace with in column names", ) parser.add_argument( "--output_dataset", nargs=2, action=StoreOutputAction, required=True, help="Path to the output dataset and its file extension (csv, tsv, parquet)", ) args = parser.parse_args() main( args.input_dataset, args.columns, args.regex_check, args.regex_replace, args.output_dataset, )