Mercurial > repos > recetox > rename_annotated_feature
diff rename_annotated_feature.py @ 0:268fcec93d9c draft default tip
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/rename_annotated_feature commit 7948bcdd36cec524d201712dc20c438973b4cc28
author | recetox |
---|---|
date | Tue, 21 May 2024 07:44:25 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rename_annotated_feature.py Tue May 21 07:44:25 2024 +0000 @@ -0,0 +1,94 @@ +import argparse +from collections import defaultdict +from typing import Tuple + +import pandas as pd + + +def parse_arguments() -> argparse.Namespace: + """Parses command-line arguments. + + Returns: + argparse.Namespace: Namespace with argument values as attributes. + """ + parser = argparse.ArgumentParser(description='Rename annotated feature.') + parser.add_argument('--annotations_table_path', type=str, required=True, help='Path to the annotations table file.') + parser.add_argument('--abundance_table_path', type=str, required=True, help='Path to the abundance table file.') + parser.add_argument('--mode', type=str, choices=['single', 'multiple'], default='single', help='Mode to use for renaming. Can be "single" or "multiple".') + parser.add_argument('--output_path', type=str, default='output.csv', help='Path to the output CSV file.') + return parser.parse_args() + + +def load_tables(annotations_table_path: str, abundance_table_path: str) -> Tuple[pd.DataFrame, pd.DataFrame]: + """Loads annotation and abundance tables from files. + + Args: + annotations_table_path (str): Path to the annotations table file. + abundance_table_path (str): Path to the abundance table file. + + Returns: + Tuple[pd.DataFrame, pd.DataFrame]: Tuple of DataFrames for annotations and abundance tables. + """ + annotations_table = pd.read_table(annotations_table_path) + abundance_table = pd.read_table(abundance_table_path) + + annotations_table.columns = annotations_table.columns.str.strip() + abundance_table.columns = abundance_table.columns.str.strip() + + return annotations_table, abundance_table + + +def rename_single(annotations_table: pd.DataFrame, abundance_table: pd.DataFrame) -> None: + """Renames columns in abundance table based on single best match in annotations table. + + Args: + annotations_table (pd.DataFrame): DataFrame of annotations. + abundance_table (pd.DataFrame): DataFrame of abundance data. + """ + scores_col = annotations_table.columns[-1] + ref_idxs = annotations_table.groupby("query")[scores_col].idxmax() + results = annotations_table.loc[ref_idxs] + + queries = results["query"] + refs = results["reference"] + + mapping = dict(zip(queries, refs)) + abundance_table.rename(columns=mapping, inplace=True) + + +def rename_multiple(annotations_table: pd.DataFrame, abundance_table: pd.DataFrame) -> None: + """Renames columns in abundance table based on multiple matches in annotations table. + + Args: + annotations_table (pd.DataFrame): DataFrame of annotations. + abundance_table (pd.DataFrame): DataFrame of abundance data. + """ + queries = annotations_table["query"] + refs = annotations_table["reference"] + + mapping = defaultdict(list) + for query, ref in zip(queries, refs): + mapping[query].append(ref) + + for query, refs in mapping.items(): + new_column_name = ', '.join(refs) + if query in abundance_table.columns: + abundance_table.rename(columns={query: new_column_name}, inplace=True) + + +def main() -> None: + """Main function to parse arguments, load tables, rename columns, and save output.""" + args = parse_arguments() + + annotations_table, abundance_table = load_tables(args.annotations_table_path, args.abundance_table_path) + + if args.mode == "single": + rename_single(annotations_table, abundance_table) + else: + rename_multiple(annotations_table, abundance_table) + + abundance_table.to_csv(args.output_path, sep="\t", index=False) + + +if __name__ == "__main__": + main()