Mercurial > repos > bgruening > flexynesis
diff flexynesis_utils.py @ 8:9c91d13827ef draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 6b520305ec30e6dc37eba92c67a5368cea0fc5ad
author | bgruening |
---|---|
date | Wed, 23 Jul 2025 07:50:31 +0000 |
parents | 9450286c42ab |
children |
line wrap: on
line diff
--- a/flexynesis_utils.py Fri Jul 04 14:58:37 2025 +0000 +++ b/flexynesis_utils.py Wed Jul 23 07:50:31 2025 +0000 @@ -163,14 +163,54 @@ continue +def validate_numeric_column(df, column_names, require_integer=False): + """ Validate that a column(s) in the DataFrame contains numeric values. """ + if isinstance(column_names, str): + # Handle comma-separated string: "col1,col2,col3" + if ',' in column_names: + column_names = [col.strip() for col in column_names.split(',')] + else: + # Single column name + column_names = [column_names] + + # Validate each column + for column_name in column_names: + if column_name not in df.columns: + raise ValueError(f"Column '{column_name}' not found in DataFrame.") + + try: + numeric_col = pd.to_numeric(df[column_name], errors='raise') + except Exception as e: + raise ValueError(f"Non-numeric values found in column '{column_name}': {e}") + + if require_integer: + # Check if all non-null values are equivalent to integers + non_null_values = numeric_col.dropna() + if not (non_null_values == non_null_values.round()).all(): + raise ValueError(f"Column '{column_name}' contains non-integer numeric values.") + print(f"Column '{column_name}': All values are integers or integer-equivalent floats.") + else: + print(f"Column '{column_name}': All values are numeric (integers and floats accepted).") + + +def validate_survival(df, column_names): + """Validate survival column(s) (integer).""" + validate_numeric_column(df, column_names, require_integer=True) + + +def validate_covariate(df, column_names): + """Validate covariate column(s) (numeric).""" + validate_numeric_column(df, column_names, require_integer=False) + + def main(): parser = argparse.ArgumentParser(description='Flexynesis extra utilities') parser.add_argument("--util", type=str, required=True, - choices=['split', 'binarize'], - help="Utility function: 'split' for spiting data to train and test, 'binarize' for creating a binarized matrix from a mutation data") + choices=['split', 'binarize', 'validate_survival', 'validate_covariate'], + help="Utility function: 'split' for spiting data to train and test, 'binarize' for creating a binarized matrix from a mutation data, 'validate_survival' for validating survival data.") - # Arguments for split + # Arguments for split (clin also for validate_survival and validate_covariate) parser.add_argument('--clin', required=False, help='Path to clinical data CSV file (samples in rows)') parser.add_argument('--omics', required=False, @@ -186,7 +226,11 @@ parser.add_argument('--sample_idx', type=int, default=1, help='Column index for samples in mutation data (default: 1)') - # common arguments + # Arguments for validate_survival and validate_covariate + parser.add_argument('--clin_variable', type=str, required=False, + help='Column name for clinical variable (e.g., death, SEX, ...)') + + # common arguments (binarize and split) parser.add_argument('--out', default='.', help='Output directory (default: current directory)') @@ -196,7 +240,7 @@ # validate utility function if not args.util: raise ValueError("Utility function must be specified") - if args.util not in ['split', 'binarize']: + if args.util not in ['split', 'binarize', 'validate_survival', 'validate_covariate']: raise ValueError(f"Invalid utility function: {args.util}") if args.util == 'split': @@ -221,6 +265,16 @@ if args.gene_idx < 0 or args.sample_idx < 0: raise ValueError("Gene and sample indices must be non-negative integers") + elif args.util == 'validate_survival' or args.util == 'validate_covariate': + # Validate clinical data file + if not args.clin: + raise ValueError("Clinical data file must be provided") + if not os.path.isfile(args.clin): + raise FileNotFoundError(f"Clinical file not found: {args.clin}") + # Validate survival event variable + if not args.clin_variable: + raise ValueError("Survival event variable must be specified") + # Create output directory if it doesn't exist if not os.path.exists(args.out): os.makedirs(args.out) @@ -248,6 +302,22 @@ binarized_matrix.to_csv(output_file, sep='\t') print(f"Binarized mutation matrix saved to {output_file}") + elif args.util == 'validate_survival': + clin_df = read_data(args.clin, index=False) + if clin_df.empty: + raise ValueError("Clinical data file is empty") + + # Validate survival event variable + validate_survival(clin_df, args.clin_variable) + + elif args.util == 'validate_covariate': + clin_df = read_data(args.clin, index=False) + if clin_df.empty: + raise ValueError("Clinical data file is empty") + + # Validate clinical variable + validate_covariate(clin_df, args.clin_variable) + except Exception as e: print(f"Error: {e}", file=sys.stderr) sys.exit(1)