Mercurial > repos > goeckslab > multimodal_learner
comparison utils.py @ 0:375c36923da1 draft default tip
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
| author | goeckslab |
|---|---|
| date | Tue, 09 Dec 2025 23:49:47 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:375c36923da1 |
|---|---|
| 1 import json | |
| 2 import logging | |
| 3 import os | |
| 4 import random | |
| 5 import sys | |
| 6 import tempfile | |
| 7 import zipfile | |
| 8 from pathlib import Path | |
| 9 from typing import List, Optional | |
| 10 | |
| 11 import numpy as np | |
| 12 import pandas as pd | |
| 13 import torch | |
| 14 | |
| 15 LOG = logging.getLogger(__name__) | |
| 16 | |
| 17 | |
| 18 def str2bool(val) -> bool: | |
| 19 """Parse common truthy strings to bool.""" | |
| 20 return str(val).strip().lower() in ("1", "true", "yes", "y") | |
| 21 | |
| 22 | |
| 23 def load_user_hparams(hp_arg: Optional[str]) -> dict: | |
| 24 """Parse --hyperparameters (inline JSON or path to .json).""" | |
| 25 if not hp_arg: | |
| 26 return {} | |
| 27 try: | |
| 28 s = hp_arg.strip() | |
| 29 if s.startswith("{"): | |
| 30 return json.loads(s) | |
| 31 with open(s, "r") as f: | |
| 32 return json.load(f) | |
| 33 except Exception as e: | |
| 34 LOG.warning(f"Could not parse --hyperparameters: {e}. Ignoring.") | |
| 35 return {} | |
| 36 | |
| 37 | |
| 38 def set_seeds(seed: int = 42): | |
| 39 random.seed(seed) | |
| 40 np.random.seed(seed) | |
| 41 torch.manual_seed(seed) | |
| 42 if torch.cuda.is_available(): | |
| 43 torch.cuda.manual_seed_all(seed) | |
| 44 | |
| 45 | |
| 46 def ensure_local_tmp(): | |
| 47 os.makedirs("/tmp", exist_ok=True) | |
| 48 | |
| 49 | |
| 50 def enable_tensor_cores_if_available(): | |
| 51 if torch.cuda.is_available(): | |
| 52 torch.set_float32_matmul_precision("high") | |
| 53 | |
| 54 | |
| 55 def enable_deterministic_mode(seed: Optional[int] = None): | |
| 56 """ | |
| 57 Force deterministic algorithms where possible to reduce run-to-run variance. | |
| 58 """ | |
| 59 if seed is not None: | |
| 60 set_seeds(seed) | |
| 61 os.environ.setdefault("PYTHONHASHSEED", str(int(seed))) | |
| 62 # cuBLAS determinism | |
| 63 os.environ.setdefault("CUBLAS_WORKSPACE_CONFIG", ":4096:8") | |
| 64 try: | |
| 65 torch.use_deterministic_algorithms(True) | |
| 66 except Exception as e: | |
| 67 LOG.warning(f"Could not enable torch deterministic algorithms: {e}") | |
| 68 try: | |
| 69 torch.backends.cudnn.deterministic = True | |
| 70 torch.backends.cudnn.benchmark = False | |
| 71 except Exception as e: | |
| 72 LOG.warning(f"Could not enforce deterministic cuDNN settings: {e}") | |
| 73 try: | |
| 74 torch.backends.cuda.matmul.allow_tf32 = False | |
| 75 except Exception: | |
| 76 pass | |
| 77 try: | |
| 78 torch.backends.cudnn.allow_tf32 = False | |
| 79 except Exception: | |
| 80 pass | |
| 81 | |
| 82 | |
| 83 def load_file(path: str) -> pd.DataFrame: | |
| 84 if not path: | |
| 85 return None | |
| 86 path = Path(path) | |
| 87 if not path.exists(): | |
| 88 raise FileNotFoundError(f"Dataset not found: {path}") | |
| 89 return pd.read_csv(path, sep=None, engine="python") | |
| 90 | |
| 91 | |
| 92 def prepare_image_search_dirs(args) -> Optional[Path]: | |
| 93 if not args.images_zip: | |
| 94 return None | |
| 95 | |
| 96 root = Path(tempfile.mkdtemp(prefix="autogluon_images_")) | |
| 97 LOG.info(f"Extracting {len(args.images_zip)} image ZIP(s) to {root}") | |
| 98 | |
| 99 for zip_path in args.images_zip: | |
| 100 path = Path(zip_path) | |
| 101 if not path.exists(): | |
| 102 raise FileNotFoundError(f"Image ZIP not found: {zip_path}") | |
| 103 with zipfile.ZipFile(path, 'r') as z: | |
| 104 z.extractall(root) | |
| 105 LOG.info(f"Extracted {path.name}") | |
| 106 | |
| 107 return root | |
| 108 | |
| 109 | |
| 110 def absolute_path_expander(df: pd.DataFrame, extracted_root: Optional[Path], image_columns: Optional[List[str]]) -> List[str]: | |
| 111 """ | |
| 112 Resolve image paths to absolute paths. If no image_columns are provided, | |
| 113 infers candidate columns whose values resolve to existing files (checking | |
| 114 absolute paths first, then paths relative to the extracted_root). | |
| 115 """ | |
| 116 if df is None or df.empty: | |
| 117 return [] | |
| 118 | |
| 119 image_columns = [c for c in (image_columns or []) if c in df.columns] | |
| 120 | |
| 121 def resolve(p): | |
| 122 if pd.isna(p): | |
| 123 return None | |
| 124 orig = Path(str(p).strip()) | |
| 125 candidates = [] | |
| 126 if orig.is_absolute(): | |
| 127 candidates.append(orig) | |
| 128 if extracted_root is not None: | |
| 129 candidates.extend([extracted_root / orig, extracted_root / orig.name]) | |
| 130 for cand in candidates: | |
| 131 if cand.exists(): | |
| 132 return str(cand.resolve()) | |
| 133 return None | |
| 134 | |
| 135 # Infer image columns if none were provided | |
| 136 if not image_columns: | |
| 137 obj_cols = [c for c in df.columns if str(df[c].dtype) == "object"] | |
| 138 inferred = [] | |
| 139 for col in obj_cols: | |
| 140 sample = df[col].dropna().head(50) | |
| 141 if sample.empty: | |
| 142 continue | |
| 143 resolved_sample = sample.apply(resolve) | |
| 144 if resolved_sample.notna().any(): | |
| 145 inferred.append(col) | |
| 146 image_columns = inferred | |
| 147 if image_columns: | |
| 148 LOG.info(f"Inferred image columns: {image_columns}") | |
| 149 | |
| 150 for col in image_columns: | |
| 151 df[col] = df[col].apply(resolve) | |
| 152 | |
| 153 return image_columns | |
| 154 | |
| 155 | |
| 156 def verify_outputs(paths): | |
| 157 ok = True | |
| 158 for p, desc in paths: | |
| 159 if os.path.exists(p): | |
| 160 size = os.path.getsize(p) | |
| 161 LOG.info(f"✓ Output {desc}: {p} ({size:,} bytes)") | |
| 162 os.chmod(p, 0o644) | |
| 163 else: | |
| 164 LOG.error(f"✗ Output {desc} MISSING: {p}") | |
| 165 ok = False | |
| 166 if not ok: | |
| 167 LOG.error("Some outputs are missing!") | |
| 168 sys.exit(1) |
