diff utils.py @ 0:375c36923da1 draft default tip

planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
author goeckslab
date Tue, 09 Dec 2025 23:49:47 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils.py	Tue Dec 09 23:49:47 2025 +0000
@@ -0,0 +1,168 @@
+import json
+import logging
+import os
+import random
+import sys
+import tempfile
+import zipfile
+from pathlib import Path
+from typing import List, Optional
+
+import numpy as np
+import pandas as pd
+import torch
+
+LOG = logging.getLogger(__name__)
+
+
+def str2bool(val) -> bool:
+    """Parse common truthy strings to bool."""
+    return str(val).strip().lower() in ("1", "true", "yes", "y")
+
+
+def load_user_hparams(hp_arg: Optional[str]) -> dict:
+    """Parse --hyperparameters (inline JSON or path to .json)."""
+    if not hp_arg:
+        return {}
+    try:
+        s = hp_arg.strip()
+        if s.startswith("{"):
+            return json.loads(s)
+        with open(s, "r") as f:
+            return json.load(f)
+    except Exception as e:
+        LOG.warning(f"Could not parse --hyperparameters: {e}. Ignoring.")
+        return {}
+
+
+def set_seeds(seed: int = 42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+
+
+def ensure_local_tmp():
+    os.makedirs("/tmp", exist_ok=True)
+
+
+def enable_tensor_cores_if_available():
+    if torch.cuda.is_available():
+        torch.set_float32_matmul_precision("high")
+
+
+def enable_deterministic_mode(seed: Optional[int] = None):
+    """
+    Force deterministic algorithms where possible to reduce run-to-run variance.
+    """
+    if seed is not None:
+        set_seeds(seed)
+        os.environ.setdefault("PYTHONHASHSEED", str(int(seed)))
+    # cuBLAS determinism
+    os.environ.setdefault("CUBLAS_WORKSPACE_CONFIG", ":4096:8")
+    try:
+        torch.use_deterministic_algorithms(True)
+    except Exception as e:
+        LOG.warning(f"Could not enable torch deterministic algorithms: {e}")
+    try:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+    except Exception as e:
+        LOG.warning(f"Could not enforce deterministic cuDNN settings: {e}")
+    try:
+        torch.backends.cuda.matmul.allow_tf32 = False
+    except Exception:
+        pass
+    try:
+        torch.backends.cudnn.allow_tf32 = False
+    except Exception:
+        pass
+
+
+def load_file(path: str) -> pd.DataFrame:
+    if not path:
+        return None
+    path = Path(path)
+    if not path.exists():
+        raise FileNotFoundError(f"Dataset not found: {path}")
+    return pd.read_csv(path, sep=None, engine="python")
+
+
+def prepare_image_search_dirs(args) -> Optional[Path]:
+    if not args.images_zip:
+        return None
+
+    root = Path(tempfile.mkdtemp(prefix="autogluon_images_"))
+    LOG.info(f"Extracting {len(args.images_zip)} image ZIP(s) to {root}")
+
+    for zip_path in args.images_zip:
+        path = Path(zip_path)
+        if not path.exists():
+            raise FileNotFoundError(f"Image ZIP not found: {zip_path}")
+        with zipfile.ZipFile(path, 'r') as z:
+            z.extractall(root)
+        LOG.info(f"Extracted {path.name}")
+
+    return root
+
+
+def absolute_path_expander(df: pd.DataFrame, extracted_root: Optional[Path], image_columns: Optional[List[str]]) -> List[str]:
+    """
+    Resolve image paths to absolute paths. If no image_columns are provided,
+    infers candidate columns whose values resolve to existing files (checking
+    absolute paths first, then paths relative to the extracted_root).
+    """
+    if df is None or df.empty:
+        return []
+
+    image_columns = [c for c in (image_columns or []) if c in df.columns]
+
+    def resolve(p):
+        if pd.isna(p):
+            return None
+        orig = Path(str(p).strip())
+        candidates = []
+        if orig.is_absolute():
+            candidates.append(orig)
+        if extracted_root is not None:
+            candidates.extend([extracted_root / orig, extracted_root / orig.name])
+        for cand in candidates:
+            if cand.exists():
+                return str(cand.resolve())
+        return None
+
+    # Infer image columns if none were provided
+    if not image_columns:
+        obj_cols = [c for c in df.columns if str(df[c].dtype) == "object"]
+        inferred = []
+        for col in obj_cols:
+            sample = df[col].dropna().head(50)
+            if sample.empty:
+                continue
+            resolved_sample = sample.apply(resolve)
+            if resolved_sample.notna().any():
+                inferred.append(col)
+        image_columns = inferred
+        if image_columns:
+            LOG.info(f"Inferred image columns: {image_columns}")
+
+    for col in image_columns:
+        df[col] = df[col].apply(resolve)
+
+    return image_columns
+
+
+def verify_outputs(paths):
+    ok = True
+    for p, desc in paths:
+        if os.path.exists(p):
+            size = os.path.getsize(p)
+            LOG.info(f"✓ Output {desc}: {p} ({size:,} bytes)")
+            os.chmod(p, 0o644)
+        else:
+            LOG.error(f"✗ Output {desc} MISSING: {p}")
+            ok = False
+    if not ok:
+        LOG.error("Some outputs are missing!")
+        sys.exit(1)