comparison utils.py @ 0:375c36923da1 draft default tip

planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
author goeckslab
date Tue, 09 Dec 2025 23:49:47 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:375c36923da1
1 import json
2 import logging
3 import os
4 import random
5 import sys
6 import tempfile
7 import zipfile
8 from pathlib import Path
9 from typing import List, Optional
10
11 import numpy as np
12 import pandas as pd
13 import torch
14
15 LOG = logging.getLogger(__name__)
16
17
18 def str2bool(val) -> bool:
19 """Parse common truthy strings to bool."""
20 return str(val).strip().lower() in ("1", "true", "yes", "y")
21
22
23 def load_user_hparams(hp_arg: Optional[str]) -> dict:
24 """Parse --hyperparameters (inline JSON or path to .json)."""
25 if not hp_arg:
26 return {}
27 try:
28 s = hp_arg.strip()
29 if s.startswith("{"):
30 return json.loads(s)
31 with open(s, "r") as f:
32 return json.load(f)
33 except Exception as e:
34 LOG.warning(f"Could not parse --hyperparameters: {e}. Ignoring.")
35 return {}
36
37
38 def set_seeds(seed: int = 42):
39 random.seed(seed)
40 np.random.seed(seed)
41 torch.manual_seed(seed)
42 if torch.cuda.is_available():
43 torch.cuda.manual_seed_all(seed)
44
45
46 def ensure_local_tmp():
47 os.makedirs("/tmp", exist_ok=True)
48
49
50 def enable_tensor_cores_if_available():
51 if torch.cuda.is_available():
52 torch.set_float32_matmul_precision("high")
53
54
55 def enable_deterministic_mode(seed: Optional[int] = None):
56 """
57 Force deterministic algorithms where possible to reduce run-to-run variance.
58 """
59 if seed is not None:
60 set_seeds(seed)
61 os.environ.setdefault("PYTHONHASHSEED", str(int(seed)))
62 # cuBLAS determinism
63 os.environ.setdefault("CUBLAS_WORKSPACE_CONFIG", ":4096:8")
64 try:
65 torch.use_deterministic_algorithms(True)
66 except Exception as e:
67 LOG.warning(f"Could not enable torch deterministic algorithms: {e}")
68 try:
69 torch.backends.cudnn.deterministic = True
70 torch.backends.cudnn.benchmark = False
71 except Exception as e:
72 LOG.warning(f"Could not enforce deterministic cuDNN settings: {e}")
73 try:
74 torch.backends.cuda.matmul.allow_tf32 = False
75 except Exception:
76 pass
77 try:
78 torch.backends.cudnn.allow_tf32 = False
79 except Exception:
80 pass
81
82
83 def load_file(path: str) -> pd.DataFrame:
84 if not path:
85 return None
86 path = Path(path)
87 if not path.exists():
88 raise FileNotFoundError(f"Dataset not found: {path}")
89 return pd.read_csv(path, sep=None, engine="python")
90
91
92 def prepare_image_search_dirs(args) -> Optional[Path]:
93 if not args.images_zip:
94 return None
95
96 root = Path(tempfile.mkdtemp(prefix="autogluon_images_"))
97 LOG.info(f"Extracting {len(args.images_zip)} image ZIP(s) to {root}")
98
99 for zip_path in args.images_zip:
100 path = Path(zip_path)
101 if not path.exists():
102 raise FileNotFoundError(f"Image ZIP not found: {zip_path}")
103 with zipfile.ZipFile(path, 'r') as z:
104 z.extractall(root)
105 LOG.info(f"Extracted {path.name}")
106
107 return root
108
109
110 def absolute_path_expander(df: pd.DataFrame, extracted_root: Optional[Path], image_columns: Optional[List[str]]) -> List[str]:
111 """
112 Resolve image paths to absolute paths. If no image_columns are provided,
113 infers candidate columns whose values resolve to existing files (checking
114 absolute paths first, then paths relative to the extracted_root).
115 """
116 if df is None or df.empty:
117 return []
118
119 image_columns = [c for c in (image_columns or []) if c in df.columns]
120
121 def resolve(p):
122 if pd.isna(p):
123 return None
124 orig = Path(str(p).strip())
125 candidates = []
126 if orig.is_absolute():
127 candidates.append(orig)
128 if extracted_root is not None:
129 candidates.extend([extracted_root / orig, extracted_root / orig.name])
130 for cand in candidates:
131 if cand.exists():
132 return str(cand.resolve())
133 return None
134
135 # Infer image columns if none were provided
136 if not image_columns:
137 obj_cols = [c for c in df.columns if str(df[c].dtype) == "object"]
138 inferred = []
139 for col in obj_cols:
140 sample = df[col].dropna().head(50)
141 if sample.empty:
142 continue
143 resolved_sample = sample.apply(resolve)
144 if resolved_sample.notna().any():
145 inferred.append(col)
146 image_columns = inferred
147 if image_columns:
148 LOG.info(f"Inferred image columns: {image_columns}")
149
150 for col in image_columns:
151 df[col] = df[col].apply(resolve)
152
153 return image_columns
154
155
156 def verify_outputs(paths):
157 ok = True
158 for p, desc in paths:
159 if os.path.exists(p):
160 size = os.path.getsize(p)
161 LOG.info(f"✓ Output {desc}: {p} ({size:,} bytes)")
162 os.chmod(p, 0o644)
163 else:
164 LOG.error(f"✗ Output {desc} MISSING: {p}")
165 ok = False
166 if not ok:
167 LOG.error("Some outputs are missing!")
168 sys.exit(1)