Mercurial > repos > goeckslab > multimodal_learner
annotate utils.py @ 2:b708d0e210e6 draft default tip
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
| author | goeckslab |
|---|---|
| date | Sat, 10 Jan 2026 16:13:19 +0000 |
| parents | 375c36923da1 |
| children |
| rev | line source |
|---|---|
|
2
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
1 import errno |
|
0
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
2 import json |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
3 import logging |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
4 import os |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
5 import random |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
6 import sys |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
7 import tempfile |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
8 import zipfile |
|
2
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
9 from collections import OrderedDict |
|
0
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
10 from pathlib import Path |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
11 from typing import List, Optional |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
12 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
13 import numpy as np |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
14 import pandas as pd |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
15 import torch |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
16 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
17 LOG = logging.getLogger(__name__) |
|
2
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
18 _IMAGE_EXTENSIONS = { |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
19 ".jpg", |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
20 ".jpeg", |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
21 ".png", |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
22 ".bmp", |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
23 ".gif", |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
24 ".tif", |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
25 ".tiff", |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
26 ".webp", |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
27 ".svs", |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
28 } |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
29 _MAX_PATH_COMPONENT = 255 |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
30 _MAX_EXTRACTED_INDEX_CACHE_SIZE = 2 |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
31 _MAX_EXTRACTED_INDEX_FILES = 100000 |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
32 _EXTRACTED_INDEX_CACHE = OrderedDict() |
|
0
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
33 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
34 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
35 def str2bool(val) -> bool: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
36 """Parse common truthy strings to bool.""" |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
37 return str(val).strip().lower() in ("1", "true", "yes", "y") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
38 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
39 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
40 def load_user_hparams(hp_arg: Optional[str]) -> dict: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
41 """Parse --hyperparameters (inline JSON or path to .json).""" |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
42 if not hp_arg: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
43 return {} |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
44 try: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
45 s = hp_arg.strip() |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
46 if s.startswith("{"): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
47 return json.loads(s) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
48 with open(s, "r") as f: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
49 return json.load(f) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
50 except Exception as e: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
51 LOG.warning(f"Could not parse --hyperparameters: {e}. Ignoring.") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
52 return {} |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
53 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
54 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
55 def set_seeds(seed: int = 42): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
56 random.seed(seed) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
57 np.random.seed(seed) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
58 torch.manual_seed(seed) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
59 if torch.cuda.is_available(): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
60 torch.cuda.manual_seed_all(seed) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
61 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
62 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
63 def ensure_local_tmp(): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
64 os.makedirs("/tmp", exist_ok=True) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
65 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
66 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
67 def enable_tensor_cores_if_available(): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
68 if torch.cuda.is_available(): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
69 torch.set_float32_matmul_precision("high") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
70 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
71 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
72 def enable_deterministic_mode(seed: Optional[int] = None): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
73 """ |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
74 Force deterministic algorithms where possible to reduce run-to-run variance. |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
75 """ |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
76 if seed is not None: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
77 set_seeds(seed) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
78 os.environ.setdefault("PYTHONHASHSEED", str(int(seed))) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
79 # cuBLAS determinism |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
80 os.environ.setdefault("CUBLAS_WORKSPACE_CONFIG", ":4096:8") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
81 try: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
82 torch.use_deterministic_algorithms(True) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
83 except Exception as e: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
84 LOG.warning(f"Could not enable torch deterministic algorithms: {e}") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
85 try: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
86 torch.backends.cudnn.deterministic = True |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
87 torch.backends.cudnn.benchmark = False |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
88 except Exception as e: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
89 LOG.warning(f"Could not enforce deterministic cuDNN settings: {e}") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
90 try: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
91 torch.backends.cuda.matmul.allow_tf32 = False |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
92 except Exception: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
93 pass |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
94 try: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
95 torch.backends.cudnn.allow_tf32 = False |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
96 except Exception: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
97 pass |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
98 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
99 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
100 def load_file(path: str) -> pd.DataFrame: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
101 if not path: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
102 return None |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
103 path = Path(path) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
104 if not path.exists(): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
105 raise FileNotFoundError(f"Dataset not found: {path}") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
106 return pd.read_csv(path, sep=None, engine="python") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
107 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
108 |
|
2
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
109 def _normalize_path_value(val: object) -> Optional[str]: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
110 if val is None: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
111 return None |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
112 s = str(val).strip().strip('"').strip("'") |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
113 return s if s else None |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
114 |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
115 |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
116 def _warn_if_long_component(path_str: str) -> None: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
117 for part in path_str.replace("\\", "/").split("/"): |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
118 if len(part) > _MAX_PATH_COMPONENT: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
119 LOG.warning( |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
120 "Path component exceeds %d chars; resolution may fail: %s", |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
121 _MAX_PATH_COMPONENT, |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
122 path_str, |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
123 ) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
124 return |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
125 |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
126 |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
127 def _build_extracted_index(extracted_root: Optional[Path]) -> set: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
128 if extracted_root is None: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
129 return set() |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
130 index = set() |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
131 for root, _dirs, files in os.walk(extracted_root): |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
132 rel_root = os.path.relpath(root, extracted_root) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
133 for fname in files: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
134 ext = os.path.splitext(fname)[1].lower() |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
135 if ext not in _IMAGE_EXTENSIONS: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
136 continue |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
137 rel_path = fname if rel_root == "." else os.path.join(rel_root, fname) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
138 index.add(rel_path.replace("\\", "/")) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
139 index.add(fname) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
140 return index |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
141 |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
142 |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
143 def _get_cached_extracted_index(extracted_root: Optional[Path]) -> set: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
144 if extracted_root is None: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
145 return set() |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
146 try: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
147 root = extracted_root.resolve() |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
148 except Exception: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
149 root = extracted_root |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
150 cache_key = str(root) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
151 try: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
152 mtime_ns = root.stat().st_mtime_ns |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
153 except OSError: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
154 _EXTRACTED_INDEX_CACHE.pop(cache_key, None) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
155 return _build_extracted_index(root) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
156 cached = _EXTRACTED_INDEX_CACHE.get(cache_key) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
157 if cached: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
158 cached_mtime, cached_index = cached |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
159 if cached_mtime == mtime_ns: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
160 _EXTRACTED_INDEX_CACHE.move_to_end(cache_key) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
161 LOG.debug("Using cached extracted index for %s (%d entries)", root, len(cached_index)) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
162 return cached_index |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
163 _EXTRACTED_INDEX_CACHE.pop(cache_key, None) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
164 LOG.debug("Invalidated extracted index cache for %s (mtime changed)", root) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
165 else: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
166 LOG.debug("No extracted index cache for %s; building", root) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
167 index = _build_extracted_index(root) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
168 if len(index) <= _MAX_EXTRACTED_INDEX_FILES: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
169 _EXTRACTED_INDEX_CACHE[cache_key] = (mtime_ns, index) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
170 _EXTRACTED_INDEX_CACHE.move_to_end(cache_key) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
171 while len(_EXTRACTED_INDEX_CACHE) > _MAX_EXTRACTED_INDEX_CACHE_SIZE: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
172 _EXTRACTED_INDEX_CACHE.popitem(last=False) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
173 else: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
174 LOG.debug("Extracted index has %d entries; skipping cache for %s", len(index), root) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
175 return index |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
176 |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
177 |
|
0
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
178 def prepare_image_search_dirs(args) -> Optional[Path]: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
179 if not args.images_zip: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
180 return None |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
181 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
182 root = Path(tempfile.mkdtemp(prefix="autogluon_images_")) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
183 LOG.info(f"Extracting {len(args.images_zip)} image ZIP(s) to {root}") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
184 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
185 for zip_path in args.images_zip: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
186 path = Path(zip_path) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
187 if not path.exists(): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
188 raise FileNotFoundError(f"Image ZIP not found: {zip_path}") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
189 with zipfile.ZipFile(path, 'r') as z: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
190 z.extractall(root) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
191 LOG.info(f"Extracted {path.name}") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
192 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
193 return root |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
194 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
195 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
196 def absolute_path_expander(df: pd.DataFrame, extracted_root: Optional[Path], image_columns: Optional[List[str]]) -> List[str]: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
197 """ |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
198 Resolve image paths to absolute paths. If no image_columns are provided, |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
199 infers candidate columns whose values resolve to existing files (checking |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
200 absolute paths first, then paths relative to the extracted_root). |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
201 """ |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
202 if df is None or df.empty: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
203 return [] |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
204 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
205 image_columns = [c for c in (image_columns or []) if c in df.columns] |
|
2
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
206 extracted_index = None |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
207 |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
208 def get_extracted_index() -> set: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
209 nonlocal extracted_index |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
210 if extracted_index is None: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
211 extracted_index = _get_cached_extracted_index(extracted_root) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
212 return extracted_index |
|
0
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
213 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
214 def resolve(p): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
215 if pd.isna(p): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
216 return None |
|
2
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
217 raw = _normalize_path_value(p) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
218 if not raw: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
219 return None |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
220 _warn_if_long_component(raw) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
221 orig = Path(raw) |
|
0
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
222 candidates = [] |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
223 if orig.is_absolute(): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
224 candidates.append(orig) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
225 if extracted_root is not None: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
226 candidates.extend([extracted_root / orig, extracted_root / orig.name]) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
227 for cand in candidates: |
|
2
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
228 try: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
229 if cand.exists(): |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
230 return str(cand.resolve()) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
231 except OSError as e: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
232 if e.errno == errno.ENAMETOOLONG: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
233 LOG.warning("Path too long for filesystem: %s", cand) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
234 continue |
|
0
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
235 return None |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
236 |
|
2
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
237 def matches_extracted(p) -> bool: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
238 if pd.isna(p): |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
239 return False |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
240 raw = _normalize_path_value(p) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
241 if not raw: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
242 return False |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
243 _warn_if_long_component(raw) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
244 index = get_extracted_index() |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
245 if not index: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
246 return False |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
247 norm = raw.replace("\\", "/").lstrip("./") |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
248 return norm in index |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
249 |
|
0
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
250 # Infer image columns if none were provided |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
251 if not image_columns: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
252 obj_cols = [c for c in df.columns if str(df[c].dtype) == "object"] |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
253 inferred = [] |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
254 for col in obj_cols: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
255 sample = df[col].dropna().head(50) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
256 if sample.empty: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
257 continue |
|
2
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
258 if extracted_root is not None: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
259 index = get_extracted_index() |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
260 else: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
261 index = set() |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
262 if index: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
263 matched = sample.apply(matches_extracted) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
264 if matched.any(): |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
265 inferred.append(col) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
266 continue |
|
0
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
267 resolved_sample = sample.apply(resolve) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
268 if resolved_sample.notna().any(): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
269 inferred.append(col) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
270 image_columns = inferred |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
271 if image_columns: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
272 LOG.info(f"Inferred image columns: {image_columns}") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
273 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
274 for col in image_columns: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
275 df[col] = df[col].apply(resolve) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
276 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
277 return image_columns |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
278 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
279 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
280 def verify_outputs(paths): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
281 ok = True |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
282 for p, desc in paths: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
283 if os.path.exists(p): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
284 size = os.path.getsize(p) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
285 LOG.info(f"✓ Output {desc}: {p} ({size:,} bytes)") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
286 os.chmod(p, 0o644) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
287 else: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
288 LOG.error(f"✗ Output {desc} MISSING: {p}") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
289 ok = False |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
290 if not ok: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
291 LOG.error("Some outputs are missing!") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
292 sys.exit(1) |
