Mercurial > repos > goeckslab > tiling_pyhist
changeset 0:c051e9688932 draft default tip
planemo upload for repository https://github.com/goeckslab/gleam.git commit 11356473f09dd54d86af28b74bd9ed097d07ca04
author | goeckslab |
---|---|
date | Thu, 03 Jul 2025 23:48:01 +0000 |
parents | |
children | |
files | Docker/Dockerfile README.md test-data/CMU-1-Small-Region.svs test-data/expected_output_CMU-1-Small-Region.zip tiling_pyhist.py tiling_pyhist.xml |
diffstat | 6 files changed, 354 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Docker/Dockerfile Thu Jul 03 23:48:01 2025 +0000 @@ -0,0 +1,58 @@ +FROM debian:stable + +LABEL maintainer="Paulo Lyra" \ + version="1.0.0" \ + description="Docker image for PyHIST Galaxy tool" + +# Install necessary tools +RUN export DEBIAN_FRONTEND=noninteractive && \ + apt-get update --fix-missing -qq && \ + apt-get install -y -q \ + build-essential \ + libgl1-mesa-glx \ + python3 \ + python3-pip \ + python3-venv \ + openslide-tools \ + python3-openslide \ + libmagic-dev \ + git && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Set working directory +WORKDIR /pyhist + +# Clone PyHIST repository +RUN git clone https://github.com/manuel-munoz-aguirre/PyHIST.git . && \ + git checkout master + +# Create and activate virtual environment, then install Python packages +RUN python3 -m venv /pyhist/venv && \ + /pyhist/venv/bin/pip install --upgrade pip && \ + /pyhist/venv/bin/pip install \ + pandas \ + opencv-python \ + numpy \ + Pillow \ + python-magic \ + openslide-python \ + psutil + +# Compile segmentation algorithm +RUN cd src/graph_segmentation/ && \ + make && \ + chmod 755 segment + +# Add venv binaries to PATH +ENV PATH="/pyhist/venv/bin:$PATH" + +# Make pyhist.py globally executable and accessible +RUN chmod +x /pyhist/pyhist.py && \ + ln -s /pyhist/pyhist.py /usr/local/bin/pyhist + +# Ensure src directory is in PYTHONPATH for module imports +ENV PYTHONPATH="/pyhist:$PYTHONPATH" + +# Set default command (optional, for debugging) +CMD ["/bin/bash"]
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.md Thu Jul 03 23:48:01 2025 +0000 @@ -0,0 +1,2 @@ +# Galaxy-tiles +Galaxy tool to generate tiles - using PyHist - from Whole Slide Image (WSI)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tiling_pyhist.py Thu Jul 03 23:48:01 2025 +0000 @@ -0,0 +1,236 @@ +import argparse +import logging +import os +import subprocess +import sys +import tempfile +import zipfile +from concurrent.futures import ProcessPoolExecutor +from pathlib import Path +from typing import Tuple + +import openslide +import psutil +from pyhist import PySlide, TileGenerator +from src import utility_functions + +# Configure logging to stdout +logging.basicConfig( + stream=sys.stdout, + format="%(asctime)s - %(levelname)s - %(message)s", + level=logging.INFO, +) + +# Constants +SEGMENT_BINARY_PATH = "/pyhist/src/graph_segmentation/segment" +DEFAULT_PATCH_SIZE = 256 +DEFAULT_DOWNSCALE_FACTOR = 8 +TILE_FORMAT = "png" +MEMORY_PER_WORKER = 1 # GB, estimated memory per worker process + + +def log_memory_usage() -> None: + """Log the current memory usage of the process in megabytes.""" + process = psutil.Process(os.getpid()) + mem_info = process.memory_info() + logging.info( + "Memory usage: RSS=%.2f MB, VMS=%.2f MB", + mem_info.rss / 1024 / 1024, + mem_info.vms / 1024 / 1024 + ) + + +def validate_slide(image_path: Path) -> None: + """Validate the input image using OpenSlide.""" + try: + with openslide.OpenSlide(str(image_path)): + logging.info("Validated input file with OpenSlide: %s", image_path) + except openslide.OpenSlideError as error: + raise RuntimeError("Invalid input file: %s", error) from error + + +def check_segmentation_binary() -> bool: + """Check if the segmentation binary exists and is executable.""" + if os.path.exists(SEGMENT_BINARY_PATH) and os.access(SEGMENT_BINARY_PATH, os.X_OK): + logging.info("Segmentation executable found: %s", SEGMENT_BINARY_PATH) + return True + logging.warning("Segmentation executable missing, using Otsu method") + return False + + +def build_pyhist_config(image_path: Path, output_dir: Path) -> dict: + """Build the configuration dictionary for PyHIST processing.""" + return { + "svs": str(image_path), + "patch_size": DEFAULT_PATCH_SIZE, + "method": "otsu", + "thres": 0.1, + "output_downsample": DEFAULT_DOWNSCALE_FACTOR, + "mask_downsample": DEFAULT_DOWNSCALE_FACTOR, + "borders": "0000", + "corners": "1010", + "pct_bc": 1, + "k_const": 1000, + "minimum_segmentsize": 1000, + "save_patches": True, + "save_blank": False, + "save_nonsquare": False, + "save_tilecrossed_image": False, + "save_mask": True, + "save_edges": False, + "info": "verbose", + "output": str(output_dir), + "format": TILE_FORMAT, + } + + +def process_image_with_pyhist( + image_path: Path, output_dir: Path, original_name: str +) -> Path: + """Process a single image with PyHIST and return the tile directory.""" + logging.info("Processing image: %s", image_path) + log_memory_usage() + + # Validate input + validate_slide(image_path) + + # Check segmentation method + check_segmentation_binary() + + # Prepare PyHIST configuration + config = build_pyhist_config(image_path, output_dir) + + # Set logging level based on config + log_levels = { + "default": logging.INFO, + "verbose": logging.DEBUG, + "silent": logging.CRITICAL, + } + logging.getLogger().setLevel(log_levels[config["info"]]) + + # Process the slide + utility_functions.check_image(config["svs"]) + slide = PySlide(config) + logging.info("Slide loaded: %s", slide) + + tile_generator = TileGenerator(slide) + logging.info("Tile generator initialized: %s", tile_generator) + + try: + tile_generator.execute() + except subprocess.CalledProcessError as error: + raise RuntimeError("Tile extraction failed: %s", error) from error + + tile_dir = Path(slide.tile_folder) + tiles = list(tile_dir.glob(f"*.{TILE_FORMAT}")) + logging.info("Found %d tiles in %s", len(tiles), tile_dir) + + utility_functions.clean(slide) + return tile_dir + + +def append_tiles_to_zip( + zip_file: zipfile.ZipFile, + original_name: str, + tile_dir: Path +) -> None: + """Append PNG tiles from the tile directory to the ZIP file.""" + original_base = Path(original_name).stem + tiles = list(tile_dir.glob(f"*.{TILE_FORMAT}")) + + for tile in tiles: + tile_number = tile.stem.split("_")[-1] + arcname = f"{original_base}/{original_base}_{tile_number}.{TILE_FORMAT}" + zip_file.write(tile, arcname) + + logging.info("Appended %d tiles from %s", len(tiles), tile_dir) + + +def process_single_image(task: Tuple[Path, str, Path]) -> Path: + """Process a single image and return the tile directory.""" + image_path, original_name, output_dir = task + try: + tile_dir = process_image_with_pyhist( + image_path, + output_dir, + original_name + ) + return tile_dir + except Exception as error: + logging.error("Error processing %s: %s", image_path, error) + raise + + +def get_max_workers() -> int: + """Determine the maximum number of worker processes based on available resources.""" + cpu_cores = psutil.cpu_count(logical=False) # Physical CPU cores + available_memory = psutil.virtual_memory().available / (1024 ** 3) # in GB + max_workers_memory = available_memory // MEMORY_PER_WORKER + max_workers = min(cpu_cores, max_workers_memory) + return max(1, int(max_workers)) + + +def parse_arguments() -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser(description="Tile extraction for Galaxy") + parser.add_argument( + "--input", + action="append", + help="Input image paths", + default=[] + ) + parser.add_argument( + "--original_name", + action="append", + help="Original file names", + default=[] + ) + parser.add_argument( + "--output_zip", + required=True, + help="Output ZIP file path" + ) + return parser.parse_args() + + +def main() -> None: + """Main function to orchestrate tile extraction and ZIP creation with dynamic multiprocessing.""" + # Removed os.chdir("/pyhist") to stay in Galaxy's working directory + logging.info("Working directory: %s", os.getcwd()) + + args = parse_arguments() + + if len(args.input) != len(args.original_name): + raise ValueError("Mismatch between input paths and original names") + + # Create a temporary directory using tempfile + with tempfile.TemporaryDirectory(prefix="pyhist_tiles_", dir=os.getcwd()) as temp_dir_path: + temp_dir = Path(temp_dir_path) + logging.info("Created temporary directory: %s", temp_dir) + + # Prepare tasks with unique output directories + tasks = [ + (Path(image_path), original_name, temp_dir / Path(original_name).stem) + for image_path, original_name in zip(args.input, args.original_name) + ] + + # Determine the number of worker processes based on available resources + max_workers = get_max_workers() + logging.info("Using %d worker processes", max_workers) + + # Process images in parallel + with ProcessPoolExecutor(max_workers=max_workers) as executor: + tile_dirs = list(executor.map(process_single_image, tasks)) + + # Create the ZIP file and append all tiles + with zipfile.ZipFile(args.output_zip, "w", zipfile.ZIP_DEFLATED) as zip_file: + for (image_path, original_name, output_dir), tile_dir in zip(tasks, tile_dirs): + append_tiles_to_zip(zip_file, original_name, tile_dir) + + logging.info("Final ZIP size: %d bytes", Path(args.output_zip).stat().st_size) + # No need for shutil.rmtree as TemporaryDirectory cleans up automatically + logging.info("Temporary directory cleaned up") + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tiling_pyhist.xml Thu Jul 03 23:48:01 2025 +0000 @@ -0,0 +1,58 @@ +<tool id="tiling_pyhist" name="Tile Images with PyHIST" version="1.0.0"> + <description>Tile pathology images using PyHIST in parallel</description> + + <requirements> + <container type="docker">quay.io/goeckslab/galaxy-tiler:1.0.0</container> + </requirements> + <stdio> + <exit_code range="137" level="fatal_oom" description="Out of Memory" /> + <exit_code range="1:" level="fatal" description="Error occurred. Please check Tool Standard Error" /> + </stdio> + <command><![CDATA[ + python "$__tool_directory__/tiling_pyhist.py" + #for $i, $img in enumerate($input_collection) + --input '${img}' --original_name "${img.element_identifier}" + #end for + --output_zip '$output_zip' + ]]></command> + + <inputs> + <param name="input_collection" type="data_collection" collection_type="list" format="svs,tiff,tif" label="Input Image Collection" + help="Provide a dataset collection of pathology images (.svs, .tiff, .tif)." /> + </inputs> + + <outputs> + <data name="output_zip" format="zip" label="Image Tiles (zip)" /> + </outputs> + <tests> + <test> + <param name="input_collection"> + <collection type="list"> + <element name="sample1" ftype="svs" value="CMU-1-Small-Region.svs" /> + </collection> + </param> + <output name="output_zip" file="expected_output_CMU-1-Small-Region.zip" compare="sim_size" delta="10000"/> + </test> + </tests> + <help><![CDATA[ + **Tile Images with PyHIST** + + This tool tiles pathology images using PyHIST in parallel using Docker. It accepts a **collection** of pathology images: + + - A dataset collection of individual pathology images (e.g., `.svs`, `.tiff`, `.tif`). + + Each image in the collection will be processed in a batch using PyHIST. The output is a ZIP file containing tiled image folders named after the input images. + + **Inputs:** + - **Input Image Collection**: Provide a collection of images to tile. + - **Output ZIP Filename**: The resulting ZIP file of tiled patches. + + **Outputs:** + - **Tiled Images ZIP**: A single ZIP archive with all tiled outputs. + + **Note**: Requires Docker on the Galaxy server. Ensure sufficient disk and CPU resources for parallel processing. + ]]></help> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008349</citation> + </citations> +</tool>