batched_lastz: run_lastz_tarball.py comparison

comparison run_lastz_tarball.py @ 0:103538753e81 draft

planemo upload for repository https://github.com/richard-burhans/galaxytools/tree/main/tools/batched_lastz commit 7b119b432f721e228a73396d4e8f0d54350b0481

author	richard-burhans
date	Tue, 30 Apr 2024 21:06:58 +0000
parents
children	007990f98551

comparison

equal deleted inserted replaced

--1:000000000000
+:103538753e81
+#!/usr/bin/env python
+import argparse
+import concurrent.futures
+import json
+import multiprocessing
+import os
+import queue
+import re
+import shutil
+import sys
+import subprocess
+import tarfile
+import tempfile
+import typing
+import time
+lastz_output_format_regex = re.compile(
+r"^(?:axt\+?|blastn|cigar|differences|general-?.+|lav|lav\+text|maf[-+]?|none|paf(?::wfmash)?|rdotplot|sam-?|softsam-?|text)$",
+re.IGNORECASE,
+)
+# Specifies the output format: lav, lav+text, axt, axt+, maf, maf+, maf-, sam, softsam, sam-, softsam-, cigar, BLASTN, PAF, PAF:wfmash, differences, rdotplot, text, general[:<fields>], or general-[:<fields>].
+# ‑‑format=none can be used when no alignment output is desired.
+def run_command(
+instance: int,
+input_queue: "queue.Queue[typing.Dict[str, typing.Any]]",
+output_queue: "queue.Queue[float]",
+debug: bool = False,
+) -> None:
+os.chdir("galaxy/files")
+while True:
+command_dict = input_queue.get()
+if not command_dict:
+return
+args = ["lastz"]
+args.extend(command_dict["args"])
+stdin = command_dict["stdin"]
+if stdin is not None:
+stdin = open(stdin, "r")
+stdout = command_dict["stdout"]
+if stdout is not None:
+stdout = open(stdout, "w")
+stderr = command_dict["stderr"]
+if stderr is not None:
+stderr = open(stderr, "w")
+begin = time.perf_counter()
+p = subprocess.run(args, stdin=stdin, stdout=stdout, stderr=stderr)
+for var in [stdin, stdout, stderr]:
+if var is not None:
+var.close()
+if p.returncode != 0:
+sys.exit(f"command failed: {' '.join(args)}")
+else:
+stderr = command_dict["stderr"]
+if stderr is not None:
+try:
+statinfo = os.stat(stderr, follow_symlinks=False)
+except:
+statinfo = None
+if statinfo is None:
+sys.exit(f"unable to stat stderr file: {' '.join(args)}")
+if statinfo.st_size != 0:
+sys.exit(f"stderr file is not empty: {' '.join(args)}")
+elapsed = time.perf_counter() - begin
+output_queue.put(elapsed)
+if debug:
+print(f"runtime {elapsed}", file=sys.stderr, flush=True)
+class BatchTar:
+def __init__(self, pathname: str, debug: bool = False) -> None:
+self.pathname = pathname
+self.debug = debug
+self.tarfile = None
+self.commands: typing.List[typing.Dict[str, typing.Any]] = []
+self._extract()
+self._load_commands()
+def batch_commands(self) -> typing.Iterator[typing.Dict[str, typing.Any]]:
+for command in self.commands:
+yield command
+def _load_commands(self) -> None:
+try:
+f = open("galaxy/commands.json")
+except FileNotFoundError:
+sys.exit(
+f"ERROR: input tarball missing galaxy/commands.json: {self.pathname}"
+)
+begin = time.perf_counter()
+for json_line in f:
+json_line = json_line.rstrip("\n")
+try:
+command_dict = json.loads(json_line)
+except json.JSONDecodeError:
+sys.exit(
+f"ERROR: bad json line in galaxy/commands.json: {self.pathname}"
+)
+self._load_command(command_dict)
+f.close()
+elapsed = time.perf_counter() - begin
+if self.debug:
+print(
+f"loaded {len(self.commands)} commands in {elapsed} seconds ",
+file=sys.stderr,
+flush=True,
+)
+def _load_command(self, command_dict: typing.Dict[str, typing.Any]) -> None:
+# check command_dict structure
+field_types: typing.Dict[str, typing.List[typing.Any]] = {
+"executable": [str],
+"args": [list],
+"stdin": [str, "None"],
+"stdout": [str, "None"],
+"stderr": [str, "None"],
+}
+bad_format = False
+for field_name in field_types.keys():
+# missing field
+if field_name not in command_dict:
+bad_format = True
+break
+# incorrect field type
+good_type = False
+for field_type in field_types[field_name]:
+if isinstance(field_type, str) and field_type == "None":
+if command_dict[field_name] is None:
+good_type = True
+break
+elif isinstance(command_dict[field_name], field_type):
+good_type = True
+break
+if good_type is False:
+bad_format = True
+if not bad_format:
+# all args must be strings
+for arg in command_dict["args"]:
+if not isinstance(arg, str):
+bad_format = True
+break
+if bad_format:
+sys.exit(
+f"ERROR: unexpected json format in line in galaxy/commands.json: {self.pathname}"
+)
+self.commands.append(command_dict)
+def _extract(self) -> None:
+try:
+self.tarball = tarfile.open(
+name=self.pathname, mode="r:*", format=tarfile.GNU_FORMAT
+)
+except FileNotFoundError:
+sys.exit(f"ERROR: unable to find input tarball: {self.pathname}")
+except tarfile.ReadError:
+sys.exit(f"ERROR: error reading input tarball: {self.pathname}")
+begin = time.perf_counter()
+self.tarball.extractall(filter="data")
+self.tarball.close()
+elapsed = time.perf_counter() - begin
+if self.debug:
+print(
+f"Extracted tarball in {elapsed} seconds", file=sys.stderr, flush=True
+)
+class TarRunner:
+def __init__(
+self,
+input_pathname: str,
+output_pathname: str,
+parallel: int,
+debug: bool = False,
+) -> None:
+self.input_pathname = input_pathname
+self.output_pathname = output_pathname
+self.parallel = parallel
+self.debug = debug
+self.batch_tar = BatchTar(self.input_pathname, debug=self.debug)
+self.output_file_format: typing.Dict[str, str] = {}
+self.output_files: typing.Dict[str, typing.List[str]] = {}
+self._set_output()
+self._set_target_query()
+def _set_output(self) -> None:
+for command_dict in self.batch_tar.batch_commands():
+output_file = None
+output_format = None
+for arg in command_dict["args"]:
+if arg.startswith("--format="):
+output_format = arg[9:]
+elif arg.startswith("--output="):
+output_file = arg[9:]
+if output_file is None:
+f = tempfile.NamedTemporaryFile(dir="galaxy/files", delete=False)
+output_file = os.path.basename(f.name)
+f.close()
+command_dict["args"].append(f"--output={output_file}")
+if output_format is None:
+output_format = "lav"
+command_dict["args"].append(f"--format={output_format}")
+if not lastz_output_format_regex.match(output_format):
+sys.exit(f"ERROR: invalid output format: {output_format}")
+self.output_file_format[output_file] = output_format
+for output_file, output_format in self.output_file_format.items():
+self.output_files.setdefault(output_format, [])
+self.output_files[output_format].append(output_file)
+def _set_target_query(self) -> None:
+for command_dict in self.batch_tar.batch_commands():
+new_args: typing.List[str] = []
+for arg in command_dict["args"]:
+if arg.startswith("--target="):
+new_args.insert(0, arg[9:])
+elif arg.startswith("--query="):
+new_args.insert(1, arg[8:])
+else:
+new_args.append(arg)
+command_dict["args"] = new_args
+def run(self) -> None:
+run_times = []
+begin = time.perf_counter()
+with multiprocessing.Manager() as manager:
+input_queue: queue.Queue[typing.Dict[str, typing.Any]] = manager.Queue()
+output_queue: queue.Queue[float] = manager.Queue()
+for command_dict in self.batch_tar.batch_commands():
+input_queue.put(command_dict)
+# use the empty dict as a sentinel
+for _ in range(self.parallel):
+input_queue.put({})
+with concurrent.futures.ProcessPoolExecutor(
+max_workers=self.parallel
+) as executor:
+futures = [
+executor.submit(
+run_command,
+instance,
+input_queue,
+output_queue,
+debug=self.debug,
+)
+for instance in range(self.parallel)
+]
+for f in concurrent.futures.as_completed(futures):
+if not f.done() or f.cancelled() or f.exception() is not None:
+sys.exit("lastz command failed")
+while not output_queue.empty():
+run_time = output_queue.get()
+run_times.append(run_time)
+elapsed = time.perf_counter() - begin
+if self.debug:
+print(f"elapsed {elapsed}", file=sys.stderr, flush=True)
+self._cleanup()
+def _cleanup(self) -> None:
+num_output_files = len(self.output_files.keys())
+for file_type, file_list in self.output_files.items():
+with open(f"output.{file_type}", "w") as ofh:
+for filename in file_list:
+with open(f"galaxy/files/{filename}") as ifh:
+for line in ifh:
+ofh.write(line)
+if num_output_files == 1:
+file_type = list(self.output_files.keys())[0]
+src_filename = f"output.{file_type}"
+shutil.copy2(src_filename, self.output_pathname)
+def main() -> None:
+if not hasattr(tarfile, "data_filter"):
+sys.exit("ERROR: extracting may be unsafe; consider updating Python")
+parser = argparse.ArgumentParser()
+parser.add_argument("--input", type=str, required=True)
+parser.add_argument("--output", type=str, required=True)
+parser.add_argument("--parallel", type=int, default=1, required=False)
+parser.add_argument("--debug", action="store_true", required=False)
+args = parser.parse_args()
+runner = TarRunner(args.input, args.output, args.parallel, args.debug)
+runner.run()
+if __name__ == "__main__":
+main()

Mercurial > repos > richard-burhans > batched_lastz

comparison run_lastz_tarball.py @ 0:103538753e81 draft