Mercurial > repos > richard-burhans > segalign
diff runner.py @ 21:25fa179d9d0a draft
planemo upload for repository https://github.com/richard-burhans/galaxytools/tree/main/tools/segalign commit e4e05d23d9da18ea87bc352122ca9e6cfa73d1c7
author | richard-burhans |
---|---|
date | Fri, 09 Aug 2024 20:23:12 +0000 |
parents | 96ff17622b17 |
children |
line wrap: on
line diff
--- a/runner.py Wed Aug 07 20:34:45 2024 +0000 +++ b/runner.py Fri Aug 09 20:23:12 2024 +0000 @@ -314,7 +314,7 @@ run_args = ["python", f"{args.tool_directory}/diagonal_partition.py", str(chunk_size)] for word in line.split(): run_args.append(word) - process = subprocess.run(run_args, stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + process = subprocess.run(run_args, stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1, text=True) for line in process.stdout.splitlines(): output_q.put(line) @@ -327,6 +327,8 @@ def estimate_chunk_size(args: argparse.Namespace) -> int: + # only used when segment size is being estimated + MAX_CHUNK_SIZE = 50000 chunk_size = -1 line_size = -1 @@ -353,10 +355,19 @@ fdict[entry.name.split(".split", 1)[0]] += file_size - # if noot enough segment files for estimation, continue - if len(fdict) > 2: + if len(fdict) < 7: + # outliers can heavily skew prediction if <7 data points + # to be safe, use 50% quantile + chunk_size = int(statistics.quantiles(fdict.values())[1] // line_size) + else: + # otherwise use 75% quantile chunk_size = int(statistics.quantiles(fdict.values())[-1] // line_size) + # if not enough data points, there is a chance of getting unlucky + # minimize worst case by using MAX_CHUNK_SIZE + + chunk_size = min(chunk_size, MAX_CHUNK_SIZE) + if args.debug: ns: int = time.monotonic_ns() - beg r_end = resource.getrusage(resource.RUSAGE_SELF)