Mercurial > repos > petr-novak > re_utils
view parallel.py @ 16:5376e1c9adec draft
Uploaded
author | petr-novak |
---|---|
date | Fri, 24 Apr 2020 08:54:30 -0400 |
parents | a4cd8608ef6b |
children |
line wrap: on
line source
#!/usr/bin/env python3 import multiprocessing import os import time from itertools import cycle ''' functions for parallel processing of data chunks using worker function ''' def run_multiple_pbs_jobs(cmds, status_files, qsub_params=""): ''' Example of pbs_params: -l walltime=1000:00:00,nodes=1:ppn=8,mem=15G -l walltime=150:00:00,nodes=1:ppn=1 ''' jobs = [] status_function = [] status_command = [] for cmd, sf in zip(cmds, status_files): jobs.append(pbs_send_job(cmd, sf, qsub_params)) for p in jobs: p.join() status_function.append(p.exitcode) # collect pbs run status for sf in status_files: with open(sf) as f: status_command.append(f.read().strip()) status = {'function': status_function, 'command': status_command} return status def pbs_send_job(cmd, status_file, qsub_params): ''' send job to pbs cluster, require status file''' p = multiprocessing.Process(target=pbs_run, args=(cmd, status_file, qsub_params)) p.start() return p def pbs_run(cmd, status_file, qsub_params): ''' run shell command cmd on pbs cluster, wait for job to finish and return status ''' print(status_file) error_file = status_file + ".e" # test if writable try: f = open(status_file, 'w').close() f = open(error_file, 'w').close() except IOError: print("cannot write to status files, make sure path exists") raise IOError if os.path.exists(status_file): print("removing old status file") os.remove(status_file) cmd_full = ("echo '{cmd} && echo \"OK\" > {status_file} || echo \"ERROR\"" " > {status_file}' | qsub -e {err}" " {qsub_params} ").format(cmd=cmd, status_file=status_file, err=error_file, qsub_params=qsub_params) os.system(cmd_full) while True: if os.path.exists(status_file): break else: time.sleep(3) with open(status_file) as f: status = f.read().strip() return status def spawn(f): def fun(pipe, x): pipe.send(f(x)) pipe.close() return fun def get_max_proc(): '''Number of cpu to ise in ether get from config.py is available or from global PROC or from environment variable PRCO or set to system max''' try: from config import PROC as max_proc except ImportError: if "PROC" in globals(): max_proc = PROC elif "PROC" in os.environ: max_proc = int(os.environ["PROC"]) else: max_proc = multiprocessing.cpu_count() return max_proc def parmap2(f, X, groups, ppn): max_proc = get_max_proc() print("running in parallel using ", max_proc, "cpu(s)") process_pool = [] output = [None] * len(X) # prepare processes for x, index in zip(X, list(range(len(X)))): # status: # 0: waiting, 1: running, 2:collected process_pool.append({ 'status': 0, 'proc': None, 'pipe': None, 'index': index, 'group': groups[index], 'ppn': ppn[index] }) # run processes running = 0 finished = 0 sleep_time = 0.001 while True: # count alive processes if not sleep_time: sleep_time = 0.001 for i in process_pool: if i['status'] == 1 and not (i['proc'].exitcode is None): sleep_time = 0.0 # was running now finished --> collect i['status'] = 2 running -= 1 finished += 1 output[i['index']] = collect(i['proc'], i['pipe']) del i['pipe'] del i['proc'] if i['status'] == 0 and running < max_proc: # waiting and free --> run # check if this group can be run running_groups = [pp['group'] for pp in process_pool if pp['status'] == 1] # check max load of concurent runs: current_load = sum([pp['ppn'] for pp in process_pool if pp['status'] == 1]) cond1 = (i['ppn'] + current_load) <= 1 cond2 = not i['group'] in running_groups if cond1 and cond2: sleep_time = 0.0 try: i['pipe'] = multiprocessing.Pipe() except OSError as e: print('exception occured:',e) continue i['proc'] = multiprocessing.Process( target=spawn(f), args=(i['pipe'][1], X[i['index']]), name=str(i['index'])) i['proc'].start() i['status'] = 1 running += 1 if finished == len(process_pool): break if sleep_time: # sleep only if nothing changed in the last cycle time.sleep(sleep_time) # sleep time gradually increase to 1 sec sleep_time = min(2 * sleep_time, 1) return output def print_status(pp): states = ['waiting', 'running', 'collected'] print("___________________________________") print("jobid status group ppn exitcode") print("===================================") for i in pp: print( i['index'], " ", states[i['status']], " ", i['group'], " ", i['ppn'], " ", i['proc'].exitcode ) def collect(pf, pp): if pf.pid and not pf.exitcode and not pf.is_alive(): returnvalue = pp[0].recv() pf.join() pp[0].close() pp[1].close() return returnvalue elif pf.exitcode: print("job finished with exit code {}".format(pf.exitcode)) pf.join() pp[0].close() pp[1].close() return None # return None else: raise Exception('not collected') def parmap(f, X): max_proc = get_max_proc() pipe = [] proc = [] returnvalue = {} for x, index in zip(X, list(range(len(X)))): pipe.append(multiprocessing.Pipe()) proc.append(multiprocessing.Process(target=spawn(f), args=(pipe[-1][1], x), name=str(index))) p = proc[-1] # count alive processes while True: running = 0 for i in proc: if i.is_alive(): running += 1 # print "running:"+str(running) if running < max_proc: break else: time.sleep(0.1) p.start() # print "process started:"+str(p.pid) # check for finished for pf, pp, index in zip(proc, pipe, range(len(pipe))): if pf.pid and not pf.exitcode and not pf.is_alive() and (pf.name not in returnvalue): pf.join() returnvalue[str(pf.name)] = pp[0].recv() pp[0].close() pp[1].close() # proc must be garbage collected - to free all file connection del proc[index] del pipe[index] # collect the rest: [pf.join() for pf in proc] for pf, pp in zip(proc, pipe): if pf.pid and not pf.exitcode and not pf.is_alive() and (pf.name not in returnvalue): returnvalue[str(pf.name)] = pp[0].recv() pp[0].close() pp[1].close() # convert to list in input correct order returnvalue = [returnvalue[str(i)] for i in range(len(X))] return returnvalue def parallel2(command, *args, groups=None, ppn=None): ''' same as parallel but groups are used to identifie mutually exclusive jobs, jobs with the same goup id are never run together ppn params is 'load' of the job - sum of loads cannot exceed 1 ''' # check args, expand if necessary args = list(args) N = [len(i) for i in args] # lengths of lists Mx = max(N) if len(set(N)) == 1: # all good pass elif set(N) == set([1, Mx]): # expand args of length 1 for i in range(len(args)): if len(args[i]) == 1: args[i] = args[i] * Mx else: raise ValueError if not groups: groups = range(Mx) elif len(groups) != Mx: print("length of groups must be same as number of job or None") raise ValueError if not ppn: ppn = [0] * Mx elif len(ppn) != Mx: print("length of ppn must be same as number of job or None") raise ValueError elif max(ppn) > 1 and min(ppn): print("ppn values must be in 0 - 1 range") raise ValueError # convert argument to suitable format - 'transpose' argsTuples = list(zip(*args)) args = [list(i) for i in argsTuples] # multiprocessing.Pool() def command_star(args): return(command(*args)) x = parmap2(command_star, argsTuples, groups, ppn) return x def parallel(command, *args): ''' Execute command in parallel using multiprocessing command is the function to be executed args is list of list of arguments execution is : command(args[0][0],args[1][0],args[2][0],args[3][0],....) command(args[0][1],args[1][1],args[2][1],args[3][1],....) command(args[0][2],args[1][2],args[2][2],args[3][2],....) ... output of command is returned as list ''' # check args, expand if necessary args = list(args) N = [len(i) for i in args] # lengths of lists Mx = max(N) if len(set(N)) == 1: # all good pass elif set(N) == set([1, Mx]): # expand args of length 1 for i in range(len(args)): if len(args[i]) == 1: args[i] = args[i] * Mx else: raise ValueError # convert argument to suitable format - 'transpose' argsTuples = list(zip(*args)) args = [list(i) for i in argsTuples] multiprocessing.Pool() def command_star(args): return(command(*args)) x = parmap(command_star, argsTuples) return x def worker(*a): x = 0 y = 0 for i in a: if i == 1.1: print("raising exception") s = 1 / 0 y += i for j in range(10): x += i for j in range(100000): x = 1.0 / (float(j) + 1.0) return(y) # test if __name__ == "__main__": # x = parallel2(worker, [1], [2], [3], [4], [1], [1, 2, 3, 7, 10, 1.1, 20, 30, 40, 10, 30, 20, 40, 50, 50], [ # 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 5, 6, 4, 3, 2]) x = parallel2( worker, [1], [2], [3], [4], [1], [1, 2, 3, 7, 10, 1.2, 20, 30, 40, 10, 30, 20, 40, 50, 50], [3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 5, 6, 4, 3, 2], groups=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], ppn=[0.6, 0.6, 0.2, 0.6, 0.2, 0.2, 0.4, 0.1, 0.1, 0.3, 0.3, 0.3, 0.1, 0.1, 0.1] ) print(x)