|
|
|
@ -252,15 +252,12 @@ def get_cluster(node_ips, node_ip, paddle_ports, selected_gpus):
|
|
|
|
|
def terminate_local_procs(procs):
|
|
|
|
|
for p in procs:
|
|
|
|
|
if p.proc.poll() is None:
|
|
|
|
|
# subprocess need to release resource(e.g. shared memory)
|
|
|
|
|
# use join to wait subprocess releasing
|
|
|
|
|
p.proc.join(timeout=1)
|
|
|
|
|
p.proc.terminate()
|
|
|
|
|
p.log_fn.close()
|
|
|
|
|
logger.debug("terminate process id:{}".format(p.proc.pid))
|
|
|
|
|
|
|
|
|
|
# wait all process terminiated
|
|
|
|
|
# time.sleep(3)
|
|
|
|
|
|
|
|
|
|
#wait all process terminiated
|
|
|
|
|
time.sleep(3)
|
|
|
|
|
for step in range(0, 50):
|
|
|
|
|
alive = False
|
|
|
|
|
for p in procs:
|
|
|
|
|