add endpoints log;test=develop (#27439)

revert-27520-disable_pr
danleifeng 4 years ago committed by GitHub
parent 9f3a9be76a
commit 905e2346ac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -463,9 +463,8 @@ def launch():
cuda_device_num = 0
if len(has_ps_args) > 0 or cuda_device_num == 0:
logger.info(
"Run parameter-sever cpu mode. pserver arguments:{}, cuda count:{}".
format(has_ps_args, cuda_device_num))
logger.info("Run parameter-sever cpu mode. pserver arguments:{}".format(
has_ps_args))
launch_ps(args)
elif len(has_collective_args) > 0:
logger.info("Run collective gpu mode. gpu arguments:{}, cuda count:{}".

@ -435,9 +435,17 @@ def start_local_trainers(cluster,
len(pod.trainers),
pretty_print_envs(proc_env, ("Distributed Envs",
"Value"))))
logger.info(
"details abouts PADDLE_TRAINER_ENDPOINTS can be found in {}/endpoints.log.".
format(log_dir))
fn = None
if log_dir is not None:
os.system("mkdir -p {}".format(log_dir))
if os.path.exists("%s/endpoints.log" % log_dir):
os.system("rm -f {}/endpoints.log".format(log_dir))
with open("%s/endpoints.log" % log_dir, "w") as f:
f.write("PADDLE_TRAINER_ENDPOINTS: \n")
f.write("\n".join(cluster.trainers_endpoints()))
fn = open("%s/workerlog.%d" % (log_dir, idx), "a")
proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
else:

Loading…
Cancel
Save