|
|
@ -323,10 +323,12 @@ def launch_ps(args):
|
|
|
|
for idx, cur_server in enumerate(pod.servers):
|
|
|
|
for idx, cur_server in enumerate(pod.servers):
|
|
|
|
proc_env = {
|
|
|
|
proc_env = {
|
|
|
|
"PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints,
|
|
|
|
"PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints,
|
|
|
|
|
|
|
|
"PADDLE_TRAINER_ENDPOINTS": worker_endpoints,
|
|
|
|
"PADDLE_PORT": cur_server.endpoint.split(":")[1],
|
|
|
|
"PADDLE_PORT": cur_server.endpoint.split(":")[1],
|
|
|
|
"TRAINING_ROLE": "PSERVER",
|
|
|
|
"TRAINING_ROLE": "PSERVER",
|
|
|
|
"PADDLE_TRAINERS_NUM": str(worker_num),
|
|
|
|
"PADDLE_TRAINERS_NUM": str(worker_num),
|
|
|
|
"POD_IP": cur_server.endpoint.split(":")[0]
|
|
|
|
"POD_IP": cur_server.endpoint.split(":")[0],
|
|
|
|
|
|
|
|
"PADDLE_WITH_GLOO": "1"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
current_env.update(proc_env)
|
|
|
|
current_env.update(proc_env)
|
|
|
|
|
|
|
|
|
|
|
@ -365,7 +367,8 @@ def launch_ps(args):
|
|
|
|
"PADDLE_TRAINER_ENDPOINTS": worker_endpoints,
|
|
|
|
"PADDLE_TRAINER_ENDPOINTS": worker_endpoints,
|
|
|
|
"PADDLE_TRAINERS_NUM": str(worker_num),
|
|
|
|
"PADDLE_TRAINERS_NUM": str(worker_num),
|
|
|
|
"TRAINING_ROLE": "TRAINER",
|
|
|
|
"TRAINING_ROLE": "TRAINER",
|
|
|
|
"PADDLE_TRAINER_ID": str(cur_worker.rank)
|
|
|
|
"PADDLE_TRAINER_ID": str(cur_worker.rank),
|
|
|
|
|
|
|
|
"PADDLE_WITH_GLOO": "1"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
current_env.update(proc_env)
|
|
|
|
current_env.update(proc_env)
|
|
|
|
|
|
|
|
|
|
|
@ -430,7 +433,11 @@ def launch():
|
|
|
|
co_arg for co_arg in collective_args
|
|
|
|
co_arg for co_arg in collective_args
|
|
|
|
if co_arg in " ".join(sys.argv[1:-1])
|
|
|
|
if co_arg in " ".join(sys.argv[1:-1])
|
|
|
|
]
|
|
|
|
]
|
|
|
|
|
|
|
|
if fluid.core.is_compiled_with_cuda():
|
|
|
|
cuda_device_num = fluid.core.get_cuda_device_count()
|
|
|
|
cuda_device_num = fluid.core.get_cuda_device_count()
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
cuda_device_num = 0
|
|
|
|
|
|
|
|
|
|
|
|
if len(has_ps_args) > 0 or cuda_device_num == 0:
|
|
|
|
if len(has_ps_args) > 0 or cuda_device_num == 0:
|
|
|
|
logger.info(
|
|
|
|
logger.info(
|
|
|
|
"Run parameter-sever cpu mode. pserver arguments:{}, cuda count:{}".
|
|
|
|
"Run parameter-sever cpu mode. pserver arguments:{}, cuda count:{}".
|
|
|
|