|
|
|
@ -78,14 +78,14 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
|
|
|
|
|
# should fix the variable
|
|
|
|
|
def _setup_nccl_op(self, startup_program, main_program):
|
|
|
|
|
trainer_endpoints = self.role_maker.get_trainer_endpoints()
|
|
|
|
|
trainers = trainer_endpoints
|
|
|
|
|
trainer_id = self.role_maker.worker_index()
|
|
|
|
|
current_endpoint = self.role_maker.get_trainer_endpoints()[trainer_id]
|
|
|
|
|
trainer_endpoints_env = ",".join(trainer_endpoints)
|
|
|
|
|
trainers_num = self.role_maker.worker_num()
|
|
|
|
|
trainer_endpoints.remove(current_endpoint)
|
|
|
|
|
if trainer_id == 0:
|
|
|
|
|
wait_server_ready(trainer_endpoints)
|
|
|
|
|
other_trainer_endpoints = trainer_endpoints[:]
|
|
|
|
|
other_trainer_endpoints.remove(current_endpoint)
|
|
|
|
|
wait_server_ready(other_trainer_endpoints)
|
|
|
|
|
nccl_id_var = startup_program.global_block().create_var(
|
|
|
|
|
name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
|
|
|
|
|
for i in range(1, self.user_defined_strategy.nccl_comm_num):
|
|
|
|
@ -110,7 +110,7 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
|
|
|
|
|
inputs={},
|
|
|
|
|
outputs={"NCCLID": nccl_id_var},
|
|
|
|
|
attrs={
|
|
|
|
|
"trainers": trainers,
|
|
|
|
|
"trainers": trainer_endpoints,
|
|
|
|
|
"trainer_id": trainer_id,
|
|
|
|
|
"nccl_comm_num": self.user_defined_strategy.nccl_comm_num,
|
|
|
|
|
"use_hierarchical_allreduce":
|
|
|
|
|