|
|
|
@ -1039,11 +1039,17 @@ class HeterRoleMaker(GeneralRoleMaker):
|
|
|
|
|
self._node_type = 1
|
|
|
|
|
self._cur_endpoint = worker_endpoints[current_id]
|
|
|
|
|
gloo = fluid.core.Gloo()
|
|
|
|
|
gloo.init(current_id,
|
|
|
|
|
len(worker_endpoints),
|
|
|
|
|
self._hdfs_path.rstrip("/") + "/trainer",
|
|
|
|
|
self._hdfs_name, self._hdfs_ugi, self._iface,
|
|
|
|
|
self._prefix)
|
|
|
|
|
|
|
|
|
|
gloo.set_rank(current_id)
|
|
|
|
|
gloo.set_size(len(worker_endpoints))
|
|
|
|
|
gloo.set_prefix(self._prefix)
|
|
|
|
|
gloo.set_iface(self._iface)
|
|
|
|
|
gloo.set_timeout_seconds(self._init_timeout_seconds,
|
|
|
|
|
self._run_timeout_seconds)
|
|
|
|
|
gloo.set_hdfs_store(
|
|
|
|
|
self._hdfs_path.rstrip("/") + "/trainer", self._hdfs_name,
|
|
|
|
|
self._hdfs_ugi)
|
|
|
|
|
gloo.init()
|
|
|
|
|
self._node_type_comm = gloo
|
|
|
|
|
elif training_role == "XPU":
|
|
|
|
|
role = Role.XPU
|
|
|
|
@ -1051,10 +1057,17 @@ class HeterRoleMaker(GeneralRoleMaker):
|
|
|
|
|
self._node_type = 2
|
|
|
|
|
self._cur_endpoint = xpu_endpoints[current_id]
|
|
|
|
|
gloo = fluid.core.Gloo()
|
|
|
|
|
gloo.init(current_id,
|
|
|
|
|
len(xpu_endpoints),
|
|
|
|
|
self._hdfs_path.rstrip("/") + "/xpu", self._hdfs_name,
|
|
|
|
|
self._hdfs_ugi, self._iface, self._prefix)
|
|
|
|
|
|
|
|
|
|
gloo.set_rank(current_id)
|
|
|
|
|
gloo.set_size(len(xpu_endpoints))
|
|
|
|
|
gloo.set_prefix(self._prefix)
|
|
|
|
|
gloo.set_iface(self._iface)
|
|
|
|
|
gloo.set_timeout_seconds(self._init_timeout_seconds,
|
|
|
|
|
self._run_timeout_seconds)
|
|
|
|
|
gloo.set_hdfs_store(
|
|
|
|
|
self._hdfs_path.rstrip("/") + "/xpu", self._hdfs_name,
|
|
|
|
|
self._hdfs_ugi)
|
|
|
|
|
gloo.init()
|
|
|
|
|
self._node_type_comm = gloo
|
|
|
|
|
elif training_role == "PSERVER":
|
|
|
|
|
role = Role.SERVER
|
|
|
|
@ -1070,30 +1083,47 @@ class HeterRoleMaker(GeneralRoleMaker):
|
|
|
|
|
self._node_type = 0
|
|
|
|
|
self._cur_endpoint = cur_endpoint
|
|
|
|
|
gloo = fluid.core.Gloo()
|
|
|
|
|
gloo.init(current_id,
|
|
|
|
|
len(eplist),
|
|
|
|
|
self._hdfs_path.rstrip("/") + "/pserver",
|
|
|
|
|
self._hdfs_name, self._hdfs_ugi, self._iface,
|
|
|
|
|
self._prefix)
|
|
|
|
|
gloo.set_rank(current_id)
|
|
|
|
|
gloo.set_size(len(eplist))
|
|
|
|
|
gloo.set_prefix(self._prefix)
|
|
|
|
|
gloo.set_iface(self._iface)
|
|
|
|
|
gloo.set_timeout_seconds(self._init_timeout_seconds,
|
|
|
|
|
self._run_timeout_seconds)
|
|
|
|
|
gloo.set_hdfs_store(
|
|
|
|
|
self._hdfs_path.rstrip("/") + "/pserver", self._hdfs_name,
|
|
|
|
|
self._hdfs_ugi)
|
|
|
|
|
gloo.init()
|
|
|
|
|
self._node_type_comm = gloo
|
|
|
|
|
|
|
|
|
|
if training_role == "TRAINER" or training_role == "XPU":
|
|
|
|
|
gloo = fluid.core.Gloo()
|
|
|
|
|
heter_list = worker_endpoints + xpu_endpoints
|
|
|
|
|
gloo.init(
|
|
|
|
|
heter_list.index(self._cur_endpoint),
|
|
|
|
|
len(heter_list),
|
|
|
|
|
|
|
|
|
|
gloo.set_rank(heter_list.index(self._cur_endpoint))
|
|
|
|
|
gloo.set_size(len(heter_list))
|
|
|
|
|
gloo.set_prefix(self._prefix)
|
|
|
|
|
gloo.set_iface(self._iface)
|
|
|
|
|
gloo.set_timeout_seconds(self._init_timeout_seconds,
|
|
|
|
|
self._run_timeout_seconds)
|
|
|
|
|
gloo.set_hdfs_store(
|
|
|
|
|
self._hdfs_path.rstrip("/") + "/heter", self._hdfs_name,
|
|
|
|
|
self._hdfs_ugi, self._iface, self._prefix)
|
|
|
|
|
self._hdfs_ugi)
|
|
|
|
|
gloo.init()
|
|
|
|
|
self._heter_comm = gloo
|
|
|
|
|
|
|
|
|
|
gloo = fluid.core.Gloo()
|
|
|
|
|
all_list = worker_endpoints + eplist + xpu_endpoints
|
|
|
|
|
gloo.init(
|
|
|
|
|
all_list.index(self._cur_endpoint),
|
|
|
|
|
len(all_list),
|
|
|
|
|
|
|
|
|
|
gloo.set_rank(all_list.index(self._cur_endpoint))
|
|
|
|
|
gloo.set_size(len(all_list))
|
|
|
|
|
gloo.set_prefix(self._prefix)
|
|
|
|
|
gloo.set_iface(self._iface)
|
|
|
|
|
gloo.set_timeout_seconds(self._init_timeout_seconds,
|
|
|
|
|
self._run_timeout_seconds)
|
|
|
|
|
gloo.set_hdfs_store(
|
|
|
|
|
self._hdfs_path.rstrip("/") + "/all", self._hdfs_name,
|
|
|
|
|
self._hdfs_ugi, self._iface, self._prefix)
|
|
|
|
|
self._hdfs_ugi)
|
|
|
|
|
gloo.init()
|
|
|
|
|
|
|
|
|
|
self._all_comm = gloo
|
|
|
|
|
self._trainers_num = trainers_num
|
|
|
|
|