|
|
|
@ -171,6 +171,7 @@ class Gloo(object):
|
|
|
|
|
|
|
|
|
|
def _init_http(self, ip, port, prefix, start_http_server, http_server_d):
|
|
|
|
|
def __start_kv_server(http_server_d, size_d):
|
|
|
|
|
print("start http_server: {}, {}".format(port, size_d))
|
|
|
|
|
from paddle.distributed.fleet.utils.http_server import KVServer
|
|
|
|
|
http_server = KVServer(port, size_d)
|
|
|
|
|
http_server.start()
|
|
|
|
@ -181,11 +182,9 @@ class Gloo(object):
|
|
|
|
|
http_server.stop()
|
|
|
|
|
|
|
|
|
|
def init_kv_server(http_server_d):
|
|
|
|
|
size_d = {
|
|
|
|
|
"trainer": self._worker_num,
|
|
|
|
|
"pserver": self._server_num,
|
|
|
|
|
"all": self._worker_num + self._server_num
|
|
|
|
|
}
|
|
|
|
|
worker_key = prefix + '_' + 'worker'
|
|
|
|
|
size_d = {worker_key: self._worker_num, }
|
|
|
|
|
print("worker_key:{}, size: {}".format(worker_key, size_d))
|
|
|
|
|
|
|
|
|
|
http_server_d["running"] = True
|
|
|
|
|
# child process for http server
|
|
|
|
@ -205,7 +204,7 @@ class Gloo(object):
|
|
|
|
|
gloo.set_iface(self._iface)
|
|
|
|
|
gloo.set_timeout_seconds(self._init_timeout_seconds,
|
|
|
|
|
self._run_timeout_seconds)
|
|
|
|
|
gloo.set_http_store(ip, port, role)
|
|
|
|
|
gloo.set_http_store(ip, port, 'worker')
|
|
|
|
|
ep = ":".join([ip, str(port)])
|
|
|
|
|
wait_server_ready([ep])
|
|
|
|
|
gloo.init()
|
|
|
|
@ -214,6 +213,7 @@ class Gloo(object):
|
|
|
|
|
port = int(port)
|
|
|
|
|
|
|
|
|
|
if start_http_server:
|
|
|
|
|
print("to start http_server")
|
|
|
|
|
http_server = init_kv_server(http_server_d)
|
|
|
|
|
|
|
|
|
|
if self._role == Role.WORKER:
|
|
|
|
|