|
|
|
@ -64,9 +64,9 @@ class Fleet(object):
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
self._opt_info = None # for fleet only
|
|
|
|
|
self.role_maker_ = None
|
|
|
|
|
self.local_ip_ = 0
|
|
|
|
|
self.is_initialized_ = False
|
|
|
|
|
self._role_maker = None
|
|
|
|
|
self._local_ip = 0
|
|
|
|
|
self._is_initialized = False
|
|
|
|
|
|
|
|
|
|
def init(self):
|
|
|
|
|
# TODO(guru4elephant)
|
|
|
|
@ -78,22 +78,22 @@ class Fleet(object):
|
|
|
|
|
current node's role, e.g. worker, server, etc.
|
|
|
|
|
"""
|
|
|
|
|
if not self.is_initialized_:
|
|
|
|
|
self.role_maker_ = MPISymetricRoleMaker()
|
|
|
|
|
self.role_maker_._generate_role()
|
|
|
|
|
self._role_maker = MPISymetricRoleMaker()
|
|
|
|
|
self._role_maker._generate_role()
|
|
|
|
|
self._fleet_ptr = fluid.core.Fleet()
|
|
|
|
|
self.is_initialized_ = True
|
|
|
|
|
self._is_initialized = True
|
|
|
|
|
|
|
|
|
|
def stop(self):
|
|
|
|
|
"""
|
|
|
|
|
stop(): will be called after a user finishes his/her training task. Fleet instance will be
|
|
|
|
|
destroyed when stop() is called.
|
|
|
|
|
"""
|
|
|
|
|
self.role_maker_._barrier_worker()
|
|
|
|
|
if self.role_maker_._is_first_worker():
|
|
|
|
|
self._role_maker._barrier_worker()
|
|
|
|
|
if self._role_maker._is_first_worker():
|
|
|
|
|
self._fleet_ptr.stop_server()
|
|
|
|
|
self.role_maker_._barrier_worker()
|
|
|
|
|
self.role_maker_._barrier_all()
|
|
|
|
|
self.role_maker_._finalize()
|
|
|
|
|
self._role_maker._barrier_worker()
|
|
|
|
|
self._role_maker._barrier_all()
|
|
|
|
|
self._role_maker._finalize()
|
|
|
|
|
|
|
|
|
|
def init_pserver(self):
|
|
|
|
|
"""
|
|
|
|
@ -110,15 +110,15 @@ class Fleet(object):
|
|
|
|
|
sys.exit(-1)
|
|
|
|
|
self._fleet_ptr.init_server(self._dist_desc_str,
|
|
|
|
|
self.role_maker_._get_rank())
|
|
|
|
|
self.local_ip_ = self._fleet_ptr.run_server()
|
|
|
|
|
self._local_ip = self._fleet_ptr.run_server()
|
|
|
|
|
# barrier_all for init_server
|
|
|
|
|
self.role_maker_._barrier_all()
|
|
|
|
|
self.all_ips_ = self.role_maker_._all_gather(self.local_ip_)
|
|
|
|
|
self._role_maker._barrier_all()
|
|
|
|
|
self._all_ips = self._role_maker._all_gather(self.local_ip_)
|
|
|
|
|
|
|
|
|
|
self._fleet_ptr.gather_servers(self.all_ips_,
|
|
|
|
|
self.role_maker_._get_size())
|
|
|
|
|
self._fleet_ptr.gather_servers(self._all_ips,
|
|
|
|
|
self._role_maker._get_size())
|
|
|
|
|
# barrier_all for init_worker, wait all workers start
|
|
|
|
|
self.role_maker_._barrier_all()
|
|
|
|
|
self._role_maker._barrier_all()
|
|
|
|
|
else:
|
|
|
|
|
print("You should run DistributedOptimizer.minimize() first")
|
|
|
|
|
sys.exit(-1)
|
|
|
|
@ -151,21 +151,21 @@ class Fleet(object):
|
|
|
|
|
print("You should run DistributedOptimizer.minimize() first")
|
|
|
|
|
sys.exit(-1)
|
|
|
|
|
# barrier_all for init_server, wait for server starts
|
|
|
|
|
self.role_maker_._barrier_all()
|
|
|
|
|
self.all_ips_ = self.role_maker_._all_gather(self.local_ip_)
|
|
|
|
|
self._fleet_ptr.init_worker(self._dist_desc_str, self.all_ips_,
|
|
|
|
|
self.role_maker_._get_size(),
|
|
|
|
|
self.role_maker_._get_rank())
|
|
|
|
|
self._role_maker._barrier_all()
|
|
|
|
|
self._all_ips = self._role_maker._all_gather(self.local_ip_)
|
|
|
|
|
self._fleet_ptr.init_worker(self._dist_desc_str, self._all_ips,
|
|
|
|
|
self._role_maker._get_size(),
|
|
|
|
|
self._role_maker._get_rank())
|
|
|
|
|
# barrier_all for init_worker
|
|
|
|
|
self.role_maker_._barrier_all()
|
|
|
|
|
self._role_maker._barrier_all()
|
|
|
|
|
# prepare for client to client communication
|
|
|
|
|
info = self._fleet_ptr.get_clients_info()
|
|
|
|
|
all_info = self.role_maker_._worker_gather(info[0])
|
|
|
|
|
all_info = self._role_maker._worker_gather(info[0])
|
|
|
|
|
self._fleet_ptr.gather_clients(all_info)
|
|
|
|
|
self._fleet_ptr.create_client2client_connection()
|
|
|
|
|
# barrier for init model
|
|
|
|
|
self.role_maker_._barrier_worker()
|
|
|
|
|
if self.role_maker_._is_first_worker():
|
|
|
|
|
self._role_maker._barrier_worker()
|
|
|
|
|
if self._role_maker._is_first_worker():
|
|
|
|
|
tables = self._dist_desc.trainer_param.dense_table
|
|
|
|
|
for prog, scope in zip(programs, scopes):
|
|
|
|
|
prog_id = str(id(prog))
|
|
|
|
@ -192,7 +192,7 @@ class Fleet(object):
|
|
|
|
|
int(table.table_id),
|
|
|
|
|
var_name_list)
|
|
|
|
|
# barrier for init model done
|
|
|
|
|
self.role_maker_._barrier_worker()
|
|
|
|
|
self._role_maker._barrier_worker()
|
|
|
|
|
else:
|
|
|
|
|
print("You should run DistributedOptimizer.minimize() first")
|
|
|
|
|
sys.exit(-1)
|
|
|
|
@ -201,39 +201,39 @@ class Fleet(object):
|
|
|
|
|
"""
|
|
|
|
|
return the number of current job's worker num
|
|
|
|
|
"""
|
|
|
|
|
return self.role_maker_._worker_num()
|
|
|
|
|
return self._role_maker._worker_num()
|
|
|
|
|
|
|
|
|
|
def get_server_num(self):
|
|
|
|
|
"""
|
|
|
|
|
return the number of current job's server num
|
|
|
|
|
"""
|
|
|
|
|
return self.role_maker_._server_num()
|
|
|
|
|
return self._role_maker._server_num()
|
|
|
|
|
|
|
|
|
|
def get_worker_index(self):
|
|
|
|
|
"""
|
|
|
|
|
return the mpi rank of current worker
|
|
|
|
|
"""
|
|
|
|
|
return self.role_maker_._worker_index()
|
|
|
|
|
return self._role_maker._worker_index()
|
|
|
|
|
|
|
|
|
|
def is_worker(self):
|
|
|
|
|
"""
|
|
|
|
|
return whether current node is a worker
|
|
|
|
|
"""
|
|
|
|
|
return self.role_maker_._is_worker()
|
|
|
|
|
return self._role_maker._is_worker()
|
|
|
|
|
|
|
|
|
|
def is_server(self):
|
|
|
|
|
"""
|
|
|
|
|
return whether current node is pserver
|
|
|
|
|
"""
|
|
|
|
|
return self.role_maker_._is_server()
|
|
|
|
|
return self._role_maker._is_server()
|
|
|
|
|
|
|
|
|
|
def init_pserver_model(self):
|
|
|
|
|
"""
|
|
|
|
|
init pserver model called from pserver
|
|
|
|
|
"""
|
|
|
|
|
if self.role_maker_._is_first_worker():
|
|
|
|
|
if self._role_maker._is_first_worker():
|
|
|
|
|
self._fleet_ptr.init_model()
|
|
|
|
|
self.role_maker_._barrier_worker()
|
|
|
|
|
self._role_maker._barrier_worker()
|
|
|
|
|
|
|
|
|
|
def save_pserver_model(self, save_path):
|
|
|
|
|
"""
|
|
|
|
|