【paddle.fleet】Fix/role maker api fix (#27326)

* fix fleet util and gloo

* fix worker endpoints

* fix

* fix UT

* fix gloo

* fix gloo

* update gloo

* update gloo

* update gloo

* update gloo

* update gloo

* fix gloo wrapper for hdfs

* add file gloo and UT

* fix UT

* fix UT

* fix UT

* hide public method of RoleMaker

* fix UT

* GPU fleetrun support gloo

* parameterserver fleetrun support gloo

* add UT

* add UT

* fix UT

* fix get server endpoint

* fix get server endpoint

* fix UT

* hide public method of rolemaker

* hide public method of rolemaker

* hide public method of rolemaker

* Update test_fleet_rolemaker_new.py

* hide public method of rolemaker

* hide public method of rolemaker
revert-27520-disable_pr
tangwei12 4 years ago committed by GitHub
parent 99626502f7
commit d6b54de467
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -180,7 +180,7 @@ class Fleet(object):
raise ValueError(
"`role_maker` should be subclass of `RoleMakerBase`, but got {}".
format(type(role_maker)))
self._role_maker.generate_role()
self._role_maker._generate_role()
self.strategy_compiler = StrategyCompiler()
if paddle.fluid.framework.in_dygraph_mode():
@ -207,7 +207,7 @@ class Fleet(object):
fleet.is_first_worker()
"""
return self._role_maker.is_first_worker()
return self._role_maker._is_first_worker()
def worker_index(self):
"""
@ -224,7 +224,7 @@ class Fleet(object):
fleet.worker_index()
"""
return self._role_maker.worker_index()
return self._role_maker._worker_index()
def worker_num(self):
"""
@ -241,7 +241,7 @@ class Fleet(object):
fleet.worker_num()
"""
return self._role_maker.worker_num()
return self._role_maker._worker_num()
def is_worker(self):
"""
@ -259,7 +259,7 @@ class Fleet(object):
fleet.is_worker()
"""
return self._role_maker.is_worker()
return self._role_maker._is_worker()
def worker_endpoints(self, to_string=False):
"""
@ -277,9 +277,9 @@ class Fleet(object):
"""
if to_string:
return ",".join(self._role_maker.get_trainer_endpoints())
return ",".join(self._role_maker._get_trainer_endpoints())
else:
return self._role_maker.get_trainer_endpoints()
return self._role_maker._get_trainer_endpoints()
def server_num(self):
"""
@ -294,7 +294,7 @@ class Fleet(object):
fleet.init()
fleet.server_num()
"""
return len(self._role_maker.get_pserver_endpoints())
return len(self._role_maker._get_pserver_endpoints())
def server_index(self):
"""
@ -311,7 +311,7 @@ class Fleet(object):
fleet.server_index()
"""
return self._role_maker.server_index()
return self._role_maker._server_index()
def server_endpoints(self, to_string=False):
"""
@ -330,9 +330,9 @@ class Fleet(object):
"""
if to_string:
return ",".join(self._role_maker.get_pserver_endpoints())
return ",".join(self._role_maker._get_pserver_endpoints())
else:
return self._role_maker.get_pserver_endpoints()
return self._role_maker._get_pserver_endpoints()
def is_server(self):
"""
@ -350,7 +350,7 @@ class Fleet(object):
fleet.is_server()
"""
return self._role_maker.is_server(
return self._role_maker._is_server(
) or self._role_maker._is_heter_worker()
def set_util(self, util):

File diff suppressed because it is too large Load Diff

@ -237,8 +237,8 @@ class UtilBase(object):
if not isinstance(files, list):
raise TypeError("files should be a list of file need to be read.")
trainer_id = self.role_maker.worker_index()
trainers = self.role_maker.worker_num()
trainer_id = self.role_maker._worker_index()
trainers = self.role_maker._worker_num()
remainder = len(files) % trainers
blocksize = int(len(files) / trainers)
@ -280,7 +280,7 @@ class UtilBase(object):
fleet_util._set_role_maker(role)
fleet_util.print_on_rank("I'm worker 0", 0)
"""
if self.role_maker.worker_index() != rank_id:
if self.role_maker._worker_index() != rank_id:
return
print(message)

@ -57,12 +57,12 @@ class CollectiveHelper(object):
if startup_program is None:
self.startup_program = fluid.default_startup_program()
endpoints = self.role_maker.get_trainer_endpoints()
current_endpoint = endpoints[self.role_maker.worker_index()]
endpoints = self.role_maker._get_trainer_endpoints()
current_endpoint = endpoints[self.role_maker._worker_index()]
for ring_id in range(self.nrings):
self._init_communicator(
self.startup_program, current_endpoint, endpoints,
self.role_maker.worker_index(), ring_id, self.wait_port)
self.role_maker._worker_index(), ring_id, self.wait_port)
self._broadcast_params()
def _init_communicator(self, program, current_endpoint, endpoints, rank,

@ -47,7 +47,7 @@ class DGCOptimizer(MetaOptimizerBase):
sparsity=configs['sparsity'],
parameter_list=opt._parameter_list,
use_nesterov=opt._use_nesterov,
num_trainers=self.role_maker.worker_num(),
num_trainers=self.role_maker._worker_num(),
regularization=opt.regularization,
grad_clip=opt._grad_clip,
name=opt._name)
@ -60,7 +60,7 @@ class DGCOptimizer(MetaOptimizerBase):
if not isinstance(self.inner_opt, Momentum):
logging.warn("dgc only works on Momentum optimizer")
return False
if self.role_maker.worker_num() <= 1:
if self.role_maker._worker_num() <= 1:
logging.warn("dgc only works on multi cards")
return False

@ -50,12 +50,12 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
# should fix the variable
def _setup_nccl_op(self, startup_program, main_program, build_strategy):
trainer_endpoints = self.role_maker.get_trainer_endpoints()
trainer_endpoints = self.role_maker._get_trainer_endpoints()
trainers = trainer_endpoints
trainer_id = self.role_maker.worker_index()
current_endpoint = self.role_maker.get_trainer_endpoints()[trainer_id]
trainer_id = self.role_maker._worker_index()
current_endpoint = self.role_maker._get_trainer_endpoints()[trainer_id]
trainer_endpoints_env = ",".join(trainer_endpoints)
trainers_num = self.role_maker.worker_num()
trainers_num = self.role_maker._worker_num()
nccl_id_var = startup_program.global_block().create_var(
name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
for i in range(1, build_strategy.nccl_comm_num):
@ -127,8 +127,8 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
local_build_strategy.enable_sequential_execution = True
exe_strategy = self.user_defined_strategy.execution_strategy
worker_num = self.role_maker.worker_num()
node_num = self.role_maker.node_num()
worker_num = self.role_maker._worker_num()
node_num = self.role_maker._node_num()
if self.role_maker._is_collective:
assert worker_num >= 1, "nccl2 worker_num must >= 1, now:{}" % worker_num
@ -170,9 +170,9 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
# TODO(guru4elephant): should be an independent optimizer
self._setup_nccl_op(startup_program, main_program, local_build_strategy)
local_build_strategy.num_trainers = self.role_maker.worker_num()
local_build_strategy.trainer_id = self.role_maker.worker_index()
local_build_strategy.trainers_endpoints = self.role_maker.get_trainer_endpoints(
local_build_strategy.num_trainers = self.role_maker._worker_num()
local_build_strategy.trainer_id = self.role_maker._worker_index()
local_build_strategy.trainers_endpoints = self.role_maker._get_trainer_endpoints(
)
local_build_strategy.enable_backward_optimizer_op_deps = True

@ -38,7 +38,7 @@ class LocalSGDOptimizer(MetaOptimizerBase):
if not self.user_defined_strategy.localsgd:
return False
if self.role_maker.worker_num() <= 1:
if self.role_maker._worker_num() <= 1:
return False
return isinstance(self.inner_opt, paddle.optimizer.momentum.Momentum) \
@ -168,7 +168,7 @@ class LocalSGDOptimizer(MetaOptimizerBase):
inputs={'X': [param]},
outputs={'Out': [param]},
attrs={
'scale': 1.0 / self.role_maker.worker_num(),
'scale': 1.0 / self.role_maker._worker_num(),
OP_ROLE_KEY: OpRole.Optimize
})
sub_block.append_op(
@ -208,7 +208,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
if not self.user_defined_strategy.adaptive_localsgd:
return False
if self.role_maker.worker_num() <= 1:
if self.role_maker._worker_num() <= 1:
return False
return isinstance(self.inner_opt, paddle.optimizer.momentum.Momentum) \
@ -275,7 +275,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
inputs={'X': [avg_loss]},
outputs={'Out': [avg_loss]},
attrs={
'scale': 1.0 / self.role_maker.worker_num(),
'scale': 1.0 / self.role_maker._worker_num(),
OP_ROLE_KEY: OpRole.Optimize
})
@ -398,7 +398,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
inputs={'X': [param]},
outputs={'Out': [param]},
attrs={
'scale': 1.0 / self.role_maker.worker_num(),
'scale': 1.0 / self.role_maker._worker_num(),
OP_ROLE_KEY: OpRole.Optimize
})
sub_block.append_op(

@ -31,7 +31,7 @@ class ParameterServerGraphOptimizer(ParameterServerOptimizer):
if k_steps < 0:
return False
if self.role_maker.is_server():
if self.role_maker._is_server():
return False
if self.role_maker._is_heter_parameter_server_mode:

@ -239,10 +239,10 @@ class ParameterServerOptimizer(MetaOptimizerBase):
strategy, self.role_maker)
compiled_config.strategy = strategy
if self.role_maker.is_worker() or self.role_maker._is_heter_worker():
if self.role_maker._is_worker() or self.role_maker._is_heter_worker():
main_program, startup_program = self._build_trainer_programs(
compiled_config)
elif self.role_maker.is_server():
elif self.role_maker._is_server():
main_program, startup_program = self._build_pserver_programs(
compiled_config)

@ -126,11 +126,11 @@ class PipelineOptimizer(MetaOptimizerBase):
optimize_ops, params_grads, prog_list = \
self.wrapped_opt.minimize(loss, startup_program,
parameter_list, no_grad_set)
if self.role_maker.worker_num() == 1:
if self.role_maker._worker_num() == 1:
return optimize_ops, params_grads
endpoints = self.role_maker.get_trainer_endpoints()
current_endpoint = endpoints[self.role_maker.worker_index()]
endpoints = self.role_maker._get_trainer_endpoints()
current_endpoint = endpoints[self.role_maker._worker_index()]
self.startup_program = startup_program
if startup_program is None:
self.startup_program = fluid.default_startup_program()
@ -142,7 +142,7 @@ class PipelineOptimizer(MetaOptimizerBase):
self.nranks = nranks
self.nrings = len(self.main_program_list)
self.rank = self.role_maker.worker_index()
self.rank = self.role_maker._worker_index()
self.endpoints = endpoints
self.current_endpoint = current_endpoint

@ -104,9 +104,9 @@ class ParameterServerRuntime(RuntimeBase):
def _init_worker(self):
def sync_strategy_envs():
kwargs = {}
kwargs["pserver_endpoints"] = self.role_maker.get_pserver_endpoints(
)
kwargs["trainer_id"] = self.role_maker.worker_index()
kwargs[
"pserver_endpoints"] = self.role_maker._get_pserver_endpoints()
kwargs["trainer_id"] = self.role_maker._worker_index()
return kwargs
def geo_strategy_envs():
@ -150,7 +150,7 @@ class ParameterServerRuntime(RuntimeBase):
return "#".join(init_attrs)
kwargs = {}
kwargs["trainers"] = self.role_maker.worker_num()
kwargs["trainers"] = self.role_maker._worker_num()
kwargs["sparse_attrs"] = get_sparse_attrs()
return kwargs
@ -338,7 +338,7 @@ class ParameterServerRuntime(RuntimeBase):
block.append_op(
type='recv_save',
attrs={
"trainer_id": self.role_maker.worker_index(),
"trainer_id": self.role_maker._worker_index(),
"shape": var.shape,
"slice_shapes":
[",".join([str(i) for i in var.shape])],
@ -378,14 +378,15 @@ class ParameterServerRuntime(RuntimeBase):
block.append_op(
type='recv_save',
attrs={
"trainer_id": self.role_maker.worker_index(),
"trainer_id": self.role_maker._worker_index(),
"shape": var.shape,
"slice_shapes": slice_shapes,
"slice_varnames": var_ctx.split_varnames(),
"remote_varnames": var_ctx.split_varnames(),
"is_sparse": True,
"endpoints": var_ctx.split_endpoints(),
"pserver_num": len(self.role_maker.get_pserver_endpoints()),
"pserver_num":
len(self.role_maker._get_pserver_endpoints()),
"file_path": os.path.join(dirname, var.name)
})
@ -403,7 +404,7 @@ class ParameterServerRuntime(RuntimeBase):
block.append_op(
type='recv_save',
attrs={
"trainer_id": self.role_maker.worker_index(),
"trainer_id": self.role_maker._worker_index(),
"shape": var.shape,
"slice_shapes": slice_shapes,
"slice_varnames": slice_varnames,
@ -411,7 +412,7 @@ class ParameterServerRuntime(RuntimeBase):
"is_sparse": True,
"endpoints": var_ctx.split_endpoints(),
"pserver_num":
len(self.role_maker.get_pserver_endpoints()),
len(self.role_maker._get_pserver_endpoints()),
"file_path": os.path.join(dirname, var.name)
})
@ -422,7 +423,7 @@ class ParameterServerRuntime(RuntimeBase):
block.append_op(
type='recv_save',
attrs={
"trainer_id": self.role_maker.worker_index(),
"trainer_id": self.role_maker._worker_index(),
"shape": var.shape,
"slice_shapes":
[",".join([str(i) for i in var.shape])],

@ -170,22 +170,40 @@ class CompileTimeStrategy(object):
return trainer.mode == DistributedMode.ASYNC
def get_role_id(self):
return self.role_maker.role_id()
try:
return self.role_maker._role_id()
except Exception:
return self.role_maker.role_id()
def get_trainers(self):
return self.role_maker.worker_num()
try:
return self.role_maker._worker_num()
except Exception:
return self.role_maker.worker_num()
def get_ps_endpoint(self):
return self.role_maker.get_pserver_endpoints()[self.get_role_id()]
try:
return self.role_maker._get_pserver_endpoints()[self.get_role_id()]
except Exception:
return self.role_maker.get_pserver_endpoints()[self.get_role_id()]
def get_ps_endpoints(self):
return self.role_maker.get_pserver_endpoints()
try:
return self.role_maker._get_pserver_endpoints()
except Exception:
return self.role_maker.get_pserver_endpoints()
def get_heter_worker_endpoints(self):
return self.role_maker._get_heter_worker_endpoints()
try:
return self.role_maker._get_heter_worker_endpoints()
except Exception:
return self.role_maker.get_heter_worker_endpoints()
def get_heter_worker_endpoint(self):
return self.role_maker._get_heter_worker_endpoint()
try:
return self.role_maker._get_heter_worker_endpoint()
except Exception:
return self.role_maker.get_heter_worker_endpoint()
def get_origin_programs(self):
return self.origin_main_program, self.origin_startup_program

@ -24,10 +24,10 @@ import numpy as np
class TestFleetBase(unittest.TestCase):
def setUp(self):
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36000"
os.environ["PADDLE_TRAINERS_NUM"] = "2"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
"127.0.0.1:36001,127.0.0.2:36001"
"127.0.0.1:36001,127.0.0.2:36002"
def test_init(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
@ -58,32 +58,51 @@ class TestFleetBase(unittest.TestCase):
def test_worker_endpoints(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
print(fleet.worker_endpoints(to_string=True))
self.assertEqual(
"127.0.0.1:36000", fleet.worker_endpoints(to_string=True))
self.assertEqual(["127.0.0.1:36000"], fleet.worker_endpoints())
def test_server_num(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["PADDLE_PORT"] = "36001"
os.environ["POD_IP"] = "127.0.0.1"
role = role_maker.PaddleCloudRoleMaker()
fleet.init(role)
if fleet.is_server():
print("fleet server num: {}".format(fleet.server_num()))
os.environ["PADDLE_TRAINERS_NUM"] = "2"
self.assertEqual(2, fleet.server_num())
def test_server_index(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["PADDLE_PORT"] = "36001"
os.environ["POD_IP"] = "127.0.0.1"
role = role_maker.PaddleCloudRoleMaker()
fleet.init(role)
if fleet.is_server():
print("fleet server index: {}".format(fleet.server_index()))
self.assertEqual(0, fleet.server_index())
def test_server_endpoints(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["PADDLE_PORT"] = "36001"
os.environ["POD_IP"] = "127.0.0.1"
role = role_maker.PaddleCloudRoleMaker()
fleet.init(role)
if fleet.is_server():
print("fleet server index: {}".format(
fleet.server_endpoints(to_string=True)))
self.assertEqual(
"127.0.0.1:36001,127.0.0.2:36002",
fleet.server_endpoints(to_string=True))
self.assertEqual(["127.0.0.1:36001", "127.0.0.2:36002"],
fleet.server_endpoints())
def test_is_server(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["PADDLE_PORT"] = "36001"
os.environ["POD_IP"] = "127.0.0.1"
role = role_maker.PaddleCloudRoleMaker()
fleet.init(role)
if fleet.is_server():
print("test fleet is server")
self.assertTrue(fleet.is_server())
def test_util(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True)

@ -87,7 +87,7 @@ class TestCloudRoleMaker2(unittest.TestCase):
role2._all_gather(1)
role2._all_gather(1)
role2._barrier_server()
role2.all_gather(1)
role2._all_gather(1)
role3 = GeneralRoleMaker(path="./test_gloo_3")
role3._worker_gather(1)
role3._worker_gather(1)

Loading…
Cancel
Save