【paddle.fleet】distributed_optimizer supports dygraph (#26541)

paddle.distributed.fleet supports dynamic graph execution.
revert-26856-strategy_example2
danleifeng 5 years ago committed by GitHub
parent c8cc094576
commit 6b4ca0d7f1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -50,3 +50,10 @@ distributed_optimizer = fleet.distributed_optimizer
save_inference_model = fleet.save_inference_model
save_persistables = fleet.save_persistables
minimize = fleet.minimize
distributed_model = fleet.distributed_model
step = fleet.step
clear_grad = fleet.clear_grad
set_lr = fleet.set_lr
get_lr = fleet.get_lr
state_dict = fleet.state_dict
set_state_dict = fleet.set_state_dict

File diff suppressed because it is too large Load Diff

@ -114,8 +114,8 @@ class TestMnist(TestParallelDyGraphRunnerBase):
model = MNIST()
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=2, drop_last=True)
opt = fluid.optimizer.Adam(
learning_rate=1e-3, parameter_list=model.parameters())
opt = paddle.optimizer.Adam(
learning_rate=1e-3, parameters=model.parameters())
return model, train_reader, opt
def run_one_loop(self, model, opt, data):

@ -488,6 +488,50 @@ class TestParallelDyGraphRunnerBase(object):
model.clear_gradients()
return out_losses
def run_gpu_fleet_api_trainer(self, args):
import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker
# 1. enable dygraph
paddle.disable_static()
# 2. init seed
seed = 90
paddle.static.default_startup_program().random_seed = seed
paddle.static.default_main_program().random_seed = seed
np.random.seed(seed)
random.seed = seed
# get trainer id
args.trainer_id = paddle.distributed.get_rank()
# 3. init parallel env
if args.update_method == "nccl2":
fleet.init(is_collective=True)
# 4. train model
model, train_reader, opt = self.get_model()
if args.update_method == "nccl2":
opt = fleet.distributed_optimizer(opt)
model = fleet.distributed_model(model)
out_losses = []
for step_id, data in enumerate(train_reader()):
data = self._get_data(data, args)
if step_id == RUN_STEP:
break
loss = self.run_one_loop(model, opt, data)
out_losses.append(loss.numpy())
if args.update_method == "nccl2":
loss = model.scale_loss(loss)
loss.backward()
if args.update_method == "nccl2":
model.apply_collective_grads()
opt.step()
opt.clear_grad()
print_to_out(out_losses)
def runtime_main(test_class):
parser = argparse.ArgumentParser(description='Run dist test.')
@ -687,7 +731,8 @@ class TestDistBase(unittest.TestCase):
envs['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
cmd += " -m coverage run --branch -p"
cmd += " %s --role trainer --lr %f" % (model, self._lr)
cmd += " %s --role trainer --update_method local --lr %f" % (model,
self._lr)
if batch_size != DEFAULT_BATCH_SIZE:
cmd += " --batch_size %d" % batch_size
@ -850,6 +895,7 @@ class TestDistBase(unittest.TestCase):
if self.__use_cuda:
tr_cmd += " --use_cuda"
env.update({
"FLAGS_selected_gpus": "{}".format(0),
"CUDA_VISIBLE_DEVICES": "{}".format(trainer_id % 2),
"PADDLE_TRAINERS_NUM": "{}".format(trainer_num),
"PADDLE_TRAINER_ID": "{}".format(trainer_id),

@ -126,6 +126,32 @@ class TestFleetBase(unittest.TestCase):
self.assertRaises(Exception, fleet.init_worker)
class TestFleetDygraph(unittest.TestCase):
def setUp(self):
os.environ[
"PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213,127.0.0.1:36214"
os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
os.environ["PADDLE_TRAINERS_NUM"] = "2"
os.environ["PADDLE_TRAINER_ID"] = "0"
def test_dygraph_method(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = fluid.dygraph.to_variable(value)
layer = paddle.nn.Linear(13, 5)
adam = paddle.optimizer.Adam(
learning_rate=0.01, parameters=layer.parameters())
# remove init cause this UT cannot launch distributed task
adam = fleet.distributed_optimizer(adam)
dp_layer = fleet.distributed_model(layer)
lr = 0.001
adam.set_lr(lr)
cur_lr = adam.get_lr()
assert (lr == cur_lr)
state_dict = adam.state_dict()
adam.set_state_dict(state_dict)
class TestFleetBaseSingleRunCollective(unittest.TestCase):
def setUp(self):
os.environ.pop("PADDLE_TRAINER_ENDPOINTS")

@ -47,5 +47,21 @@ class TestParallelDygraphMnistSpawn(TestDistSpawnRunner):
self.check_dist_result_with_spawn(test_class=TestMnist, delta=1e-5)
class TestFleetDygraphMnist(TestDistBase):
def _setup_config(self):
self._sync_mode = False
self._nccl2_mode = True
self._dygraph = True
self._gpu_fleet_api = True
def test_mnist(self):
if fluid.core.is_compiled_with_cuda():
self.check_with_place(
"parallel_dygraph_mnist.py",
delta=1e-5,
check_error_log=True,
log_name=flag_name)
if __name__ == "__main__":
unittest.main()

Loading…
Cancel
Save