【paddle.fleet】add auto parallel L1 implementations (#27090)

* add auto parallel L1 implementation
test=develop
numel
Dong Daxiang 5 years ago committed by GitHub
parent 5af81f833c
commit 0443b480b8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -13,6 +13,7 @@
# limitations under the License.
from __future__ import print_function
import copy
import warnings
import paddle
from paddle.fluid.framework import dygraph_only
@ -1008,6 +1009,18 @@ class Fleet(object):
MetaOptimizerFactory()._get_valid_meta_optimizers(
self.user_defined_optimizer)
context["user_defined_strategy"] = copy.copy(self.user_defined_strategy)
# trigger the auto-parallel in very strict condition
# strategy = DistributedStrategy()
# strategy.auto = True
# optimizer = paddle.optimizer.SGD(learning_rate=0.1)
# optimizer = fleet.distributed_optimizer(optimizer, strategy)
if self.user_defined_strategy._is_strict_auto():
# turn on all the strategy for each optimizer
for opt in distributed_optimizer_list:
opt._enable_strategy(self.user_defined_strategy)
valid_optimizer_list = []
valid_graph_optimizer_list = []
can_not_apply_optimizer_list = []

@ -42,6 +42,17 @@ class AMPOptimizer(MetaOptimizerBase):
dist_strategy.amp = False
dist_strategy.amp_configs = {}
def _enable_strategy(self, dist_strategy):
dist_strategy.amp = True
dist_strategy.amp_configs = {
"init_loss_scaling": 32768.0,
"incr_every_n_steps": 1000,
"decr_every_n_nan_or_inf": 2,
"incr_ratio": 2.0,
"decr_ratio": 8.0,
"use_dynamic_loss_scaling": True
}
def minimize_impl(self,
loss,
startup_program=None,

@ -69,6 +69,10 @@ class DGCOptimizer(MetaOptimizerBase):
dist_strategy.dgc = False
dist_strategy.dgc_configs = {}
def _enable_strategy(self, dist_strategy):
dist_strategy.dgc = True
dist_strategy.dgc_configs = {"rampup_begin_step": 0, "rampup_step": 1}
def backward(self,
loss,
startup_program=None,

@ -45,6 +45,10 @@ class GradientMergeOptimizer(MetaOptimizerBase):
dist_strategy.gradient_merge = False
dist_strategy.gradient_merge_configs = {}
def _enable_strategy(self, dist_strategy):
# we currently do not support auto-enable gradient merge
return
def minimize_impl(self,
loss,
startup_program=None,

@ -148,9 +148,6 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
sync_allreduce = dist_strategy.sync_nccl_allreduce
if sync_allreduce:
paddle.fluid.framework.set_flags({
"FLAGS_sync_nccl_allreduce": True
})
exe_strategy.num_threads = local_build_strategy.nccl_comm_num + 1
if local_build_strategy.use_hierarchical_allreduce:
exe_strategy.num_threads = 2 * local_build_strategy.nccl_comm_num + 1
@ -191,7 +188,11 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
def _disable_strategy(self, dist_strategy):
# TODO(guru4elephant): should close all PE related flags here
pass
return
def _enable_strategy(self, dist_strategy):
# by default, graph execution strategy is enabled
return
def minimize(self,
loss,

@ -75,6 +75,13 @@ class LambOptimizer(MetaOptimizerBase):
dist_strategy.lamb = False
dist_strategy.lamb_configs = {}
def _enable_strategy(self, dist_strategy):
dist_strategy.lamb = True
dist_strategy.lamb_configs = {
"lamb_weight_decay": 0.01,
"exclude_from_weight_decay": []
}
def backward(self,
loss,
startup_program=None,

@ -59,6 +59,13 @@ class LarsOptimizer(MetaOptimizerBase):
dist_strategy.lars = False
dist_strategy.lars_configs = {}
def _enable_strategy(self, dist_strategy):
dist_strategy.lars = True
dist_strategy.lars_configs = {
"lars_coeff": 0.01,
"lars_weight_decay": 0.0005,
}
def backward(self,
loss,
startup_program=None,

@ -42,6 +42,10 @@ class LocalSGDOptimizer(MetaOptimizerBase):
dist_strategy.localsgd = False
dist_strategy.localsgd_configs = {}
def _enable_strategy(self, dist_strategy):
dist_strategy.localsgd = True
dist_strategy.localsgd_configs = {"k_steps": 1}
def snapshot_name(self, param_name):
return param_name + self.snapshot_key

@ -48,6 +48,10 @@ class MetaOptimizerBase(Optimizer):
raise NotImplementedError("you should implement disable strategy in {}".
format(type(self).__name__))
def _enable_strategy(self, dist_strategy):
raise NotImplementedError("you should implement enable strategy in {}".
format(type(self).__name__))
def apply_gradients(self, params_grads):
return self.inner_opt.apply_gradients(params_grads=params_grads)

@ -39,6 +39,11 @@ class ParameterServerGraphOptimizer(ParameterServerOptimizer):
def _disable_strategy(self, dist_strategy):
dist_strategy.a_sync_configs = {}
def _enable_strategy(self, dist_strategy):
# only open up the async mode for auto-parallel
dist_strategy.a_sync = True
dist_strategy.a_sync_configs = {}
def _is_graph_out(self):
return True

@ -157,4 +157,9 @@ class ParameterServerOptimizer(MetaOptimizerBase):
return None, None
def _disable_strategy(self, dist_strategy):
dist_strategy.a_sync_configs = {}
self.user_defined_strategy.a_sync_configs = {}
def _enable_strategy(self, dist_strategy):
dist_strategy.a_sync = True
dist_strategy.a_sync_configs = {}

@ -111,6 +111,10 @@ class PipelineOptimizer(MetaOptimizerBase):
dist_strategy.pipeline = False
dist_strategy.pipeline_configs = {}
def _enable_strategy(self, dist_strategy):
# we do not support enable pipeline automatically right now
return
def minimize_impl(self,
loss,
startup_program=None,

@ -49,6 +49,10 @@ class RecomputeOptimizer(MetaOptimizerBase):
dist_strategy.recompute = False
dist_strategy.recompute_configs = {}
def _enable_strategy(self, dist_strategy):
# we do not support automatically recompute checkpoints currently
return
def backward(self,
loss,
startup_program=None,

@ -47,6 +47,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_dgc_meta_optimizer)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_private_function)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_executor)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_meta_optimizer_base)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_auto)
foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
list(REMOVE_ITEM TEST_OPS ${TEST_OP})
endforeach()
@ -458,6 +459,7 @@ if(WITH_DISTRIBUTE)
py_test_modules(test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS ${dist_ENVS})
py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS})
py_test_modules(test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS ${dist_ENVS})
py_test_modules(test_fleet_auto MODULES test_fleet_auto ENVS ${dist_ENVS})
if(NOT WIN32)
py_test_modules(test_fleet_localsgd_meta_optimizer MODULES test_fleet_localsgd_meta_optimizer ENVS ${dist_ENVS})
py_test_modules(test_fleet_lars_meta_optimizer MODULES test_fleet_lars_meta_optimizer ENVS ${dist_ENVS})

@ -0,0 +1,51 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle
import os
import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker
class TestDistributedStrategyAuto(unittest.TestCase):
def setUp(self):
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
os.environ["PADDLE_TRAINERS_NUM"] = "2"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
"127.0.0.1:36001,127.0.0.2:36001"
def test_distributed_strategy_auto(self):
fleet.init(is_collective=True)
input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32')
input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.auto = True
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
if __name__ == "__main__":
unittest.main()
Loading…
Cancel
Save