add adaptivelsgd in meta_optimizer (#27289)

* add adaptivelsgd

* Todo fix the code to avoid the conflict.
revert-27520-disable_pr
ShenLiang 4 years ago committed by GitHub
parent 6e29c2da05
commit 54b81fa32c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -41,6 +41,11 @@ message LocalSGDConfig {
optional int32 begin_step = 2 [ default = 1 ];
}
message AdaptiveLocalSGDConfig {
optional int32 init_k_steps = 1 [ default = 1 ];
optional int32 begin_step = 2 [ default = 1 ];
}
message GradientMergeConfig {
optional int32 k_steps = 1 [ default = 1 ];
optional bool avg = 2 [ default = true ];
@ -121,6 +126,7 @@ message DistributedStrategy {
optional bool cudnn_exhaustive_search = 21 [ default = true ];
optional int32 conv_workspace_size_limit = 22 [ default = 4000 ];
optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ];
optional bool adaptive_localsgd = 24 [ default = false ];
optional RecomputeConfig recompute_configs = 101;
optional AMPConfig amp_configs = 102;
@ -131,6 +137,7 @@ message DistributedStrategy {
optional AsyncConfig a_sync_configs = 107;
optional LarsConfig lars_configs = 108;
optional LambConfig lamb_configs = 109;
optional AdaptiveLocalSGDConfig adaptive_localsgd_configs = 110;
optional BuildStrategy build_strategy = 201;
optional ExecutionStrategy execution_strategy = 202;
}

@ -728,6 +728,63 @@ class DistributedStrategy(object):
"localsgd_configs")
assign_configs_value(self.strategy.localsgd_configs, configs)
@property
def adaptive_localsgd(self):
"""
Indicating whether we are using Adaptive Local SGD training. Default Value: False
For more details, please refer to `Adaptive Communication Strategies to Achieve
the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_.
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.adaptive_localsgd = True # by default this is false
"""
return self.strategy.localsgd
@adaptive_localsgd.setter
@is_strict_auto
def adaptive_localsgd(self, flag):
if isinstance(flag, bool):
self.strategy.localsgd = flag
else:
print("WARNING: adaptive_localsgd should have value of bool type")
@property
def adaptive_localsgd_configs(self):
"""
Set AdaptiveLocalSGD training configurations. AdaptiveLocalSGD has a configurable
setting that can be configured through a dict.
**Notes**:
init_k_steps(int) The initial steps for training before adaptive localsgd.
Then, the adaptive localsgd method will modify init_k_steps automatically.
Default 1.
begin_step(int) The step of begining training by adaptive localsgd. Default 1.
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.adaptive_localsgd = True
strategy.adaptive_localsgd_configs = {"init_k_steps": 1,
"begin_step": 30}
"""
return get_msg_dict(self.strategy.adaptive_localsgd_configs)
@adaptive_localsgd_configs.setter
@is_strict_auto
def adaptive_localsgd_configs(self, configs):
check_configs_key(self.strategy.adaptive_localsgd_configs, configs,
"adaptive_localsgd_configs")
assign_configs_value(self.strategy.adaptive_localsgd_configs, configs)
@property
def dgc(self):
"""

@ -18,6 +18,7 @@ from .graph_execution_optimizer import GraphExecutionOptimizer
from .parameter_server_optimizer import ParameterServerOptimizer
from .pipeline_optimizer import PipelineOptimizer
from .localsgd_optimizer import LocalSGDOptimizer
from .localsgd_optimizer import AdaptiveLocalSGDOptimizer
from .lars_optimizer import LarsOptimizer
from .parameter_server_graph_optimizer import ParameterServerGraphOptimizer
from .dgc_optimizer import DGCOptimizer

@ -24,7 +24,7 @@ class AMPOptimizer(MetaOptimizerBase):
self.meta_optimizers_white_list = [
"LarsOptimizer", "LambOptimizer", "RecomputeOptimizer",
"LocalSGDOptimizer", "GradientMergeOptimizer",
"GraphExecutionOptimizer"
"GraphExecutionOptimizer", "AdaptiveLocalSGDOptimizer"
]
self.meta_optimizers_black_list = ["DGCOptimizer"]

@ -86,6 +86,13 @@ class TestStrategyConfig(unittest.TestCase):
self.assertEqual(strategy.localsgd_configs["k_steps"], 4)
self.assertEqual(strategy.localsgd_configs["begin_step"], 120)
def test_adaptive_localsgd_configs(self):
strategy = paddle.distributed.fleet.DistributedStrategy()
configs = {"init_k_steps": 1, "begin_step": 120}
strategy.adaptive_localsgd_configs = configs
self.assertEqual(strategy.adaptive_localsgd_configs["init_k_steps"], 1)
self.assertEqual(strategy.adaptive_localsgd_configs["begin_step"], 120)
def test_dgc(self):
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.dgc = True

@ -52,5 +52,36 @@ class TestFleetLocalSGDMetaOptimizer(unittest.TestCase):
optimizer.minimize(avg_cost)
class TestFleetAdaptiveLocalSGDMetaOptimizer(unittest.TestCase):
def setUp(self):
os.environ["PADDLE_TRAINER_ID"] = "1"
os.environ[
"PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
def test_adaptive_localsgd_optimizer(self):
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32')
input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
fc = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc], size=2, act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.adaptive_localsgd = True
config = strategy.adaptive_localsgd_configs
config['init_k_steps'] = 1
config['begin_step'] = 1
strategy.adaptive_localsgd_configs = config
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
if __name__ == "__main__":
unittest.main()

Loading…
Cancel
Save