refactor momentum op to combine weight (#27414)

* refactor momentum op to combine weight_decay (scale op and sum op)
5 years ago · 8ff3550658
parent bd1d6d3b30
commit 8ff3550658
5 changed files with 646 additions and 135 deletions
--- a/paddle/fluid/operators/optimizers/momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op.cc
@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/optimizers/momentum_op.h"
+#include "paddle/fluid/framework/op_version_registry.h"

 namespace paddle {
 namespace operators {
@ -61,6 +62,12 @@ void MomentumOpMaker::Make() {
                "(bool, default false) "
                "Use Nesterov Momentum")
      .SetDefault(false);
+  AddAttr<std::string>(
+      "regularization_method",
+      "(string) regularization_method, right now only support l2decay or none")
+      .SetDefault("");
+  AddAttr<float>("regularization_coeff", "(float) regularization_coeff")
+      .SetDefault(0);
  AddComment(R"DOC(
 Momentum Optimizer.

@ -90,3 +97,16 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(
    momentum, ops::MomentumOpKernel<paddle::platform::CPUDeviceContext, float>,
    ops::MomentumOpKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_VERSION(momentum)
+    .AddCheckpoint(
+        R"ROC(
+      Upgrade momentum add 2 attributes [regularization_method, regularization_coeff].
+    )ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewAttr("regularization_method",
+                     "(string) regularization_method, right now only support "
+                     "l2decay or none",
+                     std::string(""))
+            .NewAttr("regularization_coeff", "(float) regularization_coeff",
+                     0.0f));
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
--- a/python/paddle/fluid/contrib/init.py
+++ b/python/paddle/fluid/contrib/init.py
@ -35,6 +35,7 @@ from . import mixed_precision
 from .mixed_precision import *
 from . import layers
 from .layers import *
+from . import optimizer

 __all__ = []
 __all__ += decoder.__all__
@ -46,3 +47,4 @@ __all__ += utils.__all__
 __all__ += extend_optimizer.__all__
 __all__ += ['mixed_precision']
 __all__ += layers.__all__
+__all__ += optimizer.__all__
--- a/python/paddle/fluid/contrib/optimizer.py
+++ b/python/paddle/fluid/contrib/optimizer.py
@ -0,0 +1,175 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.fluid.optimizer import Optimizer
+from paddle.fluid.regularizer import L1DecayRegularizer
+from paddle.fluid.regularizer import L2DecayRegularizer
+from paddle.fluid.regularizer import append_regularization_ops
+from paddle.fluid import framework
+from paddle.fluid import core
+from paddle.fluid.framework import program_guard
+from paddle.fluid.clip import append_gradient_clip_ops
+
+__all__ = ['Momentum']
+
+
+class Momentum(Optimizer):
+    """
+
+    Simple Momentum optimizer with velocity state
+
+    This optimizer has a flag for Nestrov Momentum.
+
+    The update equations are as follows:
+
+    .. math::
+
+        & velocity = mu * velocity + gradient
+
+        & if (use\_nesterov):
+
+        &\quad   param = param - (gradient + mu * velocity) * learning\_rate
+
+        & else:
+
+        &\quad   param = param - learning\_rate * velocity
+
+    Parameters:
+        learning_rate (float|Variable): The learning rate used to update parameters. \
+            Can be a float value or a Variable with one float value as data element.
+        momentum (float): Momentum factor
+        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        use_nesterov (bool, optional): Enables Nesterov momentum, default is false.
+        regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
+             :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
+            regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
+            ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
+            Default None, meaning there is no regularization.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
+            some derived class of ``GradientClipBase`` . There are three cliping strategies 
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name (str, optional): This parameter is used by developers to print debugging information. \
+            For details, please refer to :ref:`api_guide_Name`. Default is None.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+
+            paddle.enable_static()
+
+            place = fluid.CPUPlace()
+            main = fluid.Program()
+            with fluid.program_guard(main):
+                x = paddle.static.data(name='x', shape=[1, 13], dtype='float32')
+                y = paddle.static.data(name='y', shape=[1], dtype='float32')
+                linear = paddle.nn.Linear(13, 1)
+                y_predict = linear(x)
+                cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y)
+                avg_cost = paddle.mean(cost)
+
+                moment_optimizer = fluid.contrib.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
+                moment_optimizer.minimize(avg_cost)
+
+                fetch_list = [avg_cost]
+                train_reader = paddle.batch(
+                    paddle.dataset.uci_housing.train(), batch_size=1)
+                feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+                exe = fluid.Executor(place)
+                exe.run(paddle.static.default_startup_program())
+                for data in train_reader():
+                    exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+
+    """
+    _velocity_acc_str = "velocity"
+
+    def __init__(self,
+                 learning_rate,
+                 momentum,
+                 parameter_list=None,
+                 use_nesterov=False,
+                 regularization=None,
+                 grad_clip=None,
+                 name=None):
+        assert learning_rate is not None
+        assert momentum is not None
+        predicate = lambda regular: isinstance(regular, L2DecayRegularizer)
+        py_regular = None if predicate(regularization) else regularization
+        super(Momentum, self).__init__(
+            learning_rate=learning_rate,
+            parameter_list=parameter_list,
+            regularization=py_regular,
+            grad_clip=grad_clip,
+            name=name)
+        self.type = "momentum"
+        self._momentum = momentum
+        self._use_nesterov = bool(use_nesterov)
+        self._regularization_method = ""
+        self._regularization_coeff = 0
+        if (isinstance(regularization, L2DecayRegularizer)):
+            self._regularization_method = "l2_decay"
+            self._regularization_coeff = regularization._regularization_coeff
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+
+        for p in parameters:
+            self._add_accumulator(self._velocity_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        velocity_acc = self._get_accumulator(self._velocity_acc_str,
+                                             param_and_grad[0])
+        lr = self._create_param_lr(param_and_grad)
+
+        if framework.in_dygraph_mode():
+            _, _ = core.ops.momentum(
+                param_and_grad[0], param_and_grad[1], velocity_acc, lr,
+                param_and_grad[0], velocity_acc, 'mu', self._momentum,
+                'use_nesterov', self._use_nesterov, 'regularization_method',
+                self._regularization_method, 'regularization_coeff',
+                self._regularization_coeff)
+            return None
+
+        attrs = {
+            "mu": self._momentum,
+            "use_nesterov": self._use_nesterov,
+            "regularization_method": self._regularization_method,
+            "regularization_coeff": self._regularization_coeff
+        }
+        inputs = {
+            "Param": [param_and_grad[0]],
+            "Grad": [param_and_grad[1]],
+            "Velocity": [velocity_acc],
+            "LearningRate": [lr]
+        }
+
+        outputs = {
+            "ParamOut": [param_and_grad[0]],
+            "VelocityOut": [velocity_acc]
+        }
+        # create the momentum optimize op
+        momentum_op = block.append_op(
+            type=self.type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs,
+            stop_gradient=True)
+
+        return momentum_op
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py