add new method of gradient_clip, better to use,test=develop (#23224)

5 years ago · 7fda333ac1
parent b7b0b3595b
commit 7fda333ac1
11 changed files with 665 additions and 461 deletions
--- a/python/paddle/fluid/init.py
+++ b/python/paddle/fluid/init.py
@ -75,7 +75,6 @@ from .transpiler import DistributeTranspiler, \
    memory_optimize, release_memory, DistributeTranspilerConfig
 from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
 from . import clip
-from . import dygraph_grad_clip
 from . import profiler
 from . import unique_name
 from . import parallel_executor
@ -122,7 +121,6 @@ __all__ = framework.__all__ + executor.__all__ + \
        'WeightNormParamAttr',
        'DataFeeder',
        'clip',
-        'dygraph_grad_clip',
        'profiler',
        'unique_name',
        'Scope',
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
--- a/python/paddle/fluid/dygraph_grad_clip.py
+++ b/python/paddle/fluid/dygraph_grad_clip.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@ -2409,7 +2409,6 @@ class Block(object):
            trainable = v.trainable
            optimize_attr = v.optimize_attr
            regularizer = v.regularizer
-            gradient_clip_attr = v.gradient_clip_attr
            error_clip = v.error_clip
        elif type(v) == Variable:
            var_type = "Variable"
@ -2432,7 +2431,6 @@ class Block(object):
                    trainable=trainable,
                    optimize_attr=optimize_attr,
                    regularizer=regularizer,
-                    gradient_clip_attr=gradient_clip_attr,
                    error_clip=error_clip)
            else:
                var = Parameter(
@ -2445,7 +2443,6 @@ class Block(object):
                    trainable=trainable,
                    optimize_attr=optimize_attr,
                    regularizer=regularizer,
-                    gradient_clip_attr=gradient_clip_attr,
                    error_clip=error_clip)
        elif var_type == "Variable":
            var = Variable(
@ -2723,7 +2720,6 @@ class Block(object):
                    trainable=p.trainable,
                    optimize_attr=p.optimize_attr,
                    regularizer=p.regularizer,
-                    gradient_clip_attr=p.gradient_clip_attr,
                    error_clip=p.error_clip,
                    name=v.name)
            else:
@ -2737,7 +2733,6 @@ class Block(object):
                    trainable=p.trainable,
                    optimize_attr=p.optimize_attr,
                    regularizer=p.regularizer,
-                    gradient_clip_attr=p.gradient_clip_attr,
                    error_clip=p.error_clip,
                    name=v.name)
            self.vars[new_p.name] = new_p
@ -4646,8 +4641,6 @@ class Parameter(Variable):
            Default: {'learning_rate': 1.0}
        regularizer(WeightDecayRegularizer): The Regularizer which will
            be applied on the parameter. Default: None
-        gradient_clip_attr(BaseGradientClipAttr): The gradient clip strategy
-            which will be applied on the parameter. Default: None
        do_model_average(bool): True if the model average strategy will
            be applied on this parameter.
    """
@ -4687,8 +4680,6 @@ class Parameter(Variable):

        self.regularizer = kwargs.get('regularizer', None)

-        self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
-
        self.do_model_average = kwargs.get('do_model_average', None)

        self.is_distributed = False
@ -4723,7 +4714,7 @@ class Parameter(Variable):
        if with_details:
            res_str = Variable.to_string(self, throw_on_error, True)
            additional_attr = ("trainable", "optimize_attr", "regularizer",
-                               "gradient_clip_attr", "do_model_average")
+                               "do_model_average")
            for attr_name in additional_attr:
                res_str += "%s: %s\n" % (attr_name,
                                         cpt.to_text(getattr(self, attr_name)))
@ -4752,8 +4743,6 @@ class ParamBase(core.VarBase):
            Default: {'learning_rate': 1.0}
        regularizer(WeightDecayRegularizer): The Regularizer which will
            be applied on the ParamBase. Default: None
-        gradient_clip_attr(BaseGradientClipAttr): The gradient clip strategy
-            which will be applied on the ParamBase. Default: None
        do_model_average(bool): True if the model average strategy will
            be applied on this ParamBase.
    """
@ -4792,8 +4781,6 @@ class ParamBase(core.VarBase):

        self.regularizer = kwargs.get('regularizer', None)

-        self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
-
        self.do_model_average = kwargs.get('do_model_average', None)

        self.is_distributed = False
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@ -24,7 +24,7 @@ from . import framework
 from . import layers
 from . import unique_name
 from .backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name
-from .clip import append_gradient_clip_ops, error_clip_callback
+from .clip import GradientClipBase, error_clip_callback, append_gradient_clip_ops
 from .framework import program_guard
 from .initializer import Constant
 from .layer_helper import LayerHelper
@ -109,6 +109,8 @@ class Optimizer(object):
        self._opti_name_list = []
        self._accumulators_holder = {}
        self._param_device_map = dict()
+        # if pass grad_clip into minimize, it will not be None
+        self._grad_clip = None

    @framework.dygraph_only
    def state_dict(self):
@ -690,12 +692,17 @@ class Optimizer(object):
                # ...
                optimizer.apply_gradients(params_grads)
        """
+
        params_grads = sorted(params_grads, key=lambda x: x[0].name)

        params_grads, table_param_and_grad, table_optimize_op = \
            self._process_distribute_lookuptable(params_grads)

-        params_grads = append_gradient_clip_ops(params_grads)
+        # 'minimize(grad_clip)' or 'set_gradient_clip'
+        if self._grad_clip is not None:
+            params_grads = self._grad_clip(params_grads)
+        else:
+            params_grads = append_gradient_clip_ops(params_grads)

        # Add regularization if any
        params_grads = append_regularization_ops(params_grads,
@ -712,19 +719,19 @@ class Optimizer(object):
        """
        Second part of `minimize`, appending optimization operators for
        given `params_grads` pairs.
-
        Args:
            loss (Variable): loss variable to run optimizations.
            startup_program (Program): startup_program for initializing parameters
                in `parameter_list`.
            params_grads (list): list of (param, grad) pair to do optimization.
-
        Returns:
            list: A list of operators appended to the current program.
        """
        if framework.in_dygraph_mode():
            with program_guard(framework.default_main_program(),
                               framework.default_startup_program()):
+                if self._grad_clip is not None:
+                    params_grads = self._grad_clip(params_grads)
                params_grads = append_regularization_ops(params_grads,
                                                         self.regularization)
                optimize_ops = self._create_optimization_pass(params_grads)
@ -809,16 +816,19 @@ class Optimizer(object):
            Please refer to the example of current Optimizer.
        """
        assert isinstance(loss, Variable), "The loss should be an Variable."
+        if grad_clip is not None:
+            if not isinstance(grad_clip, GradientClipBase):
+                raise TypeError(
+                    "'grad_clip' should be an instance of GradientClipBase's derived class"
+                )
+            self._grad_clip = grad_clip
+
        params_grads = self.backward(
            loss,
            startup_program=startup_program,
            parameter_list=parameter_list,
            no_grad_set=no_grad_set)

-        if grad_clip is not None and framework.in_dygraph_mode():
-            # TODO(hongyu): FIX later, this is only for dygraph, should be work for static mode
-            params_grads = grad_clip(params_grads)
-
        optimize_ops = self.apply_optimize(
            loss, startup_program=startup_program, params_grads=params_grads)

@ -1148,6 +1158,7 @@ class DGCMomentumOptimizer(Optimizer):

        self.regular_type, self.regular_coeff = self._get_regularization_param(
            self.regularization)
+        self._grad_clip = None

    def _get_regularization_param(self, regularization):
        regular_type = 0
@ -1404,24 +1415,28 @@ class DGCMomentumOptimizer(Optimizer):
        dgc_op._set_attr(op_maker.kOpRoleVarAttrName(),
                         [param_var.name, grad_var.name])

+    @imperative_base.no_grad
    def apply_gradients(self, params_grads):
        params_grads = sorted(params_grads, key=lambda x: x[0].name)
-
        params_grads, table_param_and_grad, table_optimize_op = \
            self._process_distribute_lookuptable(params_grads)

        not_dgc_params_grads = []
        dgc_params_grads = []
+        # DGC clip and regularization in optimizer.backward
        for param, grad in params_grads:
            if not self._is_use_dgc(param, grad):
                not_dgc_params_grads.append((param, grad))
            else:
                dgc_params_grads.append((param, grad))

-        # DGC clip and regularization in local
-        not_dgc_params_grads = append_gradient_clip_ops(not_dgc_params_grads)
+        # 'minimize(grad_clip)' or 'set_gradient_clip'
+        if self._grad_clip is not None:
+            not_dgc_params_grads = self._grad_clip(not_dgc_params_grads)
+        else:
+            not_dgc_params_grads = append_gradient_clip_ops(
+                not_dgc_params_grads)

-        # Add regularization if any
        not_dgc_params_grads = append_regularization_ops(not_dgc_params_grads,
                                                         self.regularization)

@ -3942,16 +3957,13 @@ class RecomputeOptimizer(Optimizer):
    def apply_optimize(self, loss, startup_program, params_grads):
        """
        call the apply_optimize function of self._optimizer
-
        Args:
            loss (Variable): loss variable to run optimizations.
            startup_program (Program): startup_program for initializing parameters
                in `parameter_list`.
            params_grads (list): list of (param, grad) pair to do optimization.
-
        Examples:
            .. code-block:: python
-
                import paddle.fluid as fluid
                
                def mlp(input_x, input_y, hid_dim=128, label_dim=2):
@ -3979,7 +3991,6 @@ class RecomputeOptimizer(Optimizer):
                    cost, startup_program=None, params_grads=params_grads)
                
                print("Finished apply_optimize")
-
        """

        return self._optimizer.apply_optimize(
@ -3991,24 +4002,24 @@ class RecomputeOptimizer(Optimizer):
                 parameter_list=None,
                 no_grad_set=None,
                 grad_clip=None):
-
-        assert (isinstance(loss, Variable)), "The loss should be an Variable."
+        assert isinstance(loss, Variable), "The loss should be an Variable."
        assert (self._checkpoints is not None
                ), "You should call _set_checkpoints first"
        if framework.in_dygraph_mode():
            raise NotImplementedError(
                "DyGraph current does not support recompute")
-
+        if grad_clip is not None:
+            if not isinstance(grad_clip, GradientClipBase):
+                raise TypeError(
+                    "'grad_clip' should be an instance of GradientClipBase's derived class"
+                )
+            self._optimizer._grad_clip = grad_clip
        params_grads = self.backward(
            loss,
            startup_program=startup_program,
            parameter_list=parameter_list,
            no_grad_set=no_grad_set)

-        if grad_clip:
-            # TODO(guru4elephant): should add grad_clip for static graph
-            pass
-
        optimize_ops = self.apply_optimize(
            loss, startup_program=startup_program, params_grads=params_grads)

--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@ -15,6 +15,7 @@
 from __future__ import print_function

 import six
+import warnings

 from .initializer import Initializer, Xavier, Constant
 from .regularizer import WeightDecayRegularizer
@ -68,7 +69,6 @@ class ParamAttr(object):
                 learning_rate=1.0,
                 regularizer=None,
                 trainable=True,
-                 gradient_clip=None,
                 do_model_average=True):
        self.name = name
        if isinstance(self.name, six.string_types) and self.name == "":
@ -78,7 +78,6 @@ class ParamAttr(object):
        self.learning_rate = learning_rate
        self.regularizer = regularizer
        self.trainable = trainable
-        self.gradient_clip = gradient_clip
        self.do_model_average = do_model_average

    def _set_default_initializer(self, initializer):
@ -176,7 +175,6 @@ class ParamAttr(object):
            },
            'regularizer': self.regularizer,
            'trainable': self.trainable,
-            'gradient_clip_attr': self.gradient_clip,
            'do_model_average': self.do_model_average
        }
        if with_initializer:
@ -248,7 +246,6 @@ class WeightNormParamAttr(ParamAttr):
                 learning_rate=1.0,
                 regularizer=None,
                 trainable=True,
-                 gradient_clip=None,
                 do_model_average=False):
        super(WeightNormParamAttr, self).__init__(
            name=name,
@ -256,6 +253,5 @@ class WeightNormParamAttr(ParamAttr):
            learning_rate=learning_rate,
            regularizer=regularizer,
            trainable=trainable,
-            gradient_clip=gradient_clip,
            do_model_average=do_model_average)
        self.dim = dim
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@ -476,15 +476,18 @@ class TestL2Decay(TranspilerTest):
            size=1000,
            act=None,
            param_attr=fluid.ParamAttr(
-                name='fc_w',
-                regularizer=fluid.regularizer.L2Decay(),
-                gradient_clip=fluid.clip.GradientClipByValue(0.1)),
+                name='fc_w', regularizer=fluid.regularizer.L2Decay()),
            bias_attr=fluid.ParamAttr(name='fc_b'))
        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
        avg_cost = fluid.layers.mean(cost)
        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
-        sgd_optimizer.minimize(avg_cost)
+
+        def filter(param):
+            return param.name == "fc_w"
+
+        clip = fluid.clip.GradientClipByValue(0.1, need_clip=filter)
+        sgd_optimizer.minimize(avg_cost, grad_clip=clip)

    def transpiler_test_impl(self):
        pserver, startup = self.get_pserver(self.pserver1_ep)
--- a/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py
+++ b/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py
@ -25,7 +25,7 @@ from paddle.fluid import core

 from paddle.fluid.dygraph.base import to_variable

-from paddle.fluid.dygraph_grad_clip import GradClipByValue, GradClipByNorm, GradClipByGlobalNorm
+from paddle.fluid.clip import GradientClipByValue, GradientClipByNorm, GradientClipByGlobalNorm


 class TestGradClipByGlobalNorm(unittest.TestCase):
@ -65,7 +65,7 @@ class TestGradClipByGlobalNorm(unittest.TestCase):
    def get_dygrap_global_norm_result(self):
        with fluid.dygraph.guard():

-            gloabl_norm_clip = GradClipByGlobalNorm(self.max_global_norm)
+            gloabl_norm_clip = GradientClipByGlobalNorm(self.max_global_norm)
            p_g_var = []
            for p, g in self.para_and_grad:
                new_p = to_variable(p)
@ -135,7 +135,7 @@ class TestGradClipByNorm(unittest.TestCase):
    def get_dygrap_norm_result(self):
        with fluid.dygraph.guard():

-            norm_clip = GradClipByNorm(self.max_norm)
+            norm_clip = GradientClipByNorm(self.max_norm)
            p_g_var = []
            for p, g in self.para_and_grad:
                new_p = to_variable(p)
@ -200,8 +200,8 @@ class TestGradClipByValue(unittest.TestCase):

    def get_dygrap_clip_result(self):
        with fluid.dygraph.guard():
-
-            value_clip = GradClipByValue(self.min_value, self.max_value)
+            value_clip = GradientClipByValue(
+                max=self.max_value, min=self.min_value)
            p_g_var = []
            for p, g in self.para_and_grad:
                new_p = to_variable(p)
@ -225,7 +225,7 @@ class TestGradClipByValue(unittest.TestCase):
        for (p_np, g_np), (p_dy, g_dy) in zip(np_p_g, dy_out_p_g):
            self.assertTrue(np.allclose(g_np, g_dy, rtol=1e-6, atol=1e-8))

-    def test_clip_by_norm_2(self):
+    def test_clip_by_value_2(self):
        self.init_value()

        self.init_scale = 0.2
@ -236,7 +236,7 @@ class TestGradClipByValue(unittest.TestCase):
        for (p_np, g_np), (p_dy, g_dy) in zip(np_p_g, dy_out_p_g):
            self.assertTrue(np.allclose(g_np, g_dy, rtol=1e-6, atol=1e-8))

-    def test_clip_by_norm_3(self):
+    def test_clip_by_value_3(self):
        self.init_value()

        self.init_scale = 0.5
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
@ -331,7 +331,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
            model = MyLayer(size, vocab_size, size)
            optimizer = fluid.optimizer.AdamOptimizer(
                0.001, parameter_list=model.parameters())
-            grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001)
+            grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)

            indices = fluid.dygraph.to_variable(indices)
            embed = fluid.dygraph.to_variable(embed)
@ -350,7 +350,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
            model = MyLayer2(size, vocab_size, size)
            optimizer = fluid.optimizer.AdamOptimizer(
                0.001, parameter_list=model.parameters())
-            grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001)
+            grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)

            indices = fluid.dygraph.to_variable(indices)
            emebd = fluid.dygraph.to_variable(embed)
--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
@ -49,7 +49,7 @@ class TestSimpleNet(unittest.TestCase):
                    with fluid.dygraph.guard(place):
                        backward_strategy = fluid.dygraph.BackwardStrategy()
                        backward_strategy.sort_sum_gradient = sort_sum_gradient
-                        # grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5.0)
+                        # grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)

                        input_word = np.array([[1, 2], [2, 1]]).astype('int64')
                        input = to_variable(input_word)
@ -83,8 +83,7 @@ class TestSimpleNet(unittest.TestCase):
                with fluid.dygraph.guard(place):
                    backward_strategy = fluid.dygraph.BackwardStrategy()
                    backward_strategy.sort_sum_gradient = sort_sum_gradient
-                    grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(
-                        5.0)
+                    grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)

                    input_word = np.array([[1, 2], [2, 1]]).astype('int64')
                    input = to_variable(input_word)