change clip grad api, test=develop (#27767)

5 years ago · 994438b109
parent 365c2c9c89
commit 994438b109
6 changed files with 121 additions and 174 deletions
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@ -5123,6 +5123,8 @@ class Parameter(Variable):
            be applied on the parameter. Default: None
        do_model_average(bool): True if the model average strategy will
            be applied on this parameter.
+        need_clip (bool): Whether the parameter gradient need to be cliped 
+            in optimizer. Default is True.
    """

    def __init__(self,
@ -5162,6 +5164,8 @@ class Parameter(Variable):

        self.do_model_average = kwargs.get('do_model_average', None)

+        self.need_clip = kwargs.get('need_clip', True)
+
        self.is_distributed = False

    def __str__(self):
@ -5194,7 +5198,7 @@ class Parameter(Variable):
        if with_details:
            res_str = Variable.to_string(self, throw_on_error, True)
            additional_attr = ("trainable", "optimize_attr", "regularizer",
-                               "do_model_average")
+                               "do_model_average", "need_clip")
            for attr_name in additional_attr:
                res_str += "%s: %s\n" % (attr_name,
                                         cpt.to_text(getattr(self, attr_name)))
@ -5226,6 +5230,8 @@ class ParamBase(core.VarBase):
            be applied on the ParamBase. Default: None
        do_model_average(bool): True if the model average strategy will
            be applied on this ParamBase.
+        need_clip (bool): Whether the parameter gradient need to be cliped 
+            in optimizer. Default is True.
    """

    @dygraph_only
@ -5265,6 +5271,8 @@ class ParamBase(core.VarBase):

        self.do_model_average = kwargs.get('do_model_average', None)

+        self.need_clip = kwargs.get('need_clip', True)
+
        self.is_distributed = False
        # self.block = default_main_program().global_block()

--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@ -36,8 +36,8 @@ class ParamAttr(object):
    
    Note:
        ``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0. 
-        It is recommended to set ``grad_clip`` in ``optimizer`` to clip gradient. 
-        There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` , 
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+        There are three clipping strategies: :ref:`api_paddle_nn_GradientClipByGlobalNorm` , 
        :ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .

    Parameters:
@ -57,6 +57,7 @@ class ParamAttr(object):
        trainable (bool): Whether this parameter is trainable. Default True.
        do_model_average (bool): Whether this parameter should do model average
                when model average is enabled. Default False.
+        need_clip (bool): Whether the parameter gradient need to be cliped in optimizer. Default is True.

    Examples:
        .. code-block:: python
@ -78,7 +79,8 @@ class ParamAttr(object):
                 learning_rate=1.0,
                 regularizer=None,
                 trainable=True,
-                 do_model_average=True):
+                 do_model_average=True,
+                 need_clip=True):

        if sys.version_info.major == 2:
            check_type(name, "name", (str, type(None), unicode), "ParamAttr")
@ -87,6 +89,7 @@ class ParamAttr(object):
        check_type(learning_rate, "learning_rate", (float, int), "ParamAttr")
        check_type(trainable, "trainable", (bool), "ParamAttr")
        check_type(do_model_average, "do_model_average", (bool), "ParamAttr")
+        check_type(need_clip, "need_clip", (bool), "ParamAttr")
        check_type(initializer, "initializer", (Initializer, type(None)),
                   "ParamAttr")
        check_type(regularizer, "regularizer",
@ -101,6 +104,7 @@ class ParamAttr(object):
        self.regularizer = regularizer
        self.trainable = trainable
        self.do_model_average = do_model_average
+        self.need_clip = need_clip

    def _set_default_initializer(self, initializer):
        """
@ -197,7 +201,8 @@ class ParamAttr(object):
            },
            'regularizer': self.regularizer,
            'trainable': self.trainable,
-            'do_model_average': self.do_model_average
+            'do_model_average': self.do_model_average,
+            'need_clip': self.need_clip
        }
        if with_initializer:
            kwargs['initializer'] = self.initializer
@ -219,9 +224,9 @@ class WeightNormParamAttr(ParamAttr):
    <https://arxiv.org/pdf/1602.07868.pdf>`_.
      
    Note:
-        ``gradient_clip`` of ``WeightNormParamAttr`` HAS BEEN DEPRECATED since 2.0. 
-        It is recommended to use ``minimize(loss, grad_clip=clip)`` to clip gradient. 
-        There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` , 
+        ``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0. 
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+        There are three clipping strategies: :ref:`api_paddle_nn_GradientClipByGlobalNorm` , 
        :ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
        

@ -248,6 +253,7 @@ class WeightNormParamAttr(ParamAttr):
        trainable(bool, optional): Whether this parameter is trainable. Default True.
        do_model_average(bool, optional): Whether this parameter should do model average.
            Default False.
+        need_clip (bool, optional): Whether the parameter gradient need to be cliped in optimizer. Default is True.

    Examples:
        .. code-block:: python
@ -267,7 +273,8 @@ class WeightNormParamAttr(ParamAttr):
                                                learning_rate=1.0,
                                                regularizer=paddle.regularizer.L2Decay(0.1),
                                                trainable=True,
-                                                do_model_average=False))
+                                                do_model_average=False,
+                                                need_clip=True))

    """
    # List to record the parameters reparameterized by weight normalization.
@ -283,12 +290,14 @@ class WeightNormParamAttr(ParamAttr):
                 learning_rate=1.0,
                 regularizer=None,
                 trainable=True,
-                 do_model_average=False):
+                 do_model_average=False,
+                 need_clip=True):
        super(WeightNormParamAttr, self).__init__(
            name=name,
            initializer=initializer,
            learning_rate=learning_rate,
            regularizer=regularizer,
            trainable=trainable,
-            do_model_average=do_model_average)
+            do_model_average=do_model_average,
+            need_clip=need_clip)
        self.dim = dim
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@ -185,12 +185,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
    # invoke 'set_gradient_clip' in a wrong order
    def test_wrong_API_order(self):
        def backward_func(cost):
-            # no clip gradient
-            def fileter_func(param):
-                return param.name == "fc.w_0"
-
-            clip = fluid.clip.GradientClipByGlobalNorm(
-                clip_norm=5.0, need_clip=fileter_func)
+            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)
            fluid.clip.set_gradient_clip(clip)
            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01,
                                                grad_clip=clip)
@ -205,11 +200,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):

    # if grad is None or not need clip
    def test_none_grad(self):
-        def fileter_func(param):
-            return param.name == "x"
-
-        clip = fluid.clip.GradientClipByGlobalNorm(
-            self.clip_norm, need_clip=fileter_func)
+        clip = fluid.clip.GradientClipByGlobalNorm(self.clip_norm)
        x = fluid.default_main_program().global_block().create_parameter(
            name="x", shape=[2, 3], dtype="float32")
        y = fluid.default_main_program().global_block().create_parameter(
@ -228,11 +219,6 @@ class TestGradientClipByGlobalNorm(TestGradientClip):

    # raise typeError
    def test_tpyeError(self):
-        # the type of need_clip must be an funciton
-        with self.assertRaises(TypeError):
-            clip = fluid.clip.GradientClipByGlobalNorm(
-                clip_norm=self.clip_norm, need_clip="test")
-
        # the type of optimizer(grad_clip=) must be an instance of GradientClipBase's derived class
        with self.assertRaises(TypeError):
            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1,
@ -264,26 +250,22 @@ class TestGradientClipByNorm(TestGradientClip):

    # if grad is None or not need clip
    def test_none_grad(self):
-        def fileter_func(param):
-            return param.name == "z"
-
-        clip = fluid.clip.GradientClipByNorm(
-            self.clip_norm, need_clip=fileter_func)
+        clip = fluid.clip.GradientClipByNorm(self.clip_norm)
        x = fluid.default_main_program().global_block().create_parameter(
-            name="x", shape=[2, 3], dtype="float32")
+            name="x", shape=[2, 3], dtype="float32", need_clip=False)
        y = fluid.default_main_program().global_block().create_parameter(
-            name="y", shape=[2, 3], dtype="float32")
+            name="y", shape=[2, 3], dtype="float32", need_clip=False)

        # (x, None) should not be returned
        params_grads = [(x, None), (x, y)]
        params_grads = clip(params_grads)
        self.assertTrue(
            len(clip(params_grads)) == 1,
-            "ClipByNorm: when grad is None, it shouldn't be returned by gradient clip!"
+            "ClipGradByNorm: when grad is None, it shouldn't be returned by gradient clip!"
        )
        self.assertTrue(
            params_grads[0][1].name == 'y',
-            "ClipByNorm: grad should not be clipped when filtered out!")
+            "ClipGradByNorm: grad should not be clipped when filtered out!")


 class TestGradientClipByValue(TestGradientClip):
@ -312,26 +294,22 @@ class TestGradientClipByValue(TestGradientClip):

    # if grad is None or not need clip
    def test_none_grad(self):
-        def fileter_func(param):
-            return param.name == "z"
-
-        clip = fluid.clip.GradientClipByValue(
-            self.max, self.min, need_clip=fileter_func)
+        clip = fluid.clip.GradientClipByValue(self.max, self.min)
        x = fluid.default_main_program().global_block().create_parameter(
-            name="x", shape=[2, 3], dtype="float32")
+            name="x", shape=[2, 3], dtype="float32", need_clip=False)
        y = fluid.default_main_program().global_block().create_parameter(
-            name="y", shape=[2, 3], dtype="float32")
+            name="y", shape=[2, 3], dtype="float32", need_clip=False)

        # (x, None) should not be returned
        params_grads = [(x, None), (x, y)]
        params_grads = clip(params_grads)
        self.assertTrue(
            len(clip(params_grads)) == 1,
-            "ClipByValue: when grad is None, it shouldn't be returned by gradient clip!"
+            "ClipGradByValue: when grad is None, it shouldn't be returned by gradient clip!"
        )
        self.assertTrue(
            params_grads[0][1].name == 'y',
-            "ClipByValue: grad should not be clipped when filtered out!")
+            "ClipGradByValue: grad should not be clipped when filtered out!")


 class TestDygraphGradientClip(unittest.TestCase):
@ -355,13 +333,9 @@ class TestDygraphGradientClip(unittest.TestCase):

 class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
    def setUp(self):
-        # only clip gradient of x (ParamBase)
-        def fileter_func(param):
-            return param.name == "x"
-
        self.clip_norm = 0.8
        self.clip1 = fluid.clip.GradientClipByGlobalNorm(
-            clip_norm=self.clip_norm, need_clip=fileter_func)
+            clip_norm=self.clip_norm)
        self.clip2 = fluid.clip.GradientClipByGlobalNorm(
            clip_norm=self.clip_norm)

@ -401,13 +375,8 @@ class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):

 class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
    def setUp(self):
-        # only clip gradient of linear_0.w_0 (ParamBase)
-        def fileter_func(param):
-            return param.name == "linear_0.w_0"
-
        self.clip_norm = 0.8
-        self.clip = fluid.clip.GradientClipByNorm(
-            clip_norm=self.clip_norm, need_clip=fileter_func)
+        self.clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)

    def check_clip_result(self, loss, optimizer):
        # if grad is None
@ -435,14 +404,9 @@ class TestDygraphGradientClipByNorm(TestDygraphGradientClip):

 class TestDygraphGradientClipByValue(TestDygraphGradientClip):
    def setUp(self):
-        # only clip gradient of linear_0.w_0 (ParamBase)
-        def fileter_func(param):
-            return param.name == "linear_0.w_0"
-
        self.max = 0.2
        self.min = 0.1
-        self.clip = fluid.clip.GradientClipByValue(
-            max=self.max, min=self.min, need_clip=fileter_func)
+        self.clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)

    def check_clip_result(self, loss, optimizer):
        # if grad is None
--- a/python/paddle/nn/init.py
+++ b/python/paddle/nn/init.py
@ -31,9 +31,9 @@ __all__ += rnn.__all__
 __all__ += weight_norm_hook.__all__

 # TODO: define alias in nn directory
-from .clip import GradientClipByGlobalNorm  #DEFINE_ALIAS
-from .clip import GradientClipByNorm  #DEFINE_ALIAS
-from .clip import GradientClipByValue  #DEFINE_ALIAS
+from .clip import ClipGradByGlobalNorm  #DEFINE_ALIAS
+from .clip import ClipGradByNorm  #DEFINE_ALIAS
+from .clip import ClipGradByValue  #DEFINE_ALIAS
 # from .clip import set_gradient_clip        #DEFINE_ALIAS
 from .clip import clip  #DEFINE_ALIAS
 from .clip import clip_by_norm  #DEFINE_ALIAS
@ -51,13 +51,13 @@ from .decode import beam_search_decode  #DEFINE_ALIAS
 # from .decode import dynamic_decode        #DEFINE_ALIAS
 from .decode import gather_tree  #DEFINE_ALIAS
 # from .input import Input        #DEFINE_ALIAS
-from .layer.activation import ELU
-from .layer.activation import GELU
-from .layer.activation import Tanh
-from .layer.activation import Hardshrink
-from .layer.activation import Hardtanh
-from .layer.activation import PReLU
-from .layer.activation import ReLU
+from .layer.activation import ELU  #DEFINE_ALIAS
+from .layer.activation import GELU  #DEFINE_ALIAS
+from .layer.activation import Tanh  #DEFINE_ALIAS
+from .layer.activation import Hardshrink  #DEFINE_ALIAS
+from .layer.activation import Hardtanh  #DEFINE_ALIAS
+from .layer.activation import PReLU  #DEFINE_ALIAS
+from .layer.activation import ReLU  #DEFINE_ALIAS
 from .layer.activation import ReLU6  #DEFINE_ALIAS
 from .layer.activation import SELU  #DEFINE_ALIAS
 from .layer.activation import LeakyReLU  #DEFINE_ALIAS
--- a/python/paddle/nn/clip.py
+++ b/python/paddle/nn/clip.py
@ -13,18 +13,18 @@
 # limitations under the License.

 # TODO: define the functions to clip gradient of parameter  
-from ..fluid.clip import GradientClipByGlobalNorm  #DEFINE_ALIAS
-from ..fluid.clip import GradientClipByNorm  #DEFINE_ALIAS
-from ..fluid.clip import GradientClipByValue  #DEFINE_ALIAS
+from ..fluid.clip import ClipGradByGlobalNorm  #DEFINE_ALIAS
+from ..fluid.clip import ClipGradByNorm  #DEFINE_ALIAS
+from ..fluid.clip import ClipGradByValue  #DEFINE_ALIAS
 from ..fluid.layers import clip  #DEFINE_ALIAS

 from ..fluid.layers import clip_by_norm  #DEFINE_ALIAS

 __all__ = [
    #       'ErrorClipByValue',
-    'GradientClipByGlobalNorm',
-    'GradientClipByNorm',
-    'GradientClipByValue',
+    'ClipGradByGlobalNorm',
+    'ClipGradByNorm',
+    'ClipGradByValue',
    #       'set_gradient_clip',
    'clip',
    'clip_by_norm'