change clip grad api, test=develop (#27767)

my_2.0rc
Qi Li 4 years ago committed by GitHub
parent 365c2c9c89
commit 994438b109
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

File diff suppressed because it is too large Load Diff

@ -5123,6 +5123,8 @@ class Parameter(Variable):
be applied on the parameter. Default: None
do_model_average(bool): True if the model average strategy will
be applied on this parameter.
need_clip (bool): Whether the parameter gradient need to be cliped
in optimizer. Default is True.
"""
def __init__(self,
@ -5162,6 +5164,8 @@ class Parameter(Variable):
self.do_model_average = kwargs.get('do_model_average', None)
self.need_clip = kwargs.get('need_clip', True)
self.is_distributed = False
def __str__(self):
@ -5194,7 +5198,7 @@ class Parameter(Variable):
if with_details:
res_str = Variable.to_string(self, throw_on_error, True)
additional_attr = ("trainable", "optimize_attr", "regularizer",
"do_model_average")
"do_model_average", "need_clip")
for attr_name in additional_attr:
res_str += "%s: %s\n" % (attr_name,
cpt.to_text(getattr(self, attr_name)))
@ -5226,6 +5230,8 @@ class ParamBase(core.VarBase):
be applied on the ParamBase. Default: None
do_model_average(bool): True if the model average strategy will
be applied on this ParamBase.
need_clip (bool): Whether the parameter gradient need to be cliped
in optimizer. Default is True.
"""
@dygraph_only
@ -5265,6 +5271,8 @@ class ParamBase(core.VarBase):
self.do_model_average = kwargs.get('do_model_average', None)
self.need_clip = kwargs.get('need_clip', True)
self.is_distributed = False
# self.block = default_main_program().global_block()

@ -36,8 +36,8 @@ class ParamAttr(object):
Note:
``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0.
It is recommended to set ``grad_clip`` in ``optimizer`` to clip gradient.
There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` ,
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
There are three clipping strategies: :ref:`api_paddle_nn_GradientClipByGlobalNorm` ,
:ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
Parameters:
@ -57,6 +57,7 @@ class ParamAttr(object):
trainable (bool): Whether this parameter is trainable. Default True.
do_model_average (bool): Whether this parameter should do model average
when model average is enabled. Default False.
need_clip (bool): Whether the parameter gradient need to be cliped in optimizer. Default is True.
Examples:
.. code-block:: python
@ -78,7 +79,8 @@ class ParamAttr(object):
learning_rate=1.0,
regularizer=None,
trainable=True,
do_model_average=True):
do_model_average=True,
need_clip=True):
if sys.version_info.major == 2:
check_type(name, "name", (str, type(None), unicode), "ParamAttr")
@ -87,6 +89,7 @@ class ParamAttr(object):
check_type(learning_rate, "learning_rate", (float, int), "ParamAttr")
check_type(trainable, "trainable", (bool), "ParamAttr")
check_type(do_model_average, "do_model_average", (bool), "ParamAttr")
check_type(need_clip, "need_clip", (bool), "ParamAttr")
check_type(initializer, "initializer", (Initializer, type(None)),
"ParamAttr")
check_type(regularizer, "regularizer",
@ -101,6 +104,7 @@ class ParamAttr(object):
self.regularizer = regularizer
self.trainable = trainable
self.do_model_average = do_model_average
self.need_clip = need_clip
def _set_default_initializer(self, initializer):
"""
@ -197,7 +201,8 @@ class ParamAttr(object):
},
'regularizer': self.regularizer,
'trainable': self.trainable,
'do_model_average': self.do_model_average
'do_model_average': self.do_model_average,
'need_clip': self.need_clip
}
if with_initializer:
kwargs['initializer'] = self.initializer
@ -219,9 +224,9 @@ class WeightNormParamAttr(ParamAttr):
<https://arxiv.org/pdf/1602.07868.pdf>`_.
Note:
``gradient_clip`` of ``WeightNormParamAttr`` HAS BEEN DEPRECATED since 2.0.
It is recommended to use ``minimize(loss, grad_clip=clip)`` to clip gradient.
There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` ,
``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
There are three clipping strategies: :ref:`api_paddle_nn_GradientClipByGlobalNorm` ,
:ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
@ -248,6 +253,7 @@ class WeightNormParamAttr(ParamAttr):
trainable(bool, optional): Whether this parameter is trainable. Default True.
do_model_average(bool, optional): Whether this parameter should do model average.
Default False.
need_clip (bool, optional): Whether the parameter gradient need to be cliped in optimizer. Default is True.
Examples:
.. code-block:: python
@ -267,7 +273,8 @@ class WeightNormParamAttr(ParamAttr):
learning_rate=1.0,
regularizer=paddle.regularizer.L2Decay(0.1),
trainable=True,
do_model_average=False))
do_model_average=False,
need_clip=True))
"""
# List to record the parameters reparameterized by weight normalization.
@ -283,12 +290,14 @@ class WeightNormParamAttr(ParamAttr):
learning_rate=1.0,
regularizer=None,
trainable=True,
do_model_average=False):
do_model_average=False,
need_clip=True):
super(WeightNormParamAttr, self).__init__(
name=name,
initializer=initializer,
learning_rate=learning_rate,
regularizer=regularizer,
trainable=trainable,
do_model_average=do_model_average)
do_model_average=do_model_average,
need_clip=need_clip)
self.dim = dim

@ -185,12 +185,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# invoke 'set_gradient_clip' in a wrong order
def test_wrong_API_order(self):
def backward_func(cost):
# no clip gradient
def fileter_func(param):
return param.name == "fc.w_0"
clip = fluid.clip.GradientClipByGlobalNorm(
clip_norm=5.0, need_clip=fileter_func)
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)
fluid.clip.set_gradient_clip(clip)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01,
grad_clip=clip)
@ -205,11 +200,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# if grad is None or not need clip
def test_none_grad(self):
def fileter_func(param):
return param.name == "x"
clip = fluid.clip.GradientClipByGlobalNorm(
self.clip_norm, need_clip=fileter_func)
clip = fluid.clip.GradientClipByGlobalNorm(self.clip_norm)
x = fluid.default_main_program().global_block().create_parameter(
name="x", shape=[2, 3], dtype="float32")
y = fluid.default_main_program().global_block().create_parameter(
@ -228,11 +219,6 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# raise typeError
def test_tpyeError(self):
# the type of need_clip must be an funciton
with self.assertRaises(TypeError):
clip = fluid.clip.GradientClipByGlobalNorm(
clip_norm=self.clip_norm, need_clip="test")
# the type of optimizer(grad_clip=) must be an instance of GradientClipBase's derived class
with self.assertRaises(TypeError):
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1,
@ -264,26 +250,22 @@ class TestGradientClipByNorm(TestGradientClip):
# if grad is None or not need clip
def test_none_grad(self):
def fileter_func(param):
return param.name == "z"
clip = fluid.clip.GradientClipByNorm(
self.clip_norm, need_clip=fileter_func)
clip = fluid.clip.GradientClipByNorm(self.clip_norm)
x = fluid.default_main_program().global_block().create_parameter(
name="x", shape=[2, 3], dtype="float32")
name="x", shape=[2, 3], dtype="float32", need_clip=False)
y = fluid.default_main_program().global_block().create_parameter(
name="y", shape=[2, 3], dtype="float32")
name="y", shape=[2, 3], dtype="float32", need_clip=False)
# (x, None) should not be returned
params_grads = [(x, None), (x, y)]
params_grads = clip(params_grads)
self.assertTrue(
len(clip(params_grads)) == 1,
"ClipByNorm: when grad is None, it shouldn't be returned by gradient clip!"
"ClipGradByNorm: when grad is None, it shouldn't be returned by gradient clip!"
)
self.assertTrue(
params_grads[0][1].name == 'y',
"ClipByNorm: grad should not be clipped when filtered out!")
"ClipGradByNorm: grad should not be clipped when filtered out!")
class TestGradientClipByValue(TestGradientClip):
@ -312,26 +294,22 @@ class TestGradientClipByValue(TestGradientClip):
# if grad is None or not need clip
def test_none_grad(self):
def fileter_func(param):
return param.name == "z"
clip = fluid.clip.GradientClipByValue(
self.max, self.min, need_clip=fileter_func)
clip = fluid.clip.GradientClipByValue(self.max, self.min)
x = fluid.default_main_program().global_block().create_parameter(
name="x", shape=[2, 3], dtype="float32")
name="x", shape=[2, 3], dtype="float32", need_clip=False)
y = fluid.default_main_program().global_block().create_parameter(
name="y", shape=[2, 3], dtype="float32")
name="y", shape=[2, 3], dtype="float32", need_clip=False)
# (x, None) should not be returned
params_grads = [(x, None), (x, y)]
params_grads = clip(params_grads)
self.assertTrue(
len(clip(params_grads)) == 1,
"ClipByValue: when grad is None, it shouldn't be returned by gradient clip!"
"ClipGradByValue: when grad is None, it shouldn't be returned by gradient clip!"
)
self.assertTrue(
params_grads[0][1].name == 'y',
"ClipByValue: grad should not be clipped when filtered out!")
"ClipGradByValue: grad should not be clipped when filtered out!")
class TestDygraphGradientClip(unittest.TestCase):
@ -355,13 +333,9 @@ class TestDygraphGradientClip(unittest.TestCase):
class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
def setUp(self):
# only clip gradient of x (ParamBase)
def fileter_func(param):
return param.name == "x"
self.clip_norm = 0.8
self.clip1 = fluid.clip.GradientClipByGlobalNorm(
clip_norm=self.clip_norm, need_clip=fileter_func)
clip_norm=self.clip_norm)
self.clip2 = fluid.clip.GradientClipByGlobalNorm(
clip_norm=self.clip_norm)
@ -401,13 +375,8 @@ class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
def setUp(self):
# only clip gradient of linear_0.w_0 (ParamBase)
def fileter_func(param):
return param.name == "linear_0.w_0"
self.clip_norm = 0.8
self.clip = fluid.clip.GradientClipByNorm(
clip_norm=self.clip_norm, need_clip=fileter_func)
self.clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)
def check_clip_result(self, loss, optimizer):
# if grad is None
@ -435,14 +404,9 @@ class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
class TestDygraphGradientClipByValue(TestDygraphGradientClip):
def setUp(self):
# only clip gradient of linear_0.w_0 (ParamBase)
def fileter_func(param):
return param.name == "linear_0.w_0"
self.max = 0.2
self.min = 0.1
self.clip = fluid.clip.GradientClipByValue(
max=self.max, min=self.min, need_clip=fileter_func)
self.clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)
def check_clip_result(self, loss, optimizer):
# if grad is None

@ -31,9 +31,9 @@ __all__ += rnn.__all__
__all__ += weight_norm_hook.__all__
# TODO: define alias in nn directory
from .clip import GradientClipByGlobalNorm #DEFINE_ALIAS
from .clip import GradientClipByNorm #DEFINE_ALIAS
from .clip import GradientClipByValue #DEFINE_ALIAS
from .clip import ClipGradByGlobalNorm #DEFINE_ALIAS
from .clip import ClipGradByNorm #DEFINE_ALIAS
from .clip import ClipGradByValue #DEFINE_ALIAS
# from .clip import set_gradient_clip #DEFINE_ALIAS
from .clip import clip #DEFINE_ALIAS
from .clip import clip_by_norm #DEFINE_ALIAS
@ -51,13 +51,13 @@ from .decode import beam_search_decode #DEFINE_ALIAS
# from .decode import dynamic_decode #DEFINE_ALIAS
from .decode import gather_tree #DEFINE_ALIAS
# from .input import Input #DEFINE_ALIAS
from .layer.activation import ELU
from .layer.activation import GELU
from .layer.activation import Tanh
from .layer.activation import Hardshrink
from .layer.activation import Hardtanh
from .layer.activation import PReLU
from .layer.activation import ReLU
from .layer.activation import ELU #DEFINE_ALIAS
from .layer.activation import GELU #DEFINE_ALIAS
from .layer.activation import Tanh #DEFINE_ALIAS
from .layer.activation import Hardshrink #DEFINE_ALIAS
from .layer.activation import Hardtanh #DEFINE_ALIAS
from .layer.activation import PReLU #DEFINE_ALIAS
from .layer.activation import ReLU #DEFINE_ALIAS
from .layer.activation import ReLU6 #DEFINE_ALIAS
from .layer.activation import SELU #DEFINE_ALIAS
from .layer.activation import LeakyReLU #DEFINE_ALIAS

@ -13,18 +13,18 @@
# limitations under the License.
# TODO: define the functions to clip gradient of parameter
from ..fluid.clip import GradientClipByGlobalNorm #DEFINE_ALIAS
from ..fluid.clip import GradientClipByNorm #DEFINE_ALIAS
from ..fluid.clip import GradientClipByValue #DEFINE_ALIAS
from ..fluid.clip import ClipGradByGlobalNorm #DEFINE_ALIAS
from ..fluid.clip import ClipGradByNorm #DEFINE_ALIAS
from ..fluid.clip import ClipGradByValue #DEFINE_ALIAS
from ..fluid.layers import clip #DEFINE_ALIAS
from ..fluid.layers import clip_by_norm #DEFINE_ALIAS
__all__ = [
# 'ErrorClipByValue',
'GradientClipByGlobalNorm',
'GradientClipByNorm',
'GradientClipByValue',
'ClipGradByGlobalNorm',
'ClipGradByNorm',
'ClipGradByValue',
# 'set_gradient_clip',
'clip',
'clip_by_norm'

Loading…
Cancel
Save