change clip grad api, test=develop (#27767)

my_2.0rc
Qi Li 5 years ago committed by GitHub
parent 365c2c9c89
commit 994438b109
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

File diff suppressed because it is too large Load Diff

@ -5123,6 +5123,8 @@ class Parameter(Variable):
be applied on the parameter. Default: None be applied on the parameter. Default: None
do_model_average(bool): True if the model average strategy will do_model_average(bool): True if the model average strategy will
be applied on this parameter. be applied on this parameter.
need_clip (bool): Whether the parameter gradient need to be cliped
in optimizer. Default is True.
""" """
def __init__(self, def __init__(self,
@ -5162,6 +5164,8 @@ class Parameter(Variable):
self.do_model_average = kwargs.get('do_model_average', None) self.do_model_average = kwargs.get('do_model_average', None)
self.need_clip = kwargs.get('need_clip', True)
self.is_distributed = False self.is_distributed = False
def __str__(self): def __str__(self):
@ -5194,7 +5198,7 @@ class Parameter(Variable):
if with_details: if with_details:
res_str = Variable.to_string(self, throw_on_error, True) res_str = Variable.to_string(self, throw_on_error, True)
additional_attr = ("trainable", "optimize_attr", "regularizer", additional_attr = ("trainable", "optimize_attr", "regularizer",
"do_model_average") "do_model_average", "need_clip")
for attr_name in additional_attr: for attr_name in additional_attr:
res_str += "%s: %s\n" % (attr_name, res_str += "%s: %s\n" % (attr_name,
cpt.to_text(getattr(self, attr_name))) cpt.to_text(getattr(self, attr_name)))
@ -5226,6 +5230,8 @@ class ParamBase(core.VarBase):
be applied on the ParamBase. Default: None be applied on the ParamBase. Default: None
do_model_average(bool): True if the model average strategy will do_model_average(bool): True if the model average strategy will
be applied on this ParamBase. be applied on this ParamBase.
need_clip (bool): Whether the parameter gradient need to be cliped
in optimizer. Default is True.
""" """
@dygraph_only @dygraph_only
@ -5265,6 +5271,8 @@ class ParamBase(core.VarBase):
self.do_model_average = kwargs.get('do_model_average', None) self.do_model_average = kwargs.get('do_model_average', None)
self.need_clip = kwargs.get('need_clip', True)
self.is_distributed = False self.is_distributed = False
# self.block = default_main_program().global_block() # self.block = default_main_program().global_block()

@ -36,8 +36,8 @@ class ParamAttr(object):
Note: Note:
``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0. ``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0.
It is recommended to set ``grad_clip`` in ``optimizer`` to clip gradient. Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` , There are three clipping strategies: :ref:`api_paddle_nn_GradientClipByGlobalNorm` ,
:ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` . :ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
Parameters: Parameters:
@ -57,6 +57,7 @@ class ParamAttr(object):
trainable (bool): Whether this parameter is trainable. Default True. trainable (bool): Whether this parameter is trainable. Default True.
do_model_average (bool): Whether this parameter should do model average do_model_average (bool): Whether this parameter should do model average
when model average is enabled. Default False. when model average is enabled. Default False.
need_clip (bool): Whether the parameter gradient need to be cliped in optimizer. Default is True.
Examples: Examples:
.. code-block:: python .. code-block:: python
@ -78,7 +79,8 @@ class ParamAttr(object):
learning_rate=1.0, learning_rate=1.0,
regularizer=None, regularizer=None,
trainable=True, trainable=True,
do_model_average=True): do_model_average=True,
need_clip=True):
if sys.version_info.major == 2: if sys.version_info.major == 2:
check_type(name, "name", (str, type(None), unicode), "ParamAttr") check_type(name, "name", (str, type(None), unicode), "ParamAttr")
@ -87,6 +89,7 @@ class ParamAttr(object):
check_type(learning_rate, "learning_rate", (float, int), "ParamAttr") check_type(learning_rate, "learning_rate", (float, int), "ParamAttr")
check_type(trainable, "trainable", (bool), "ParamAttr") check_type(trainable, "trainable", (bool), "ParamAttr")
check_type(do_model_average, "do_model_average", (bool), "ParamAttr") check_type(do_model_average, "do_model_average", (bool), "ParamAttr")
check_type(need_clip, "need_clip", (bool), "ParamAttr")
check_type(initializer, "initializer", (Initializer, type(None)), check_type(initializer, "initializer", (Initializer, type(None)),
"ParamAttr") "ParamAttr")
check_type(regularizer, "regularizer", check_type(regularizer, "regularizer",
@ -101,6 +104,7 @@ class ParamAttr(object):
self.regularizer = regularizer self.regularizer = regularizer
self.trainable = trainable self.trainable = trainable
self.do_model_average = do_model_average self.do_model_average = do_model_average
self.need_clip = need_clip
def _set_default_initializer(self, initializer): def _set_default_initializer(self, initializer):
""" """
@ -197,7 +201,8 @@ class ParamAttr(object):
}, },
'regularizer': self.regularizer, 'regularizer': self.regularizer,
'trainable': self.trainable, 'trainable': self.trainable,
'do_model_average': self.do_model_average 'do_model_average': self.do_model_average,
'need_clip': self.need_clip
} }
if with_initializer: if with_initializer:
kwargs['initializer'] = self.initializer kwargs['initializer'] = self.initializer
@ -219,9 +224,9 @@ class WeightNormParamAttr(ParamAttr):
<https://arxiv.org/pdf/1602.07868.pdf>`_. <https://arxiv.org/pdf/1602.07868.pdf>`_.
Note: Note:
``gradient_clip`` of ``WeightNormParamAttr`` HAS BEEN DEPRECATED since 2.0. ``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0.
It is recommended to use ``minimize(loss, grad_clip=clip)`` to clip gradient. Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` , There are three clipping strategies: :ref:`api_paddle_nn_GradientClipByGlobalNorm` ,
:ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` . :ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
@ -248,6 +253,7 @@ class WeightNormParamAttr(ParamAttr):
trainable(bool, optional): Whether this parameter is trainable. Default True. trainable(bool, optional): Whether this parameter is trainable. Default True.
do_model_average(bool, optional): Whether this parameter should do model average. do_model_average(bool, optional): Whether this parameter should do model average.
Default False. Default False.
need_clip (bool, optional): Whether the parameter gradient need to be cliped in optimizer. Default is True.
Examples: Examples:
.. code-block:: python .. code-block:: python
@ -267,7 +273,8 @@ class WeightNormParamAttr(ParamAttr):
learning_rate=1.0, learning_rate=1.0,
regularizer=paddle.regularizer.L2Decay(0.1), regularizer=paddle.regularizer.L2Decay(0.1),
trainable=True, trainable=True,
do_model_average=False)) do_model_average=False,
need_clip=True))
""" """
# List to record the parameters reparameterized by weight normalization. # List to record the parameters reparameterized by weight normalization.
@ -283,12 +290,14 @@ class WeightNormParamAttr(ParamAttr):
learning_rate=1.0, learning_rate=1.0,
regularizer=None, regularizer=None,
trainable=True, trainable=True,
do_model_average=False): do_model_average=False,
need_clip=True):
super(WeightNormParamAttr, self).__init__( super(WeightNormParamAttr, self).__init__(
name=name, name=name,
initializer=initializer, initializer=initializer,
learning_rate=learning_rate, learning_rate=learning_rate,
regularizer=regularizer, regularizer=regularizer,
trainable=trainable, trainable=trainable,
do_model_average=do_model_average) do_model_average=do_model_average,
need_clip=need_clip)
self.dim = dim self.dim = dim

@ -185,12 +185,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# invoke 'set_gradient_clip' in a wrong order # invoke 'set_gradient_clip' in a wrong order
def test_wrong_API_order(self): def test_wrong_API_order(self):
def backward_func(cost): def backward_func(cost):
# no clip gradient clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)
def fileter_func(param):
return param.name == "fc.w_0"
clip = fluid.clip.GradientClipByGlobalNorm(
clip_norm=5.0, need_clip=fileter_func)
fluid.clip.set_gradient_clip(clip) fluid.clip.set_gradient_clip(clip)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01, sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01,
grad_clip=clip) grad_clip=clip)
@ -205,11 +200,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# if grad is None or not need clip # if grad is None or not need clip
def test_none_grad(self): def test_none_grad(self):
def fileter_func(param): clip = fluid.clip.GradientClipByGlobalNorm(self.clip_norm)
return param.name == "x"
clip = fluid.clip.GradientClipByGlobalNorm(
self.clip_norm, need_clip=fileter_func)
x = fluid.default_main_program().global_block().create_parameter( x = fluid.default_main_program().global_block().create_parameter(
name="x", shape=[2, 3], dtype="float32") name="x", shape=[2, 3], dtype="float32")
y = fluid.default_main_program().global_block().create_parameter( y = fluid.default_main_program().global_block().create_parameter(
@ -228,11 +219,6 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# raise typeError # raise typeError
def test_tpyeError(self): def test_tpyeError(self):
# the type of need_clip must be an funciton
with self.assertRaises(TypeError):
clip = fluid.clip.GradientClipByGlobalNorm(
clip_norm=self.clip_norm, need_clip="test")
# the type of optimizer(grad_clip=) must be an instance of GradientClipBase's derived class # the type of optimizer(grad_clip=) must be an instance of GradientClipBase's derived class
with self.assertRaises(TypeError): with self.assertRaises(TypeError):
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1, sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1,
@ -264,26 +250,22 @@ class TestGradientClipByNorm(TestGradientClip):
# if grad is None or not need clip # if grad is None or not need clip
def test_none_grad(self): def test_none_grad(self):
def fileter_func(param): clip = fluid.clip.GradientClipByNorm(self.clip_norm)
return param.name == "z"
clip = fluid.clip.GradientClipByNorm(
self.clip_norm, need_clip=fileter_func)
x = fluid.default_main_program().global_block().create_parameter( x = fluid.default_main_program().global_block().create_parameter(
name="x", shape=[2, 3], dtype="float32") name="x", shape=[2, 3], dtype="float32", need_clip=False)
y = fluid.default_main_program().global_block().create_parameter( y = fluid.default_main_program().global_block().create_parameter(
name="y", shape=[2, 3], dtype="float32") name="y", shape=[2, 3], dtype="float32", need_clip=False)
# (x, None) should not be returned # (x, None) should not be returned
params_grads = [(x, None), (x, y)] params_grads = [(x, None), (x, y)]
params_grads = clip(params_grads) params_grads = clip(params_grads)
self.assertTrue( self.assertTrue(
len(clip(params_grads)) == 1, len(clip(params_grads)) == 1,
"ClipByNorm: when grad is None, it shouldn't be returned by gradient clip!" "ClipGradByNorm: when grad is None, it shouldn't be returned by gradient clip!"
) )
self.assertTrue( self.assertTrue(
params_grads[0][1].name == 'y', params_grads[0][1].name == 'y',
"ClipByNorm: grad should not be clipped when filtered out!") "ClipGradByNorm: grad should not be clipped when filtered out!")
class TestGradientClipByValue(TestGradientClip): class TestGradientClipByValue(TestGradientClip):
@ -312,26 +294,22 @@ class TestGradientClipByValue(TestGradientClip):
# if grad is None or not need clip # if grad is None or not need clip
def test_none_grad(self): def test_none_grad(self):
def fileter_func(param): clip = fluid.clip.GradientClipByValue(self.max, self.min)
return param.name == "z"
clip = fluid.clip.GradientClipByValue(
self.max, self.min, need_clip=fileter_func)
x = fluid.default_main_program().global_block().create_parameter( x = fluid.default_main_program().global_block().create_parameter(
name="x", shape=[2, 3], dtype="float32") name="x", shape=[2, 3], dtype="float32", need_clip=False)
y = fluid.default_main_program().global_block().create_parameter( y = fluid.default_main_program().global_block().create_parameter(
name="y", shape=[2, 3], dtype="float32") name="y", shape=[2, 3], dtype="float32", need_clip=False)
# (x, None) should not be returned # (x, None) should not be returned
params_grads = [(x, None), (x, y)] params_grads = [(x, None), (x, y)]
params_grads = clip(params_grads) params_grads = clip(params_grads)
self.assertTrue( self.assertTrue(
len(clip(params_grads)) == 1, len(clip(params_grads)) == 1,
"ClipByValue: when grad is None, it shouldn't be returned by gradient clip!" "ClipGradByValue: when grad is None, it shouldn't be returned by gradient clip!"
) )
self.assertTrue( self.assertTrue(
params_grads[0][1].name == 'y', params_grads[0][1].name == 'y',
"ClipByValue: grad should not be clipped when filtered out!") "ClipGradByValue: grad should not be clipped when filtered out!")
class TestDygraphGradientClip(unittest.TestCase): class TestDygraphGradientClip(unittest.TestCase):
@ -355,13 +333,9 @@ class TestDygraphGradientClip(unittest.TestCase):
class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip): class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
def setUp(self): def setUp(self):
# only clip gradient of x (ParamBase)
def fileter_func(param):
return param.name == "x"
self.clip_norm = 0.8 self.clip_norm = 0.8
self.clip1 = fluid.clip.GradientClipByGlobalNorm( self.clip1 = fluid.clip.GradientClipByGlobalNorm(
clip_norm=self.clip_norm, need_clip=fileter_func) clip_norm=self.clip_norm)
self.clip2 = fluid.clip.GradientClipByGlobalNorm( self.clip2 = fluid.clip.GradientClipByGlobalNorm(
clip_norm=self.clip_norm) clip_norm=self.clip_norm)
@ -401,13 +375,8 @@ class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
class TestDygraphGradientClipByNorm(TestDygraphGradientClip): class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
def setUp(self): def setUp(self):
# only clip gradient of linear_0.w_0 (ParamBase)
def fileter_func(param):
return param.name == "linear_0.w_0"
self.clip_norm = 0.8 self.clip_norm = 0.8
self.clip = fluid.clip.GradientClipByNorm( self.clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)
clip_norm=self.clip_norm, need_clip=fileter_func)
def check_clip_result(self, loss, optimizer): def check_clip_result(self, loss, optimizer):
# if grad is None # if grad is None
@ -435,14 +404,9 @@ class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
class TestDygraphGradientClipByValue(TestDygraphGradientClip): class TestDygraphGradientClipByValue(TestDygraphGradientClip):
def setUp(self): def setUp(self):
# only clip gradient of linear_0.w_0 (ParamBase)
def fileter_func(param):
return param.name == "linear_0.w_0"
self.max = 0.2 self.max = 0.2
self.min = 0.1 self.min = 0.1
self.clip = fluid.clip.GradientClipByValue( self.clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)
max=self.max, min=self.min, need_clip=fileter_func)
def check_clip_result(self, loss, optimizer): def check_clip_result(self, loss, optimizer):
# if grad is None # if grad is None

@ -31,9 +31,9 @@ __all__ += rnn.__all__
__all__ += weight_norm_hook.__all__ __all__ += weight_norm_hook.__all__
# TODO: define alias in nn directory # TODO: define alias in nn directory
from .clip import GradientClipByGlobalNorm #DEFINE_ALIAS from .clip import ClipGradByGlobalNorm #DEFINE_ALIAS
from .clip import GradientClipByNorm #DEFINE_ALIAS from .clip import ClipGradByNorm #DEFINE_ALIAS
from .clip import GradientClipByValue #DEFINE_ALIAS from .clip import ClipGradByValue #DEFINE_ALIAS
# from .clip import set_gradient_clip #DEFINE_ALIAS # from .clip import set_gradient_clip #DEFINE_ALIAS
from .clip import clip #DEFINE_ALIAS from .clip import clip #DEFINE_ALIAS
from .clip import clip_by_norm #DEFINE_ALIAS from .clip import clip_by_norm #DEFINE_ALIAS
@ -51,13 +51,13 @@ from .decode import beam_search_decode #DEFINE_ALIAS
# from .decode import dynamic_decode #DEFINE_ALIAS # from .decode import dynamic_decode #DEFINE_ALIAS
from .decode import gather_tree #DEFINE_ALIAS from .decode import gather_tree #DEFINE_ALIAS
# from .input import Input #DEFINE_ALIAS # from .input import Input #DEFINE_ALIAS
from .layer.activation import ELU from .layer.activation import ELU #DEFINE_ALIAS
from .layer.activation import GELU from .layer.activation import GELU #DEFINE_ALIAS
from .layer.activation import Tanh from .layer.activation import Tanh #DEFINE_ALIAS
from .layer.activation import Hardshrink from .layer.activation import Hardshrink #DEFINE_ALIAS
from .layer.activation import Hardtanh from .layer.activation import Hardtanh #DEFINE_ALIAS
from .layer.activation import PReLU from .layer.activation import PReLU #DEFINE_ALIAS
from .layer.activation import ReLU from .layer.activation import ReLU #DEFINE_ALIAS
from .layer.activation import ReLU6 #DEFINE_ALIAS from .layer.activation import ReLU6 #DEFINE_ALIAS
from .layer.activation import SELU #DEFINE_ALIAS from .layer.activation import SELU #DEFINE_ALIAS
from .layer.activation import LeakyReLU #DEFINE_ALIAS from .layer.activation import LeakyReLU #DEFINE_ALIAS

@ -13,18 +13,18 @@
# limitations under the License. # limitations under the License.
# TODO: define the functions to clip gradient of parameter # TODO: define the functions to clip gradient of parameter
from ..fluid.clip import GradientClipByGlobalNorm #DEFINE_ALIAS from ..fluid.clip import ClipGradByGlobalNorm #DEFINE_ALIAS
from ..fluid.clip import GradientClipByNorm #DEFINE_ALIAS from ..fluid.clip import ClipGradByNorm #DEFINE_ALIAS
from ..fluid.clip import GradientClipByValue #DEFINE_ALIAS from ..fluid.clip import ClipGradByValue #DEFINE_ALIAS
from ..fluid.layers import clip #DEFINE_ALIAS from ..fluid.layers import clip #DEFINE_ALIAS
from ..fluid.layers import clip_by_norm #DEFINE_ALIAS from ..fluid.layers import clip_by_norm #DEFINE_ALIAS
__all__ = [ __all__ = [
# 'ErrorClipByValue', # 'ErrorClipByValue',
'GradientClipByGlobalNorm', 'ClipGradByGlobalNorm',
'GradientClipByNorm', 'ClipGradByNorm',
'GradientClipByValue', 'ClipGradByValue',
# 'set_gradient_clip', # 'set_gradient_clip',
'clip', 'clip',
'clip_by_norm' 'clip_by_norm'

Loading…
Cancel
Save