|
|
|
@ -24,8 +24,6 @@ __all__ = [
|
|
|
|
|
'GradientClipByValue',
|
|
|
|
|
'GradientClipByNorm',
|
|
|
|
|
'GradientClipByGlobalNorm',
|
|
|
|
|
'append_gradient_clip_ops',
|
|
|
|
|
'error_clip_callback',
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -38,6 +36,25 @@ class BaseErrorClipAttr(object):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ErrorClipByValue(BaseErrorClipAttr):
|
|
|
|
|
"""
|
|
|
|
|
Clips tensor values to the range [min, max].
|
|
|
|
|
|
|
|
|
|
Given a tensor t, this operation clips its value to min and max inplace.
|
|
|
|
|
|
|
|
|
|
- Any values less than min are set to min.
|
|
|
|
|
- Any values greater than max are set to max.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
max (float): The maximum value to clip by.
|
|
|
|
|
min (float, optional): The minimum value to clip by. if not set by user, \
|
|
|
|
|
will be set to -max by framework.
|
|
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
|
|
var = fluid.framework.Variable(..., error_clip=ErrorClipByValue(max=5.0), ...)
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, max, min=None):
|
|
|
|
|
max = float(max)
|
|
|
|
|
if min is None:
|
|
|
|
@ -99,6 +116,31 @@ class NullGradientClipAttr(BaseGradientClipAttr):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class GradientClipByValue(BaseGradientClipAttr):
|
|
|
|
|
"""
|
|
|
|
|
Clips gradient values to the range [min, max].
|
|
|
|
|
|
|
|
|
|
Given a tensor t, this operation clips its value to min and max inplace.
|
|
|
|
|
|
|
|
|
|
- Any values less than min are set to min.
|
|
|
|
|
- Any values greater than max are set to max.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
max (float): The maximum value to clip by.
|
|
|
|
|
min (float, optional): The minimum value to clip by. if not set by user, \
|
|
|
|
|
will be set to -max by framework.
|
|
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
|
|
w_param_attrs = ParamAttr(name=None,
|
|
|
|
|
initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
|
|
|
|
|
learning_rate=1.0,
|
|
|
|
|
regularizer=L1Decay(1.0),
|
|
|
|
|
trainable=True,
|
|
|
|
|
clip=GradientClipByValue(-1.0, 1.0))
|
|
|
|
|
y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, max, min=None):
|
|
|
|
|
max = float(max)
|
|
|
|
|
if min is None:
|
|
|
|
@ -120,6 +162,37 @@ class GradientClipByValue(BaseGradientClipAttr):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class GradientClipByNorm(BaseGradientClipAttr):
|
|
|
|
|
"""
|
|
|
|
|
Clips tensor values to a maximum L2-norm.
|
|
|
|
|
|
|
|
|
|
This operator limits the L2 norm of the input :math:`X` within :math:`max\_norm`.
|
|
|
|
|
If the L2 norm of :math:`X` is less than or equal to :math:`max\_norm`, :math:`Out`
|
|
|
|
|
will be the same as :math:`X`. If the L2 norm of :math:`X` is greater than
|
|
|
|
|
:math:`max\_norm`, :math:`X` will be linearly scaled to make the L2 norm of
|
|
|
|
|
:math:`Out` equal to :math:`max\_norm`, as shown in the following formula:
|
|
|
|
|
|
|
|
|
|
.. math::
|
|
|
|
|
|
|
|
|
|
Out = \\frac{max\_norm * X}{norm(X)},
|
|
|
|
|
|
|
|
|
|
where :math:`norm(X)` represents the L2 norm of :math:`X`.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
clip_norm (float): The maximum norm value
|
|
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
|
|
w_param_attrs = ParamAttr(name=None,
|
|
|
|
|
initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
|
|
|
|
|
learning_rate=1.0,
|
|
|
|
|
regularizer=L1Decay(1.0),
|
|
|
|
|
trainable=True,
|
|
|
|
|
clip=GradientClipByNorm(clip_norm=2.0))
|
|
|
|
|
y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, clip_norm):
|
|
|
|
|
self.clip_norm = clip_norm
|
|
|
|
|
|
|
|
|
@ -183,15 +256,16 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
|
|
|
|
|
|
|
|
|
|
def set_gradient_clip(clip, param_list=None, program=None):
|
|
|
|
|
"""
|
|
|
|
|
To specify parameters that require gradient clip.
|
|
|
|
|
Args:
|
|
|
|
|
clip(BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr,
|
|
|
|
|
which describes the type and detailed attributes of required gradient clip.
|
|
|
|
|
param_list(list, None by default): Parameters that require gradient clip.
|
|
|
|
|
It can be a list of parameter or a list of parameter's name.
|
|
|
|
|
When it's None, all parameters in the program will be included.
|
|
|
|
|
program(Program, None by default): The program where parameters are.
|
|
|
|
|
Will be the default main program when assigned with None.
|
|
|
|
|
To specify parameters that require gradient clip.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
clip(BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr,
|
|
|
|
|
which describes the type and detailed attributes of required gradient clip.
|
|
|
|
|
param_list(list(Variable)): Parameters that require gradient clip.
|
|
|
|
|
It can be a list of parameter or a list of parameter's name.
|
|
|
|
|
When it's None, all parameters in the program will be included.
|
|
|
|
|
program(Program): The program where parameters are.
|
|
|
|
|
Will be the default main program when assigned with None.
|
|
|
|
|
"""
|
|
|
|
|
if not isinstance(clip, BaseGradientClipAttr):
|
|
|
|
|
raise TypeError(
|
|
|
|
|