|
|
@ -29,7 +29,7 @@ class Adam(Optimizer):
|
|
|
|
of section 2 of `Adam paper <https://arxiv.org/abs/1412.6980>`_ ,
|
|
|
|
of section 2 of `Adam paper <https://arxiv.org/abs/1412.6980>`_ ,
|
|
|
|
it can dynamically adjusts the learning rate of each parameter using
|
|
|
|
it can dynamically adjusts the learning rate of each parameter using
|
|
|
|
the 1st moment estimates and the 2nd moment estimates of the gradient.
|
|
|
|
the 1st moment estimates and the 2nd moment estimates of the gradient.
|
|
|
|
|
|
|
|
|
|
|
|
The parameter ``param_out`` update rule with gradient ``grad``:
|
|
|
|
The parameter ``param_out`` update rule with gradient ``grad``:
|
|
|
|
|
|
|
|
|
|
|
|
.. math::
|
|
|
|
.. math::
|
|
|
@ -68,13 +68,10 @@ class Adam(Optimizer):
|
|
|
|
the regularization setting here in optimizer will be ignored for this parameter. \
|
|
|
|
the regularization setting here in optimizer will be ignored for this parameter. \
|
|
|
|
Otherwise, the regularization setting here in optimizer will take effect. \
|
|
|
|
Otherwise, the regularization setting here in optimizer will take effect. \
|
|
|
|
Default None, meaning there is no regularization.
|
|
|
|
Default None, meaning there is no regularization.
|
|
|
|
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
|
|
|
|
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
|
|
|
|
some derived class of ``GradientClipBase`` . There are three cliping strategies
|
|
|
|
some derived class of ``GradientClipBase`` . There are three cliping strategies
|
|
|
|
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
|
|
|
|
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
|
|
|
|
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
|
|
|
|
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
|
|
|
|
name (str, optional): Normally there is no need for user to set this property.
|
|
|
|
|
|
|
|
For more information, please refer to :ref:`api_guide_Name`.
|
|
|
|
|
|
|
|
The default value is None.
|
|
|
|
|
|
|
|
lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
|
|
|
|
lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
|
|
|
|
The accumulators are updated at every step. Every element of the two moving-average
|
|
|
|
The accumulators are updated at every step. Every element of the two moving-average
|
|
|
|
is updated in both dense mode and sparse mode. If the size of parameter is very large,
|
|
|
|
is updated in both dense mode and sparse mode. If the size of parameter is very large,
|
|
|
@ -82,17 +79,17 @@ class Adam(Optimizer):
|
|
|
|
gradient in current mini-batch, so it will be much more faster. But this mode has
|
|
|
|
gradient in current mini-batch, so it will be much more faster. But this mode has
|
|
|
|
different semantics with the original Adam algorithm and may lead to different result.
|
|
|
|
different semantics with the original Adam algorithm and may lead to different result.
|
|
|
|
The default value is False.
|
|
|
|
The default value is False.
|
|
|
|
|
|
|
|
name (str, optional): Normally there is no need for user to set this property.
|
|
|
|
|
|
|
|
For more information, please refer to :ref:`api_guide_Name`.
|
|
|
|
|
|
|
|
The default value is None.
|
|
|
|
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
Examples:
|
|
|
|
.. code-block:: python
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
|
|
|
|
import paddle
|
|
|
|
import paddle
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
paddle.disable_static()
|
|
|
|
|
|
|
|
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
|
|
|
|
|
|
|
|
linear = paddle.nn.Linear(10, 10)
|
|
|
|
linear = paddle.nn.Linear(10, 10)
|
|
|
|
inp = paddle.to_tensor(inp)
|
|
|
|
inp = paddle.rand([10,10], dtype="float32")
|
|
|
|
out = linear(inp)
|
|
|
|
out = linear(inp)
|
|
|
|
loss = paddle.mean(out)
|
|
|
|
loss = paddle.mean(out)
|
|
|
|
adam = paddle.optimizer.Adam(learning_rate=0.1,
|
|
|
|
adam = paddle.optimizer.Adam(learning_rate=0.1,
|
|
|
@ -105,12 +102,9 @@ class Adam(Optimizer):
|
|
|
|
|
|
|
|
|
|
|
|
# Adam with beta1/beta2 as Tensor and weight_decay as float
|
|
|
|
# Adam with beta1/beta2 as Tensor and weight_decay as float
|
|
|
|
import paddle
|
|
|
|
import paddle
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
paddle.disable_static()
|
|
|
|
|
|
|
|
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
|
|
|
|
|
|
|
|
linear = paddle.nn.Linear(10, 10)
|
|
|
|
linear = paddle.nn.Linear(10, 10)
|
|
|
|
inp = paddle.to_tensor(inp)
|
|
|
|
inp = paddle.rand([10,10], dtype="float32")
|
|
|
|
out = linear(inp)
|
|
|
|
out = linear(inp)
|
|
|
|
loss = paddle.mean(out)
|
|
|
|
loss = paddle.mean(out)
|
|
|
|
|
|
|
|
|
|
|
@ -140,8 +134,8 @@ class Adam(Optimizer):
|
|
|
|
parameters=None,
|
|
|
|
parameters=None,
|
|
|
|
weight_decay=None,
|
|
|
|
weight_decay=None,
|
|
|
|
grad_clip=None,
|
|
|
|
grad_clip=None,
|
|
|
|
name=None,
|
|
|
|
lazy_mode=False,
|
|
|
|
lazy_mode=False):
|
|
|
|
name=None):
|
|
|
|
assert learning_rate is not None
|
|
|
|
assert learning_rate is not None
|
|
|
|
assert beta1 is not None
|
|
|
|
assert beta1 is not None
|
|
|
|
assert beta2 is not None
|
|
|
|
assert beta2 is not None
|
|
|
@ -258,7 +252,7 @@ class Adam(Optimizer):
|
|
|
|
def step(self):
|
|
|
|
def step(self):
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
Execute the optimizer and update parameters once.
|
|
|
|
Execute the optimizer and update parameters once.
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Returns:
|
|
|
|
None
|
|
|
|
None
|
|
|
|
|
|
|
|
|
|
|
@ -266,13 +260,11 @@ class Adam(Optimizer):
|
|
|
|
.. code-block:: python
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
|
|
|
|
import paddle
|
|
|
|
import paddle
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
paddle.disable_static()
|
|
|
|
a = paddle.rand([2,13], dtype="float32")
|
|
|
|
value = np.arange(26).reshape(2, 13).astype("float32")
|
|
|
|
|
|
|
|
a = paddle.to_tensor(value)
|
|
|
|
|
|
|
|
linear = paddle.nn.Linear(13, 5)
|
|
|
|
linear = paddle.nn.Linear(13, 5)
|
|
|
|
# This can be any optimizer supported by dygraph.
|
|
|
|
# This can be any optimizer supported by dygraph.
|
|
|
|
adam = paddle.optimizer.Adam(learning_rate = 0.01,
|
|
|
|
adam = paddle.optimizer.Adam(learning_rate = 0.01,
|
|
|
|
parameters = linear.parameters())
|
|
|
|
parameters = linear.parameters())
|
|
|
|
out = linear(a)
|
|
|
|
out = linear(a)
|
|
|
|
out.backward()
|
|
|
|
out.backward()
|
|
|
|