You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
296 lines
12 KiB
296 lines
12 KiB
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
from .optimizer import Optimizer
|
|
from ..fluid import core
|
|
from ..fluid import framework
|
|
from ..fluid.framework import Variable
|
|
|
|
__all__ = ["Adam"]
|
|
|
|
|
|
class Adam(Optimizer):
|
|
"""
|
|
The Adam optimizer uses an optimization described at the end
|
|
of section 2 of `Adam paper <https://arxiv.org/abs/1412.6980>`_ ,
|
|
it can dynamically adjusts the learning rate of each parameter using
|
|
the 1st moment estimates and the 2nd moment estimates of the gradient.
|
|
|
|
The parameter ``param_out`` update rule with gradient ``grad``:
|
|
|
|
.. math::
|
|
|
|
t & = t + 1
|
|
|
|
moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
|
|
|
|
moment\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
|
|
|
|
learning\_rate & = learning\_rate * \\
|
|
\\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t}
|
|
|
|
param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
|
|
|
|
Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
|
|
|
|
Args:
|
|
learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``.
|
|
It can be a float value or a _LRScheduler. The default value is 0.001.
|
|
beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
|
|
It should be a float number or a Tensor with shape [1] and data type as float32.
|
|
The default value is 0.9.
|
|
beta2 (float|Tensor, optional): The exponential decay rate for the 2nd moment estimates.
|
|
It should be a float number or a Tensor with shape [1] and data type as float32.
|
|
The default value is 0.999.
|
|
epsilon (float, optional): A small float value for numerical stability.
|
|
The default value is 1e-08.
|
|
parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
|
|
This parameter is required in dygraph mode. \
|
|
The default value is None in static mode, at this time all parameters will be updated.
|
|
weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
|
|
It canbe a float value as coeff of L2 regularization or \
|
|
:ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
|
|
If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
|
|
the regularization setting here in optimizer will be ignored for this parameter. \
|
|
Otherwise, the regularization setting here in optimizer will take effect. \
|
|
Default None, meaning there is no regularization.
|
|
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
|
|
some derived class of ``GradientClipBase`` . There are three cliping strategies
|
|
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
|
|
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
|
|
name (str, optional): Normally there is no need for user to set this property.
|
|
For more information, please refer to :ref:`api_guide_Name`.
|
|
The default value is None.
|
|
lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
|
|
The accumulators are updated at every step. Every element of the two moving-average
|
|
is updated in both dense mode and sparse mode. If the size of parameter is very large,
|
|
then the update may be very slow. The lazy mode only update the element that has
|
|
gradient in current mini-batch, so it will be much more faster. But this mode has
|
|
different semantics with the original Adam algorithm and may lead to different result.
|
|
The default value is False.
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
import paddle
|
|
import numpy as np
|
|
|
|
paddle.disable_static()
|
|
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
|
|
linear = paddle.nn.Linear(10, 10)
|
|
inp = paddle.to_tensor(inp)
|
|
out = linear(inp)
|
|
loss = paddle.mean(out)
|
|
adam = paddle.optimizer.Adam(learning_rate=0.1,
|
|
parameters=linear.parameters())
|
|
out.backward()
|
|
adam.step()
|
|
adam.clear_grad()
|
|
|
|
.. code-block:: python
|
|
|
|
# Adam with beta1/beta2 as Tensor and weight_decay as float
|
|
import paddle
|
|
import numpy as np
|
|
|
|
paddle.disable_static()
|
|
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
|
|
linear = paddle.nn.Linear(10, 10)
|
|
inp = paddle.to_tensor(inp)
|
|
out = linear(inp)
|
|
loss = paddle.mean(out)
|
|
|
|
beta1 = paddle.to_tensor([0.9], dtype="float32")
|
|
beta2 = paddle.to_tensor([0.99], dtype="float32")
|
|
|
|
adam = paddle.optimizer.Adam(learning_rate=0.1,
|
|
parameters=linear.parameters(),
|
|
beta1=beta1,
|
|
beta2=beta2,
|
|
weight_decay=0.01)
|
|
out.backward()
|
|
adam.step()
|
|
adam.clear_grad()
|
|
|
|
"""
|
|
_moment1_acc_str = "moment1"
|
|
_moment2_acc_str = "moment2"
|
|
_beta1_pow_acc_str = "beta1_pow_acc"
|
|
_beta2_pow_acc_str = "beta2_pow_acc"
|
|
|
|
def __init__(self,
|
|
learning_rate=0.001,
|
|
beta1=0.9,
|
|
beta2=0.999,
|
|
epsilon=1e-8,
|
|
parameters=None,
|
|
weight_decay=None,
|
|
grad_clip=None,
|
|
name=None,
|
|
lazy_mode=False):
|
|
assert learning_rate is not None
|
|
assert beta1 is not None
|
|
assert beta2 is not None
|
|
assert epsilon is not None
|
|
if not 0 <= beta1 < 1:
|
|
raise ValueError("Invaild value of beta1, expect beta1 in [0,1).")
|
|
if not 0 <= beta2 < 1:
|
|
raise ValueError("Invaild value of beta2, expect beta2 in [0,1).")
|
|
if not 0 <= epsilon:
|
|
raise ValueError("Invaild value of epsilon, expect epsilon >= 0.")
|
|
super(Adam, self).__init__(
|
|
learning_rate=learning_rate,
|
|
parameters=parameters,
|
|
weight_decay=weight_decay,
|
|
grad_clip=grad_clip,
|
|
name=name)
|
|
self.type = "adam"
|
|
self._beta1 = beta1
|
|
self._beta2 = beta2
|
|
self._epsilon = epsilon
|
|
self._lazy_mode = lazy_mode
|
|
|
|
def _create_accumulators(self, block, parameters):
|
|
assert isinstance(block, framework.Block)
|
|
|
|
# Create accumulator tensors for first and second moments
|
|
for p in parameters:
|
|
self._add_accumulator(self._moment1_acc_str, p)
|
|
self._add_accumulator(self._moment2_acc_str, p)
|
|
self._add_accumulator(
|
|
name=self._beta1_pow_acc_str,
|
|
param=p,
|
|
fill_value=0.9 if isinstance(self._beta1, Variable) \
|
|
else self._beta1,
|
|
shape=[1],
|
|
type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
|
|
self._add_accumulator(
|
|
name=self._beta2_pow_acc_str,
|
|
param=p,
|
|
fill_value=0.999 if isinstance(self._beta2, Variable) \
|
|
else self._beta2,
|
|
shape=[1],
|
|
type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
|
|
|
|
def _append_optimize_op(self, block, param_and_grad):
|
|
assert isinstance(block, framework.Block)
|
|
|
|
moment1 = self._get_accumulator(self._moment1_acc_str,
|
|
param_and_grad[0])
|
|
moment2 = self._get_accumulator(self._moment2_acc_str,
|
|
param_and_grad[0])
|
|
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
|
|
param_and_grad[0])
|
|
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
|
|
param_and_grad[0])
|
|
lr = self._create_param_lr(param_and_grad)
|
|
# create the adam optimize op
|
|
|
|
if framework.in_dygraph_mode():
|
|
_beta1 = self._beta1 if not isinstance(
|
|
self._beta1, Variable) else self._beta1.numpy().item(0)
|
|
_beta2 = self._beta2 if not isinstance(
|
|
self._beta2, Variable) else self._beta2.numpy().item(0)
|
|
_, _, _, _, _ = core.ops.adam(
|
|
param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
|
|
beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1,
|
|
moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon,
|
|
'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread',
|
|
1000, 'beta1', _beta1, 'beta2', _beta2)
|
|
|
|
return None
|
|
|
|
inputs = {
|
|
"Param": [param_and_grad[0]],
|
|
"Grad": [param_and_grad[1]],
|
|
"LearningRate": [lr],
|
|
"Moment1": [moment1],
|
|
"Moment2": [moment2],
|
|
"Beta1Pow": [beta1_pow_acc],
|
|
"Beta2Pow": [beta2_pow_acc]
|
|
}
|
|
outputs = {
|
|
"ParamOut": [param_and_grad[0]],
|
|
"Moment1Out": [moment1],
|
|
"Moment2Out": [moment2],
|
|
"Beta1PowOut": [beta1_pow_acc],
|
|
"Beta2PowOut": [beta2_pow_acc],
|
|
}
|
|
attrs = {
|
|
"epsilon": self._epsilon,
|
|
"lazy_mode": self._lazy_mode,
|
|
"min_row_size_to_use_multithread": 1000
|
|
}
|
|
|
|
if isinstance(self._beta1, Variable):
|
|
inputs['Beta1Tensor'] = self._beta1
|
|
else:
|
|
attrs['beta1'] = self._beta1
|
|
if isinstance(self._beta2, Variable):
|
|
inputs['Beta2Tensor'] = self._beta2
|
|
else:
|
|
attrs['beta2'] = self._beta2
|
|
|
|
adam_op = block.append_op(
|
|
type=self.type,
|
|
inputs=inputs,
|
|
outputs=outputs,
|
|
attrs=attrs,
|
|
stop_gradient=True)
|
|
|
|
return adam_op
|
|
|
|
@framework.dygraph_only
|
|
def step(self):
|
|
"""
|
|
Execute the optimizer and update parameters once.
|
|
|
|
Returns:
|
|
None
|
|
|
|
Examples:
|
|
.. code-block:: python
|
|
|
|
import paddle
|
|
import numpy as np
|
|
paddle.disable_static()
|
|
value = np.arange(26).reshape(2, 13).astype("float32")
|
|
a = paddle.to_tensor(value)
|
|
linear = paddle.nn.Linear(13, 5)
|
|
# This can be any optimizer supported by dygraph.
|
|
adam = paddle.optimizer.Adam(learning_rate = 0.01,
|
|
parameters = linear.parameters())
|
|
out = linear(a)
|
|
out.backward()
|
|
adam.step()
|
|
adam.clear_grad()
|
|
"""
|
|
parameter_list = self._parameter_list
|
|
self._dtype = None
|
|
params_grads = []
|
|
for param in self._parameter_list:
|
|
if not param.trainable:
|
|
continue
|
|
if param._grad_ivar() is not None:
|
|
grad_var = param._grad_ivar()
|
|
if hasattr(grad_var, "_is_sparse") and grad_var._is_sparse(
|
|
) and self.regularization is not None:
|
|
raise RuntimeError(
|
|
"Adam don't support weight_decay with sparse parameters, please set it to None."
|
|
)
|
|
params_grads.append((param, grad_var))
|
|
|
|
optimize_ops = self._apply_optimize(
|
|
loss=None, startup_program=None, params_grads=params_grads)
|