[WIP] update optimizer for 2.0 (#26288)
refine Optimizer/Adam/Admax/RMSProp && add Admw * buf fix * update comment * unify arguments place; notest * fix ut, test=develop * bug fix * fix conflicts, test=develop * add examples code * bug fix * fix comments * fix sample code * add sample code for Optimizer * add adamax ut, test=develop * fix rmsprop ut, test=develop * add ut for optimizer.py and adamw.py * remove TestAdamOptimizerBetaVariable * update api && add ut * update doc && fix ut * add ut Co-authored-by: mapingshuo <mps2012@yeah.net>test_feature_precision_test_c
parent
e2b82e0439
commit
eeda90d674
@ -0,0 +1,67 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import numpy as np
|
||||
from op_test import OpTest
|
||||
import paddle
|
||||
import paddle.fluid as fluid
|
||||
|
||||
|
||||
class TestAdamaxAPI(unittest.TestCase):
|
||||
def test_adamax_api_dygraph(self):
|
||||
paddle.disable_static()
|
||||
value = np.arange(26).reshape(2, 13).astype("float32")
|
||||
a = paddle.to_variable(value)
|
||||
linear = paddle.nn.Linear(13, 5, dtype="float32")
|
||||
adam = paddle.optimizer.Adamax(
|
||||
learning_rate=0.01,
|
||||
parameters=linear.parameters(),
|
||||
weight_decay=0.01)
|
||||
out = linear(a)
|
||||
out.backward()
|
||||
adam.step()
|
||||
adam.clear_gradients()
|
||||
|
||||
def test_adamax_api(self):
|
||||
place = fluid.CPUPlace()
|
||||
shape = [2, 3, 8, 8]
|
||||
exe = fluid.Executor(place)
|
||||
train_prog = fluid.Program()
|
||||
startup = fluid.Program()
|
||||
with fluid.program_guard(train_prog, startup):
|
||||
with fluid.unique_name.guard():
|
||||
data = fluid.data(name="data", shape=shape)
|
||||
conv = fluid.layers.conv2d(data, 8, 3)
|
||||
loss = paddle.mean(conv)
|
||||
beta1 = 0.85
|
||||
beta2 = 0.95
|
||||
opt = paddle.optimizer.Adamax(
|
||||
learning_rate=1e-5,
|
||||
beta1=beta1,
|
||||
beta2=beta2,
|
||||
weight_decay=0.01,
|
||||
epsilon=1e-8)
|
||||
opt.minimize(loss)
|
||||
|
||||
exe.run(startup)
|
||||
data_np = np.random.random(shape).astype('float32')
|
||||
rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
|
||||
assert rets[0] is not None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -0,0 +1,81 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
import paddle
|
||||
import numpy as np
|
||||
import paddle.fluid as fluid
|
||||
|
||||
|
||||
class TestAdamWOp(unittest.TestCase):
|
||||
def test_adamw_op_dygraph(self):
|
||||
paddle.disable_static()
|
||||
value = np.arange(26).reshape(2, 13).astype("float32")
|
||||
a = paddle.to_variable(value)
|
||||
linear = paddle.nn.Linear(13, 5, dtype="float32")
|
||||
adam = paddle.optimizer.AdamW(
|
||||
learning_rate=0.01,
|
||||
parameters=linear.parameters(),
|
||||
apply_decay_param_fun=lambda name: True,
|
||||
weight_decay=0.01)
|
||||
out = linear(a)
|
||||
out.backward()
|
||||
adam.step()
|
||||
adam.clear_gradients()
|
||||
|
||||
def test_adamw_op_coverage(self):
|
||||
paddle.disable_static()
|
||||
value = np.arange(26).reshape(2, 13).astype("float32")
|
||||
a = paddle.to_variable(value)
|
||||
linear = paddle.nn.Linear(13, 5, dtype="float32")
|
||||
adam = paddle.optimizer.AdamW(
|
||||
learning_rate=0.0,
|
||||
parameters=linear.parameters(),
|
||||
apply_decay_param_fun=lambda name: True,
|
||||
weight_decay=0.01)
|
||||
assert (adam.__str__() is not None)
|
||||
|
||||
def test_adamw_op(self):
|
||||
place = fluid.CPUPlace()
|
||||
shape = [2, 3, 8, 8]
|
||||
exe = fluid.Executor(place)
|
||||
train_prog = fluid.Program()
|
||||
startup = fluid.Program()
|
||||
with fluid.program_guard(train_prog, startup):
|
||||
with fluid.unique_name.guard():
|
||||
data = fluid.data(name="data", shape=shape)
|
||||
conv = fluid.layers.conv2d(data, 8, 3)
|
||||
loss = paddle.mean(conv)
|
||||
|
||||
beta1 = fluid.layers.create_global_var(
|
||||
shape=[1], value=0.85, dtype='float32', persistable=True)
|
||||
beta2 = fluid.layers.create_global_var(
|
||||
shape=[1], value=0.95, dtype='float32', persistable=True)
|
||||
betas = [beta1, beta2]
|
||||
opt = paddle.optimizer.AdamW(
|
||||
learning_rate=1e-5,
|
||||
beta1=beta1,
|
||||
beta2=beta2,
|
||||
weight_decay=0.01,
|
||||
epsilon=1e-8)
|
||||
opt.minimize(loss)
|
||||
|
||||
exe.run(startup)
|
||||
data_np = np.random.random(shape).astype('float32')
|
||||
rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
|
||||
assert rets[0] is not None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,246 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .optimizer import Optimizer
|
||||
from ..fluid import core
|
||||
from ..fluid import framework
|
||||
from ..fluid.framework import Variable
|
||||
|
||||
__all__ = ["Adam"]
|
||||
|
||||
|
||||
class Adam(Optimizer):
|
||||
"""
|
||||
The Adam optimizer uses an optimization described at the end
|
||||
of section 2 of `Adam paper <https://arxiv.org/abs/1412.6980>`_ ,
|
||||
it can dynamically adjusts the learning rate of each parameter using
|
||||
the 1st moment estimates and the 2nd moment estimates of the gradient.
|
||||
|
||||
The parameter ``param_out`` update rule with gradient ``grad``:
|
||||
|
||||
.. math::
|
||||
|
||||
t & = t + 1
|
||||
|
||||
moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
|
||||
|
||||
moment\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
|
||||
|
||||
learning\_rate & = learning\_rate * \\
|
||||
\\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t}
|
||||
|
||||
param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
|
||||
|
||||
Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
|
||||
|
||||
Args:
|
||||
learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
|
||||
It can be a float value or a LearningRateDecay. The default value is 0.001.
|
||||
beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
|
||||
It should be a float number or a Tensor with shape [1] and data type as float32.
|
||||
The default value is 0.9.
|
||||
beta2 (float|Tensor, optional): The exponential decay rate for the 2nd moment estimates.
|
||||
It should be a float number or a Tensor with shape [1] and data type as float32.
|
||||
The default value is 0.999.
|
||||
epsilon (float, optional): A small float value for numerical stability.
|
||||
The default value is 1e-08.
|
||||
parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
|
||||
This parameter is required in dygraph mode. \
|
||||
The default value is None in static mode, at this time all parameters will be updated.
|
||||
weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
|
||||
It canbe a float value as coeff of L2 regularization or \
|
||||
:ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
|
||||
If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
|
||||
the regularization setting here in optimizer will be ignored for this parameter. \
|
||||
Otherwise, the regularization setting here in optimizer will take effect. \
|
||||
Default None, meaning there is no regularization.
|
||||
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
|
||||
some derived class of ``GradientClipBase`` . There are three cliping strategies
|
||||
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
|
||||
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
|
||||
name (str, optional): Normally there is no need for user to set this property.
|
||||
For more information, please refer to :ref:`api_guide_Name`.
|
||||
The default value is None.
|
||||
lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
|
||||
The accumulators are updated at every step. Every element of the two moving-average
|
||||
is updated in both dense mode and sparse mode. If the size of parameter is very large,
|
||||
then the update may be very slow. The lazy mode only update the element that has
|
||||
gradient in current mini-batch, so it will be much more faster. But this mode has
|
||||
different semantics with the original Adam algorithm and may lead to different result.
|
||||
The default value is False.
|
||||
|
||||
Examples:
|
||||
.. code-block:: python
|
||||
|
||||
import paddle
|
||||
import numpy as np
|
||||
|
||||
paddle.disable_static()
|
||||
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
|
||||
linear = paddle.nn.Linear(10, 10)
|
||||
inp = paddle.to_tensor(inp)
|
||||
out = linear(inp)
|
||||
loss = paddle.mean(out)
|
||||
adam = paddle.optimizer.Adam(learning_rate=0.1,
|
||||
parameters=linear.parameters())
|
||||
out.backward()
|
||||
adam.step()
|
||||
adam.clear_grad()
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Adam with beta1/beta2 as Tensor and weight_decay as float
|
||||
import paddle
|
||||
import numpy as np
|
||||
|
||||
paddle.disable_static()
|
||||
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
|
||||
linear = paddle.nn.Linear(10, 10)
|
||||
inp = paddle.to_tensor(inp)
|
||||
out = linear(inp)
|
||||
loss = paddle.mean(out)
|
||||
|
||||
beta1 = paddle.to_tensor([0.9], dtype="float32")
|
||||
beta2 = paddle.to_tensor([0.99], dtype="float32")
|
||||
|
||||
adam = paddle.optimizer.Adam(learning_rate=0.1,
|
||||
parameters=linear.parameters(),
|
||||
beta1=beta1,
|
||||
beta2=beta2,
|
||||
weight_decay=0.01)
|
||||
out.backward()
|
||||
adam.step()
|
||||
adam.clear_grad()
|
||||
|
||||
"""
|
||||
_moment1_acc_str = "moment1"
|
||||
_moment2_acc_str = "moment2"
|
||||
_beta1_pow_acc_str = "beta1_pow_acc"
|
||||
_beta2_pow_acc_str = "beta2_pow_acc"
|
||||
|
||||
def __init__(self,
|
||||
learning_rate=0.001,
|
||||
beta1=0.9,
|
||||
beta2=0.999,
|
||||
epsilon=1e-8,
|
||||
parameters=None,
|
||||
weight_decay=None,
|
||||
grad_clip=None,
|
||||
name=None,
|
||||
lazy_mode=False):
|
||||
assert learning_rate is not None
|
||||
assert beta1 is not None
|
||||
assert beta2 is not None
|
||||
assert epsilon is not None
|
||||
super(Adam, self).__init__(
|
||||
learning_rate=learning_rate,
|
||||
parameters=parameters,
|
||||
weight_decay=weight_decay,
|
||||
grad_clip=grad_clip,
|
||||
name=name)
|
||||
self.type = "adam"
|
||||
self._beta1 = beta1
|
||||
self._beta2 = beta2
|
||||
self._epsilon = epsilon
|
||||
self._lazy_mode = lazy_mode
|
||||
|
||||
def _create_accumulators(self, block, parameters):
|
||||
assert isinstance(block, framework.Block)
|
||||
|
||||
# Create accumulator tensors for first and second moments
|
||||
for p in parameters:
|
||||
self._add_accumulator(self._moment1_acc_str, p)
|
||||
self._add_accumulator(self._moment2_acc_str, p)
|
||||
self._add_accumulator(
|
||||
name=self._beta1_pow_acc_str,
|
||||
param=p,
|
||||
fill_value=0.9 if isinstance(self._beta1, Variable) \
|
||||
else self._beta1,
|
||||
shape=[1],
|
||||
type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
|
||||
self._add_accumulator(
|
||||
name=self._beta2_pow_acc_str,
|
||||
param=p,
|
||||
fill_value=0.999 if isinstance(self._beta2, Variable) \
|
||||
else self._beta2,
|
||||
shape=[1],
|
||||
type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
|
||||
|
||||
def _append_optimize_op(self, block, param_and_grad):
|
||||
assert isinstance(block, framework.Block)
|
||||
|
||||
moment1 = self._get_accumulator(self._moment1_acc_str,
|
||||
param_and_grad[0])
|
||||
moment2 = self._get_accumulator(self._moment2_acc_str,
|
||||
param_and_grad[0])
|
||||
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
|
||||
param_and_grad[0])
|
||||
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
|
||||
param_and_grad[0])
|
||||
lr = self._create_param_lr(param_and_grad)
|
||||
# create the adam optimize op
|
||||
|
||||
if framework.in_dygraph_mode():
|
||||
_beta1 = self._beta1 if not isinstance(
|
||||
self._beta1, Variable) else self._beta1.numpy().item(0)
|
||||
_beta2 = self._beta2 if not isinstance(
|
||||
self._beta2, Variable) else self._beta2.numpy().item(0)
|
||||
_, _, _, _, _ = core.ops.adam(
|
||||
param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
|
||||
beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1,
|
||||
moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon,
|
||||
'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread',
|
||||
1000, 'beta1', _beta1, 'beta2', _beta2)
|
||||
|
||||
return None
|
||||
|
||||
inputs = {
|
||||
"Param": [param_and_grad[0]],
|
||||
"Grad": [param_and_grad[1]],
|
||||
"LearningRate": [lr],
|
||||
"Moment1": [moment1],
|
||||
"Moment2": [moment2],
|
||||
"Beta1Pow": [beta1_pow_acc],
|
||||
"Beta2Pow": [beta2_pow_acc]
|
||||
}
|
||||
outputs = {
|
||||
"ParamOut": [param_and_grad[0]],
|
||||
"Moment1Out": [moment1],
|
||||
"Moment2Out": [moment2],
|
||||
"Beta1PowOut": [beta1_pow_acc],
|
||||
"Beta2PowOut": [beta2_pow_acc],
|
||||
}
|
||||
attrs = {
|
||||
"epsilon": self._epsilon,
|
||||
"lazy_mode": self._lazy_mode,
|
||||
"min_row_size_to_use_multithread": 1000
|
||||
}
|
||||
|
||||
if isinstance(self._beta1, Variable):
|
||||
inputs['Beta1Tensor'] = self._beta1
|
||||
else:
|
||||
attrs['beta1'] = self._beta1
|
||||
if isinstance(self._beta2, Variable):
|
||||
inputs['Beta2Tensor'] = self._beta2
|
||||
else:
|
||||
attrs['beta2'] = self._beta2
|
||||
|
||||
adam_op = block.append_op(
|
||||
type=self.type,
|
||||
inputs=inputs,
|
||||
outputs=outputs,
|
||||
attrs=attrs,
|
||||
stop_gradient=True)
|
||||
|
||||
return adam_op
|
@ -0,0 +1,192 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .optimizer import Optimizer
|
||||
from ..fluid import core
|
||||
from ..fluid import framework
|
||||
from ..fluid.framework import Variable, name_scope
|
||||
|
||||
__all__ = ["Adamax"]
|
||||
|
||||
|
||||
class Adamax(Optimizer):
|
||||
"""
|
||||
The Adamax optimizer is implemented based on the Adamax Optimization
|
||||
in Section 7 of `Adam paper <https://arxiv.org/abs/1412.6980>`_.
|
||||
The Adamax algorithm is a variant of the Adam algorithm based on the infinite norm,
|
||||
which makes the learning rate update algorithm more stable and simple.
|
||||
|
||||
The parameter ``param_out`` update rule with gradient ``grad``:
|
||||
|
||||
.. math::
|
||||
|
||||
t & = t + 1
|
||||
|
||||
moment\_out & = {\\beta}_1 * moment + (1 - {\\beta}_1) * grad
|
||||
|
||||
inf\_norm\_out & = max({\\beta}_2 * inf\_norm + \epsilon, |grad|)
|
||||
|
||||
learning\_rate & = \\frac{learning\_rate}{1 - {\\beta}_1^t}
|
||||
|
||||
param\_out & = param - learning\_rate * \\frac{moment\_out}{inf\_norm\_out}
|
||||
|
||||
Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
|
||||
|
||||
The original paper does not have an ``epsilon`` attribute,
|
||||
it is added here for numerical stability to prevent the division by 0 error.
|
||||
|
||||
Args:
|
||||
learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
|
||||
It can be a float value or a LearningRateDecay. The default value is 0.001.
|
||||
beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
|
||||
The default value is 0.9.
|
||||
beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
|
||||
The default value is 0.999.
|
||||
epsilon (float, optional): A small float value for numerical stability.
|
||||
The default value is 1e-08.
|
||||
parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
|
||||
This parameter is required in dygraph mode. \
|
||||
The default value is None in static mode, at this time all parameters will be updated.
|
||||
weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
|
||||
It canbe a float value as coeff of L2 regularization or \
|
||||
:ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
|
||||
If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
|
||||
the regularization setting here in optimizer will be ignored for this parameter. \
|
||||
Otherwise, the regularization setting here in optimizer will take effect. \
|
||||
Default None, meaning there is no regularization.
|
||||
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
|
||||
some derived class of ``GradientClipBase`` . There are three cliping strategies
|
||||
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
|
||||
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
|
||||
name (str, optional): Normally there is no need for user to set this property.
|
||||
For more information, please refer to :ref:`api_guide_Name`.
|
||||
The default value is None.
|
||||
|
||||
**Notes**:
|
||||
**Currently, Adamax doesn't support sparse parameter optimization.**
|
||||
|
||||
Examples:
|
||||
.. code-block:: python
|
||||
import paddle
|
||||
import numpy as np
|
||||
|
||||
paddle.disable_static()
|
||||
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
|
||||
linear = paddle.nn.Linear(10, 10)
|
||||
inp = paddle.to_tensor(inp)
|
||||
out = linear(inp)
|
||||
loss = paddle.mean(out)
|
||||
|
||||
beta1 = paddle.to_tensor([0.9], dtype="float32")
|
||||
beta2 = paddle.to_tensor([0.99], dtype="float32")
|
||||
|
||||
adam = paddle.optimizer.Adamax(learning_rate=0.1,
|
||||
parameters=linear.parameters(),
|
||||
beta1=beta1,
|
||||
beta2=beta2,
|
||||
weight_decay=0.01)
|
||||
out.backward()
|
||||
adam.step()
|
||||
adam.clear_grad()
|
||||
|
||||
"""
|
||||
_moment_acc_str = "moment"
|
||||
_inf_norm_acc_str = "inf_norm"
|
||||
_beta1_pow_acc_str = "beta1_pow_acc"
|
||||
|
||||
def __init__(self,
|
||||
learning_rate=0.001,
|
||||
beta1=0.9,
|
||||
beta2=0.999,
|
||||
epsilon=1e-8,
|
||||
parameters=None,
|
||||
weight_decay=None,
|
||||
grad_clip=None,
|
||||
name=None):
|
||||
assert learning_rate is not None
|
||||
assert beta1 is not None
|
||||
assert beta2 is not None
|
||||
assert epsilon is not None
|
||||
super(Adamax, self).__init__(
|
||||
learning_rate=learning_rate,
|
||||
parameters=parameters,
|
||||
weight_decay=weight_decay,
|
||||
grad_clip=grad_clip,
|
||||
name=name)
|
||||
self.type = "adamax"
|
||||
self._beta1 = beta1
|
||||
self._beta2 = beta2
|
||||
self._epsilon = epsilon
|
||||
|
||||
def _create_accumulators(self, block, parameters):
|
||||
# Create accumulator tensors for first moment and infinity norm
|
||||
for p in parameters:
|
||||
self._add_accumulator(self._moment_acc_str, p)
|
||||
self._add_accumulator(self._inf_norm_acc_str, p)
|
||||
self._add_accumulator(
|
||||
name=self._beta1_pow_acc_str,
|
||||
param=p,
|
||||
fill_value=self._beta1,
|
||||
shape=[1])
|
||||
|
||||
def _append_optimize_op(self, block, param_and_grad):
|
||||
assert isinstance(block, framework.Block)
|
||||
|
||||
moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0])
|
||||
inf_norm = self._get_accumulator(self._inf_norm_acc_str,
|
||||
param_and_grad[0])
|
||||
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
|
||||
param_and_grad[0])
|
||||
# create the adamax optimize op
|
||||
adamax_op = block.append_op(
|
||||
type=self.type,
|
||||
inputs={
|
||||
"Param": param_and_grad[0],
|
||||
"Grad": param_and_grad[1],
|
||||
"LearningRate": self._create_param_lr(param_and_grad),
|
||||
"Moment": moment,
|
||||
"InfNorm": inf_norm,
|
||||
"Beta1Pow": beta1_pow_acc
|
||||
},
|
||||
outputs={
|
||||
"ParamOut": param_and_grad[0],
|
||||
"MomentOut": moment,
|
||||
"InfNormOut": inf_norm
|
||||
},
|
||||
attrs={
|
||||
"beta1": self._beta1,
|
||||
"beta2": self._beta2,
|
||||
"epsilon": self._epsilon
|
||||
},
|
||||
stop_gradient=True)
|
||||
|
||||
return adamax_op
|
||||
|
||||
def _finish_update(self, block, parameters_and_grads):
|
||||
"""Update Beta1 Power accumulator
|
||||
"""
|
||||
assert isinstance(block, framework.Block)
|
||||
for param, grad in parameters_and_grads:
|
||||
if grad is None or param.trainable is False:
|
||||
continue
|
||||
with param.block.program._optimized_guard(
|
||||
[param, grad]), name_scope('adamax'):
|
||||
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
|
||||
param)
|
||||
block.append_op(
|
||||
type="scale",
|
||||
inputs={"X": beta1_pow_acc},
|
||||
outputs={"Out": beta1_pow_acc},
|
||||
attrs={"scale": self._beta1},
|
||||
stop_gradient=True)
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue