|
|
|
@ -43,9 +43,10 @@ __all__ = [
|
|
|
|
|
'Ftrl', 'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer',
|
|
|
|
|
'AdamOptimizer', 'AdamaxOptimizer', 'DpsgdOptimizer',
|
|
|
|
|
'DecayedAdagradOptimizer', 'RMSPropOptimizer', 'FtrlOptimizer', 'Adadelta',
|
|
|
|
|
'ModelAverage', 'LarsMomentum', 'LarsMomentumOptimizer',
|
|
|
|
|
'DGCMomentumOptimizer', 'LambOptimizer', 'ExponentialMovingAverage',
|
|
|
|
|
'PipelineOptimizer', 'LookaheadOptimizer', 'RecomputeOptimizer'
|
|
|
|
|
'AdadeltaOptimizer', 'ModelAverage', 'LarsMomentum',
|
|
|
|
|
'LarsMomentumOptimizer', 'DGCMomentumOptimizer', 'LambOptimizer',
|
|
|
|
|
'ExponentialMovingAverage', 'PipelineOptimizer', 'LookaheadOptimizer',
|
|
|
|
|
'RecomputeOptimizer'
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -1778,39 +1779,42 @@ class DecayedAdagradOptimizer(Optimizer):
|
|
|
|
|
|
|
|
|
|
class AdadeltaOptimizer(Optimizer):
|
|
|
|
|
"""
|
|
|
|
|
**Adadelta Optimizer**
|
|
|
|
|
**NOTES: This API does not support sparse parameter optimization.**
|
|
|
|
|
|
|
|
|
|
Simple Adadelta optimizer with average squared grad state and
|
|
|
|
|
average squared update state.
|
|
|
|
|
The details of adadelta please refer to this
|
|
|
|
|
Adadelta Optimizer. Please refer to this for details:
|
|
|
|
|
`ADADELTA: AN ADAPTIVE LEARNING RATE METHOD
|
|
|
|
|
<http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf>`_.
|
|
|
|
|
<https://arxiv.org/abs/1212.5701>`_.
|
|
|
|
|
|
|
|
|
|
.. math::
|
|
|
|
|
.. math::
|
|
|
|
|
|
|
|
|
|
E(g_t^2) &= \rho * E(g_{t-1}^2) + (1-\rho) * g^2\\
|
|
|
|
|
|
|
|
|
|
E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\
|
|
|
|
|
learning\\_rate &= sqrt( ( E(dx_{t-1}^2) + \\epsilon ) / ( \\
|
|
|
|
|
E(g_t^2) + \\epsilon ) ) \\\\
|
|
|
|
|
E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\\_rate)^2
|
|
|
|
|
learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \epsilon ) / ( E(g_t^2) + \epsilon ) }\\
|
|
|
|
|
|
|
|
|
|
E(dx_t^2) &= \rho * E(dx_{t-1}^2) + (1-\rho) * (-g*learning\_rate)^2
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
learning_rate(float): global learning rate
|
|
|
|
|
rho(float): rho in equation
|
|
|
|
|
epsilon(float): epsilon in equation
|
|
|
|
|
regularization: A Regularizer, such as
|
|
|
|
|
fluid.regularizer.L2DecayRegularizer.
|
|
|
|
|
name: A optional name prefix.
|
|
|
|
|
learning_rate(float|Variable): global learning rate.
|
|
|
|
|
epsilon(float): a small float number for numeric stability. Default 1.0e-6.
|
|
|
|
|
rho(float): a floating point value indicating the decay rate.
|
|
|
|
|
regularization(WeightDecayRegularizer, optional): A Regularizer, such as fluid.regularizer.L2DecayRegularizer. Default None, meaning that there is no regularization.
|
|
|
|
|
name(str, optional): A optional name prefix for debugging. Default None.
|
|
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
|
|
import paddle.fluid as fluid
|
|
|
|
|
|
|
|
|
|
image = fluid.layers.data(name='image', shape=[28], dtype='float32')
|
|
|
|
|
fc = fluid.layers.fc(image, size=10)
|
|
|
|
|
cost = fluid.layers.reduce_mean(fc)
|
|
|
|
|
optimizer = fluid.optimizer.Adadelta(
|
|
|
|
|
learning_rate=0.0003, epsilon=1.0e-6, rho=0.95)
|
|
|
|
|
_, params_grads = optimizer.minimize(cost)
|
|
|
|
|
|
|
|
|
|
Notes:
|
|
|
|
|
Currently, AdadeltaOptimizer doesn't support sparse parameter optimization.
|
|
|
|
|
# optimizer_ops is a list of optimizer operators to update parameters
|
|
|
|
|
# params_grads is a list of (param, param_grad), where param is each
|
|
|
|
|
# parameter and param_grad is the gradient variable of param.
|
|
|
|
|
optimizer_ops, params_grads = optimizer.minimize(cost)
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
_avg_squared_grad_acc_str = "_avg_squared_grad"
|
|
|
|
|