|
|
|
@ -24,7 +24,9 @@ from layer_helper import LayerHelper
|
|
|
|
|
from regularizer import append_regularization_ops
|
|
|
|
|
from clip import append_gradient_clip_ops, error_clip_callback
|
|
|
|
|
|
|
|
|
|
__all__ = ['SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad']
|
|
|
|
|
__all__ = [
|
|
|
|
|
'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Adadelta'
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Optimizer(object):
|
|
|
|
@ -575,6 +577,58 @@ class DecayedAdagradOptimizer(Optimizer):
|
|
|
|
|
return decayed_adagrad_op
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class AdadeltaOptimizer(Optimizer):
|
|
|
|
|
"""Simple Adadelta optimizer with average squared grad state and
|
|
|
|
|
average squared update state.
|
|
|
|
|
"""
|
|
|
|
|
_avg_squared_grad_acc_str = "_avg_squared_grad"
|
|
|
|
|
_avg_squared_update_acc_str = "_avg_squared_update"
|
|
|
|
|
|
|
|
|
|
def __init__(self, learning_rate, epsilon=1.0e-6, rho=0.95, **kwargs):
|
|
|
|
|
assert learning_rate is not None
|
|
|
|
|
assert epsilon is not None
|
|
|
|
|
assert rho is not None
|
|
|
|
|
super(AdadeltaOptimizer, self).__init__(
|
|
|
|
|
learning_rate=learning_rate, **kwargs)
|
|
|
|
|
self.type = "adadelta"
|
|
|
|
|
self._epsilon = epsilon
|
|
|
|
|
self._rho = rho
|
|
|
|
|
|
|
|
|
|
def _create_accumulators(self, block, parameters):
|
|
|
|
|
assert isinstance(block, framework.Block)
|
|
|
|
|
|
|
|
|
|
for p in parameters:
|
|
|
|
|
self._add_accumulator(self._avg_squared_grad_acc_str, p)
|
|
|
|
|
self._add_accumulator(self._avg_squared_update_acc_str, p)
|
|
|
|
|
|
|
|
|
|
def _append_optimize_op(self, block, param_and_grad):
|
|
|
|
|
assert isinstance(block, framework.Block)
|
|
|
|
|
|
|
|
|
|
avg_squared_grad_acc = self._get_accumulator(
|
|
|
|
|
self._avg_squared_grad_acc_str, param_and_grad[0])
|
|
|
|
|
avg_squared_update_acc = self._get_accumulator(
|
|
|
|
|
self._avg_squared_update_acc_str, param_and_grad[0])
|
|
|
|
|
|
|
|
|
|
# Create the adadelta optimizer op
|
|
|
|
|
adadelta_op = block.append_op(
|
|
|
|
|
type=self.type,
|
|
|
|
|
inputs={
|
|
|
|
|
"Param": param_and_grad[0],
|
|
|
|
|
"Grad": param_and_grad[1],
|
|
|
|
|
"AvgSquaredGrad": avg_squared_grad_acc,
|
|
|
|
|
"AvgSquaredUpdate": avg_squared_update_acc
|
|
|
|
|
},
|
|
|
|
|
outputs={
|
|
|
|
|
"ParamOut": param_and_grad[0],
|
|
|
|
|
"AvgSquaredGradOut": avg_squared_grad_acc,
|
|
|
|
|
"AvgSquaredUpdateOut": avg_squared_update_acc
|
|
|
|
|
},
|
|
|
|
|
attrs={"epsilon": self._epsilon,
|
|
|
|
|
"rho": self._rho})
|
|
|
|
|
|
|
|
|
|
return adadelta_op
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# We short the class name, since users will use the optimizer with the package
|
|
|
|
|
# name. The sample code:
|
|
|
|
|
#
|
|
|
|
@ -589,3 +643,4 @@ Adagrad = AdagradOptimizer
|
|
|
|
|
Adam = AdamOptimizer
|
|
|
|
|
Adamax = AdamaxOptimizer
|
|
|
|
|
DecayedAdagrad = DecayedAdagradOptimizer
|
|
|
|
|
Adadelta = AdadeltaOptimizer
|
|
|
|
|