|
|
|
@ -129,6 +129,7 @@ class AdamW(Adam):
|
|
|
|
|
self._params_name = set()
|
|
|
|
|
self._apply_decay_param_fun = apply_decay_param_fun
|
|
|
|
|
self._coeff = coeff
|
|
|
|
|
self._lr_to_coeff = dict()
|
|
|
|
|
super(AdamW, self).__init__(
|
|
|
|
|
learning_rate=learning_rate,
|
|
|
|
|
parameters=parameters,
|
|
|
|
@ -139,96 +140,48 @@ class AdamW(Adam):
|
|
|
|
|
name=name,
|
|
|
|
|
lazy_mode=lazy_mode)
|
|
|
|
|
|
|
|
|
|
def _scale_parameters(self, params_and_grads):
|
|
|
|
|
def _append_decoupled_weight_decay(self, block, param_and_grad):
|
|
|
|
|
"""
|
|
|
|
|
Adds weight decay ops.
|
|
|
|
|
scaled_parameter = parameter * coeff
|
|
|
|
|
Add decoupled weight decay op.
|
|
|
|
|
parameter = parameter - parameter * coeff * lr
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
params_and_grads: A list of (parameters, gradients) pairs,
|
|
|
|
|
block: block in which variable is to be created
|
|
|
|
|
param_and_grad: (parameters, gradients) pairs,
|
|
|
|
|
the parameters need to decay.
|
|
|
|
|
Raises:
|
|
|
|
|
Exception: The type of coeff and parameter is not consistent.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
scaled_params = []
|
|
|
|
|
for param, grad in params_and_grads:
|
|
|
|
|
# If no gradient then we don't need to do anything
|
|
|
|
|
if grad is None:
|
|
|
|
|
continue
|
|
|
|
|
if self._apply_decay_param_fun is not None \
|
|
|
|
|
and not self._apply_decay_param_fun(param.name):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if isinstance(self._coeff, float):
|
|
|
|
|
assert param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32, \
|
|
|
|
|
"the type of coeff(float) and parameter(%s) is not consistent."%(self._coeff.dtype)
|
|
|
|
|
else:
|
|
|
|
|
assert self._coeff.dtype == param.dtype, \
|
|
|
|
|
"the type of coeff(%s) and parameter(%s) is not consistent."%(self._coeff.dtype, param.dtype)
|
|
|
|
|
if isinstance(self._learning_rate, float):
|
|
|
|
|
learning_rate = self._learning_rate
|
|
|
|
|
else:
|
|
|
|
|
learning_rate = self._learning_rate()
|
|
|
|
|
with param.block.program._optimized_guard(
|
|
|
|
|
[param, grad]), framework.name_scope('weight decay'):
|
|
|
|
|
scaled_params.append(
|
|
|
|
|
(param, grad, param * self._coeff * learning_rate))
|
|
|
|
|
if param.name not in self._params_name:
|
|
|
|
|
self._params_name.add(param.name)
|
|
|
|
|
param = param * self._coeff
|
|
|
|
|
return scaled_params
|
|
|
|
|
|
|
|
|
|
@imperative_base.no_grad
|
|
|
|
|
def minimize(self,
|
|
|
|
|
loss,
|
|
|
|
|
startup_program=None,
|
|
|
|
|
parameters=None,
|
|
|
|
|
no_grad_set=None):
|
|
|
|
|
parameters = parameters if parameters \
|
|
|
|
|
else self._parameter_list
|
|
|
|
|
|
|
|
|
|
params_grads = self.backward(
|
|
|
|
|
loss=loss,
|
|
|
|
|
startup_program=startup_program,
|
|
|
|
|
parameters=parameters,
|
|
|
|
|
no_grad_set=no_grad_set)
|
|
|
|
|
scaled_params = self._scale_parameters(params_grads)
|
|
|
|
|
for p_grad_sgrad in scaled_params:
|
|
|
|
|
param, grad, scaled_param = p_grad_sgrad
|
|
|
|
|
with param.block.program._optimized_guard(
|
|
|
|
|
[param, grad]), framework.name_scope('weight decay'):
|
|
|
|
|
updated_param = paddle.fluid.layers.elementwise_sub(
|
|
|
|
|
x=param, y=scaled_param)
|
|
|
|
|
paddle.fluid.layers.assign(input=updated_param, output=param)
|
|
|
|
|
|
|
|
|
|
optimize_ops = self._apply_optimize(
|
|
|
|
|
loss=loss,
|
|
|
|
|
params_grads=params_grads,
|
|
|
|
|
startup_program=startup_program)
|
|
|
|
|
return optimize_ops, params_grads
|
|
|
|
|
|
|
|
|
|
@framework.dygraph_only
|
|
|
|
|
@imperative_base.no_grad
|
|
|
|
|
def step(self):
|
|
|
|
|
params_grads = []
|
|
|
|
|
for param in self._parameter_list:
|
|
|
|
|
if not param.trainable:
|
|
|
|
|
continue
|
|
|
|
|
if param._grad_ivar() is not None:
|
|
|
|
|
grad_var = param._grad_ivar()
|
|
|
|
|
params_grads.append((param, grad_var))
|
|
|
|
|
|
|
|
|
|
scaled_params = self._scale_parameters(params_grads)
|
|
|
|
|
for p_grad_sgrad in scaled_params:
|
|
|
|
|
param, grad, scaled_param = p_grad_sgrad
|
|
|
|
|
with param.block.program._optimized_guard(
|
|
|
|
|
[param, grad]), framework.name_scope('weight decay'):
|
|
|
|
|
updated_param = paddle.fluid.layers.elementwise_sub(
|
|
|
|
|
x=param, y=scaled_param)
|
|
|
|
|
paddle.fluid.layers.assign(input=updated_param, output=param)
|
|
|
|
|
self._apply_optimize(
|
|
|
|
|
loss=None, startup_program=None, params_grads=params_grads)
|
|
|
|
|
param, grad = param_and_grad
|
|
|
|
|
|
|
|
|
|
if self._apply_decay_param_fun is not None \
|
|
|
|
|
and not self._apply_decay_param_fun(param.name):
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
if isinstance(self._learning_rate, float):
|
|
|
|
|
learning_rate = self._learning_rate
|
|
|
|
|
else:
|
|
|
|
|
# NOTE. We add this function to the _append_optimize_op(),
|
|
|
|
|
# for we must make sure _create_param_lr() be called after
|
|
|
|
|
# optimizer._create_global_learning_rate().
|
|
|
|
|
learning_rate = self._create_param_lr(param_and_grad)
|
|
|
|
|
|
|
|
|
|
with block.program._optimized_guard(
|
|
|
|
|
[param, grad]), framework.name_scope('weight decay'):
|
|
|
|
|
self._params_name.add(param.name)
|
|
|
|
|
|
|
|
|
|
# If it has been calculated, the result will be reused
|
|
|
|
|
decay_coeff = self._lr_to_coeff.get(learning_rate, None)
|
|
|
|
|
if decay_coeff is None:
|
|
|
|
|
decay_coeff = 1.0 - learning_rate * self._coeff
|
|
|
|
|
self._lr_to_coeff[learning_rate] = decay_coeff
|
|
|
|
|
|
|
|
|
|
scaled_param = param * decay_coeff
|
|
|
|
|
paddle.fluid.layers.assign(input=scaled_param, output=param)
|
|
|
|
|
|
|
|
|
|
def _append_optimize_op(self, block, param_and_grad):
|
|
|
|
|
self._append_decoupled_weight_decay(block, param_and_grad)
|
|
|
|
|
return super(AdamW, self)._append_optimize_op(block, param_and_grad)
|
|
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
|
return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
|
|
|
|
|