|
|
|
|
@ -35,8 +35,8 @@ from ..dygraph import learning_rate_scheduler as imperate_lr
|
|
|
|
|
|
|
|
|
|
__all__ = [
|
|
|
|
|
'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
|
|
|
|
|
'polynomial_decay', 'piecewise_decay', 'noam_decay', 'append_LARS',
|
|
|
|
|
'cosine_decay', 'linear_lr_warmup'
|
|
|
|
|
'polynomial_decay', 'piecewise_decay', 'noam_decay', 'cosine_decay',
|
|
|
|
|
'linear_lr_warmup'
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -381,50 +381,6 @@ def cosine_decay(learning_rate, step_each_epoch, epochs):
|
|
|
|
|
return decayed_lr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def append_LARS(params_grads, learning_rate, weight_decay):
|
|
|
|
|
"""
|
|
|
|
|
Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for
|
|
|
|
|
each layer.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
learning_rate: A learning rate Variable. This
|
|
|
|
|
is the global learning rate for LARS.
|
|
|
|
|
weight_decay: A Python `float` number.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
The decayed learning rate
|
|
|
|
|
Examples:
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
|
|
learning_rate *= local_gw_ratio * sqrt(sumsq(param))
|
|
|
|
|
/ (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
assert not imperative_base.enabled(
|
|
|
|
|
), "append_LARS is NOT supported in dygraph mode now"
|
|
|
|
|
|
|
|
|
|
def _balanced_weight(param_norm, grad_norm):
|
|
|
|
|
if weight_decay == 1.0:
|
|
|
|
|
return grad_norm + param_norm
|
|
|
|
|
else:
|
|
|
|
|
return grad_norm + weight_decay * param_norm
|
|
|
|
|
|
|
|
|
|
for param, grad in params_grads:
|
|
|
|
|
with param.block.program.optimized_guard(
|
|
|
|
|
[param, grad]), name_scope("optimizer"):
|
|
|
|
|
param_lr = param.optimize_attr['learning_rate']
|
|
|
|
|
param_norm = ops.sqrt(nn.reduce_sum(input=ops.square(param)))
|
|
|
|
|
grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad)))
|
|
|
|
|
if type(param_lr) == float and param_lr == 1.0:
|
|
|
|
|
decayed_lr = learning_rate * param_norm \
|
|
|
|
|
/ _balanced_weight(param_norm, grad_norm)
|
|
|
|
|
else:
|
|
|
|
|
decayed_lr = learning_rate * param_lr * param_norm \
|
|
|
|
|
/ _balanced_weight(param_norm, grad_norm)
|
|
|
|
|
# set back param local learning rate
|
|
|
|
|
param.optimize_attr['learning_rate'] = decayed_lr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def linear_lr_warmup(learning_rate, warmup_steps, start_lr, end_lr):
|
|
|
|
|
"""
|
|
|
|
|
Applies linear learning rate warmup before the normal learning rate
|
|
|
|
|
|