|
|
|
@ -14,6 +14,7 @@
|
|
|
|
|
|
|
|
|
|
from .optimizer import Optimizer
|
|
|
|
|
from .adam import Adam
|
|
|
|
|
from ..fluid import core
|
|
|
|
|
from ..fluid import framework
|
|
|
|
|
from ..fluid.dygraph import base as imperative_base
|
|
|
|
|
import paddle
|
|
|
|
@ -182,8 +183,16 @@ class AdamW(Adam):
|
|
|
|
|
decay_coeff = 1.0 - learning_rate * self._coeff
|
|
|
|
|
self._lr_to_coeff[learning_rate] = decay_coeff
|
|
|
|
|
|
|
|
|
|
scaled_param = param * decay_coeff
|
|
|
|
|
paddle.fluid.layers.assign(input=scaled_param, output=param)
|
|
|
|
|
find_master = (self._multi_precision and
|
|
|
|
|
param.dtype == core.VarDesc.VarType.FP16)
|
|
|
|
|
if find_master:
|
|
|
|
|
master_weight = self._master_weights[param.name]
|
|
|
|
|
scaled_param = master_weight * decay_coeff
|
|
|
|
|
paddle.fluid.layers.assign(
|
|
|
|
|
input=scaled_param, output=master_weight)
|
|
|
|
|
else:
|
|
|
|
|
scaled_param = param * decay_coeff
|
|
|
|
|
paddle.fluid.layers.assign(input=scaled_param, output=param)
|
|
|
|
|
|
|
|
|
|
def _append_optimize_op(self, block, param_and_grad):
|
|
|
|
|
self._append_decoupled_weight_decay(block, param_and_grad)
|
|
|
|
|