Use correct master weights in AdamW. (#30895)

* Use correct master weights in AdamW.

* Just modify the master weight.

* Update for CI Coverage.
revert-31068-fix_conv3d_windows
Zhen Wang 4 years ago committed by GitHub
parent 79fa8fb0df
commit 71acde9afc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -97,7 +97,7 @@ def train(use_pure_fp16=True, use_nesterov=False, use_adam=False):
test_program = train_program.clone(for_test=True)
if use_adam:
optimizer = paddle.optimizer.Adam(
optimizer = paddle.optimizer.AdamW(
learning_rate=0.001,
epsilon=1e-8,
weight_decay=0.0,

@ -14,6 +14,7 @@
from .optimizer import Optimizer
from .adam import Adam
from ..fluid import core
from ..fluid import framework
from ..fluid.dygraph import base as imperative_base
import paddle
@ -182,8 +183,16 @@ class AdamW(Adam):
decay_coeff = 1.0 - learning_rate * self._coeff
self._lr_to_coeff[learning_rate] = decay_coeff
scaled_param = param * decay_coeff
paddle.fluid.layers.assign(input=scaled_param, output=param)
find_master = (self._multi_precision and
param.dtype == core.VarDesc.VarType.FP16)
if find_master:
master_weight = self._master_weights[param.name]
scaled_param = master_weight * decay_coeff
paddle.fluid.layers.assign(
input=scaled_param, output=master_weight)
else:
scaled_param = param * decay_coeff
paddle.fluid.layers.assign(input=scaled_param, output=param)
def _append_optimize_op(self, block, param_and_grad):
self._append_decoupled_weight_decay(block, param_and_grad)

Loading…
Cancel
Save