|
|
|
@ -2333,10 +2333,10 @@ class ExponentialMovingAverage(object):
|
|
|
|
|
|
|
|
|
|
\\text{EMA}_t & = \\text{decay} * \\text{EMA}_{t-1} + (1 - \\text{decay}) * \\theta_t
|
|
|
|
|
|
|
|
|
|
The average results will be saved in temporary variables which are created
|
|
|
|
|
and maintained by the object, and can be applied to parameters of current
|
|
|
|
|
model by calling **apply()** method. And the **restore()** method is used to
|
|
|
|
|
restore the parameters.
|
|
|
|
|
The average results calculated by **update()** method will be saved in
|
|
|
|
|
temporary variables which are created and maintained by the object, and can
|
|
|
|
|
be applied to parameters of current model by calling **apply()** method. And
|
|
|
|
|
the **restore()** method is used to restore the parameters.
|
|
|
|
|
|
|
|
|
|
**Bias correction**. All EMAs are initialized to :math:`0` and hence they will be
|
|
|
|
|
zero biased, which can be corrected by divided by a factor
|
|
|
|
@ -2382,6 +2382,7 @@ class ExponentialMovingAverage(object):
|
|
|
|
|
|
|
|
|
|
global_steps = fluid.layers.learning_rate_scheduler._decay_step_counter()
|
|
|
|
|
ema = fluid.optimizer.ExponentialMovingAverage(0.999, thres_steps=global_steps)
|
|
|
|
|
ema.update()
|
|
|
|
|
|
|
|
|
|
# pseudo code
|
|
|
|
|
for pass_id in range(args.pass_num):
|
|
|
|
@ -2407,7 +2408,7 @@ class ExponentialMovingAverage(object):
|
|
|
|
|
self._name = name if name is not None else ''
|
|
|
|
|
self._decay_var = self._get_ema_decay()
|
|
|
|
|
|
|
|
|
|
self.params_tmps = []
|
|
|
|
|
self._params_tmps = []
|
|
|
|
|
for param in default_main_program().global_block().all_parameters():
|
|
|
|
|
if param.do_model_average != False:
|
|
|
|
|
tmp = param.block.create_var(
|
|
|
|
@ -2416,22 +2417,22 @@ class ExponentialMovingAverage(object):
|
|
|
|
|
dtype=param.dtype,
|
|
|
|
|
persistable=False,
|
|
|
|
|
stop_gradient=True)
|
|
|
|
|
self.params_tmps.append((param, tmp))
|
|
|
|
|
self._params_tmps.append((param, tmp))
|
|
|
|
|
|
|
|
|
|
ema_vars = {}
|
|
|
|
|
for param, tmp in self.params_tmps:
|
|
|
|
|
self._ema_vars = {}
|
|
|
|
|
for param, tmp in self._params_tmps:
|
|
|
|
|
with param.block.program._optimized_guard(
|
|
|
|
|
[param, tmp]), name_scope('moving_average'):
|
|
|
|
|
ema_vars[param.name] = self._append_ema_ops(param)
|
|
|
|
|
self._ema_vars[param.name] = self._create_ema_vars(param)
|
|
|
|
|
|
|
|
|
|
self.apply_program = Program()
|
|
|
|
|
block = self.apply_program.global_block()
|
|
|
|
|
with program_guard(main_program=self.apply_program):
|
|
|
|
|
decay_pow = self._get_decay_pow(block)
|
|
|
|
|
for param, tmp in self.params_tmps:
|
|
|
|
|
for param, tmp in self._params_tmps:
|
|
|
|
|
param = block._clone_variable(param)
|
|
|
|
|
tmp = block._clone_variable(tmp)
|
|
|
|
|
ema = block._clone_variable(ema_vars[param.name])
|
|
|
|
|
ema = block._clone_variable(self._ema_vars[param.name])
|
|
|
|
|
layers.assign(input=param, output=tmp)
|
|
|
|
|
# bias correction
|
|
|
|
|
ema = ema / (1.0 - decay_pow)
|
|
|
|
@ -2440,7 +2441,7 @@ class ExponentialMovingAverage(object):
|
|
|
|
|
self.restore_program = Program()
|
|
|
|
|
block = self.restore_program.global_block()
|
|
|
|
|
with program_guard(main_program=self.restore_program):
|
|
|
|
|
for param, tmp in self.params_tmps:
|
|
|
|
|
for param, tmp in self._params_tmps:
|
|
|
|
|
tmp = block._clone_variable(tmp)
|
|
|
|
|
param = block._clone_variable(param)
|
|
|
|
|
layers.assign(input=tmp, output=param)
|
|
|
|
@ -2472,7 +2473,7 @@ class ExponentialMovingAverage(object):
|
|
|
|
|
decay_pow_acc = layers.elementwise_pow(decay_var, global_steps + 1)
|
|
|
|
|
return decay_pow_acc
|
|
|
|
|
|
|
|
|
|
def _append_ema_ops(self, param):
|
|
|
|
|
def _create_ema_vars(self, param):
|
|
|
|
|
param_ema = layers.create_global_var(
|
|
|
|
|
name=unique_name.generate(self._name + param.name + '_ema'),
|
|
|
|
|
shape=param.shape,
|
|
|
|
@ -2480,10 +2481,21 @@ class ExponentialMovingAverage(object):
|
|
|
|
|
dtype=param.dtype,
|
|
|
|
|
persistable=True)
|
|
|
|
|
|
|
|
|
|
ema_t = param_ema * self._decay_var + param * (1 - self._decay_var)
|
|
|
|
|
layers.assign(input=ema_t, output=param_ema)
|
|
|
|
|
return param_ema
|
|
|
|
|
|
|
|
|
|
def update(self):
|
|
|
|
|
"""
|
|
|
|
|
Update Exponential Moving Average. Should only call this method in
|
|
|
|
|
train program.
|
|
|
|
|
"""
|
|
|
|
|
for param, tmp in self._params_tmps:
|
|
|
|
|
with param.block.program._optimized_guard(
|
|
|
|
|
[param, tmp]), name_scope('moving_average'):
|
|
|
|
|
param_ema = self._ema_vars[param.name]
|
|
|
|
|
ema_t = param_ema * self._decay_var + param * (1 -
|
|
|
|
|
self._decay_var)
|
|
|
|
|
layers.assign(input=ema_t, output=param_ema)
|
|
|
|
|
|
|
|
|
|
@signature_safe_contextmanager
|
|
|
|
|
def apply(self, executor, need_restore=True):
|
|
|
|
|
"""
|
|
|
|
|