|
|
|
@ -123,7 +123,7 @@ class Optimizer(object):
|
|
|
|
|
"""
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
def _finish_update(self, block):
|
|
|
|
|
def _finish_update(self, block, parameters):
|
|
|
|
|
"""Finish any custom updates needed
|
|
|
|
|
before completing an optimization step
|
|
|
|
|
|
|
|
|
@ -132,7 +132,7 @@ class Optimizer(object):
|
|
|
|
|
parameters: list of parameter variables for the optimizer
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
list of finish ops or None
|
|
|
|
|
None
|
|
|
|
|
"""
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
@ -236,7 +236,8 @@ class Optimizer(object):
|
|
|
|
|
|
|
|
|
|
# Get custom finish ops for subclasses
|
|
|
|
|
# FIXME: Need to fix this once we figure out how to handle dependencies
|
|
|
|
|
self._finish_update(loss.block)
|
|
|
|
|
self._finish_update(loss.block,
|
|
|
|
|
[p[0] for p in parameters_and_grads])
|
|
|
|
|
|
|
|
|
|
end = len(global_block.ops)
|
|
|
|
|
return global_block.slice_ops(start, end)
|
|
|
|
@ -486,6 +487,8 @@ class AdamOptimizer(Optimizer):
|
|
|
|
|
"""
|
|
|
|
|
_moment1_acc_str = "moment1"
|
|
|
|
|
_moment2_acc_str = "moment2"
|
|
|
|
|
_beta1_pow_acc_str = "beta1_pow_acc"
|
|
|
|
|
_beta2_pow_acc_str = "beta2_pow_acc"
|
|
|
|
|
|
|
|
|
|
def __init__(self,
|
|
|
|
|
learning_rate=0.001,
|
|
|
|
@ -507,32 +510,22 @@ class AdamOptimizer(Optimizer):
|
|
|
|
|
def _create_accumulators(self, block, parameters):
|
|
|
|
|
assert isinstance(block, framework.Block)
|
|
|
|
|
|
|
|
|
|
main_block = block.program.global_block()
|
|
|
|
|
# Create beta1 and beta2 power tensors
|
|
|
|
|
beta_shape = [1]
|
|
|
|
|
self._beta1_pow_acc = self.helper.create_global_variable(
|
|
|
|
|
name=unique_name.generate('beta1_pow_acc'),
|
|
|
|
|
dtype='float32' if self._dtype == None else self._dtype,
|
|
|
|
|
shape=beta_shape,
|
|
|
|
|
lod_level=0,
|
|
|
|
|
persistable=True)
|
|
|
|
|
self.helper.set_variable_initializer(
|
|
|
|
|
self._beta1_pow_acc, initializer=Constant(self._beta1))
|
|
|
|
|
|
|
|
|
|
self._beta2_pow_acc = self.helper.create_global_variable(
|
|
|
|
|
name=unique_name.generate('beta2_pow_acc'),
|
|
|
|
|
dtype='float32' if self._dtype == None else self._dtype,
|
|
|
|
|
shape=beta_shape,
|
|
|
|
|
lod_level=0,
|
|
|
|
|
persistable=True)
|
|
|
|
|
|
|
|
|
|
self.helper.set_variable_initializer(
|
|
|
|
|
self._beta2_pow_acc, initializer=Constant(self._beta2))
|
|
|
|
|
|
|
|
|
|
# Create accumulator tensors for first and second moments
|
|
|
|
|
for p in parameters:
|
|
|
|
|
self._add_accumulator(self._moment1_acc_str, p)
|
|
|
|
|
self._add_accumulator(self._moment2_acc_str, p)
|
|
|
|
|
self._add_accumulator(
|
|
|
|
|
name=self._beta1_pow_acc_str,
|
|
|
|
|
param=p,
|
|
|
|
|
dtype='float32',
|
|
|
|
|
fill_value=self._beta1,
|
|
|
|
|
shape=[1])
|
|
|
|
|
self._add_accumulator(
|
|
|
|
|
name=self._beta2_pow_acc_str,
|
|
|
|
|
param=p,
|
|
|
|
|
dtype='float32',
|
|
|
|
|
fill_value=self._beta2,
|
|
|
|
|
shape=[1])
|
|
|
|
|
|
|
|
|
|
def _append_optimize_op(self, block, param_and_grad):
|
|
|
|
|
assert isinstance(block, framework.Block)
|
|
|
|
@ -541,6 +534,11 @@ class AdamOptimizer(Optimizer):
|
|
|
|
|
param_and_grad[0])
|
|
|
|
|
moment2 = self._get_accumulator(self._moment2_acc_str,
|
|
|
|
|
param_and_grad[0])
|
|
|
|
|
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
|
|
|
|
|
param_and_grad[0])
|
|
|
|
|
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
|
|
|
|
|
param_and_grad[0])
|
|
|
|
|
|
|
|
|
|
# create the adam optimize op
|
|
|
|
|
adam_op = block.append_op(
|
|
|
|
|
type=self.type,
|
|
|
|
@ -550,8 +548,8 @@ class AdamOptimizer(Optimizer):
|
|
|
|
|
"LearningRate": self._create_param_lr(param_and_grad),
|
|
|
|
|
"Moment1": moment1,
|
|
|
|
|
"Moment2": moment2,
|
|
|
|
|
"Beta1Pow": self._beta1_pow_acc,
|
|
|
|
|
"Beta2Pow": self._beta2_pow_acc
|
|
|
|
|
"Beta1Pow": beta1_pow_acc,
|
|
|
|
|
"Beta2Pow": beta2_pow_acc
|
|
|
|
|
},
|
|
|
|
|
outputs={
|
|
|
|
|
"ParamOut": param_and_grad[0],
|
|
|
|
@ -566,24 +564,27 @@ class AdamOptimizer(Optimizer):
|
|
|
|
|
|
|
|
|
|
return adam_op
|
|
|
|
|
|
|
|
|
|
def _finish_update(self, block):
|
|
|
|
|
def _finish_update(self, block, parameters):
|
|
|
|
|
"""Update Beta1 and Beta2 Power accumulators
|
|
|
|
|
"""
|
|
|
|
|
assert isinstance(block, framework.Block)
|
|
|
|
|
main_block = block.program.global_block()
|
|
|
|
|
scale_beta1 = main_block.append_op(
|
|
|
|
|
type="scale",
|
|
|
|
|
inputs={"X": self._beta1_pow_acc},
|
|
|
|
|
outputs={"Out": self._beta1_pow_acc},
|
|
|
|
|
attrs={"scale": self._beta1})
|
|
|
|
|
|
|
|
|
|
scale_beta2 = main_block.append_op(
|
|
|
|
|
type="scale",
|
|
|
|
|
inputs={"X": self._beta2_pow_acc},
|
|
|
|
|
outputs={"Out": self._beta2_pow_acc},
|
|
|
|
|
attrs={"scale": self._beta2})
|
|
|
|
|
|
|
|
|
|
return [scale_beta1, scale_beta2]
|
|
|
|
|
for param in parameters:
|
|
|
|
|
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
|
|
|
|
|
param)
|
|
|
|
|
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
|
|
|
|
|
param)
|
|
|
|
|
main_block.append_op(
|
|
|
|
|
type="scale",
|
|
|
|
|
inputs={"X": beta1_pow_acc},
|
|
|
|
|
outputs={"Out": beta1_pow_acc},
|
|
|
|
|
attrs={"scale": self._beta1})
|
|
|
|
|
|
|
|
|
|
main_block.append_op(
|
|
|
|
|
type="scale",
|
|
|
|
|
inputs={"X": beta2_pow_acc},
|
|
|
|
|
outputs={"Out": beta2_pow_acc},
|
|
|
|
|
attrs={"scale": self._beta2})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class AdamaxOptimizer(Optimizer):
|
|
|
|
@ -626,6 +627,7 @@ class AdamaxOptimizer(Optimizer):
|
|
|
|
|
"""
|
|
|
|
|
_moment_acc_str = "moment"
|
|
|
|
|
_inf_norm_acc_str = "inf_norm"
|
|
|
|
|
_beta1_pow_acc_str = "beta1_pow_acc"
|
|
|
|
|
|
|
|
|
|
def __init__(self,
|
|
|
|
|
learning_rate=0.001,
|
|
|
|
@ -645,21 +647,16 @@ class AdamaxOptimizer(Optimizer):
|
|
|
|
|
self._epsilon = epsilon
|
|
|
|
|
|
|
|
|
|
def _create_accumulators(self, block, parameters):
|
|
|
|
|
# Create beta1 power accumulator tensor
|
|
|
|
|
beta_shape = [1]
|
|
|
|
|
self._beta1_pow_acc = self.helper.create_global_variable(
|
|
|
|
|
name=unique_name.generate('beta1_pow_acc'),
|
|
|
|
|
dtype='float32' if self._dtype == None else self._dtype,
|
|
|
|
|
shape=beta_shape,
|
|
|
|
|
lod_level=0,
|
|
|
|
|
persistable=True)
|
|
|
|
|
self.helper.set_variable_initializer(
|
|
|
|
|
self._beta1_pow_acc, initializer=Constant(self._beta1))
|
|
|
|
|
|
|
|
|
|
# Create accumulator tensors for first moment and infinity norm
|
|
|
|
|
for p in parameters:
|
|
|
|
|
self._add_accumulator(self._moment_acc_str, p)
|
|
|
|
|
self._add_accumulator(self._inf_norm_acc_str, p)
|
|
|
|
|
self._add_accumulator(
|
|
|
|
|
name=self._beta1_pow_acc_str,
|
|
|
|
|
param=p,
|
|
|
|
|
dtype='float32',
|
|
|
|
|
fill_value=self._beta1,
|
|
|
|
|
shape=[1])
|
|
|
|
|
|
|
|
|
|
def _append_optimize_op(self, block, param_and_grad):
|
|
|
|
|
assert isinstance(block, framework.Block)
|
|
|
|
@ -667,6 +664,8 @@ class AdamaxOptimizer(Optimizer):
|
|
|
|
|
moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0])
|
|
|
|
|
inf_norm = self._get_accumulator(self._inf_norm_acc_str,
|
|
|
|
|
param_and_grad[0])
|
|
|
|
|
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
|
|
|
|
|
param_and_grad[0])
|
|
|
|
|
# create the adamax optimize op
|
|
|
|
|
adamax_op = block.append_op(
|
|
|
|
|
type=self.type,
|
|
|
|
@ -676,7 +675,7 @@ class AdamaxOptimizer(Optimizer):
|
|
|
|
|
"LearningRate": self._create_param_lr(param_and_grad),
|
|
|
|
|
"Moment": moment,
|
|
|
|
|
"InfNorm": inf_norm,
|
|
|
|
|
"Beta1Pow": self._beta1_pow_acc
|
|
|
|
|
"Beta1Pow": beta1_pow_acc
|
|
|
|
|
},
|
|
|
|
|
outputs={
|
|
|
|
|
"ParamOut": param_and_grad[0],
|
|
|
|
@ -691,18 +690,19 @@ class AdamaxOptimizer(Optimizer):
|
|
|
|
|
|
|
|
|
|
return adamax_op
|
|
|
|
|
|
|
|
|
|
def _finish_update(self, block):
|
|
|
|
|
def _finish_update(self, block, parameters):
|
|
|
|
|
"""Update Beta1 Power accumulator
|
|
|
|
|
"""
|
|
|
|
|
assert isinstance(block, framework.Block)
|
|
|
|
|
main_block = block.program.global_block()
|
|
|
|
|
scale_beta1 = main_block.append_op(
|
|
|
|
|
type="scale",
|
|
|
|
|
inputs={"X": self._beta1_pow_acc},
|
|
|
|
|
outputs={"Out": self._beta1_pow_acc},
|
|
|
|
|
attrs={"scale": self._beta1})
|
|
|
|
|
|
|
|
|
|
return [scale_beta1]
|
|
|
|
|
for param in parameters:
|
|
|
|
|
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
|
|
|
|
|
param)
|
|
|
|
|
main_block.append_op(
|
|
|
|
|
type="scale",
|
|
|
|
|
inputs={"X": beta1_pow_acc},
|
|
|
|
|
outputs={"Out": beta1_pow_acc},
|
|
|
|
|
attrs={"scale": self._beta1})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DecayedAdagradOptimizer(Optimizer):
|
|
|
|
|