|
|
@ -24,9 +24,9 @@ _ftrl_opt = C.MultitypeFuncGraph("ftrl_opt")
|
|
|
|
_ftrl_push_pull_opt = C.MultitypeFuncGraph("ftrl_opt")
|
|
|
|
_ftrl_push_pull_opt = C.MultitypeFuncGraph("ftrl_opt")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@_ftrl_opt.register("Function", "Function", "Tensor", "Number", "Number", "Number", "Tensor", "IndexedSlices", "Tensor",
|
|
|
|
@_ftrl_opt.register("Function", "Function", "Number", "Number", "Number", "Tensor", "Tensor", "IndexedSlices", "Tensor",
|
|
|
|
"Tensor", "Bool")
|
|
|
|
"Tensor", "Bool")
|
|
|
|
def _tensor_run_opt_with_sparse(opt, spars_opt, learning_rate, l1, l2, lr_power, linear, gradient, weight, moment,
|
|
|
|
def _tensor_run_opt_with_sparse(opt, spars_opt, l1, l2, lr_power, learning_rate, linear, gradient, weight, moment,
|
|
|
|
ps_parameter):
|
|
|
|
ps_parameter):
|
|
|
|
"""Apply sparse ftrl optimizer to the weight parameter when the gradient is sparse."""
|
|
|
|
"""Apply sparse ftrl optimizer to the weight parameter when the gradient is sparse."""
|
|
|
|
success = True
|
|
|
|
success = True
|
|
|
@ -43,9 +43,9 @@ def _tensor_run_opt_with_sparse(opt, spars_opt, learning_rate, l1, l2, lr_power,
|
|
|
|
return success
|
|
|
|
return success
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@_ftrl_opt.register("Function", "Function", "Tensor", "Number", "Number", "Number", "Tensor", "Tensor", "Tensor",
|
|
|
|
@_ftrl_opt.register("Function", "Function", "Number", "Number", "Number", "Tensor", "Tensor", "Tensor", "Tensor",
|
|
|
|
"Tensor", "Bool")
|
|
|
|
"Tensor", "Bool")
|
|
|
|
def _tensor_run_opt(opt, spars_opt, learning_rate, l1, l2, lr_power, linear, gradient, weight, moment, ps_parameter):
|
|
|
|
def _tensor_run_opt(opt, spars_opt, l1, l2, lr_power, learning_rate, linear, gradient, weight, moment, ps_parameter):
|
|
|
|
"""Apply ftrl optimizer to the weight parameter."""
|
|
|
|
"""Apply ftrl optimizer to the weight parameter."""
|
|
|
|
success = True
|
|
|
|
success = True
|
|
|
|
if ps_parameter:
|
|
|
|
if ps_parameter:
|
|
|
@ -83,7 +83,7 @@ def _tensor_run_push_pull_opt_with_one_number(push, pull, learning_rate, l1, l2,
|
|
|
|
return success
|
|
|
|
return success
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _check_param(initial_accum, lr_power, l1, l2, use_locking, weight_decay=0.0, prim_name=None):
|
|
|
|
def _check_param(initial_accum, lr_power, l1, l2, use_locking, prim_name=None):
|
|
|
|
"""Check param."""
|
|
|
|
"""Check param."""
|
|
|
|
validator.check_value_type("initial_accum", initial_accum, [float], prim_name)
|
|
|
|
validator.check_value_type("initial_accum", initial_accum, [float], prim_name)
|
|
|
|
validator.check_number("initial_accum", initial_accum, 0.0, Rel.GE, prim_name)
|
|
|
|
validator.check_number("initial_accum", initial_accum, 0.0, Rel.GE, prim_name)
|
|
|
@ -99,9 +99,6 @@ def _check_param(initial_accum, lr_power, l1, l2, use_locking, weight_decay=0.0,
|
|
|
|
|
|
|
|
|
|
|
|
validator.check_value_type("use_locking", use_locking, [bool], prim_name)
|
|
|
|
validator.check_value_type("use_locking", use_locking, [bool], prim_name)
|
|
|
|
|
|
|
|
|
|
|
|
validator.check_value_type("weight_decay", weight_decay, [float], prim_name)
|
|
|
|
|
|
|
|
validator.check_number("weight_decay", weight_decay, 0.0, Rel.GE, prim_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class FTRL(Optimizer):
|
|
|
|
class FTRL(Optimizer):
|
|
|
|
"""
|
|
|
|
"""
|
|
|
@ -113,15 +110,34 @@ class FTRL(Optimizer):
|
|
|
|
<https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf>`_ for engineering document.
|
|
|
|
<https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf>`_ for engineering document.
|
|
|
|
|
|
|
|
|
|
|
|
Note:
|
|
|
|
Note:
|
|
|
|
|
|
|
|
When separating parameter groups, the weight decay in each group will be applied on the parameters if the
|
|
|
|
|
|
|
|
weight decay is positive. When not separating parameter groups, the `weight_decay` in the API will be applied
|
|
|
|
|
|
|
|
on all of the parameters.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
To improve parameter groups performance, the customized order of parameters can be supported.
|
|
|
|
|
|
|
|
|
|
|
|
The sparse strategy is applied while the SparseGatherV2 operator being used for forward network.
|
|
|
|
The sparse strategy is applied while the SparseGatherV2 operator being used for forward network.
|
|
|
|
The sparse feature is under continuous development. The sparse
|
|
|
|
The sparse feature is under continuous development. The sparse behavior is currently performed on the CPU.
|
|
|
|
behavior is currently performed on the CPU.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
Args:
|
|
|
|
params (list[Parameter]): A list of parameter, which will be updated. The element in `params`
|
|
|
|
params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
|
|
|
|
should be Parameter.
|
|
|
|
the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
|
|
|
|
|
|
|
|
"lr", "weight_decay" and "order_params" are the keys can be parsed.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
- params: Required. The value should be a list of `Parameter`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
- lr: Using different learning rate by separating parameters is currently not supported.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
- weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
|
|
|
|
|
|
|
|
will be used. If not, the `weight_decay` in the API will be used.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
- order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
|
|
|
|
|
|
|
|
the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
|
|
|
|
|
|
|
|
in the value of 'order_params' should be in one of group parameters.
|
|
|
|
|
|
|
|
|
|
|
|
initial_accum (float): The starting value for accumulators, must be zero or positive values. Default: 0.1.
|
|
|
|
initial_accum (float): The starting value for accumulators, must be zero or positive values. Default: 0.1.
|
|
|
|
learning_rate (float): The learning rate value, should be positive. Default: 0.001.
|
|
|
|
learning_rate (float): The learning rate value, should be zero or positive, dynamic learning rate is currently
|
|
|
|
|
|
|
|
not supported. Default: 0.001.
|
|
|
|
lr_power (float): Learning rate power controls how the learning rate decreases during training, must be less
|
|
|
|
lr_power (float): Learning rate power controls how the learning rate decreases during training, must be less
|
|
|
|
than or equal to zero. Use fixed learning rate if lr_power is zero. Default: -0.5.
|
|
|
|
than or equal to zero. Use fixed learning rate if lr_power is zero. Default: -0.5.
|
|
|
|
l1 (float): l1 regularization strength, must be greater than or equal to zero. Default: 0.0.
|
|
|
|
l1 (float): l1 regularization strength, must be greater than or equal to zero. Default: 0.0.
|
|
|
@ -139,23 +155,36 @@ class FTRL(Optimizer):
|
|
|
|
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
Examples:
|
|
|
|
>>> net = Net()
|
|
|
|
>>> net = Net()
|
|
|
|
|
|
|
|
>>> #1) All parameters use the same learning rate and weight decay
|
|
|
|
|
|
|
|
>>> optim = nn.FTRL(params=net.trainable_params())
|
|
|
|
|
|
|
|
>>>
|
|
|
|
|
|
|
|
>>> #2) Use parameter groups and set different values
|
|
|
|
|
|
|
|
>>> conv_params = list(filter(lambda x: 'conv' in x.name, net.trainable_params()))
|
|
|
|
|
|
|
|
>>> no_conv_params = list(filter(lambda x: 'conv' not in x.name, net.trainable_params()))
|
|
|
|
|
|
|
|
>>> group_params = [{'params': conv_params, 'weight_decay': 0.01},
|
|
|
|
|
|
|
|
>>> {'params': no_conv_params},
|
|
|
|
|
|
|
|
>>> {'order_params': net.trainable_params()}]
|
|
|
|
|
|
|
|
>>> optim = nn.FTRL(group_params, learning_rate=0.1, weight_decay=0.0)
|
|
|
|
|
|
|
|
>>> # The conv_params's parameters will use weight decay of 0.01.
|
|
|
|
|
|
|
|
>>> # The no_conv_params's parameters will use default weight decay of 0.0.
|
|
|
|
|
|
|
|
>>> # The final parameters order in which the optimizer will be followed is the value of 'order_params'.
|
|
|
|
|
|
|
|
>>>
|
|
|
|
>>> loss = nn.SoftmaxCrossEntropyWithLogits()
|
|
|
|
>>> loss = nn.SoftmaxCrossEntropyWithLogits()
|
|
|
|
>>> opt = nn.FTRL(net.trainable_params())
|
|
|
|
>>> model = Model(net, loss_fn=loss, optimizer=optim)
|
|
|
|
>>> model = Model(net, loss_fn=loss, optimizer=opt, metrics=None)
|
|
|
|
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
def __init__(self, params, initial_accum=0.1, learning_rate=0.001, lr_power=-0.5, l1=0.0, l2=0.0,
|
|
|
|
def __init__(self, params, initial_accum=0.1, learning_rate=0.001, lr_power=-0.5, l1=0.0, l2=0.0,
|
|
|
|
use_locking=False, loss_scale=1.0, weight_decay=0.0):
|
|
|
|
use_locking=False, loss_scale=1.0, weight_decay=0.0):
|
|
|
|
super(FTRL, self).__init__(learning_rate, params, loss_scale=loss_scale)
|
|
|
|
super(FTRL, self).__init__(learning_rate, params, weight_decay, loss_scale=loss_scale)
|
|
|
|
if self.is_group:
|
|
|
|
if self.dynamic_lr or self.is_group_lr:
|
|
|
|
raise RuntimeError(f"The {self.cls_name} optimizer cannot support group setting.")
|
|
|
|
raise ValueError('Dynamic learning rate or group learning rate is currently not supported.')
|
|
|
|
_check_param(initial_accum, lr_power, l1, l2, use_locking, weight_decay, self.cls_name)
|
|
|
|
_check_param(initial_accum, lr_power, l1, l2, use_locking, self.cls_name)
|
|
|
|
self.moments = self.parameters.clone(prefix="moments", init=initial_accum)
|
|
|
|
self.moments = self.parameters.clone(prefix="moments", init=initial_accum)
|
|
|
|
self.linear = self.parameters.clone(prefix="linear", init='zeros')
|
|
|
|
self.linear = self.parameters.clone(prefix="linear", init='zeros')
|
|
|
|
self.l1 = l1
|
|
|
|
self.l1 = l1
|
|
|
|
self.l2 = l2
|
|
|
|
self.l2 = l2
|
|
|
|
self.lr_power = lr_power
|
|
|
|
self.lr_power = lr_power
|
|
|
|
self.weight_decay = weight_decay
|
|
|
|
if not self.is_group:
|
|
|
|
self.decay_tf = tuple((lambda: True)() for x in self.parameters)
|
|
|
|
self.decay_flags = tuple((lambda: True)() for x in self.parameters)
|
|
|
|
self.hyper_map = C.HyperMap()
|
|
|
|
self.hyper_map = C.HyperMap()
|
|
|
|
self.opt = P.ApplyFtrl(use_locking=use_locking)
|
|
|
|
self.opt = P.ApplyFtrl(use_locking=use_locking)
|
|
|
|
self.sparse_opt = P.FusedSparseFtrl(learning_rate, l1, l2, lr_power, use_locking=use_locking)
|
|
|
|
self.sparse_opt = P.FusedSparseFtrl(learning_rate, l1, l2, lr_power, use_locking=use_locking)
|
|
|
@ -164,12 +193,11 @@ class FTRL(Optimizer):
|
|
|
|
params = self.parameters
|
|
|
|
params = self.parameters
|
|
|
|
moments = self.moments
|
|
|
|
moments = self.moments
|
|
|
|
linear = self.linear
|
|
|
|
linear = self.linear
|
|
|
|
lr = self.learning_rate
|
|
|
|
grads = self.decay_weight(grads)
|
|
|
|
if self.weight_decay > 0.0:
|
|
|
|
|
|
|
|
grads = self.map_(F.partial(_apply_decay, self.weight_decay), self.decay_tf, params, grads)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
grads = self.scale_grad(grads)
|
|
|
|
grads = self.scale_grad(grads)
|
|
|
|
success = self.map_(F.partial(_ftrl_opt, self.opt, self.sparse_opt, lr, self.l1, self.l2, self.lr_power),
|
|
|
|
lr = self.get_lr()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
success = self.map_(F.partial(_ftrl_opt, self.opt, self.sparse_opt, self.l1, self.l2, self.lr_power, lr),
|
|
|
|
linear, grads, params, moments, self.ps_parameters)
|
|
|
|
linear, grads, params, moments, self.ps_parameters)
|
|
|
|
return success
|
|
|
|
return success
|
|
|
|
|
|
|
|
|
|
|
@ -180,7 +208,7 @@ class PSFTRL(Optimizer):
|
|
|
|
super(PSFTRL, self).__init__(learning_rate, params, loss_scale=loss_scale)
|
|
|
|
super(PSFTRL, self).__init__(learning_rate, params, loss_scale=loss_scale)
|
|
|
|
if self.is_group:
|
|
|
|
if self.is_group:
|
|
|
|
raise RuntimeError(f"The {self.cls_name} optimizer cannot support group setting.")
|
|
|
|
raise RuntimeError(f"The {self.cls_name} optimizer cannot support group setting.")
|
|
|
|
_check_param(initial_accum, lr_power, l1, l2, use_locking, weight_decay, self.cls_name)
|
|
|
|
_check_param(initial_accum, lr_power, l1, l2, use_locking, self.cls_name)
|
|
|
|
self.moments = self.parameters.clone(prefix="moments", init=initial_accum)
|
|
|
|
self.moments = self.parameters.clone(prefix="moments", init=initial_accum)
|
|
|
|
self.linear = self.parameters.clone(prefix="linear", init='zeros')
|
|
|
|
self.linear = self.parameters.clone(prefix="linear", init='zeros')
|
|
|
|
self.l1 = l1
|
|
|
|
self.l1 = l1
|
|
|
|