|
|
@ -27,6 +27,8 @@ from mindspore._checkparam import Rel
|
|
|
|
from .optimizer import Optimizer
|
|
|
|
from .optimizer import Optimizer
|
|
|
|
|
|
|
|
|
|
|
|
_adam_opt = C.MultitypeFuncGraph("adam_opt")
|
|
|
|
_adam_opt = C.MultitypeFuncGraph("adam_opt")
|
|
|
|
|
|
|
|
_scaler_one = Tensor(1, mstype.int32)
|
|
|
|
|
|
|
|
_scaler_ten = Tensor(10, mstype.float32)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@_adam_opt.register("Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", "Tensor", "Tensor",
|
|
|
|
@_adam_opt.register("Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", "Tensor", "Tensor",
|
|
|
@ -85,31 +87,80 @@ def _update_run_op(beta1, beta2, eps, lr, weight_decay, param, m, v, gradient, d
|
|
|
|
return gradient
|
|
|
|
return gradient
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@_adam_opt.register("Function", "Function", "Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor",
|
|
|
|
@_adam_opt.register("Function", "Function", "Function", "Function", "Bool", "Bool", "Bool", "Tensor", "Tensor",
|
|
|
|
"Tensor", "RowTensor", "Tensor", "Tensor", "Tensor", "Bool")
|
|
|
|
"Tensor", "Tensor", "Tensor", "Tensor", "RowTensor", "Tensor", "Tensor", "Tensor", "Bool")
|
|
|
|
def _run_opt_with_sparse(opt, sparse_opt, push, pull, beta1_power, beta2_power, beta1, beta2, eps, lr,
|
|
|
|
def _run_opt_with_sparse(opt, sparse_opt, push, pull, use_locking, use_nesterov, target, beta1_power,
|
|
|
|
gradient, params, moment1, moment2, ps_parameter):
|
|
|
|
beta2_power, beta1, beta2, eps, lr, gradient, params, m, v, ps_parameter):
|
|
|
|
"""Apply sparse adam optimizer to the weight parameter when the gradient is sparse."""
|
|
|
|
"""Apply sparse adam optimizer to the weight parameter when the gradient is sparse."""
|
|
|
|
success = True
|
|
|
|
success = True
|
|
|
|
indices = gradient.indices
|
|
|
|
indices = gradient.indices
|
|
|
|
values = gradient.values
|
|
|
|
values = gradient.values
|
|
|
|
if ps_parameter:
|
|
|
|
if ps_parameter:
|
|
|
|
op_shape = P.Shape()
|
|
|
|
op_shape = P.Shape()
|
|
|
|
shapes = (op_shape(params), op_shape(moment1), op_shape(moment2),
|
|
|
|
shapes = (op_shape(params), op_shape(m), op_shape(v),
|
|
|
|
op_shape(beta1_power), op_shape(beta2_power), op_shape(lr), op_shape(beta1),
|
|
|
|
op_shape(beta1_power), op_shape(beta2_power), op_shape(lr), op_shape(beta1),
|
|
|
|
op_shape(beta2), op_shape(eps), op_shape(values), op_shape(indices))
|
|
|
|
op_shape(beta2), op_shape(eps), op_shape(values), op_shape(indices))
|
|
|
|
success = F.depend(success, pull(push((beta1_power, beta2_power, lr, beta1, beta2,
|
|
|
|
success = F.depend(success, pull(push((beta1_power, beta2_power, lr, beta1, beta2,
|
|
|
|
eps, values, indices), shapes), params))
|
|
|
|
eps, values, indices), shapes), params))
|
|
|
|
else:
|
|
|
|
return success
|
|
|
|
success = F.depend(success, sparse_opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2,
|
|
|
|
|
|
|
|
|
|
|
|
if not target:
|
|
|
|
|
|
|
|
success = F.depend(success, sparse_opt(params, m, v, beta1_power, beta2_power, lr, beta1, beta2,
|
|
|
|
eps, values, indices))
|
|
|
|
eps, values, indices))
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
op_mul = P.Mul()
|
|
|
|
|
|
|
|
op_square = P.Square()
|
|
|
|
|
|
|
|
op_sqrt = P.Sqrt()
|
|
|
|
|
|
|
|
scatter_add = P.ScatterAdd(use_locking)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
assign_m = F.assign(m, op_mul(beta1, m))
|
|
|
|
|
|
|
|
assign_v = F.assign(v, op_mul(beta2, v))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
grad_indices = gradient.indices
|
|
|
|
|
|
|
|
grad_value = gradient.values
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
next_m = scatter_add(m,
|
|
|
|
|
|
|
|
grad_indices,
|
|
|
|
|
|
|
|
op_mul(F.tuple_to_array((1.0,)) - beta1, grad_value))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
next_v = scatter_add(v,
|
|
|
|
|
|
|
|
grad_indices,
|
|
|
|
|
|
|
|
op_mul(F.tuple_to_array((1.0,)) - beta2, op_square(grad_value)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if use_nesterov:
|
|
|
|
|
|
|
|
m_temp = next_m * _scaler_ten
|
|
|
|
|
|
|
|
assign_m_nesterov = F.assign(m, op_mul(beta1, next_m))
|
|
|
|
|
|
|
|
div_value = scatter_add(m,
|
|
|
|
|
|
|
|
op_mul(grad_indices, _scaler_one),
|
|
|
|
|
|
|
|
op_mul(F.tuple_to_array((1.0,)) - beta1, grad_value))
|
|
|
|
|
|
|
|
param_update = div_value / (op_sqrt(next_v) + eps)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
m_recover = F.assign(m, m_temp / _scaler_ten)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
F.control_depend(m_temp, assign_m_nesterov)
|
|
|
|
|
|
|
|
F.control_depend(assign_m_nesterov, div_value)
|
|
|
|
|
|
|
|
F.control_depend(param_update, m_recover)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
param_update = next_m / (op_sqrt(next_v) + eps)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lr_t = lr * op_sqrt(1 - beta2_power) / (1 - beta1_power)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
next_param = params - lr_t * param_update
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
F.control_depend(assign_m, next_m)
|
|
|
|
|
|
|
|
F.control_depend(assign_v, next_v)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
success = F.depend(success, F.assign(params, next_param))
|
|
|
|
|
|
|
|
success = F.depend(success, F.assign(m, next_m))
|
|
|
|
|
|
|
|
success = F.depend(success, F.assign(v, next_v))
|
|
|
|
|
|
|
|
|
|
|
|
return success
|
|
|
|
return success
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@_adam_opt.register("Function", "Function", "Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor",
|
|
|
|
@_adam_opt.register("Function", "Function", "Function", "Function", "Bool", "Bool", "Bool", "Tensor", "Tensor",
|
|
|
|
"Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Bool")
|
|
|
|
"Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Bool")
|
|
|
|
def _run_opt_with_one_number(opt, sparse_opt, push, pull, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient,
|
|
|
|
def _run_opt_with_one_number(opt, sparse_opt, push, pull, use_locking, use_nesterov, target, beta1_power,
|
|
|
|
params, moment1, moment2, ps_parameter):
|
|
|
|
beta2_power, beta1, beta2, eps, lr, gradient, params, moment1, moment2, ps_parameter):
|
|
|
|
"""Apply adam optimizer to the weight parameter using Tensor."""
|
|
|
|
"""Apply adam optimizer to the weight parameter using Tensor."""
|
|
|
|
success = True
|
|
|
|
success = True
|
|
|
|
if ps_parameter:
|
|
|
|
if ps_parameter:
|
|
|
@ -161,8 +212,8 @@ class Adam(Optimizer):
|
|
|
|
To improve parameter groups performance, the customized order of parameters is supported.
|
|
|
|
To improve parameter groups performance, the customized order of parameters is supported.
|
|
|
|
|
|
|
|
|
|
|
|
The sparse strategy is applied while the SparseGatherV2 operator is used for forward network.
|
|
|
|
The sparse strategy is applied while the SparseGatherV2 operator is used for forward network.
|
|
|
|
The sparse feature is under continuous development. The sparse
|
|
|
|
The sparse feature is under continuous development. If the sparse strategy wants to be executed on the host,
|
|
|
|
behavior is currently performed on the CPU.
|
|
|
|
set the target to the CPU.
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
Args:
|
|
|
|
params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
|
|
|
|
params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
|
|
|
@ -242,14 +293,16 @@ class Adam(Optimizer):
|
|
|
|
self.beta1_power = Parameter(initializer(1, [1], mstype.float32), name="beta1_power")
|
|
|
|
self.beta1_power = Parameter(initializer(1, [1], mstype.float32), name="beta1_power")
|
|
|
|
self.beta2_power = Parameter(initializer(1, [1], mstype.float32), name="beta2_power")
|
|
|
|
self.beta2_power = Parameter(initializer(1, [1], mstype.float32), name="beta2_power")
|
|
|
|
self.eps = Tensor(eps, mstype.float32)
|
|
|
|
self.eps = Tensor(eps, mstype.float32)
|
|
|
|
|
|
|
|
self.use_nesterov = use_nesterov
|
|
|
|
|
|
|
|
self.use_locking = use_locking
|
|
|
|
self.moment1 = self.parameters.clone(prefix="moment1", init='zeros')
|
|
|
|
self.moment1 = self.parameters.clone(prefix="moment1", init='zeros')
|
|
|
|
self.moment2 = self.parameters.clone(prefix="moment2", init='zeros')
|
|
|
|
self.moment2 = self.parameters.clone(prefix="moment2", init='zeros')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self._is_device = True
|
|
|
|
self.hyper_map = C.HyperMap()
|
|
|
|
self.hyper_map = C.HyperMap()
|
|
|
|
self.opt = P.Adam(use_locking, use_nesterov)
|
|
|
|
self.opt = P.Adam(use_locking, use_nesterov)
|
|
|
|
self.sparse_opt = P.FusedSparseAdam(use_locking, use_nesterov)
|
|
|
|
self.sparse_opt = P.FusedSparseAdam(use_locking, use_nesterov)
|
|
|
|
|
|
|
|
self.sparse_opt.add_prim_attr("primitive", "CPU")
|
|
|
|
self._ps_pull = P.Pull()
|
|
|
|
self._ps_pull = P.Pull()
|
|
|
|
self._ps_push = P.Push("Adam", [0, 1, 2])
|
|
|
|
self._ps_push = P.Push("Adam", [0, 1, 2])
|
|
|
|
self._ps_push.add_prim_attr("use_nesterov", use_nesterov)
|
|
|
|
self._ps_push.add_prim_attr("use_nesterov", use_nesterov)
|
|
|
@ -260,6 +313,7 @@ class Adam(Optimizer):
|
|
|
|
moment2 = self.moment2
|
|
|
|
moment2 = self.moment2
|
|
|
|
gradients = self.decay_weight(gradients)
|
|
|
|
gradients = self.decay_weight(gradients)
|
|
|
|
gradients = self.scale_grad(gradients)
|
|
|
|
gradients = self.scale_grad(gradients)
|
|
|
|
|
|
|
|
gradients = self._grad_sparse_indices_deduplicate(gradients)
|
|
|
|
lr = self.get_lr()
|
|
|
|
lr = self.get_lr()
|
|
|
|
|
|
|
|
|
|
|
|
beta1_power = self.beta1_power * self.beta1
|
|
|
|
beta1_power = self.beta1_power * self.beta1
|
|
|
@ -268,14 +322,26 @@ class Adam(Optimizer):
|
|
|
|
self.beta2_power = beta2_power
|
|
|
|
self.beta2_power = beta2_power
|
|
|
|
if self.is_group_lr:
|
|
|
|
if self.is_group_lr:
|
|
|
|
success = self.map_(F.partial(_adam_opt, self.opt, self.sparse_opt, self._ps_push, self._ps_pull,
|
|
|
|
success = self.map_(F.partial(_adam_opt, self.opt, self.sparse_opt, self._ps_push, self._ps_pull,
|
|
|
|
|
|
|
|
self.use_locking, self.use_nesterov, self._is_device,
|
|
|
|
beta1_power, beta2_power, self.beta1, self.beta2, self.eps),
|
|
|
|
beta1_power, beta2_power, self.beta1, self.beta2, self.eps),
|
|
|
|
lr, gradients, params, moment1, moment2, self.ps_parameters)
|
|
|
|
lr, gradients, params, moment1, moment2, self.ps_parameters)
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
success = self.map_(F.partial(_adam_opt, self.opt, self.sparse_opt, self._ps_push, self._ps_pull,
|
|
|
|
success = self.map_(F.partial(_adam_opt, self.opt, self.sparse_opt, self._ps_push, self._ps_pull,
|
|
|
|
|
|
|
|
self.use_locking, self.use_nesterov, self._is_device,
|
|
|
|
beta1_power, beta2_power, self.beta1, self.beta2, self.eps, lr),
|
|
|
|
beta1_power, beta2_power, self.beta1, self.beta2, self.eps, lr),
|
|
|
|
gradients, params, moment1, moment2, self.ps_parameters)
|
|
|
|
gradients, params, moment1, moment2, self.ps_parameters)
|
|
|
|
return success
|
|
|
|
return success
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@Optimizer.target.setter
|
|
|
|
|
|
|
|
def target(self, value):
|
|
|
|
|
|
|
|
"""If the input value is set to "CPU", the parameters will be updated on the host using the Fused
|
|
|
|
|
|
|
|
optimizer operation."""
|
|
|
|
|
|
|
|
if value not in ('CPU', 'Ascend'):
|
|
|
|
|
|
|
|
raise ValueError("The value must be 'CPU' or 'Ascend', but got value {}".format(value))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self._is_device = (value != 'CPU')
|
|
|
|
|
|
|
|
self._target = value
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class AdamWeightDecay(Optimizer):
|
|
|
|
class AdamWeightDecay(Optimizer):
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|