|
|
|
@ -27,6 +27,7 @@ from mindspore._checkparam import Rel
|
|
|
|
|
from .optimizer import Optimizer
|
|
|
|
|
|
|
|
|
|
_adam_opt = C.MultitypeFuncGraph("adam_opt")
|
|
|
|
|
_adam_push_pull_opt = C.MultitypeFuncGraph("_adam_push_pull_opt")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@_adam_opt.register("Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor",
|
|
|
|
@ -129,6 +130,31 @@ def _run_opt_with_one_number(opt, sparse_opt, beta1_power, beta2_power, beta1, b
|
|
|
|
|
eps, gradient))
|
|
|
|
|
return success
|
|
|
|
|
|
|
|
|
|
@_adam_push_pull_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor",
|
|
|
|
|
"Tensor", "Tuple", "Tensor", "Tensor", "Tensor")
|
|
|
|
|
def _run_push_pull_opt_with_sparse(push, pull, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, params,
|
|
|
|
|
moment1, moment2):
|
|
|
|
|
"""Apply sparse adam optimizer by push and pull to the weight parameter when the gradient is sparse."""
|
|
|
|
|
success = True
|
|
|
|
|
op_shape = P.Shape()
|
|
|
|
|
shapes = (op_shape(params), op_shape(moment1), op_shape(moment2),
|
|
|
|
|
op_shape(beta1_power), op_shape(beta2_power), op_shape(lr), op_shape(beta1),
|
|
|
|
|
op_shape(beta2), op_shape(eps), op_shape(gradient[1]), op_shape(gradient[0]))
|
|
|
|
|
success = F.depend(success, pull(push((beta1_power, beta2_power, lr, beta1, beta2,
|
|
|
|
|
eps, gradient[1], gradient[0]), shapes), params))
|
|
|
|
|
return success
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@_adam_push_pull_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor",
|
|
|
|
|
"Tensor", "Tensor", "Tensor", "Tensor", "Tensor")
|
|
|
|
|
def _run_push_pull_opt_with_one_number(push, pull, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, params,
|
|
|
|
|
moment1, moment2):
|
|
|
|
|
"""Apply adam optimizer by push and pull to the weight parameter using Tensor."""
|
|
|
|
|
success = True
|
|
|
|
|
op_shape = P.Shape()
|
|
|
|
|
success = F.depend(success, pull(push((beta1_power, beta2_power, lr, beta1, beta2, eps, gradient),
|
|
|
|
|
(op_shape(params), op_shape(moment1), op_shape(moment2))), params))
|
|
|
|
|
return success
|
|
|
|
|
|
|
|
|
|
class Adam(Optimizer):
|
|
|
|
|
r"""
|
|
|
|
@ -274,6 +300,51 @@ class Adam(Optimizer):
|
|
|
|
|
gradients, params, moment1, moment2)
|
|
|
|
|
return success
|
|
|
|
|
|
|
|
|
|
class PSAdam(Optimizer):
|
|
|
|
|
'''The same usage as Adam optimizer except the parameters are set PS mode.'''
|
|
|
|
|
def __init__(self, params, learning_rate=1e-3, beta1=0.9, beta2=0.999, eps=1e-8, use_locking=False,
|
|
|
|
|
use_nesterov=False, weight_decay=0.0, loss_scale=1.0):
|
|
|
|
|
super(PSAdam, self).__init__(learning_rate, params, weight_decay, loss_scale)
|
|
|
|
|
_check_param_value(beta1, beta2, eps, weight_decay, self.cls_name)
|
|
|
|
|
validator.check_value_type("use_locking", use_locking, [bool], self.cls_name)
|
|
|
|
|
validator.check_value_type("use_nesterov", use_nesterov, [bool], self.cls_name)
|
|
|
|
|
|
|
|
|
|
self.beta1 = Tensor(beta1, mstype.float32)
|
|
|
|
|
self.beta2 = Tensor(beta2, mstype.float32)
|
|
|
|
|
self.beta1_power = Parameter(initializer(1, [1], mstype.float32), name="beta1_power")
|
|
|
|
|
self.beta2_power = Parameter(initializer(1, [1], mstype.float32), name="beta2_power")
|
|
|
|
|
self.eps = Tensor(eps, mstype.float32)
|
|
|
|
|
|
|
|
|
|
self.moment1 = self.parameters.clone(prefix="moment1", init='zeros')
|
|
|
|
|
self.moment2 = self.parameters.clone(prefix="moment2", init='zeros')
|
|
|
|
|
|
|
|
|
|
self.hyper_map = C.HyperMap()
|
|
|
|
|
self.push = P.Push("Adam", [0, 1, 2])
|
|
|
|
|
self.push.add_prim_attr("primitive_target", "CPU")
|
|
|
|
|
self.pull = P.Pull()
|
|
|
|
|
self.pull.add_prim_attr("primitive_target", "CPU")
|
|
|
|
|
|
|
|
|
|
def construct(self, gradients):
|
|
|
|
|
params = self.parameters
|
|
|
|
|
moment1 = self.moment1
|
|
|
|
|
moment2 = self.moment2
|
|
|
|
|
gradients = self.decay_weight(gradients)
|
|
|
|
|
gradients = self.scale_grad(gradients)
|
|
|
|
|
lr = self.get_lr()
|
|
|
|
|
|
|
|
|
|
beta1_power = self.beta1_power * self.beta1
|
|
|
|
|
self.beta1_power = beta1_power
|
|
|
|
|
beta2_power = self.beta2_power * self.beta2
|
|
|
|
|
self.beta2_power = beta2_power
|
|
|
|
|
if self.is_group_lr:
|
|
|
|
|
success = self.map_(F.partial(_adam_push_pull_opt, self.push, self.pull, beta1_power, beta2_power,
|
|
|
|
|
self.beta1, self.beta2, self.eps),
|
|
|
|
|
lr, gradients, params, moment1, moment2)
|
|
|
|
|
else:
|
|
|
|
|
success = self.map_(F.partial(_adam_push_pull_opt, self.push, self.pull, beta1_power, beta2_power,
|
|
|
|
|
self.beta1, self.beta2, self.eps, lr),
|
|
|
|
|
gradients, params, moment1, moment2)
|
|
|
|
|
return success
|
|
|
|
|
|
|
|
|
|
class AdamWeightDecay(Optimizer):
|
|
|
|
|
"""
|
|
|
|
|