diff --git a/mindspore/nn/optim/adam.py b/mindspore/nn/optim/adam.py index da57aaa88e..2b2d2439c9 100755 --- a/mindspore/nn/optim/adam.py +++ b/mindspore/nn/optim/adam.py @@ -27,6 +27,8 @@ from mindspore._checkparam import Rel from .optimizer import Optimizer _adam_opt = C.MultitypeFuncGraph("adam_opt") +_scaler_one = Tensor(1, mstype.int32) +_scaler_ten = Tensor(10, mstype.float32) @_adam_opt.register("Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", "Tensor", "Tensor", @@ -85,31 +87,80 @@ def _update_run_op(beta1, beta2, eps, lr, weight_decay, param, m, v, gradient, d return gradient -@_adam_opt.register("Function", "Function", "Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", - "Tensor", "RowTensor", "Tensor", "Tensor", "Tensor", "Bool") -def _run_opt_with_sparse(opt, sparse_opt, push, pull, beta1_power, beta2_power, beta1, beta2, eps, lr, - gradient, params, moment1, moment2, ps_parameter): +@_adam_opt.register("Function", "Function", "Function", "Function", "Bool", "Bool", "Bool", "Tensor", "Tensor", + "Tensor", "Tensor", "Tensor", "Tensor", "RowTensor", "Tensor", "Tensor", "Tensor", "Bool") +def _run_opt_with_sparse(opt, sparse_opt, push, pull, use_locking, use_nesterov, target, beta1_power, + beta2_power, beta1, beta2, eps, lr, gradient, params, m, v, ps_parameter): """Apply sparse adam optimizer to the weight parameter when the gradient is sparse.""" success = True indices = gradient.indices values = gradient.values if ps_parameter: op_shape = P.Shape() - shapes = (op_shape(params), op_shape(moment1), op_shape(moment2), + shapes = (op_shape(params), op_shape(m), op_shape(v), op_shape(beta1_power), op_shape(beta2_power), op_shape(lr), op_shape(beta1), op_shape(beta2), op_shape(eps), op_shape(values), op_shape(indices)) success = F.depend(success, pull(push((beta1_power, beta2_power, lr, beta1, beta2, eps, values, indices), shapes), params)) - else: - success = F.depend(success, sparse_opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2, + return success + + if not target: + success = F.depend(success, sparse_opt(params, m, v, beta1_power, beta2_power, lr, beta1, beta2, eps, values, indices)) + else: + op_mul = P.Mul() + op_square = P.Square() + op_sqrt = P.Sqrt() + scatter_add = P.ScatterAdd(use_locking) + + assign_m = F.assign(m, op_mul(beta1, m)) + assign_v = F.assign(v, op_mul(beta2, v)) + + grad_indices = gradient.indices + grad_value = gradient.values + + next_m = scatter_add(m, + grad_indices, + op_mul(F.tuple_to_array((1.0,)) - beta1, grad_value)) + + next_v = scatter_add(v, + grad_indices, + op_mul(F.tuple_to_array((1.0,)) - beta2, op_square(grad_value))) + + if use_nesterov: + m_temp = next_m * _scaler_ten + assign_m_nesterov = F.assign(m, op_mul(beta1, next_m)) + div_value = scatter_add(m, + op_mul(grad_indices, _scaler_one), + op_mul(F.tuple_to_array((1.0,)) - beta1, grad_value)) + param_update = div_value / (op_sqrt(next_v) + eps) + + m_recover = F.assign(m, m_temp / _scaler_ten) + + F.control_depend(m_temp, assign_m_nesterov) + F.control_depend(assign_m_nesterov, div_value) + F.control_depend(param_update, m_recover) + else: + param_update = next_m / (op_sqrt(next_v) + eps) + + lr_t = lr * op_sqrt(1 - beta2_power) / (1 - beta1_power) + + next_param = params - lr_t * param_update + + F.control_depend(assign_m, next_m) + F.control_depend(assign_v, next_v) + + success = F.depend(success, F.assign(params, next_param)) + success = F.depend(success, F.assign(m, next_m)) + success = F.depend(success, F.assign(v, next_v)) + return success -@_adam_opt.register("Function", "Function", "Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", - "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Bool") -def _run_opt_with_one_number(opt, sparse_opt, push, pull, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, - params, moment1, moment2, ps_parameter): +@_adam_opt.register("Function", "Function", "Function", "Function", "Bool", "Bool", "Bool", "Tensor", "Tensor", + "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Bool") +def _run_opt_with_one_number(opt, sparse_opt, push, pull, use_locking, use_nesterov, target, beta1_power, + beta2_power, beta1, beta2, eps, lr, gradient, params, moment1, moment2, ps_parameter): """Apply adam optimizer to the weight parameter using Tensor.""" success = True if ps_parameter: @@ -161,8 +212,8 @@ class Adam(Optimizer): To improve parameter groups performance, the customized order of parameters is supported. The sparse strategy is applied while the SparseGatherV2 operator is used for forward network. - The sparse feature is under continuous development. The sparse - behavior is currently performed on the CPU. + The sparse feature is under continuous development. If the sparse strategy wants to be executed on the host, + set the target to the CPU. Args: params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated, @@ -242,14 +293,16 @@ class Adam(Optimizer): self.beta1_power = Parameter(initializer(1, [1], mstype.float32), name="beta1_power") self.beta2_power = Parameter(initializer(1, [1], mstype.float32), name="beta2_power") self.eps = Tensor(eps, mstype.float32) - + self.use_nesterov = use_nesterov + self.use_locking = use_locking self.moment1 = self.parameters.clone(prefix="moment1", init='zeros') self.moment2 = self.parameters.clone(prefix="moment2", init='zeros') + self._is_device = True self.hyper_map = C.HyperMap() self.opt = P.Adam(use_locking, use_nesterov) self.sparse_opt = P.FusedSparseAdam(use_locking, use_nesterov) - + self.sparse_opt.add_prim_attr("primitive", "CPU") self._ps_pull = P.Pull() self._ps_push = P.Push("Adam", [0, 1, 2]) self._ps_push.add_prim_attr("use_nesterov", use_nesterov) @@ -260,6 +313,7 @@ class Adam(Optimizer): moment2 = self.moment2 gradients = self.decay_weight(gradients) gradients = self.scale_grad(gradients) + gradients = self._grad_sparse_indices_deduplicate(gradients) lr = self.get_lr() beta1_power = self.beta1_power * self.beta1 @@ -268,14 +322,26 @@ class Adam(Optimizer): self.beta2_power = beta2_power if self.is_group_lr: success = self.map_(F.partial(_adam_opt, self.opt, self.sparse_opt, self._ps_push, self._ps_pull, + self.use_locking, self.use_nesterov, self._is_device, beta1_power, beta2_power, self.beta1, self.beta2, self.eps), lr, gradients, params, moment1, moment2, self.ps_parameters) else: success = self.map_(F.partial(_adam_opt, self.opt, self.sparse_opt, self._ps_push, self._ps_pull, + self.use_locking, self.use_nesterov, self._is_device, beta1_power, beta2_power, self.beta1, self.beta2, self.eps, lr), gradients, params, moment1, moment2, self.ps_parameters) return success + @Optimizer.target.setter + def target(self, value): + """If the input value is set to "CPU", the parameters will be updated on the host using the Fused + optimizer operation.""" + if value not in ('CPU', 'Ascend'): + raise ValueError("The value must be 'CPU' or 'Ascend', but got value {}".format(value)) + + self._is_device = (value != 'CPU') + self._target = value + class AdamWeightDecay(Optimizer): """ diff --git a/mindspore/nn/optim/ftrl.py b/mindspore/nn/optim/ftrl.py index 8667a0fa2e..d193581d24 100644 --- a/mindspore/nn/optim/ftrl.py +++ b/mindspore/nn/optim/ftrl.py @@ -89,7 +89,8 @@ class FTRL(Optimizer): To improve parameter groups performance, the customized order of parameters can be supported. The sparse strategy is applied while the SparseGatherV2 operator being used for forward network. - The sparse feature is under continuous development. The sparse behavior is currently performed on the CPU. + The sparse feature is under continuous development. If the sparse strategy wants to be executed on the host, + set the target to the CPU. Args: params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated, @@ -154,12 +155,14 @@ class FTRL(Optimizer): self.linear = self.parameters.clone(prefix="linear", init='zeros') self.l1 = l1 self.l2 = l2 + self.lr = learning_rate self.lr_power = lr_power if not self.is_group: self.decay_flags = tuple((lambda: True)() for x in self.parameters) self.hyper_map = C.HyperMap() self.opt = P.ApplyFtrl(use_locking=use_locking) - self.sparse_opt = P.FusedSparseFtrl(learning_rate, l1, l2, lr_power, use_locking=use_locking) + self.use_locking = use_locking + self.sparse_opt = P.SparseApplyFtrl(learning_rate, l1, l2, lr_power, use_locking=use_locking) self._ps_pull = P.Pull() self._ps_push = P.Push("Ftrl", [0, 1, 2]) self._ps_push.add_prim_attr("init_accum", initial_accum) @@ -174,9 +177,26 @@ class FTRL(Optimizer): linear = self.linear grads = self.decay_weight(grads) grads = self.scale_grad(grads) + grads = self._grad_sparse_indices_deduplicate(grads) lr = self.get_lr() success = self.map_(F.partial(_ftrl_opt, self.opt, self.sparse_opt, self._ps_push, self._ps_pull, self.l1, self.l2, self.lr_power, lr), linear, grads, params, moments, self.ps_parameters) return success + + @Optimizer.target.setter + def target(self, value): + """If the input value is set to "CPU", the parameters will be updated on the host using the Fused + optimizer operation.""" + + if value not in ('CPU', 'Ascend'): + raise ValueError("The value must be 'CPU' or 'Ascend', but got value {}".format(value)) + + if value == 'CPU': + self.sparse_opt = P.FusedSparseFtrl(self.lr, self.l1, self.l2, self.lr_power, self.use_locking) + self.sparse_opt.add_prim_attr("primitive", "CPU") + else: + self.sparse_opt = P.SparseApplyFtrl(self.lr, self.l1, self.l2, self.lr_power, self.use_locking) + + self._target = value diff --git a/mindspore/nn/optim/lazyadam.py b/mindspore/nn/optim/lazyadam.py index a08b809ac0..93b5011494 100644 --- a/mindspore/nn/optim/lazyadam.py +++ b/mindspore/nn/optim/lazyadam.py @@ -27,31 +27,57 @@ from .optimizer import Optimizer _lazy_adam_opt = C.MultitypeFuncGraph("lazy_adam_opt") -@_lazy_adam_opt.register("Function", "Function", "Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", - "Tensor", "Tensor", "RowTensor", "Tensor", "Tensor", "Tensor", "Bool") -def _run_opt_with_sparse(opt, sparse_opt, push, pull, beta1_power, beta2_power, beta1, beta2, eps, - lr, gradient, params, moment1, moment2, ps_parameter): +@_lazy_adam_opt.register("Function", "Function", "Function", "Function", "Bool", "Bool", "Bool", "Tensor", "Tensor", + "Tensor", "Tensor", "Tensor", "Tensor", "RowTensor", "Tensor", "Tensor", "Tensor", "Bool") +def _run_opt_with_sparse(opt, sparse_opt, push, pull, use_locking, use_nesterov, target, beta1_power, beta2_power, + beta1, beta2, eps, lr, gradient, params, m, v, ps_parameter): """Apply sparse lazy adam optimizer to the weight parameter when the gradient is sparse.""" success = True indices = gradient.indices values = gradient.values if ps_parameter: op_shape = P.Shape() - shapes = (op_shape(params), op_shape(moment1), op_shape(moment2), + shapes = (op_shape(params), op_shape(m), op_shape(v), op_shape(beta1_power), op_shape(beta2_power), op_shape(lr), op_shape(beta1), op_shape(beta2), op_shape(eps), op_shape(values), op_shape(indices)) success = F.depend(success, pull(push((beta1_power, beta2_power, lr, beta1, beta2, eps, values, indices), shapes), params)) - else: - success = F.depend(success, sparse_opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2, + return success + + if not target: + success = F.depend(success, sparse_opt(params, m, v, beta1_power, beta2_power, lr, beta1, beta2, eps, values, indices)) + else: + op_gather = P.GatherV2() + op_sqrt = P.Sqrt() + scatter_add = P.ScatterAdd(use_locking) + scatter_update = P.ScatterUpdate(use_locking) + + m_slice = op_gather(m, indices, 0) + v_slice = op_gather(v, indices, 0) + + next_m = m_slice * beta1 + values * (1 - beta1) + next_v = v_slice * beta2 + values * values * (1 - beta2) + + lr_t = lr * op_sqrt(1 - beta2_power) / (1 - beta1_power) + + if use_nesterov: + m_temp = beta1 * next_m + values * (1 - beta1) + param_update = m_temp / (op_sqrt(next_v) + eps) + else: + param_update = next_m / (op_sqrt(next_v) + eps) + + success = F.depend(success, scatter_add(params, indices, - lr_t * param_update)) + success = F.depend(success, scatter_update(m, indices, next_m)) + success = F.depend(success, scatter_update(v, indices, next_v)) + return success -@_lazy_adam_opt.register("Function", "Function", "Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", - "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Bool") -def _run_opt_with_one_number(opt, sparse_opt, push, pull, beta1_power, beta2_power, beta1, beta2, eps, - lr, gradient, params, moment1, moment2, ps_parameter): +@_lazy_adam_opt.register("Function", "Function", "Function", "Function", "Bool", "Bool", "Bool", "Tensor", "Tensor", + "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Bool") +def _run_opt_with_one_number(opt, sparse_opt, push, pull, use_locking, use_nesterov, target, beta1_power, + beta2_power, beta1, beta2, eps, lr, gradient, params, moment1, moment2, ps_parameter): """Apply lazy adam optimizer to the weight parameter using Tensor.""" success = True if ps_parameter: @@ -108,7 +134,7 @@ class LazyAdam(Optimizer): The sparse strategy is applied while the SparseGatherV2 operator being used for forward network. The sparse behavior, to be notice, is not equivalent to the original Adam algorithm, as only the current indices parames will be updated. The sparse feature is under - continuous development. The sparse behavior is currently performed on the CPU. + continuous development. If the sparse strategy wants to be executed on the host, set the target to the CPU. Args: params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated, @@ -191,14 +217,14 @@ class LazyAdam(Optimizer): self.eps = Tensor(eps, mstype.float32) self.use_nesterov = use_nesterov self.use_locking = use_locking - + self._is_device = True self.moment1 = self.parameters.clone(prefix="moment1", init='zeros') self.moment2 = self.parameters.clone(prefix="moment2", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.Adam(use_locking, use_nesterov) self.sparse_opt = P.FusedSparseLazyAdam(use_locking, use_nesterov) - + self.sparse_opt.add_prim_attr("primitive", "CPU") self._ps_pull = P.Pull() self._ps_push = P.Push("Adam", [0, 1, 2]) self._ps_push.add_prim_attr("use_nesterov", use_nesterov) @@ -206,6 +232,7 @@ class LazyAdam(Optimizer): def construct(self, gradients): gradients = self.decay_weight(gradients) gradients = self.scale_grad(gradients) + gradients = self._grad_sparse_indices_deduplicate(gradients) lr = self.get_lr() self.beta1_power = self.beta1_power * self.beta1 @@ -213,10 +240,22 @@ class LazyAdam(Optimizer): if self.is_group_lr: success = self.map_(F.partial(_lazy_adam_opt, self.opt, self.sparse_opt, self._ps_push, self._ps_pull, + self.use_locking, self.use_nesterov, self._is_device, self.beta1_power, self.beta2_power, self.beta1, self.beta2, self.eps), lr, gradients, self.parameters, self.moment1, self.moment2, self.ps_parameters) else: success = self.map_(F.partial(_lazy_adam_opt, self.opt, self.sparse_opt, self._ps_push, self._ps_pull, + self.use_locking, self.use_nesterov, self._is_device, self.beta1_power, self.beta2_power, self.beta1, self.beta2, self.eps, lr), gradients, self.parameters, self.moment1, self.moment2, self.ps_parameters) return success + + @Optimizer.target.setter + def target(self, value): + """If the input value is set to "CPU", the parameters will be updated on the host using the Fused + optimizer operation.""" + if value not in ('CPU', 'Ascend'): + raise ValueError("The value must be 'CPU' or 'Ascend', but got value {}".format(value)) + + self._is_device = (value != 'CPU') + self._target = value diff --git a/mindspore/nn/optim/optimizer.py b/mindspore/nn/optim/optimizer.py index 5af4392c61..e89574eed4 100755 --- a/mindspore/nn/optim/optimizer.py +++ b/mindspore/nn/optim/optimizer.py @@ -26,7 +26,6 @@ from mindspore.common.initializer import initializer from mindspore.common.tensor import Tensor, RowTensor import mindspore.common.dtype as mstype from mindspore._checkparam import Validator as validator -from mindspore._checkparam import Rel from mindspore import log as logger from mindspore.parallel._utils import _get_global_rank, _get_device_num, _get_parallel_mode from mindspore.context import ParallelMode @@ -105,6 +104,8 @@ class Optimizer(Cell): weight_decay = self._preprocess_weight_decay(weight_decay) + self._unique = True + self._target = 'Ascend' self.dynamic_lr = False self.assignadd = None self.global_step = None @@ -173,6 +174,30 @@ class Optimizer(Cell): else: self.optim_filter = (True,) * self.param_length + @property + def unique(self): + """This method is to see whether to make unique,This method is read-only.""" + return self._unique + + @unique.setter + def unique(self, value): + """Set whether the input value is unique.""" + if not isinstance(value, bool): + raise TypeError("The value type must be bool, but got value type is {}".format(type(value))) + self._unique = value + + @property + def target(self): + """This method is used to determine the value of target and whether the parameter update is performed on + the host or device. This method is read-only.""" + return self._target + + @target.setter + def target(self, value): + """If the input value is set to "CPU", the parameters will be updated on the host using the Fused + optimizer operation.""" + raise NotImplementedError + def decay_weight(self, gradients): """ Weight decay. @@ -217,6 +242,12 @@ class Optimizer(Cell): return gradients + def _grad_sparse_indices_deduplicate(self, gradients): + """ In the case of using big operators, de duplicate the 'indexes' in gradients.""" + if self._target != 'CPU' and self._unique: + gradients = self.map_(F.partial(_indices_deduplicate), gradients) + return gradients + def _preprocess_weight_decay(self, weight_decay): """Check weight decay, and convert int to float.""" if isinstance(weight_decay, (float, int)): @@ -514,7 +545,7 @@ def _tensor_apply_decay(weight_decay, if_apply, weight, gradient): _grad_scale = C.MultitypeFuncGraph("grad_scale") - +_indices_deduplicate = C.MultitypeFuncGraph("indices_deduplicate") @_grad_scale.register("Number", "Tensor") def tensor_grad_scale(scale, grad): @@ -532,6 +563,24 @@ def tensor_grad_scale_with_sparse(scale, grad): return RowTensor(grad.indices, grad.values * scale, grad.dense_shape) +@_indices_deduplicate.register("RowTensor") +def rowtensor_deduplicate_indices_slices(grad): + """Unique the indices and sums the 'values' corresponding to the duplicate indices.""" + indices = grad.indices + values = grad.values + + unique_indices, index_position = P.Unique()(indices) + summed_values = P.UnsortedSegmentSum()(values, index_position, P.DynamicShape()(unique_indices)[0]) + + return RowTensor(unique_indices, summed_values, grad.dense_shape) + + +@_indices_deduplicate.register("Tensor") +def tensor_deduplicate_indice_slices(grad): + """Return the input gradient directly in the dense sences.""" + return grad + + class _ConvertToCell(LearningRateSchedule): """Inner api, convert learning rate of scalar to LearningRateSchedule.""" def __init__(self, learning_rate): diff --git a/mindspore/nn/optim/proximal_ada_grad.py b/mindspore/nn/optim/proximal_ada_grad.py index cfdc3a5820..4529b201c5 100644 --- a/mindspore/nn/optim/proximal_ada_grad.py +++ b/mindspore/nn/optim/proximal_ada_grad.py @@ -17,7 +17,6 @@ from mindspore.ops import functional as F, composite as C, operations as P from mindspore.common import Tensor import mindspore.common.dtype as mstype from mindspore._checkparam import Validator as validator -from mindspore._checkparam import Rel from .optimizer import Optimizer _proximal_ada_grad_opt = C.MultitypeFuncGraph("proximal_ada_grad_opt") @@ -66,8 +65,8 @@ class ProximalAdagrad(Optimizer): To improve parameter groups performance, the customized order of parameters can be supported. The sparse strategy is applied while the SparseGatherV2 operator being used for forward network. - The sparse feature is under continuous development. The sparse - behavior is currently performed on the CPU. + The sparse feature is under continuous development. If the sparse strategy wants to be executed on the host, + set the target to the CPU. Args: params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated, @@ -136,14 +135,16 @@ class ProximalAdagrad(Optimizer): self.l1 = Tensor(l1, mstype.float32) self.l2 = Tensor(l2, mstype.float32) self.hyper_map = C.HyperMap() + self.use_locking = use_locking self.opt = P.ApplyProximalAdagrad(use_locking=use_locking) - self.sparse_opt = P.FusedSparseProximalAdagrad(use_locking=use_locking) + self.sparse_opt = P.SparseApplyProximalAdagrad(use_locking=use_locking) def construct(self, grads): params = self.parameters accum = self.accum grads = self.decay_weight(grads) grads = self.scale_grad(grads) + grads = self._grad_sparse_indices_deduplicate(grads) lr = self.get_lr() if self.is_group_lr: success = self.map_(F.partial(_proximal_ada_grad_opt, self.opt, self.sparse_opt, self.l1, self.l2), lr, @@ -152,3 +153,18 @@ class ProximalAdagrad(Optimizer): success = self.map_(F.partial(_proximal_ada_grad_opt, self.opt, self.sparse_opt, self.l1, self.l2, lr), grads, params, accum) return success + + @Optimizer.target.setter + def target(self, value): + """If the input value is set to "CPU", the parameters will be updated on the host using the Fused + optimizer operation.""" + + if value not in ('CPU', 'Ascend'): + raise ValueError("The value must be 'CPU' or 'Ascend', but got value {}".format(value)) + + if value == 'CPU': + self.sparse_opt = P.FusedSparseProximalAdagrad(self.use_locking).add_prim_attr("primitive", "CPU") + else: + self.sparse_opt = P.SparseApplyProximalAdagrad(self.use_locking) + + self._target = value diff --git a/model_zoo/official/recommend/wide_and_deep/src/wide_and_deep.py b/model_zoo/official/recommend/wide_and_deep/src/wide_and_deep.py index f7b8914b90..f07e60fee9 100644 --- a/model_zoo/official/recommend/wide_and_deep/src/wide_and_deep.py +++ b/model_zoo/official/recommend/wide_and_deep/src/wide_and_deep.py @@ -345,8 +345,8 @@ class TrainStepWrap(nn.Cell): self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens) self.optimizer_w = FTRL(learning_rate=5e-2, params=self.weights_w, l1=1e-8, l2=1e-8, initial_accum=1.0, loss_scale=sens) - self.optimizer_w.sparse_opt.add_prim_attr("primitive_target", "CPU") - self.optimizer_d.sparse_opt.add_prim_attr("primitive_target", "CPU") + self.optimizer_w.target = "CPU" + self.optimizer_d.target = "CPU" else: self.optimizer_d = Adam( self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens) diff --git a/tests/st/ps/cmp_sparse_embedding/test_cmp_sparse_embedding.py b/tests/st/ps/cmp_sparse_embedding/test_cmp_sparse_embedding.py index 1a4b0765cb..c95bd582d7 100644 --- a/tests/st/ps/cmp_sparse_embedding/test_cmp_sparse_embedding.py +++ b/tests/st/ps/cmp_sparse_embedding/test_cmp_sparse_embedding.py @@ -74,7 +74,7 @@ def do_sparse_embedding(ps=False): net.embedding.embedding_table.set_param_ps() optimizer = Adam(filter(lambda x: x.requires_grad, net.get_parameters())) - optimizer.sparse_opt.add_prim_attr("primitive_target", "CPU") + optimizer.target = 'CPU' criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_with_criterion = WithLossCell(net, criterion) train_network = TrainOneStepCell(net_with_criterion, optimizer) diff --git a/tests/ut/python/ir/test_row_tensor.py b/tests/ut/python/ir/test_row_tensor.py index f0fbcdce3e..1f36817077 100644 --- a/tests/ut/python/ir/test_row_tensor.py +++ b/tests/ut/python/ir/test_row_tensor.py @@ -465,7 +465,7 @@ def test_embedding_lookup_with_mix_precision(): criterion = nn.SoftmaxCrossEntropyWithLogits(reduction='mean') optimizer = nn.Adam(params=net.trainable_params(), learning_rate=0.1) - optimizer.sparse_opt.add_prim_attr("primitive_target", "CPU") + optimizer.target = 'CPU' train_network = ms.amp.build_train_network(net, optimizer, criterion, level="O2") train_network.set_train() for _ in range(2): diff --git a/tests/ut/python/nn/optim/test_adam.py b/tests/ut/python/nn/optim/test_adam.py index bebbc00880..e8e6bd9cac 100644 --- a/tests/ut/python/nn/optim/test_adam.py +++ b/tests/ut/python/nn/optim/test_adam.py @@ -109,6 +109,19 @@ def test_sparse_adam_compile(): net = NetWithSparseGatherV2() net.set_train() + optimizer = Adam(net.trainable_params(), learning_rate=0.1, loss_scale=1024.0, weight_decay=0.9) + optimizer.target = 'CPU' + train_network = TrainOneStepCell(net, optimizer) + _executor.compile(train_network, indices, label) + + +def test_sparse_adam(): + """ test_sparse_adam """ + indices = Tensor(np.array([0, 1]).astype(np.int32)) + label = Tensor(np.zeros([2, 1, 2]).astype(np.float32)) + net = NetWithSparseGatherV2() + net.set_train() + optimizer = Adam(net.trainable_params(), learning_rate=0.1, loss_scale=1024.0, weight_decay=0.9) train_network = TrainOneStepCell(net, optimizer) _executor.compile(train_network, indices, label) diff --git a/tests/ut/python/nn/optim/test_ftrl.py b/tests/ut/python/nn/optim/test_ftrl.py index 670bebc92d..2b5f1ef481 100644 --- a/tests/ut/python/nn/optim/test_ftrl.py +++ b/tests/ut/python/nn/optim/test_ftrl.py @@ -72,5 +72,19 @@ def test_spares_ftrl_compile(): net.set_train() optimizer = FTRL(net.trainable_params(), weight_decay=0.9, loss_scale=2.0) + optimizer.target = 'CPU' + train_network = TrainOneStepCell(net, optimizer) + _executor.compile(train_network, indices, label) + + +def test_spares_ftrl(): + """ test sparse ftrl""" + indices = Tensor(np.array([0, 1]).astype(np.int32)) + label = Tensor(np.zeros([2, 1, 2]).astype(np.float32)) + net = NetWithSparseGatherV2() + net.set_train() + + optimizer = FTRL(net.trainable_params(), weight_decay=0.9, loss_scale=2.0) + optimizer.target = 'Ascend' train_network = TrainOneStepCell(net, optimizer) _executor.compile(train_network, indices, label) diff --git a/tests/ut/python/nn/optim/test_lazyadam.py b/tests/ut/python/nn/optim/test_lazyadam.py index 7769597140..f97b22f9da 100644 --- a/tests/ut/python/nn/optim/test_lazyadam.py +++ b/tests/ut/python/nn/optim/test_lazyadam.py @@ -76,6 +76,20 @@ def test_spares_lazy_adam_compile(): net.set_train() optimizer = LazyAdam(net.trainable_params(), learning_rate=0.1, weight_decay=0.9, loss_scale=2.0) + optimizer.target = 'CPU' + train_network = TrainOneStepCell(net, optimizer) + _executor.compile(train_network, indices, label) + + +def test_spares_lazy_adam(): + """ test sparse adam""" + indices = Tensor(np.array([0, 1]).astype(np.int32)) + label = Tensor(np.zeros([2, 1, 2]).astype(np.float32)) + net = NetWithSparseGatherV2() + net.set_train() + + optimizer = LazyAdam(net.trainable_params(), learning_rate=0.1, weight_decay=0.9, loss_scale=2.0) + optimizer.target = 'Ascend' train_network = TrainOneStepCell(net, optimizer) _executor.compile(train_network, indices, label) diff --git a/tests/ut/python/nn/optim/test_proximal_ada_grad.py b/tests/ut/python/nn/optim/test_proximal_ada_grad.py index d88c55fd70..d8a8198072 100644 --- a/tests/ut/python/nn/optim/test_proximal_ada_grad.py +++ b/tests/ut/python/nn/optim/test_proximal_ada_grad.py @@ -71,6 +71,19 @@ def test_spares_proximal_ada_grad_compile(): net = NetWithSparseGatherV2() net.set_train() + optimizer = ProximalAdagrad(net.trainable_params(), weight_decay=0.9, loss_scale=1024.0) + optimizer.target = 'CPU' + train_network = TrainOneStepCell(net, optimizer) + _executor.compile(train_network, indices, label) + + +def test_spares_proximal_ada_grad(): + """ test sparse proximal_ada_grad """ + indices = Tensor(np.array([0, 1]).astype(np.int32)) + label = Tensor(np.zeros([2, 1, 2]).astype(np.float32)) + net = NetWithSparseGatherV2() + net.set_train() + optimizer = ProximalAdagrad(net.trainable_params(), weight_decay=0.9, loss_scale=1024.0) train_network = TrainOneStepCell(net, optimizer) _executor.compile(train_network, indices, label) diff --git a/tests/ut/python/nn/optim/test_target.py b/tests/ut/python/nn/optim/test_target.py new file mode 100644 index 0000000000..f518fedc6d --- /dev/null +++ b/tests/ut/python/nn/optim/test_target.py @@ -0,0 +1,75 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" test lazy adam """ +import numpy as np +from mindspore.nn.optim import LazyAdam, FTRL, Adam, ProximalAdagrad +import mindspore.nn as nn +from mindspore import Tensor, Parameter, context +from mindspore.ops import operations as P + +context.set_context(enable_sparse=True) + + +class NetWithSparseGatherV2(nn.Cell): + """ NetWithSparseGatherV2 definition """ + def __init__(self): + super(NetWithSparseGatherV2, self).__init__() + self.weight1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), name="weight1") + self.weight2 = Parameter(Tensor(np.ones([2, 1, 2]).astype((np.float32))), name="weight2") + self.axis = 0 + self.gather = P.SparseGatherV2() + + def construct(self, indices, label): + return self.gather(self.weight1, indices, self.axis) + self.weight2 + + +def test_ftrl_target(): + """ test_ftrl_target """ + net = NetWithSparseGatherV2() + net.set_train() + + optimizer = FTRL(net.trainable_params(), weight_decay=0.9, loss_scale=2.0) + if optimizer.target not in ('CPU', 'Ascend'): + raise ValueError("The value must be 'CPU' or 'Ascend', but got value {}".format(optimizer.target)) + + +def test_lazyadam_target(): + """ test_lazyadam_target """ + net = NetWithSparseGatherV2() + net.set_train() + + optimizer = LazyAdam(net.trainable_params(), learning_rate=0.1, weight_decay=0.9, loss_scale=2.0) + if optimizer.target not in ('CPU', 'Ascend'): + raise ValueError("The value must be 'CPU' or 'Ascend', but got value {}".format(optimizer.target)) + + +def test_adam_target(): + """ test_adam_target """ + net = NetWithSparseGatherV2() + net.set_train() + + optimizer = Adam(net.trainable_params(), learning_rate=0.1, loss_scale=1024.0, weight_decay=0.9) + if optimizer.target not in ('CPU', 'Ascend'): + raise ValueError("The value must be 'CPU' or 'Ascend', but got value {}".format(optimizer.target)) + + +def test_proximal_target(): + """ test_proximal_target """ + net = NetWithSparseGatherV2() + net.set_train() + + optimizer = ProximalAdagrad(net.trainable_params(), weight_decay=0.9, loss_scale=1024.0) + if optimizer.target not in ('CPU', 'Ascend'): + raise ValueError("The value must be 'CPU' or 'Ascend', but got value {}".format(optimizer.target))