From 52790b74e685da24ea25fc47351da08591f581cf Mon Sep 17 00:00:00 2001 From: liuxiao Date: Tue, 9 Jun 2020 21:01:41 +0800 Subject: [PATCH] Add some description to API about optimizer. --- mindspore/nn/optim/adam.py | 17 +++++++++-------- mindspore/nn/optim/lamb.py | 10 ++++++---- mindspore/nn/optim/lars.py | 4 ++-- mindspore/nn/optim/sgd.py | 9 +++++---- mindspore/ops/operations/nn_ops.py | 4 ++-- 5 files changed, 24 insertions(+), 20 deletions(-) diff --git a/mindspore/nn/optim/adam.py b/mindspore/nn/optim/adam.py index 40237a22d7..8844fc415e 100755 --- a/mindspore/nn/optim/adam.py +++ b/mindspore/nn/optim/adam.py @@ -162,13 +162,14 @@ class Adam(Optimizer): in the value of 'order_params' but not in any group will use default learning rate and default weight decay. - learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is - Iterable or a Tensor and the dims of the Tensor is 1, - use dynamic learning rate, then the i-th step will - take the i-th value as the learning rate. - When the learning_rate is float or learning_rate is a Tensor - but the dims of the Tensor is 0, use fixed learning rate. - Other cases are not supported. Default: 1e-3. + learning_rate (Union[int, float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is + Iterable or a Tensor and the dims of the Tensor is 1, + use dynamic learning rate, then the i-th step will + take the i-th value as the learning rate. + When the learning_rate is float or learning_rate is a + Tensor but the dims of the Tensor is 0, use fixed learning + rate. Other cases are not supported. It should be equal to + or greater than 0. Default: 1e-3. beta1 (float): The exponential decay rate for the 1st moment estimates. Should be in range (0.0, 1.0). Default: 0.9. beta2 (float): The exponential decay rate for the 2nd moment estimates. Should be in range (0.0, 1.0). Default: @@ -181,7 +182,7 @@ class Adam(Optimizer): use_nesterov (bool): Whether to use Nesterov Accelerated Gradient (NAG) algorithm to update the gradients. If True, updates the gradients using NAG. If False, updates the gradients without using NAG. Default: False. - weight_decay (float): Weight decay (L2 penalty). Default: 0.0. + weight_decay (float): Weight decay (L2 penalty). It should be equal to or greater than 0. Default: 0.0. loss_scale (float): A floating point value for the loss scale. Should be greater than 0. Default: 1.0. Inputs: diff --git a/mindspore/nn/optim/lamb.py b/mindspore/nn/optim/lamb.py index d8cc5b4ce4..83a299b742 100755 --- a/mindspore/nn/optim/lamb.py +++ b/mindspore/nn/optim/lamb.py @@ -143,10 +143,12 @@ class Lamb(Optimizer): params (list[Parameter]): A list of parameter, which will be updated. The element in `params` should be class mindspore.Parameter. decay_steps (int): The steps of the lr decay. Should be equal to or greater than 1. - warmup_steps (int): The steps of lr warm up. Default: 0. - start_learning_rate (float): A floating point value for the learning rate. Default: 0.1. - end_learning_rate (float): A floating point value for the end learning rate. Default: 0.0001. - power (float): The power of the polynomial. Default: 1.0. + warmup_steps (int): The steps of lr warm up. Should be equal to or greater than 0. Default: 0. + start_learning_rate (float): A floating point value for the learning rate. Should be equal to + or greater than 0. Default: 0.1. + end_learning_rate (float): A floating point value for the end learning rate. Should be equal to + or greater than 0. Default: 0.0001. + power (float): The power of the polynomial. It must be positive. Default: 1.0. beta1 (float): The exponential decay rate for the 1st moment estimates. Default: 0.9. Should be in range (0.0, 1.0). beta2 (float): The exponential decay rate for the 2nd moment estimates. Default: 0.999. diff --git a/mindspore/nn/optim/lars.py b/mindspore/nn/optim/lars.py index 001a578ffe..ca554788a4 100755 --- a/mindspore/nn/optim/lars.py +++ b/mindspore/nn/optim/lars.py @@ -59,13 +59,13 @@ class LARS(Optimizer): optimizer (Optimizer): MindSpore optimizer for which to wrap and modify gradients. epsilon (float): Term added to the denominator to improve numerical stability. Default: 1e-05. hyperpara (float): Trust coefficient for calculating the local learning rate. Default: 0.001. - weight_decay (float): Weight decay (L2 penalty). Default: 0.0. + weight_decay (float): Weight decay (L2 penalty). It should be equal to or greater than 0. Default: 0.0. use_clip (bool): Whether to use clip operation for calculating the local learning rate. Default: False. decay_filter (Function): A function to determine whether apply weight decay on parameters. Default: lambda x: 'LayerNorm' not in x.name and 'bias' not in x.name. lars_filter (Function): A function to determine whether apply lars algorithm. Default: lambda x: 'LayerNorm' not in x.name and 'bias' not in x.name. - loss_scale (float): A floating point value for the loss scale. Default: 1.0. + loss_scale (float): A floating point value for the loss scale. It should be greater than 0. Default: 1.0. Inputs: - **gradients** (tuple[Tensor]) - The gradients of `params` in optimizer, the shape is diff --git a/mindspore/nn/optim/sgd.py b/mindspore/nn/optim/sgd.py index a7493400f8..d138fea529 100755 --- a/mindspore/nn/optim/sgd.py +++ b/mindspore/nn/optim/sgd.py @@ -73,10 +73,11 @@ class SGD(Optimizer): take the i-th value as the learning rate. When the learning_rate is float or learning_rate is a Tensor but the dims of the Tensor is 0, use fixed learning rate. - Other cases are not supported. Default: 0.1. - momentum (float): A floating point value the momentum. Default: 0.0. - dampening (float): A floating point value of dampening for momentum. Default: 0.0. - weight_decay (float): Weight decay (L2 penalty). Default: 0.0. + Other cases are not supported. It should be equal to or + greater than 0. Default: 0.1. + momentum (float): A floating point value the momentum. should be at least 0.0. Default: 0.0. + dampening (float): A floating point value of dampening for momentum. should be at least 0.0. Default: 0.0. + weight_decay (float): Weight decay (L2 penalty). It should be equal to or greater than 0. Default: 0.0. nesterov (bool): Enables the Nesterov momentum. Default: False. loss_scale (float): A floating point value for the loss scale, which should be larger than 0.0. Default: 1.0. diff --git a/mindspore/ops/operations/nn_ops.py b/mindspore/ops/operations/nn_ops.py index 91b09d2553..f3e1392553 100644 --- a/mindspore/ops/operations/nn_ops.py +++ b/mindspore/ops/operations/nn_ops.py @@ -3159,7 +3159,7 @@ class SparseApplyFtrl(PrimitiveWithInfer): validator.check_value_type("l1", l1, [float], self.name) validator.check_value_type("l2", l2, [float], self.name) validator.check_value_type("lr_power", lr_power, [float], self.name) - self.lr = validator.check_number_range("lr", lr, 0.0, float("inf"), Rel.INC_LEFT, self.name) + self.lr = validator.check_number_range("lr", lr, 0.0, float("inf"), Rel.INC_NEITHER, self.name) self.l1 = validator.check_number("l1", l1, 0.0, Rel.GE, self.name) self.l2 = validator.check_number("l2", l2, 0.0, Rel.GE, self.name) self.lr_power = validator.check_number("lr_power", lr_power, 0, Rel.LE, self.name) @@ -3350,7 +3350,7 @@ class CTCLoss(PrimitiveWithInfer): """ @prim_attr_register - def __init__(self, preprocess_collapse_repeated=False, ctc_merge_repeated=False, + def __init__(self, preprocess_collapse_repeated=False, ctc_merge_repeated=True, ignore_longer_outputs_than_inputs=False): self.init_prim_io_names(inputs=["inputs", "labels_indices", "labels_values", "sequence_length"], outputs=["loss", "gradient"])