!6514 fix nn & operations api comments

Merge pull request !6514 from panfengfeng/fix_api
5 years ago · 89cd6bf07a
parent 10add158f8 2d7b93e958
commit 89cd6bf07a
58 changed files with 753 additions and 749 deletions
--- a/mindspore/nn/cell.py
+++ b/mindspore/nn/cell.py
@ -614,7 +614,7 @@ class Cell(Cell_):
        """
        Defines the computation to be performed.

-        This method should be overridden by all subclasses.
+        This method must be overridden by all subclasses.

        Note:
            The inputs of the top cell only allow Tensor.
@ -748,7 +748,7 @@ class Cell(Cell_):
        Yields parameters of this cell. If `expand` is True, yield parameters of this cell and all subcells.

        Args:
-            expand (bool): If True, yields parameters of this cell and all subcells. Otherwise, only yield parameters
+            expand (bool): If true, yields parameters of this cell and all subcells. Otherwise, only yield parameters
                           that are direct members of this cell. Default: True.

        Examples:
@ -775,7 +775,7 @@ class Cell(Cell_):

        Args:
            name_prefix (str): Namespace. Default: ''.
-            expand (bool): If True, yields parameters of this cell and all subcells. Otherwise, only yield parameters
+            expand (bool): If true, yields parameters of this cell and all subcells. Otherwise, only yield parameters
                           that are direct members of this cell. Default: True.

        Examples:
@ -990,7 +990,7 @@ class Cell(Cell_):
        Set the cell backward hook function. Note that this function is only supported in Pynative Mode.

        Note:
-            fn should be defined as the following code. `cell_name` is the name of registered cell.
+            fn must be defined as the following code. `cell_name` is the name of registered cell.
            `grad_input` is gradient passed to the cell. `grad_output` is the gradient computed and passed to the
            next cell or primitve, which may be modified and returned.
            >>> hook_fn(cell_name, grad_input, grad_output) -> Tensor or None
--- a/mindspore/nn/dynamic_lr.py
+++ b/mindspore/nn/dynamic_lr.py
@ -90,7 +90,7 @@ def exponential_decay_lr(learning_rate, decay_rate, total_step, step_per_epoch,
        total_step (int): The total number of steps.
        step_per_epoch (int): The number of steps in per epoch.
        decay_epoch (int): A value used to calculate decayed learning rate.
-        is_stair (bool): If True, learning rate is decayed once every `decay_epoch` times. Default: False.
+        is_stair (bool): If true, learning rate is decayed once every `decay_epoch` times. Default: False.

    Returns:
        list[float]. The size of list is `total_step`.
@ -132,7 +132,7 @@ def natural_exp_decay_lr(learning_rate, decay_rate, total_step, step_per_epoch,
        total_step (int): The total number of steps.
        step_per_epoch (int): The number of steps in per epoch.
        decay_epoch (int): A value used to calculate decayed learning rate.
-        is_stair (bool): If True, learning rate is decayed once every `decay_epoch` times. Default: False.
+        is_stair (bool): If true, learning rate is decayed once every `decay_epoch` times. Default: False.

    Returns:
        list[float]. The size of list is `total_step`.
@ -175,7 +175,7 @@ def inverse_decay_lr(learning_rate, decay_rate, total_step, step_per_epoch, deca
        total_step (int): The total number of steps.
        step_per_epoch (int): The number of steps in per epoch.
        decay_epoch (int): A value used to calculate decayed learning rate.
-        is_stair (bool): If True, learning rate is decayed once every `decay_epoch` times. Default: False.
+        is_stair (bool): If true, learning rate is decayed once every `decay_epoch` times. Default: False.

    Returns:
        list[float]. The size of list is `total_step`.
@ -283,7 +283,7 @@ def polynomial_decay_lr(learning_rate, end_learning_rate, total_step, step_per_e
        total_step (int): The total number of steps.
        step_per_epoch (int): The number of steps in per epoch.
        decay_epoch (int): A value used to calculate decayed learning rate.
-        power (float): A value used to calculate decayed learning rate. This parameter should be greater than 0.
+        power (float): A value used to calculate decayed learning rate. This parameter must be greater than 0.
        update_decay_epoch (bool): If true, update `decay_epoch`. Default: False.

    Returns:
--- a/mindspore/nn/graph_kernels/graph_kernels.py
+++ b/mindspore/nn/graph_kernels/graph_kernels.py
@ -106,11 +106,11 @@ class MinimumGrad(GraphKernel):
    """
    Backprop function for Minimum operator.

-    Compares x and y elementwise, dout should has the same shape with x and y.
+    Compares x and y elementwise, dout must has the same shape with x and y.

    Inputs:
        - **x** (Tensor) - The first input
-        - **y** (Tensor) - x and y should have same shape
+        - **y** (Tensor) - x and y must have same shape
        - **dout** (Tensor) - Has the same shape as x and y, next operator's backprop output

    Outputs:
@ -274,7 +274,7 @@ class EqualCount(GraphKernel):
    """
    Computes the number of the same elements of two tensors.

-    The two input tensors should have the same shape and data type.
+    The two input tensors must have the same shape and data type.

    Inputs:
        x (Tensor): the first input tensor.
@ -309,8 +309,8 @@ class ReduceMean(GraphKernel):
    The dtype of the tensor to be reduced is number.

    Args:
-        keep_dims (bool): If True, keep these reduced dimensions and the length is 1.
-                          If False, don't keep these dimensions. Default: False.
+        keep_dims (bool): If true, keep these reduced dimensions and the length is 1.
+                          If false, don't keep these dimensions. Default: False.

    Inputs:
        - **input_x** (Tensor[Number]) - The input tensor.
@ -1000,10 +1000,10 @@ class LayerNorm(Cell):
        normalized_shape (Union(tuple[int], list[int]): The normalization is performed over axis
            `begin_norm_axis ... R - 1`.
        begin_norm_axis (int): It first normalization dimension: normalization will be performed along dimensions
-            `begin_norm_axis: rank(inputs)`, the value should be in [-1, rank(input)). Default: -1.
+            `begin_norm_axis: rank(inputs)`, the value must be in [-1, rank(input)). Default: -1.
        begin_params_axis (int): The first parameter(beta, gamma)dimension: scale and centering parameters
            will have dimensions `begin_params_axis: rank(inputs)` and will be broadcast with
-            the normalized inputs accordingly, the value should be in [-1, rank(input)). Default: -1.
+            the normalized inputs accordingly, the value must be in [-1, rank(input)). Default: -1.
        gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
            'he_uniform', etc. Default: 'ones'.
--- a/mindspore/nn/layer/basic.py
+++ b/mindspore/nn/layer/basic.py
@ -279,7 +279,7 @@ class ClipByNorm(Cell):
    where :math:`L_2(X)` is the :math:`L_2`-norm of :math:`X`.

    Inputs:
-        - **input** (Tensor) - Tensor of shape N-D. The type should be float32 or float16.
+        - **input** (Tensor) - Tensor of shape N-D. The type must be float32 or float16.
        - **clip_norm** (Tensor) - A scalar Tensor of shape :math:`()` or :math:`(1)`.

    Outputs:
@ -336,7 +336,7 @@ class Norm(Cell):

    Args:
        axis (Union[tuple, int]): The axis over which to compute vector norms. Default: ().
-        keep_dims (bool): If True, the axis indicated in `axis` are kept with size 1. Otherwise,
+        keep_dims (bool): If true, the axis indicated in `axis` are kept with size 1. Otherwise,
                   the dimensions in `axis` are removed from the output shape. Default: False.

    Inputs:
@ -507,12 +507,12 @@ class Unfold(Cell):
    The input tensor must be a 4-D tensor and the data format is NCHW.

    Args:
-        ksizes (Union[tuple[int], list[int]]): The size of sliding window, should be a tuple or a list of integers,
+        ksizes (Union[tuple[int], list[int]]): The size of sliding window, must be a tuple or a list of integers,
            and the format is [1, ksize_row, ksize_col, 1].
        strides (Union[tuple[int], list[int]]): Distance between the centers of the two consecutive patches,
-            should be a tuple or list of int, and the format is [1, stride_row, stride_col, 1].
+            must be a tuple or list of int, and the format is [1, stride_row, stride_col, 1].
        rates (Union[tuple[int], list[int]]): In each extracted patch, the gap between the corresponding dimension
-            pixel positions, should be a tuple or a list of integers, and the format is [1, rate_row, rate_col, 1].
+            pixel positions, must be a tuple or a list of integers, and the format is [1, rate_row, rate_col, 1].
        padding (str): The type of padding algorithm, is a string whose value is "same" or "valid",
            not case sensitive. Default: "valid".

@ -575,7 +575,7 @@ class MatrixDiag(Cell):
          float32, float16, int32, int8, and uint8.

    Outputs:
-        Tensor, has the same type as input `x`. The shape should be x.shape + (x.shape[-1], ).
+        Tensor, has the same type as input `x`. The shape must be x.shape + (x.shape[-1], ).

    Examples:
        >>> x = Tensor(np.array([1, -1]), mstype.float32)
@ -606,7 +606,7 @@ class MatrixDiagPart(Cell):
          float32, float16, int32, int8, and uint8.

    Outputs:
-        Tensor, has the same type as input `x`. The shape should be x.shape[:-2] + [min(x.shape[-2:])].
+        Tensor, has the same type as input `x`. The shape must be x.shape[:-2] + [min(x.shape[-2:])].

    Examples:
        >>> x = Tensor([[[-1, 0], [0, 1]], [[-1, 0], [0, 1]], [[-1, 0], [0, 1]]], mindspore.float32)
--- a/mindspore/nn/layer/conv.py
+++ b/mindspore/nn/layer/conv.py
@ -160,7 +160,7 @@ class Conv2d(_Conv):
              must be 0.

            - pad: Implicit paddings on both sides of the input. The number of `padding` will be padded to the input
-              Tensor borders. `padding` should be greater than or equal to 0.
+              Tensor borders. `padding` must be greater than or equal to 0.

        padding (Union[int, tuple[int]]): Implicit paddings on both sides of the input. If `padding` is one integer,
                    the paddings of top, bottom, left and right are the same, equal to padding. If `padding` is a tuple
@ -168,10 +168,10 @@ class Conv2d(_Conv):
                    padding[1], padding[2], and padding[3] accordingly. Default: 0.
        dilation (Union[int, tuple[int]]): The data type is int or a tuple of 2 integers. Specifies the dilation rate
                                      to use for dilated convolution. If set to be :math:`k > 1`, there will
-                                      be :math:`k - 1` pixels skipped for each sampling location. Its value should
+                                      be :math:`k - 1` pixels skipped for each sampling location. Its value must
                                      be greater or equal to 1 and bounded by the height and width of the
                                      input. Default: 1.
-        group (int): Split filter into groups, `in_ channels` and `out_channels` should be
+        group (int): Splits filter into groups, `in_ channels` and `out_channels` must be
            divisible by the number of groups. If the group is equal to `in_channels` and `out_channels`,
            this 2D convolution layer also can be called 2D depthwise convolution layer. Default: 1.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: False.
@ -239,7 +239,7 @@ class Conv2d(_Conv):
        self.bias_add = P.BiasAdd()

    def _init_depthwise_conv2d(self):
-        """Init depthwise conv2d op"""
+        """Initialize depthwise conv2d op"""
        if context.get_context("device_target") == "Ascend" and self.group > 1:
            self.dilation = self._dilation
            validator.check_integer('group', self.group, self.in_channels, Rel.EQ)
@ -335,15 +335,15 @@ class Conv1d(_Conv):
              must be 0.

            - pad: Implicit paddings on both sides of the input. The number of `padding` will be padded to the input
-              Tensor borders. `padding` should be greater than or equal to 0.
+              Tensor borders. `padding` must be greater than or equal to 0.

        padding (int): Implicit paddings on both sides of the input. Default: 0.
        dilation (int): The data type is int. Specifies the dilation rate
                                      to use for dilated convolution. If set to be :math:`k > 1`, there will
-                                      be :math:`k - 1` pixels skipped for each sampling location. Its value should
+                                      be :math:`k - 1` pixels skipped for each sampling location. Its value must
                                      be greater or equal to 1 and bounded by the height and width of the
                                      input. Default: 1.
-        group (int): Split filter into groups, `in_ channels` and `out_channels` should be
+        group (int): Splits filter into groups, `in_ channels` and `out_channels` must be
            divisible by the number of groups. Default: 1.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: False.
        weight_init (Union[Tensor, str, Initializer, numbers.Number]): An initializer for the convolution kernel.
@ -481,7 +481,7 @@ class Conv2dTranspose(_Conv):
            width of the kernel.
        stride (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
            the height and width of movement are both strides, or a tuple of two int numbers that
-            represent height and width of movement respectively. Its value should be equal to or greater than 1.
+            represent height and width of movement respectively. Its value must be equal to or greater than 1.
            Default: 1.
        pad_mode (str): Select the mode of the pad. The optional values are
            "pad", "same", "valid". Default: "same".
@ -497,10 +497,10 @@ class Conv2dTranspose(_Conv):
                    padding[1], padding[2], and padding[3] accordingly. Default: 0.
        dilation (Union[int, tuple[int]]): The data type is int or a tuple of 2 integers. Specifies the dilation rate
                                      to use for dilated convolution. If set to be :math:`k > 1`, there will
-                                      be :math:`k - 1` pixels skipped for each sampling location. Its value should
+                                      be :math:`k - 1` pixels skipped for each sampling location. Its value must
                                      be greater than or equal to 1 and bounded by the height and width of the
                                      input. Default: 1.
-        group (int): Split filter into groups, `in_channels` and `out_channels` should be
+        group (int): Splits filter into groups, `in_channels` and `out_channels` must be
            divisible by the number of groups. This does not support for Davinci devices when group > 1. Default: 1.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: False.
        weight_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the convolution kernel.
@ -662,10 +662,10 @@ class Conv1dTranspose(_Conv):
        padding (int): Implicit paddings on both sides of the input. Default: 0.
        dilation (int): The data type is int. Specifies the dilation rate
                                      to use for dilated convolution. If set to be :math:`k > 1`, there will
-                                      be :math:`k - 1` pixels skipped for each sampling location. Its value should
+                                      be :math:`k - 1` pixels skipped for each sampling location. Its value must
                                      be greater or equal to 1 and bounded by the width of the
                                      input. Default: 1.
-        group (int): Split filter into groups, `in_channels` and `out_channels` should be
+        group (int): Splits filter into groups, `in_channels` and `out_channels` must be
            divisible by the number of groups. This is not support for Davinci devices when group > 1. Default: 1.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: False.
        weight_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the convolution kernel.
--- a/mindspore/nn/layer/embedding.py
+++ b/mindspore/nn/layer/embedding.py
@ -36,7 +36,7 @@ class Embedding(Cell):
    the corresponding word embeddings.

    Note:
-        When 'use_one_hot' is set to True, the type of the input should be mindspore.int32.
+        When 'use_one_hot' is set to True, the type of the input must be mindspore.int32.

    Args:
        vocab_size (int): Size of the dictionary of embeddings.
@ -49,7 +49,7 @@ class Embedding(Cell):

    Inputs:
        - **input** (Tensor) - Tensor of shape :math:`(\text{batch_size}, \text{input_length})`. The elements of
-          the Tensor should be integer and not larger than vocab_size. Otherwise the corresponding embedding vector will
+          the Tensor must be integer and not larger than vocab_size. Otherwise the corresponding embedding vector will
          be zero.

    Outputs:
@ -120,7 +120,7 @@ class EmbeddingLookup(Cell):
        specified 'offset = 0' to lookup table.
        When 'target' is set to 'DEVICE', this module will use P.GatherV2() which
        specified 'axis = 0' to lookup table.
-        In field slice mode, the manual_shapes should be given. It is a tuple ,where
+        In field slice mode, the manual_shapes must be given. It is a tuple ,where
        the element is vocab[i], vocab[i] is the row numbers for i-th
        part.

@ -128,16 +128,16 @@ class EmbeddingLookup(Cell):
        vocab_size (int): Size of the dictionary of embeddings.
        embedding_size (int): The size of each embedding vector.
        param_init (str): The initialize way of embedding table. Default: 'normal'.
-        target (str): Specify the target where the op is executed. The value should in
+        target (str): Specifies the target where the op is executed. The value must in
            ['DEVICE', 'CPU']. Default: 'CPU'.
-        slice_mode (str): The slicing way in semi_auto_parallel/auto_parallel. The value should get through
+        slice_mode (str): The slicing way in semi_auto_parallel/auto_parallel. The value must get through
            nn.EmbeddingLookup. Default: nn.EmbeddingLookup.BATCH_SLICE.
        manual_shapes (tuple): The accompaniment array in field slice mode.

    Inputs:
        - **input_indices** (Tensor) - The shape of tensor is :math:`(y_1, y_2, ..., y_S)`.
          Specifies the indices of elements of the original Tensor. Values can be out of range of embedding_table,
-          and the exceeding part will be filled with 0 in the output. Input_indices should only be a 2d tensor in
+          and the exceeding part will be filled with 0 in the output. Input_indices must only be a 2d tensor in
          this interface.

    Outputs:
--- a/mindspore/nn/layer/image.py
+++ b/mindspore/nn/layer/image.py
@ -193,8 +193,8 @@ class SSIM(Cell):
        k2 (float): The constant used to generate c2 in the contrast comparison function. Default: 0.03.

    Inputs:
-        - **img1** (Tensor) - The first image batch with format 'NCHW'. It should be the same shape and dtype as img2.
-        - **img2** (Tensor) - The second image batch with format 'NCHW'. It should be the same shape and dtype as img1.
+        - **img1** (Tensor) - The first image batch with format 'NCHW'. It must be the same shape and dtype as img2.
+        - **img2** (Tensor) - The second image batch with format 'NCHW'. It must be the same shape and dtype as img1.

    Outputs:
        Tensor, has the same dtype as img1. It is a 1-D tensor with shape N, where N is the batch num of img1.
@ -267,8 +267,8 @@ class MSSSIM(Cell):
        k2 (float): The constant used to generate c2 in the contrast comparison function. Default: 0.03.

    Inputs:
-        - **img1** (Tensor) - The first image batch with format 'NCHW'. It should be the same shape and dtype as img2.
-        - **img2** (Tensor) - The second image batch with format 'NCHW'. It should be the same shape and dtype as img1.
+        - **img1** (Tensor) - The first image batch with format 'NCHW'. It must be the same shape and dtype as img2.
+        - **img2** (Tensor) - The second image batch with format 'NCHW'. It must be the same shape and dtype as img1.

    Outputs:
        Tensor, has the same dtype as img1. It is a 1-D tensor with shape N, where N is the batch num of img1.
@ -352,8 +352,8 @@ class PSNR(Cell):
          Default: 1.0.

    Inputs:
-        - **img1** (Tensor) - The first image batch with format 'NCHW'. It should be the same shape and dtype as img2.
-        - **img2** (Tensor) - The second image batch with format 'NCHW'. It should be the same shape and dtype as img1.
+        - **img1** (Tensor) - The first image batch with format 'NCHW'. It must be the same shape and dtype as img2.
+        - **img2** (Tensor) - The second image batch with format 'NCHW'. It must be the same shape and dtype as img1.

    Outputs:
        Tensor, with dtype mindspore.float32. It is a 1-D tensor with shape N, where N is the batch num of img1.
--- a/mindspore/nn/layer/lstm.py
+++ b/mindspore/nn/layer/lstm.py
@ -78,7 +78,7 @@ class LSTM(Cell):
        - **input** (Tensor) - Tensor of shape (seq_len, batch_size, `input_size`).
        - **hx** (tuple) - A tuple of two Tensors (h_0, c_0) both of data type mindspore.float32 or
          mindspore.float16 and shape (num_directions * `num_layers`, batch_size, `hidden_size`).
-          Data type of `hx` should be the same as `input`.
+          Data type of `hx` must be the same as `input`.

    Outputs:
        Tuple, a tuple constains (`output`, (`h_n`, `c_n`)).
@ -208,7 +208,7 @@ class LSTMCell(Cell):
          mindspore.float16 and shape (num_directions * `num_layers`, batch_size, `hidden_size`).
        - **c** - data type mindspore.float32 or
          mindspore.float16 and shape (num_directions * `num_layers`, batch_size, `hidden_size`).
-          Data type of `h' and 'c' should be the same of `input`.
+          Data type of `h' and 'c' must be the same of `input`.

    Outputs:
        `output`, `h_n`, `c_n`, 'reserve', 'state'.
--- a/mindspore/nn/layer/math.py
+++ b/mindspore/nn/layer/math.py
@ -36,8 +36,8 @@ class ReduceLogSumExp(Cell):
    The dtype of the tensor to be reduced is number.

    Args:
-        keep_dims (bool): If True, keep these reduced dimensions and the length is 1.
-                          If False, don't keep these dimensions.
+        keep_dims (bool): If true, keep these reduced dimensions and the length is 1.
+                          If false, don't keep these dimensions.
                          Default : False.

    Inputs:
@ -357,16 +357,16 @@ class MatMul(Cell):
    will be broadcasted and must be broadcastable.

    Args:
-        transpose_x1 (bool): If True, `a` is transposed before multiplication. Default: False.
-        transpose_x2 (bool): If True, `b` is transposed before multiplication. Default: False.
+        transpose_x1 (bool): If true, `a` is transposed before multiplication. Default: False.
+        transpose_x2 (bool): If true, `b` is transposed before multiplication. Default: False.

    Inputs:
        - **input_x1** (Tensor) - The first tensor to be multiplied. The shape of the tensor is :math:`(*A, N, C)`,
          where :math:`*A` represents the batch size of `x1` which can be multidimensional.
-          If `transpose_a` is True, its shape should be :math:`(*A, N, C)` after transposing.
+          If `transpose_a` is True, its shape must be :math:`(*A, N, C)` after transposing.
        - **input_x2** (Tensor) - The second tensor to be multiplied. The shape of the tensor is :math:`(*B, C, M)`,
          where :math:`*B` represents the batch size of `x2` which can be multidimensional.
-          If `transpose_b` is True, its shape should be :math:`(*B, C, M)` after transposing.
+          If `transpose_b` is True, its shape must be :math:`(*B, C, M)` after transposing.

    Outputs:
        Tensor, the shape of the output tensor is :math:`(*L, N, M)`. :math:`*L` is the batch size after broadcasting.
--- a/mindspore/nn/layer/pooling.py
+++ b/mindspore/nn/layer/pooling.py
@ -159,7 +159,7 @@ class AvgPool2d(_PoolNd):

    Args:
        kernel_size (Union[int, tuple[int]]): The size of kernel used to take the average value.
-            The data type of kernel_size should be int and the value represents the height and width,
+            The data type of kernel_size must be int and the value represents the height and width,
            or a tuple of two int numbers that represent height and width respectively.
            Default: 1.
        stride (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
--- a/mindspore/nn/layer/quant.py
+++ b/mindspore/nn/layer/quant.py
@ -65,14 +65,14 @@ class Conv2dBnAct(Cell):
            and width of the 2D convolution window. Single int means the value is for both height and width of
            the kernel. A tuple of 2 ints means the first value is for the height and the other is for the
            width of the kernel.
-        stride (int): Specifies stride for all spatial dimensions with the same value. The value of stride should be
+        stride (int): Specifies stride for all spatial dimensions with the same value. The value of stride must be
            greater than or equal to 1 and lower than any one of the height and width of the input. Default: 1.
        pad_mode (str): Specifies padding mode. The optional values are "same", "valid", "pad". Default: "same".
        padding (int): Implicit paddings on both sides of the input. Default: 0.
-        dilation (int): Specifying the dilation rate to use for dilated convolution. If set to be :math:`k > 1`,
-            there will be :math:`k - 1` pixels skipped for each sampling location. Its value should be greater than
+        dilation (int): Specifies the dilation rate to use for dilated convolution. If set to be :math:`k > 1`,
+            there will be :math:`k - 1` pixels skipped for each sampling location. Its value must be greater than
            or equal to 1 and lower than any one of the height and width of the input. Default: 1.
-        group (int): Split filter into groups, `in_ channels` and `out_channels` should be
+        group (int): Splits filter into groups, `in_ channels` and `out_channels` must be
            divisible by the number of groups. Default: 1.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: False.
        weight_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the convolution kernel.
@ -85,14 +85,14 @@ class Conv2dBnAct(Cell):
            Initializer and string are the same as 'weight_init'. Refer to the values of
            Initializer for more details. Default: 'zeros'.
        has_bn (bool): Specifies to used batchnorm or not. Default: False.
-        momentum (float): Momentum for moving average.Momentum value should be [0, 1].Default:0.9
+        momentum (float): Momentum for moving average.Momentum value must be [0, 1].Default:0.9
        eps (float): Term added to the denominator to improve numerical stability. Should be greater than 0. Default:
                     1e-5.
        activation (Cell): Specifies activation type. The optional values are as following:
            'softmax', 'logsoftmax', 'relu', 'relu6', 'tanh', 'gelu', 'sigmoid',
            'prelu', 'leakyrelu', 'hswish', 'hsigmoid'. Default: None.
        alpha (float): Slope of the activation function at x < 0. Default: 0.2.
-        after_fake(bool): Determin whether there should be a fake quantization operation after Cond2dBnAct.
+        after_fake(bool): Determin whether there must be a fake quantization operation after Cond2dBnAct.

    Inputs:
        - **input** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
@ -176,7 +176,7 @@ class DenseBnAct(Cell):
        activation (string): Specifies activation type. The optional values are as following:
            'Softmax', 'LogSoftmax', 'ReLU', 'ReLU6', 'Tanh', 'GELU', 'Sigmoid',
            'PReLU', 'LeakyReLU', 'h-Swish', and 'h-Sigmoid'. Default: None.
-        after_fake(bool): Determin whether there should be a fake quantization operation after DenseBnAct.
+        after_fake(bool): Determin whether there must be a fake quantization operation after DenseBnAct.

    Inputs:
        - **input** (Tensor) - Tensor of shape :math:`(N, in\_channels)`.
@ -227,7 +227,7 @@ class BatchNormFoldCell(Cell):
    Batch normalization folded.

    Args:
-        momentum (float): Momentum value should be [0, 1]. Default: 0.9.
+        momentum (float): Momentum value must be [0, 1]. Default: 0.9.
        epsilon (float): A small float number to avoid dividing by 0. 1e-5 if dtype in
            float32 else 1e-3. Default: 1e-5.
        freeze_bn (int): Delay in steps at which computation switches from regular batch
@ -250,7 +250,7 @@ class BatchNormFoldCell(Cell):
    """

    def __init__(self, momentum=0.9, epsilon=1e-5, freeze_bn=0):
-        """init batch norm fold layer"""
+        """Initialize batch norm fold layer"""
        super(BatchNormFoldCell, self).__init__()
        self.epsilon = epsilon
        self.is_gpu = context.get_context('device_target') == "GPU"
@ -323,7 +323,7 @@ class FakeQuantWithMinMax(Cell):
                 symmetric=False,
                 narrow_range=False,
                 quant_delay=0):
-        """init FakeQuantWithMinMax layer"""
+        """Initialize FakeQuantWithMinMax layer"""
        super(FakeQuantWithMinMax, self).__init__()
        validator.check_type("min_init", min_init, [int, float])
        validator.check_type("max_init", max_init, [int, float])
@ -470,7 +470,7 @@ class Conv2dBnFoldQuant(Cell):
                 narrow_range=False,
                 quant_delay=0,
                 freeze_bn=100000):
-        """init Conv2dBnFoldQuant layer"""
+        """Initialize Conv2dBnFoldQuant layer"""
        super(Conv2dBnFoldQuant, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
@ -611,8 +611,8 @@ class Conv2dBnWithoutFoldQuant(Cell):
        stride (int): Specifies stride for all spatial dimensions with the same value. Default: 1.
        pad_mode (str): Specifies padding mode. The optional values are "same", "valid", "pad". Default: "same".
        padding (int): Implicit paddings on both sides of the input. Default: 0.
-        dilation (int): Specifying the dilation rate to use for dilated convolution. Default: 1.
-        group (int): Split filter into groups, `in_ channels` and `out_channels` should be
+        dilation (int): Specifies the dilation rate to use for dilated convolution. Default: 1.
+        group (int): Splits filter into groups, `in_ channels` and `out_channels` must be
            divisible by the number of groups. Default: 1.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: False.
        eps (float): Parameters for BatchNormal. Default: 1e-5.
@ -743,8 +743,8 @@ class Conv2dQuant(Cell):
        stride (int): Specifies stride for all spatial dimensions with the same value. Default: 1.
        pad_mode (str): Specifies padding mode. The optional values are "same", "valid", "pad". Default: "same".
        padding (int): Implicit paddings on both sides of the input. Default: 0.
-        dilation (int): Specifying the dilation rate to use for dilated convolution. Default: 1.
-        group (int): Split filter into groups, `in_ channels` and `out_channels` should be
+        dilation (int): Specifies the dilation rate to use for dilated convolution. Default: 1.
+        group (int): Splits filter into groups, `in_ channels` and `out_channels` must be
            divisible by the number of groups. Default: 1.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: False.
        weight_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the convolution kernel.
--- a/mindspore/nn/learning_rate_schedule.py
+++ b/mindspore/nn/learning_rate_schedule.py
@ -32,10 +32,10 @@ class LearningRateSchedule(Cell):
        """
        Defines the computation to get the current learning rate.

-        This method should be overridden by all subclasses.
+        This method must be overridden by all subclasses.

        Note:
-            The output should be a Tensor of scalar.
+            The output must be a Tensor of scalar.

        Inputs:
            Tensor. The current step number.
@ -73,7 +73,7 @@ class ExponentialDecayLR(LearningRateSchedule):
        learning_rate (float): The initial value of learning rate.
        decay_rate (float): The decay rate.
        decay_steps (int): A value used to calculate decayed learning rate.
-        is_stair (bool): If True, learning rate is decayed once every `decay_steps` time. Default: False.
+        is_stair (bool): If true, learning rate is decayed once every `decay_steps` time. Default: False.

    Inputs:
        Tensor. The current step number.
@ -127,7 +127,7 @@ class NaturalExpDecayLR(LearningRateSchedule):
        learning_rate (float): The initial value of learning rate.
        decay_rate (float): The decay rate.
        decay_steps (int): A value used to calculate decayed learning rate.
-        is_stair (bool): If True, learning rate is decayed once every `decay_steps` time. Default: False.
+        is_stair (bool): If true, learning rate is decayed once every `decay_steps` time. Default: False.

    Inputs:
        Tensor. The current step number.
@ -292,8 +292,8 @@ class PolynomialDecayLR(LearningRateSchedule):
        learning_rate (float): The initial value of learning rate.
        end_learning_rate (float): The end value of learning rate.
        decay_steps (int): A value used to calculate decayed learning rate.
-        power (float): A value used to calculate decayed learning rate. This parameter should be greater than 0.
-        update_decay_steps (bool): If True, learning rate is decayed once every `decay_steps` time. Default: False.
+        power (float): A value used to calculate decayed learning rate. This parameter must be greater than 0.
+        update_decay_steps (bool): If true, learning rate is decayed once every `decay_steps` time. Default: False.

    Inputs:
        Tensor. The current step number.
--- a/mindspore/nn/loss/loss.py
+++ b/mindspore/nn/loss/loss.py
@ -83,9 +83,9 @@ class L1Loss(_Loss):
            Default: "mean".

    Inputs:
-        - **input_data** (Tensor) - Tensor of shape :math:`(x_1, x_2, ..., x_R)`. The data type should be float16 or
+        - **input_data** (Tensor) - Tensor of shape :math:`(x_1, x_2, ..., x_R)`. The data type must be float16 or
          float32.
-        - **target_data** (Tensor) - Tensor of shape :math:`(y_1, y_2, ..., y_S)`. The data type should be float16 or
+        - **target_data** (Tensor) - Tensor of shape :math:`(y_1, y_2, ..., y_S)`. The data type must be float16 or
          float32.

    Outputs:
@ -344,14 +344,14 @@ class CosineEmbeddingLoss(_Loss):

    Args:
        margin (float): Should be in [-1.0, 1.0]. Default 0.0.
-        reduction (str): Specifies which reduction to be applied to the output. It should be one of
+        reduction (str): Specifies which reduction to be applied to the output. It must be one of
          "none", "mean", and "sum", meaning no reduction, reduce mean and sum on output, respectively. Default "mean".

    Inputs:
        - **input_x1** (Tensor) - Input tensor.
-        - **input_x2** (Tensor) - Its shape and data type should be the same as `input_x1`'s shape and data type.
+        - **input_x2** (Tensor) - Its shape and data type must be the same as `input_x1`'s shape and data type.
        - **y** (Tensor) - Contains value 1 or -1. Suppose the shape of `input_x1` is
-          :math:`(x_1, x_2, x_3,..., x_R)`, then the shape of `target` should be :math:`(x_1, x_3, x_4, ..., x_R)`.
+          :math:`(x_1, x_2, x_3,..., x_R)`, then the shape of `target` must be :math:`(x_1, x_3, x_4, ..., x_R)`.

    Outputs:
        - **loss** (Tensor) - If `reduction` is "none", its shape is the same as `y`'s shape, otherwise a scalar value
--- a/mindspore/nn/metrics/_evaluation.py
+++ b/mindspore/nn/metrics/_evaluation.py
@ -77,7 +77,7 @@ class EvaluationBase(Metric):
        A interface describes the behavior of clearing the internal evaluation result.

        Note:
-            All subclasses should override this interface.
+            All subclasses must override this interface.
        """
        raise NotImplementedError

@ -86,7 +86,7 @@ class EvaluationBase(Metric):
        A interface describes the behavior of updating the internal evaluation result.

        Note:
-            All subclasses should override this interface.
+            All subclasses must override this interface.

        Args:
            inputs: The first item is predicted array and the second item is target array.
@ -98,6 +98,6 @@ class EvaluationBase(Metric):
        A interface describes the behavior of computing the evaluation result.

        Note:
-            All subclasses should override this interface.
+            All subclasses must override this interface.
        """
        raise NotImplementedError
--- a/mindspore/nn/metrics/loss.py
+++ b/mindspore/nn/metrics/loss.py
@ -46,7 +46,7 @@ class Loss(Metric):

        Args:
            inputs: Inputs contain only one element, the element is loss. The dimension of
-                loss should be 0 or 1.
+                loss must be 0 or 1.

        Raises:
            ValueError: If the length of inputs is not 1.
--- a/mindspore/nn/metrics/metric.py
+++ b/mindspore/nn/metrics/metric.py
@ -85,7 +85,7 @@ class Metric(metaclass=ABCMeta):
        An interface describes the behavior of clearing the internal evaluation result.

        Note:
-            All subclasses should override this interface.
+            All subclasses must override this interface.
        """
        raise NotImplementedError('Must define clear function to use this base class')

@ -95,7 +95,7 @@ class Metric(metaclass=ABCMeta):
        An interface describes the behavior of computing the evaluation result.

        Note:
-            All subclasses should override this interface.
+            All subclasses must override this interface.
        """
        raise NotImplementedError('Must define eval function to use this base class')

@ -105,7 +105,7 @@ class Metric(metaclass=ABCMeta):
        An interface describes the behavior of updating the internal evaluation result.

        Note:
-            All subclasses should override this interface.
+            All subclasses must override this interface.

        Args:
            inputs: A variable-length input argument list.
--- a/mindspore/nn/metrics/precision.py
+++ b/mindspore/nn/metrics/precision.py
@ -34,7 +34,7 @@ class Precision(EvaluationBase):
        \text{precision} = \frac{\text{true_positive}}{\text{true_positive} + \text{false_positive}}

    Note:
-        In the multi-label cases, the elements of :math:`y` and :math:`y_{pred}` should be 0 or 1.
+        In the multi-label cases, the elements of :math:`y` and :math:`y_{pred}` must be 0 or 1.

    Args:
        eval_type (str): Metric to calculate accuracy over a dataset, for classification or
--- a/mindspore/nn/metrics/recall.py
+++ b/mindspore/nn/metrics/recall.py
@ -34,7 +34,7 @@ class Recall(EvaluationBase):
        \text{recall} = \frac{\text{true_positive}}{\text{true_positive} + \text{false_negative}}

    Note:
-        In the multi-label cases, the elements of :math:`y` and :math:`y_{pred}` should be 0 or 1.
+        In the multi-label cases, the elements of :math:`y` and :math:`y_{pred}` must be 0 or 1.

    Args:
        eval_type (str): Metric to calculate the recall over a dataset, for classification or
--- a/mindspore/nn/optim/adam.py
+++ b/mindspore/nn/optim/adam.py
@ -166,10 +166,10 @@ class Adam(Optimizer):

    Args:
        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
-            the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
+            the element in `params` must be class `Parameter`. When the `params` is a list of `dict`, the "params",
            "lr", "weight_decay" and "order_params" are the keys can be parsed.

-            - params: Required. The value should be a list of `Parameter`.
+            - params: Required. The value must be a list of `Parameter`.

            - lr: Optional. If "lr" is in the keys, the value of the corresponding learning rate will be used.
              If not, the `learning_rate` in the API will be used.
@ -177,16 +177,16 @@ class Adam(Optimizer):
            - weight_decay: Optional. If "weight_decay" is in the keys, the value of the corresponding weight decay
              will be used. If not, the `weight_decay` in the API will be used.

-            - order_params: Optional. If "order_params" is in the keys, the value should be the order of parameters and
+            - order_params: Optional. If "order_params" is in the keys, the value must be the order of parameters and
              the order will be followed in the optimizer. There are no other keys in the `dict` and the parameters
-              which in the 'order_params' should be in one of group parameters.
+              which in the 'order_params' must be in one of group parameters.

        learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or a graph for the learning rate.
            When the learning_rate is an Iterable or a Tensor in a 1D dimension, use the dynamic learning rate, then
            the i-th step will take the i-th value as the learning rate. When the learning_rate is LearningRateSchedule,
            use dynamic learning rate, the i-th learning rate will be calculated during the process of training
            according to the formula of LearningRateSchedule. When the learning_rate is a float or a Tensor in a zero
-            dimension, use fixed learning rate. Other cases are not supported. The float learning rate should be
+            dimension, use fixed learning rate. Other cases are not supported. The float learning rate must be
            equal to or greater than 0. If the type of `learning_rate` is int, it will be converted to float.
            Default: 1e-3.
        beta1 (float): The exponential decay rate for the 1st moment estimations. Should be in range (0.0, 1.0).
@ -201,7 +201,7 @@ class Adam(Optimizer):
        use_nesterov (bool): Whether to use Nesterov Accelerated Gradient (NAG) algorithm to update the gradients.
            If true, update the gradients using NAG.
            If false, update the gradients without using NAG. Default: False.
-        weight_decay (float): Weight decay (L2 penalty). It should be equal to or greater than 0. Default: 0.0.
+        weight_decay (float): Weight decay (L2 penalty). It must be equal to or greater than 0. Default: 0.0.
        loss_scale (float): A floating point value for the loss scale. Should be greater than 0. Default: 1.0.

    Inputs:
@ -290,10 +290,10 @@ class AdamWeightDecay(Optimizer):

    Args:
        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
-            the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
+            the element in `params` must be class `Parameter`. When the `params` is a list of `dict`, the "params",
            "lr", "weight_decay" and "order_params" are the keys can be parsed.

-            - params: Required. The value should be a list of `Parameter`.
+            - params: Required. The value must be a list of `Parameter`.

            - lr: Optional. If "lr" is in the keys, the value of the corresponding learning rate will be used.
              If not, the `learning_rate` in the API will be used.
@ -301,16 +301,16 @@ class AdamWeightDecay(Optimizer):
            - weight_decay: Optional. If "weight_decay" is in the keys, the value of the corresponding weight decay
              will be used. If not, the `weight_decay` in the API will be used.

-            - order_params: Optional. If "order_params" is in the keys, the value should be the order of parameters and
+            - order_params: Optional. If "order_params" is in the keys, the value must be the order of parameters and
              the order will be followed in the optimizer. There are no other keys in the `dict` and the parameters
-              which in the 'order_params' should be in one of group parameters.
+              which in the 'order_params' must be in one of group parameters.

        learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or a graph for the learning rate.
            When the learning_rate is an Iterable or a Tensor in a 1D dimension, use the dynamic learning rate, then
            the i-th step will take the i-th value as the learning rate. When the learning_rate is LearningRateSchedule,
            use dynamic learning rate, the i-th learning rate will be calculated during the process of training
            according to the formula of LearningRateSchedule. When the learning_rate is a float or a Tensor in a zero
-            dimension, use fixed learning rate. Other cases are not supported. The float learning rate should be
+            dimension, use fixed learning rate. Other cases are not supported. The float learning rate must be
            equal to or greater than 0. If the type of `learning_rate` is int, it will be converted to float.
            Default: 1e-3.
        beta1 (float): The exponential decay rate for the 1st moment estimations. Default: 0.9.
@ -319,7 +319,7 @@ class AdamWeightDecay(Optimizer):
            Should be in range (0.0, 1.0).
        eps (float): Term added to the denominator to improve numerical stability. Default: 1e-6.
            Should be greater than 0.
-        weight_decay (float): Weight decay (L2 penalty). It should be equal to or greater than 0. Default: 0.0.
+        weight_decay (float): Weight decay (L2 penalty). It must be equal to or greater than 0. Default: 0.0.

    Inputs:
        - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`.
--- a/mindspore/nn/optim/ftrl.py
+++ b/mindspore/nn/optim/ftrl.py
@ -93,29 +93,29 @@ class FTRL(Optimizer):

    Args:
        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
-            the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
+            the element in `params` must be class `Parameter`. When the `params` is a list of `dict`, the "params",
            "lr", "weight_decay" and "order_params" are the keys can be parsed.

-            - params: Required. The value should be a list of `Parameter`.
+            - params: Required. The value must be a list of `Parameter`.

            - lr: Using different learning rate by separating parameters is currently not supported.

            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
              will be used. If not, the `weight_decay` in the API will be used.

-            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
+            - order_params: Optional. If "order_params" in the keys, the value must be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
-              in the value of 'order_params' should be in one of group parameters.
+              in the value of 'order_params' must be in one of group parameters.

        initial_accum (float): The starting value for accumulators, must be zero or positive values. Default: 0.1.
-        learning_rate (float): The learning rate value, should be zero or positive, dynamic learning rate is currently
+        learning_rate (float): The learning rate value, must be zero or positive, dynamic learning rate is currently
            not supported. Default: 0.001.
        lr_power (float): Learning rate power controls how the learning rate decreases during training, must be less
            than or equal to zero. Use fixed learning rate if lr_power is zero. Default: -0.5.
        l1 (float): l1 regularization strength, must be greater than or equal to zero. Default: 0.0.
        l2 (float): l2 regularization strength, must be greater than or equal to zero. Default: 0.0.
-        use_locking (bool): If True, use locks for updating operation. Default: False.
-        loss_scale (float): Value for the loss scale. It should be equal to or greater than 1.0. Default: 1.0.
+        use_locking (bool): If true, use locks for updating operation. Default: False.
+        loss_scale (float): Value for the loss scale. It must be equal to or greater than 1.0. Default: 1.0.
        weight_decay (float): Weight decay value to multiply weight, must be zero or positive value. Default: 0.0.

    Inputs:
--- a/mindspore/nn/optim/lamb.py
+++ b/mindspore/nn/optim/lamb.py
@ -199,10 +199,10 @@ class Lamb(Optimizer):

    Args:
        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
-            the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
+            the element in `params` must be class `Parameter`. When the `params` is a list of `dict`, the "params",
            "lr", "weight_decay" and "order_params" are the keys can be parsed.

-            - params: Required. The value should be a list of `Parameter`.
+            - params: Required. The value must be a list of `Parameter`.

            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
              If not, the `learning_rate` in the API will be used.
@ -210,16 +210,16 @@ class Lamb(Optimizer):
            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
              will be used. If not, the `weight_decay` in the API will be used.

-            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
+            - order_params: Optional. If "order_params" in the keys, the value must be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
-              in the value of 'order_params' should be in one of group parameters.
+              in the value of 'order_params' must be in one of group parameters.

        learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or a graph for the learning rate.
            When the learning_rate is an Iterable or a Tensor in a 1D dimension, use dynamic learning rate, then
            the i-th step will take the i-th value as the learning rate. When the learning_rate is LearningRateSchedule,
            use dynamic learning rate, the i-th learning rate will be calculated during the process of training
            according to the formula of LearningRateSchedule. When the learning_rate is a float or a Tensor in a zero
-            dimension, use fixed learning rate. Other cases are not supported. The float learning rate should be
+            dimension, use fixed learning rate. Other cases are not supported. The float learning rate must be
            equal to or greater than 0. If the type of `learning_rate` is int, it will be converted to float.
        beta1 (float): The exponential decay rate for the 1st moment estimations. Default: 0.9.
            Should be in range (0.0, 1.0).
--- a/mindspore/nn/optim/lazyadam.py
+++ b/mindspore/nn/optim/lazyadam.py
@ -112,10 +112,10 @@ class LazyAdam(Optimizer):

    Args:
        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
-            the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
+            the element in `params` must be class `Parameter`. When the `params` is a list of `dict`, the "params",
            "lr" and "weight_decay" are the keys can be parsed.

-            - params: Required. The value should be a list of `Parameter`.
+            - params: Required. The value must be a list of `Parameter`.

            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
              If not, the `learning_rate` in the API will be used.
@ -123,16 +123,16 @@ class LazyAdam(Optimizer):
            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
              will be used. If not, the `weight_decay` in the API will be used.

-            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
+            - order_params: Optional. If "order_params" in the keys, the value must be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
-              in the value of 'order_params' should be in one of group parameters.
+              in the value of 'order_params' must be in one of group parameters.

        learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or a graph for the learning rate.
            When the learning_rate is an Iterable or a Tensor in a 1D dimension, use dynamic learning rate, then
            the i-th step will take the i-th value as the learning rate. When the learning_rate is LearningRateSchedule,
            use dynamic learning rate, the i-th learning rate will be calculated during the process of training
            according to the formula of LearningRateSchedule. When the learning_rate is a float or a Tensor in a zero
-            dimension, use fixed learning rate. Other cases are not supported. The float learning rate should be
+            dimension, use fixed learning rate. Other cases are not supported. The float learning rate must be
            equal to or greater than 0. If the type of `learning_rate` is int, it will be converted to float.
            Default: 1e-3.
        beta1 (float): The exponential decay rate for the 1st moment estimations. Should be in range (0.0, 1.0).
--- a/mindspore/nn/optim/momentum.py
+++ b/mindspore/nn/optim/momentum.py
@ -68,10 +68,10 @@ class Momentum(Optimizer):

    Args:
        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
-            the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
+            the element in `params` must be class `Parameter`. When the `params` is a list of `dict`, the "params",
            "lr", "weight_decay" and "order_params" are the keys can be parsed.

-            - params: Required. The value should be a list of `Parameter`.
+            - params: Required. The value must be a list of `Parameter`.

            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
              If not, the `learning_rate` in the API will be used.
@ -79,21 +79,21 @@ class Momentum(Optimizer):
            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
              will be used. If not, the `weight_decay` in the API will be used.

-            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
+            - order_params: Optional. If "order_params" in the keys, the value must be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
-              in the value of 'order_params' should be in one of group parameters.
+              in the value of 'order_params' must be in one of group parameters.

        learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or a graph for the learning rate.
            When the learning_rate is an Iterable or a Tensor in a 1D dimension, use dynamic learning rate, then
            the i-th step will take the i-th value as the learning rate. When the learning_rate is LearningRateSchedule,
            use dynamic learning rate, the i-th learning rate will be calculated during the process of training
            according to the formula of LearningRateSchedule. When the learning_rate is a float or a Tensor in a zero
-            dimension, use fixed learning rate. Other cases are not supported. The float learning rate should be
+            dimension, use fixed learning rate. Other cases are not supported. The float learning rate must be
            equal to or greater than 0. If the type of `learning_rate` is int, it will be converted to float.
        momentum (float): Hyperparameter of type float, means momentum for the moving average.
-            It should be at least 0.0.
-        weight_decay (int, float): Weight decay (L2 penalty). It should be equal to or greater than 0.0. Default: 0.0.
-        loss_scale (int, float): A floating point value for the loss scale. It should be greater than 0.0. Default: 1.0.
+            It must be at least 0.0.
+        weight_decay (int, float): Weight decay (L2 penalty). It must be equal to or greater than 0.0. Default: 0.0.
+        loss_scale (int, float): A floating point value for the loss scale. It must be greater than 0.0. Default: 1.0.
        use_nesterov (bool): Enable Nesterov momentum. Default: False.

    Inputs:
--- a/mindspore/nn/optim/optimizer.py
+++ b/mindspore/nn/optim/optimizer.py
@ -58,13 +58,13 @@ class Optimizer(Cell):
            the i-th step will take the i-th value as the learning rate. When the learning_rate is LearningRateSchedule,
            use dynamic learning rate, the i-th learning rate will be calculated during the process of training
            according to the formula of LearningRateSchedule. When the learning_rate is a float or a Tensor in a zero
-            dimension, use fixed learning rate. Other cases are not supported. The float learning rate should be
+            dimension, use fixed learning rate. Other cases are not supported. The float learning rate must be
            equal to or greater than 0. If the type of `learning_rate` is int, it will be converted to float.
        parameters (Union[list[Parameter], list[dict]]): When the `parameters` is a list of `Parameter` which will be
-            updated, the element in `parameters` should be class `Parameter`. When the `parameters` is a list of `dict`,
+            updated, the element in `parameters` must be class `Parameter`. When the `parameters` is a list of `dict`,
            the "params", "lr", "weight_decay" and "order_params" are the keys can be parsed.

-            - params: Required. The value should be a list of `Parameter`.
+            - params: Required. The value must be a list of `Parameter`.

            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
              If not, the `learning_rate` in the API will be used.
@ -72,13 +72,13 @@ class Optimizer(Cell):
            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
              will be used. If not, the `weight_decay` in the API will be used.

-            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
+            - order_params: Optional. If "order_params" in the keys, the value must be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
-              in the value of 'order_params' should be in one of group parameters.
+              in the value of 'order_params' must be in one of group parameters.

-        weight_decay (float): A floating point value for the weight decay. It should be equal to or greater than 0.
+        weight_decay (float): A floating point value for the weight decay. It must be equal to or greater than 0.
            If the type of `weight_decay` input is int, it will be converted to float. Default: 0.0.
-        loss_scale (float): A floating point value for the loss scale. It should be greater than 0. If the
+        loss_scale (float): A floating point value for the loss scale. It must be greater than 0. If the
            type of `loss_scale` input is int, it will be converted to float. Default: 1.0.

    Raises:
@ -315,7 +315,7 @@ class Optimizer(Cell):
                        raise ValueError("The Tensor type dynamic learning rate in group should be the same size.")

    def _init_group_params(self, parameters, learning_rate, weight_decay):
-        """Init learning rate or weight decay in group params."""
+        """Initialize learning rate or weight decay in group params."""
        self._parse_group_params(parameters, learning_rate)
        default_lr = self._build_single_lr(learning_rate, 'learning_rate')

--- a/mindspore/nn/optim/proximal_ada_grad.py
+++ b/mindspore/nn/optim/proximal_ada_grad.py
@ -71,10 +71,10 @@ class ProximalAdagrad(Optimizer):

    Args:
        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,
-            the element in `params` should be class `Parameter`. When the `params` is a list of `dict`, the "params",
+            the element in `params` must be class `Parameter`. When the `params` is a list of `dict`, the "params",
            "lr", "weight_decay" and "order_params" are the keys can be parsed.

-            - params: Required. The value should be a list of `Parameter`.
+            - params: Required. The value must be a list of `Parameter`.

            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
              If not, the `learning_rate` in the API will be used.
@ -82,9 +82,9 @@ class ProximalAdagrad(Optimizer):
            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
              will be used. If not, the `weight_decay` in the API will be used.

-            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
+            - order_params: Optional. If "order_params" in the keys, the value must be the order of parameters and
              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
-              in the value of 'order_params' should be in one of group parameters.
+              in the value of 'order_params' must be in one of group parameters.

        accum (float): The starting value for accumulators, must be zero or positive values. Default: 0.1.
        learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or a graph for the learning rate.
@ -92,13 +92,13 @@ class ProximalAdagrad(Optimizer):
            the i-th step will take the i-th value as the learning rate. When the learning_rate is LearningRateSchedule,
            use dynamic learning rate, the i-th learning rate will be calculated during the process of training
            according to the formula of LearningRateSchedule. When the learning_rate is a float or a Tensor in a zero
-            dimension, use fixed learning rate. Other cases are not supported. The float learning rate should be
+            dimension, use fixed learning rate. Other cases are not supported. The float learning rate must be
            equal to or greater than 0. If the type of `learning_rate` is int, it will be converted to float.
            Default: 0.001.
        l1 (float): l1 regularization strength, must be greater than or equal to zero. Default: 0.0.
        l2 (float): l2 regularization strength, must be greater than or equal to zero. Default: 0.0.
-        use_locking (bool): If True, use locks for updating operation. Default: False.
-        loss_scale (float): Value for the loss scale. It should be greater than 0.0. Default: 1.0.
+        use_locking (bool): If true, use locks for updating operation. Default: False.
+        loss_scale (float): Value for the loss scale. It must be greater than 0.0. Default: 1.0.
        weight_decay (float): Weight decay value to multiply weight, must be zero or positive value. Default: 0.0.

    Inputs:
--- a/Show More
+++ b/Show More