!11489 add validation of ApplyMomentum and update annotation of Broadcast, DistributedGradRecuer, GlobalBatchNorm, etc. Operators.

From: @wangshuide2020 Reviewed-by: @liangchenghui,@ljl0711 Signed-off-by: @liangchenghui
4 years ago · 27dc6e19a3
parent be8867d85d 4cca77cb88
commit 27dc6e19a3
8 changed files with 30 additions and 22 deletions
--- a/mindspore/nn/layer/basic.py
+++ b/mindspore/nn/layer/basic.py
@ -861,7 +861,7 @@ class MatrixDiag(Cell):
    r"""
    Returns a batched diagonal tensor with a given batched diagonal values.

-    Assume :math:`x` has :math:`k` dimensions :math:`[I, J, K, ..., N]`, then the output is a tensor of rank
+    Assume `x` has :math:`k` dimensions :math:`[I, J, K, ..., N]`, then the output is a tensor of rank
    :math:`k+1` with dimensions :math:`[I, J, K, ..., N, N]` where:

    .. code-block::
@ -905,7 +905,7 @@ class MatrixDiagPart(Cell):
    Returns the batched diagonal part of a batched tensor.

    Assume `x` has :math:`k` dimensions :math:`[I, J, K, ..., M, N]`, then the output is a tensor of rank
-    :math:`k-1` with dimensions :math:`[I, J, K, ..., min(M, N]` where:
+    :math:`k-1` with dimensions :math:`[I, J, K, ..., min(M, N)]` where:

    .. code-block::

--- a/mindspore/nn/layer/normalization.py
+++ b/mindspore/nn/layer/normalization.py
@ -417,7 +417,7 @@ class BatchNorm3d(Cell):

    Note:
        The implementation of BatchNorm is different in graph mode and pynative mode, therefore that mode can not be
-        changed after net was initilized.
+        changed after net was initialized.
        Note that the formula for updating the running_mean and running_var is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times x_t + \text{momentum} \times \hat{x}`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the new observed value.
@ -548,8 +548,8 @@ class GlobalBatchNorm(_BatchNorm):
        ``Ascend``

    Examples:
-        >>> # This example should be run with multiple processes. Refer to the run_distribute_train.sh
-        >>> import os
+        >>> # This example should be run with multiple processes.
+        >>> # Please refer to the tutorial > Distributed Training on mindspore.cn.
        >>> import numpy as np
        >>> from mindspore.communication import init
        >>> from mindspore import context
@ -557,9 +557,7 @@ class GlobalBatchNorm(_BatchNorm):
        >>> from mindspore import nn, Tensor
        >>> from mindspore.common import dtype as mstype
        >>>
-        >>> device_id = int(os.environ["DEVICE_ID"])
-        >>> context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=True,
-        ...                     device_id=int(device_id))
+        >>> context.set_context(mode=context.GRAPH_MODE)
        >>> init()
        >>> context.reset_auto_parallel_context()
        >>> context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL)
--- a/mindspore/nn/layer/quant.py
+++ b/mindspore/nn/layer/quant.py
@ -235,12 +235,12 @@ class FakeQuantWithMinMaxObserver(UniformQuantObserver):

    where X is the input tensor, and :math:`c` is the `ema_decay`.

-    The scale s and zero point zp is computed as:
+    The scale and zero point zp is computed as:

    .. math::

        \begin{array}{ll} \\
-            s =
+            scale =
            \begin{cases}
                \frac{x_{max} - x_{min}}{Q_{max} - Q_{min}}
                  & \text{ if } symmetric = \text{False} \\
@ -1375,7 +1375,7 @@ class QuantMindirBlock(Cell):

       Args:
        core_op (Cell): The operation cell.
-        weight (Tensor): The weigth of the cell.
+        weight (Tensor): The weight of the cell.
        bias (Tensor): The bias of the cell. Default: None.
        activation (str): The regularization function applied to the output of the layer, eg. 'relu'. Default: None.
        param_dict (dict): The information of the cell.
--- a/mindspore/nn/optim/momentum.py
+++ b/mindspore/nn/optim/momentum.py
@ -105,6 +105,7 @@ class Momentum(Optimizer):

    Raises:
        ValueError: If the momentum is less than 0.0.
+        TypeError: If the momentum is not a float or use_nesterov is not a bool.

    Supported Platforms:
        ``Ascend`` ``GPU`` ``CPU``
--- a/mindspore/nn/wrap/grad_reducer.py
+++ b/mindspore/nn/wrap/grad_reducer.py
@ -244,11 +244,11 @@ class DistributedGradReducer(Cell):
        ValueError: If degree is not a int or less than 0.

    Supported Platforms:
-        ``Ascend``
+        ``Ascend``, ``GPU``

    Examples:
-        >>> # This example should be run with multiple processes. Refer to the run_distribute_train.sh
-        >>> import os
+        >>> # This example should be run with multiple processes.
+        >>> # Please refer to the tutorial > Distributed Training on mindspore.cn.
        >>> import numpy as np
        >>> from mindspore.communication import init
        >>> from mindspore.ops import composite as C
@ -261,9 +261,7 @@ class DistributedGradReducer(Cell):
        >>> from mindspore.nn.wrap.cell_wrapper import WithLossCell
        >>> from mindspore.parallel._utils import (_get_device_num, _get_gradients_mean)
        >>>
-        >>> device_id = int(os.environ["DEVICE_ID"])
-        >>> context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=True,
-        ...                     device_id=int(device_id))
+        >>> context.set_context(mode=context.GRAPH_MODE)
        >>> init()
        >>> context.reset_auto_parallel_context()
        >>> context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL)
--- a/mindspore/ops/_op_impl/tbe/inplace_update.py
+++ b/mindspore/ops/_op_impl/tbe/inplace_update.py
@ -23,7 +23,6 @@ inplace_update_op_info = TBERegOp("InplaceUpdate") \
    .compute_cost(10) \
    .kernel_name("inplace_update_d") \
    .partial_flag(True) \
-    .need_check_supported(True) \
    .attr("indices", "required", "listInt", "all") \
    .input(0, "x", False, "required", "all") \
    .input(1, "v", False, "required", "all") \
--- a/mindspore/ops/operations/comm_ops.py
+++ b/mindspore/ops/operations/comm_ops.py
@ -149,7 +149,7 @@ class AllGather(PrimitiveWithInfer):
        ``Ascend`` ``GPU``

    Examples:
-        >>> # This example should be run with two devices. Refer to the tutorial > Distirbuted Training on mindspore.cn.
+        >>> # This example should be run with two devices. Refer to the tutorial > Distributed Training on mindspore.cn.
        >>> import numpy as np
        >>> import mindspore.ops.operations as ops
        >>> import mindspore.nn as nn
@ -272,7 +272,7 @@ class ReduceScatter(PrimitiveWithInfer):
        ``Ascend`` ``GPU``

    Examples:
-        >>> # This example should be run with two devices. Refer to the tutorial > Distirbuted Training on mindspore.cn.
+        >>> # This example should be run with two devices. Refer to the tutorial > Distributed Training on mindspore.cn.
        >>> from mindspore import Tensor, context
        >>> from mindspore.communication import init
        >>> from mindspore.ops.operations.comm_ops import ReduceOp
@ -396,15 +396,19 @@ class Broadcast(PrimitiveWithInfer):
        TypeError: If root_rank is not a integer or group is not a string.

    Supported Platforms:
-        ``Ascend``
+        ``Ascend``, ``GPU``

    Examples:
+        >>> # This example should be run with multiple processes.
+        >>> # Please refer to the tutorial > Distributed Training on mindspore.cn.
        >>> from mindspore import Tensor
+        >>> from mindspore import context
        >>> from mindspore.communication import init
        >>> import mindspore.nn as nn
        >>> import mindspore.ops.operations as ops
        >>> import numpy as np
        >>>
+        >>> context.set_context(mode=context.GRAPH_MODE)
        >>> init()
        >>> class Net(nn.Cell):
        ...     def __init__(self):
--- a/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/ops/operations/nn_ops.py
@ -2169,6 +2169,8 @@ class ApplyMomentum(PrimitiveWithInfer):
    Refer to the paper `On the importance of initialization and momentum in deep
    learning <https://dl.acm.org/doi/10.5555/3042817.3043064>`_  for more details.

+    Refer to :class:`mindspore.nn.Momentum` for more details about the formula and usage.
+
    Inputs of `variable`, `accumulation` and `gradient` comply with the implicit type conversion rules
    to make the data types consistent.
    If they have different data types, lower priority data type will be converted to
@ -2194,11 +2196,14 @@ class ApplyMomentum(PrimitiveWithInfer):
    Outputs:
        Tensor, parameters to be updated.

+    Raises:
+        TypeError: If the use_locking or use_nesterov is not a bool or gradient_scale is not a float.
+
    Supported Platforms:
        ``Ascend`` ``GPU`` ``CPU``

    Examples:
-        Please refer to the usage in nn.ApplyMomentum.
+        Please refer to the usage in :class:`mindspore.nn.Momentum`.
    """
    __mindspore_signature__ = (
        sig.make_sig('variable', sig.sig_rw.RW_WRITE, dtype=sig.sig_dtype.T),
@ -2210,6 +2215,9 @@ class ApplyMomentum(PrimitiveWithInfer):

    @prim_attr_register
    def __init__(self, use_nesterov=False, use_locking=False, gradient_scale=1.0):
+        self.use_nesterov = validator.check_bool(use_nesterov)
+        self.use_locking = validator.check_bool(use_locking)
+        validator.check_value_type('gradient_scale', gradient_scale, [float], self.name)
        self.init_prim_io_names(inputs=['variable', 'accumulation', 'learning_rate', 'gradient', 'momentum'],
                                outputs=['output'])
        self.is_tbe = context.get_context("device_target") == "Ascend"