!11489 add validation of ApplyMomentum and update annotation of Broadcast, DistributedGradRecuer, GlobalBatchNorm, etc. Operators.

From: @wangshuide2020
Reviewed-by: @liangchenghui,@ljl0711
Signed-off-by: @liangchenghui
pull/11489/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit 27dc6e19a3

@ -861,7 +861,7 @@ class MatrixDiag(Cell):
r"""
Returns a batched diagonal tensor with a given batched diagonal values.
Assume :math:`x` has :math:`k` dimensions :math:`[I, J, K, ..., N]`, then the output is a tensor of rank
Assume `x` has :math:`k` dimensions :math:`[I, J, K, ..., N]`, then the output is a tensor of rank
:math:`k+1` with dimensions :math:`[I, J, K, ..., N, N]` where:
.. code-block::
@ -905,7 +905,7 @@ class MatrixDiagPart(Cell):
Returns the batched diagonal part of a batched tensor.
Assume `x` has :math:`k` dimensions :math:`[I, J, K, ..., M, N]`, then the output is a tensor of rank
:math:`k-1` with dimensions :math:`[I, J, K, ..., min(M, N]` where:
:math:`k-1` with dimensions :math:`[I, J, K, ..., min(M, N)]` where:
.. code-block::

@ -417,7 +417,7 @@ class BatchNorm3d(Cell):
Note:
The implementation of BatchNorm is different in graph mode and pynative mode, therefore that mode can not be
changed after net was initilized.
changed after net was initialized.
Note that the formula for updating the running_mean and running_var is
:math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times x_t + \text{momentum} \times \hat{x}`,
where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the new observed value.
@ -548,8 +548,8 @@ class GlobalBatchNorm(_BatchNorm):
``Ascend``
Examples:
>>> # This example should be run with multiple processes. Refer to the run_distribute_train.sh
>>> import os
>>> # This example should be run with multiple processes.
>>> # Please refer to the tutorial > Distributed Training on mindspore.cn.
>>> import numpy as np
>>> from mindspore.communication import init
>>> from mindspore import context
@ -557,9 +557,7 @@ class GlobalBatchNorm(_BatchNorm):
>>> from mindspore import nn, Tensor
>>> from mindspore.common import dtype as mstype
>>>
>>> device_id = int(os.environ["DEVICE_ID"])
>>> context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=True,
... device_id=int(device_id))
>>> context.set_context(mode=context.GRAPH_MODE)
>>> init()
>>> context.reset_auto_parallel_context()
>>> context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL)

@ -235,12 +235,12 @@ class FakeQuantWithMinMaxObserver(UniformQuantObserver):
where X is the input tensor, and :math:`c` is the `ema_decay`.
The scale s and zero point zp is computed as:
The scale and zero point zp is computed as:
.. math::
\begin{array}{ll} \\
s =
scale =
\begin{cases}
\frac{x_{max} - x_{min}}{Q_{max} - Q_{min}}
& \text{ if } symmetric = \text{False} \\
@ -1375,7 +1375,7 @@ class QuantMindirBlock(Cell):
Args:
core_op (Cell): The operation cell.
weight (Tensor): The weigth of the cell.
weight (Tensor): The weight of the cell.
bias (Tensor): The bias of the cell. Default: None.
activation (str): The regularization function applied to the output of the layer, eg. 'relu'. Default: None.
param_dict (dict): The information of the cell.

@ -105,6 +105,7 @@ class Momentum(Optimizer):
Raises:
ValueError: If the momentum is less than 0.0.
TypeError: If the momentum is not a float or use_nesterov is not a bool.
Supported Platforms:
``Ascend`` ``GPU`` ``CPU``

@ -244,11 +244,11 @@ class DistributedGradReducer(Cell):
ValueError: If degree is not a int or less than 0.
Supported Platforms:
``Ascend``
``Ascend``, ``GPU``
Examples:
>>> # This example should be run with multiple processes. Refer to the run_distribute_train.sh
>>> import os
>>> # This example should be run with multiple processes.
>>> # Please refer to the tutorial > Distributed Training on mindspore.cn.
>>> import numpy as np
>>> from mindspore.communication import init
>>> from mindspore.ops import composite as C
@ -261,9 +261,7 @@ class DistributedGradReducer(Cell):
>>> from mindspore.nn.wrap.cell_wrapper import WithLossCell
>>> from mindspore.parallel._utils import (_get_device_num, _get_gradients_mean)
>>>
>>> device_id = int(os.environ["DEVICE_ID"])
>>> context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=True,
... device_id=int(device_id))
>>> context.set_context(mode=context.GRAPH_MODE)
>>> init()
>>> context.reset_auto_parallel_context()
>>> context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL)

@ -23,7 +23,6 @@ inplace_update_op_info = TBERegOp("InplaceUpdate") \
.compute_cost(10) \
.kernel_name("inplace_update_d") \
.partial_flag(True) \
.need_check_supported(True) \
.attr("indices", "required", "listInt", "all") \
.input(0, "x", False, "required", "all") \
.input(1, "v", False, "required", "all") \

@ -149,7 +149,7 @@ class AllGather(PrimitiveWithInfer):
``Ascend`` ``GPU``
Examples:
>>> # This example should be run with two devices. Refer to the tutorial > Distirbuted Training on mindspore.cn.
>>> # This example should be run with two devices. Refer to the tutorial > Distributed Training on mindspore.cn.
>>> import numpy as np
>>> import mindspore.ops.operations as ops
>>> import mindspore.nn as nn
@ -272,7 +272,7 @@ class ReduceScatter(PrimitiveWithInfer):
``Ascend`` ``GPU``
Examples:
>>> # This example should be run with two devices. Refer to the tutorial > Distirbuted Training on mindspore.cn.
>>> # This example should be run with two devices. Refer to the tutorial > Distributed Training on mindspore.cn.
>>> from mindspore import Tensor, context
>>> from mindspore.communication import init
>>> from mindspore.ops.operations.comm_ops import ReduceOp
@ -396,15 +396,19 @@ class Broadcast(PrimitiveWithInfer):
TypeError: If root_rank is not a integer or group is not a string.
Supported Platforms:
``Ascend``
``Ascend``, ``GPU``
Examples:
>>> # This example should be run with multiple processes.
>>> # Please refer to the tutorial > Distributed Training on mindspore.cn.
>>> from mindspore import Tensor
>>> from mindspore import context
>>> from mindspore.communication import init
>>> import mindspore.nn as nn
>>> import mindspore.ops.operations as ops
>>> import numpy as np
>>>
>>> context.set_context(mode=context.GRAPH_MODE)
>>> init()
>>> class Net(nn.Cell):
... def __init__(self):

@ -2169,6 +2169,8 @@ class ApplyMomentum(PrimitiveWithInfer):
Refer to the paper `On the importance of initialization and momentum in deep
learning <https://dl.acm.org/doi/10.5555/3042817.3043064>`_ for more details.
Refer to :class:`mindspore.nn.Momentum` for more details about the formula and usage.
Inputs of `variable`, `accumulation` and `gradient` comply with the implicit type conversion rules
to make the data types consistent.
If they have different data types, lower priority data type will be converted to
@ -2194,11 +2196,14 @@ class ApplyMomentum(PrimitiveWithInfer):
Outputs:
Tensor, parameters to be updated.
Raises:
TypeError: If the use_locking or use_nesterov is not a bool or gradient_scale is not a float.
Supported Platforms:
``Ascend`` ``GPU`` ``CPU``
Examples:
Please refer to the usage in nn.ApplyMomentum.
Please refer to the usage in :class:`mindspore.nn.Momentum`.
"""
__mindspore_signature__ = (
sig.make_sig('variable', sig.sig_rw.RW_WRITE, dtype=sig.sig_dtype.T),
@ -2210,6 +2215,9 @@ class ApplyMomentum(PrimitiveWithInfer):
@prim_attr_register
def __init__(self, use_nesterov=False, use_locking=False, gradient_scale=1.0):
self.use_nesterov = validator.check_bool(use_nesterov)
self.use_locking = validator.check_bool(use_locking)
validator.check_value_type('gradient_scale', gradient_scale, [float], self.name)
self.init_prim_io_names(inputs=['variable', 'accumulation', 'learning_rate', 'gradient', 'momentum'],
outputs=['output'])
self.is_tbe = context.get_context("device_target") == "Ascend"

Loading…
Cancel
Save