update annotation of Norm, Range, Ftrl, etc. operators.

4 years ago · 88b5e96d0c
parent 01ea8266f5
commit 88b5e96d0c
5 changed files with 181 additions and 24 deletions
--- a/mindspore/nn/layer/basic.py
+++ b/mindspore/nn/layer/basic.py
@ -427,9 +427,13 @@ class ClipByNorm(Cell):


 class Norm(Cell):
-    """
+    r"""
    Computes the norm of vectors, currently including Euclidean norm, i.e., :math:`L_2`-norm.

+    .. math::
+
+        norm(x) = \sqrt{\sum_{i=1}^{n} (x_i^2)}
+
    Args:
        axis (Union[tuple, int]): The axis over which to compute vector norms. Default: ().
        keep_dims (bool): If true, the axis indicated in `axis` are kept with size 1. Otherwise,
@ -484,6 +488,22 @@ class OneHot(Cell):
        If the input indices is rank :math:`N`, the output will have rank :math:`N+1`. The new
        axis is created at dimension `axis`.

+    If :math:`indices` is a scalar, the output shape will be a vector of length :math:`depth`.
+
+    If :math:`indices` is a vector of length :math:`features`, the output shape will be:
+
+    :math:`features * depth if axis == -1`
+
+    :math:`depth * features if axis == 0`
+
+    If :math:`indices` is a matrix with shape :math:`[batch, features]`, the output shape will be:
+
+    :math:`batch * features * depth if axis == -1`
+
+    :math:`batch * depth * features if axis == 1`
+
+    :math:`depth * batch * features if axis == 0`
+
    Args:
        axis (int): Features x depth if axis is -1, depth x features
                    if axis is 0. Default: -1.
@ -540,7 +560,11 @@ class Pad(Cell):
        paddings (tuple): The shape of parameter `paddings` is (N, 2). N is the rank of input data. All elements of
            paddings are int type. For `D` th dimension of input, paddings[D, 0] indicates how many sizes to be
            extended ahead of the `D` th dimension of the input tensor, and paddings[D, 1] indicates how many sizes to
-            be extended behind of the `D` th dimension of the input tensor.
+            be extended behind of the `D` th dimension of the input tensor. The padded size of each dimension D of the
+            output is:
+
+            :math:`paddings[D, 0]` + input_x.dim_size(D) + paddings[D, 1]`.
+
        mode (str): Specifies padding mode. The optional values are "CONSTANT", "REFLECT", "SYMMETRIC".
            Default: "CONSTANT".

@ -697,8 +721,11 @@ class Unfold(Cell):
          data type is number.

    Outputs:
-        Tensor, a 4-D tensor whose data type is same as 'input_x',
-        and the shape is [out_batch, out_depth, out_row, out_col], the out_batch is the same as the in_batch.
+        Tensor, a 4-D tensor whose data type is same as `input_x`,
+        and the shape is [out_batch, out_depth, out_row, out_col] where `out_batch` is the same as the `in_batch`.
+        :math:`out_depth = ksize_row * ksize_col * in_depth`,
+        :math:`out_row = (in_row - (ksize_row + (ksize_row - 1) * (rate_row - 1))) // stride_row + 1`,
+        :math:`out_col = (in_col - (ksize_col + (ksize_col - 1) * (rate_col - 1))) // stride_col + 1`.

    Supported Platforms:
        ``Ascend``
@ -843,6 +870,11 @@ class MatrixDiag(Cell):
    """
    Returns a batched diagonal tensor with a given batched diagonal values.

+    Assume :math:`x` has :math:`k` dimensions :math:`[I, J, K, ..., N]`, then the output is a tensor of rank
+    :math:`k+1` with dimensions :math:`[I, J, K, ..., N, N]` where:
+
+    :math:`output[i, j, k, ..., m, n] = 1{m=n} * x[i, j, k, ..., n]`.
+
    Inputs:
        - **x** (Tensor) - The diagonal values. It can be one of the following data types:
          float32, float16, int32, int8, and uint8.
@ -879,6 +911,11 @@ class MatrixDiagPart(Cell):
    r"""
    Returns the batched diagonal part of a batched tensor.

+    Assume :math:`x` has :math:`k` dimensions :math:`[I, J, K, ..., M, N]`, then the output is a tensor of rank
+    :math:`k-1` with dimensions :math:`[I, J, K, ..., min(M, N]` where:
+
+    :math:`output[i, j, k, ..., n] = x[i, j, k, ..., n, n]`.
+
    Inputs:
        - **x** (Tensor) - The batched tensor. It can be one of the following data types:
          float32, float16, int32, int8, and uint8.
@ -916,6 +953,14 @@ class MatrixSetDiag(Cell):
    r"""
    Modifies the batched diagonal part of a batched tensor.

+    Assume :math:`x` has :math:`k+1` dimensions :math:`[I, J, K, ..., M, N]` and :math:`diagonal` has :math:`k`
+    dimensions :math:`[I, J, K, ..., min(M, N)]`. Then the output is a tensor of rank :math:`k+1` with dimensions
+    :math:`[I, J, K, ..., M, N]` where:
+
+        :math:`output[i, j, k, ..., m, n] = diagnoal[i, j, k, ..., n]` for :math:`m == n`.
+
+        :math:`output[i, j, k, ..., m, n] = x[i, j, k, ..., m, n]` for :math:`m != n`.
+
    Inputs:
        - **x** (Tensor) - The batched tensor. Rank k+1, where k >= 1. It can be one of the following data types:
          float32, float16, int32, int8, and uint8.
--- a/mindspore/nn/layer/math.py
+++ b/mindspore/nn/layer/math.py
@ -105,6 +105,13 @@ class Range(Cell):
    r"""
    Creates a sequence of numbers in range [start, limit) with step size delta.

+    The size of output is \left \lfloor \frac{limit-start}{delta}  \right \rfloor + 1 and `delta` is the gap
+    between two values in the tensor.
+
+    .. math::
+
+        out_{i+1} = out_{i} +delta
+
    Args:
        start (Union[int, float]): If `limit` is `None`, the value acts as limit in the range and first entry
            defaults to `0`. Otherwise, it acts as first entry in the range.
@ -830,7 +837,9 @@ class MatMul(Cell):
    - If at least one of x1 and x2 is N-dimensional (N>2), the none-matrix dimensions(batch) of inputs will be
      broadcasted and must be broadcastable. Note if one of 'x1' and 'x2' is 1-dimensional, the argument will first be
      expanded to 2 dimension and then the none-matrix dimensions will be broadcasted. After the matrix multiply, the
-      expanded dimension will be removed.
+      expanded dimension will be removed. For example, if `x1` is a :math:`(j \times 1 \times n \times m)` tensor and
+      `x2` is a :math:`(k \times m \times p)` tensor, the output will be a :math:`(j \times k \times n \times p)`
+      tensor.

    Args:
        transpose_x1 (bool): If true, `a` is transposed before multiplication. Default: False.
--- a/mindspore/nn/layer/quant.py
+++ b/mindspore/nn/layer/quant.py
@ -212,6 +212,61 @@ class FakeQuantWithMinMaxObserver(UniformQuantObserver):
    r"""
    Quantization aware operation which provides the fake quantization observer function on data with min and max.

+    The running min/max :math:`x_\text{min}` and :math:`x_\text{max}` are computed as:
+
+    .. math::
+
+    \begin{array}{ll} \\
+        x_\text{min} =
+        \begin{cases}
+            \min(\min(X), 0)
+              & \text{ if } ema = \text{False} \\
+            \min((1 - c) \min(X) + \text{c } x_\text{min}, 0)
+              & \text{ if } \text{otherwise}
+        \end{cases}\\
+        x_\text{max} =
+        \begin{cases}
+            \max(\max(X), 0)
+              & \text{ if } ema = \text{False} \\
+            \max((1 - c) \max(X) + \text{c } x_\text{max}, 0)
+              & \text{ if } \text{otherwise}
+        \end{cases}
+    \end{array}
+
+    where X is the input tensor, and :math:`c` is the `ema_decay`.
+
+    The scale s and zero point zp is computed as:
+
+    .. math::
+
+    \begin{array}{ll} \\
+        s =
+        \begin{cases}
+            \frac{x_\text{max} - x_\text{min}}{Q_\text{max} - Q_\text{min}}
+              & \text{ if } symmetric = \text{False} \\
+            \frac{2\max(x_\text{max}, \left | x_\text{min} \right |) }{Q_\text{max} - Q_\text{min}}
+              & \text{ if } \text{otherwise}
+        \end{cases}\\
+        zp\_min = Q_\text{min} - \frac{x_\text{min}}{scale} \\
+        zp = \left \lfloor \min(Q_\text{max}, \max(Q_\text{min}, zp\_min)) + 0.5 \right \rfloor
+    \end{array}
+
+    where :math:`Q_\text{max}` and :math:`Q_\text{min}` is decided by quant_dtype, for example, if quant_dtype=INT8,
+    then :math:`Q_\text{max}`=127 and :math:`Q_\text{min}`=-128.
+
+    The fake quant output is computed as:
+
+    .. math::
+
+    \begin{array}{ll} \\
+        u_\text{min} = (Q_\text{min} - zp) * scale \\
+        u_\text{max} = (Q_\text{max} - zp) * scale \\
+        u_X = \left \lfloor \frac{\min(u_\text{max}, \max(u_\text{min}, X)) - u_\text{min}}{scale}
+        + 0.5 \right \rfloor \\
+        output = u_X * scale + u_\text{min}
+    \end{array}
+
+
    Args:
        min_init (int, float): The initialized min value. Default: -6.
        max_init (int, float): The initialized max value. Default: 6.
@ -337,7 +392,8 @@ class Conv2dBnFoldQuantOneConv(Cell):
    r"""
    2D convolution which use the convolution layer statistics once to calculate BatchNormal operation folded construct.

-    This part is a more detailed overview of Conv2d operation.
+    This part is a more detailed overview of Conv2d operation. For more detials about Quantilization,
+    please refer to :class`mindspore.nn.FakeQuantWithMinMaxObserver`.

    Args:
        in_channels (int): The number of input channel :math:`C_{in}`.
@ -365,7 +421,7 @@ class Conv2dBnFoldQuantOneConv(Cell):
        var_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the
            variance vector. Default: 'ones'.
        fake (bool): Whether Conv2dBnFoldQuant Cell adds FakeQuantWithMinMaxObserver. Default: True.
-        quant_config (QuantConfig): Configs the observer types and quant settings of weight and activation. Can be
+        quant_config (QuantConfig): Configures the oberser types and quant settings of weight and activation. Can be
            generated by compression.quant.create_quant_config method.
            Default: both set to default FakeQuantWithMinMaxObserver.
        quant_dtype (QuantDtype): Specifies the FakeQuant datatype. Default: QuantDtype.INT8.
@ -537,7 +593,8 @@ class Conv2dBnFoldQuant(Cell):
    r"""
    2D convolution with BatchNormal operation folded construct.

-    This part is a more detailed overview of Conv2d operation.
+    This part is a more detailed overview of Conv2d operation. For more detials about Quantilization,
+    please refer to :class`mindspore.nn.FakeQuantWithMinMaxObserver`.

    Args:
        in_channels (int): The number of input channel :math:`C_{in}`.
@ -565,7 +622,7 @@ class Conv2dBnFoldQuant(Cell):
        var_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the
            variance vector. Default: 'ones'.
        fake (bool): Whether Conv2dBnFoldQuant Cell adds FakeQuantWithMinMaxObserver. Default: True.
-        quant_config (QuantConfig): Configs the observer types and quant settings of weight and activation. Can be
+        quant_config (QuantConfig): Configures the oberser types and quant settings of weight and activation. Can be
            generated by compression.quant.create_quant_config method.
            Default: both set to default FakeQuantWithMinMaxObserver.
        quant_dtype (QuantDtype): Specifies the FakeQuant datatype. Default: QuantDtype.INT8.
@ -725,7 +782,8 @@ class Conv2dBnWithoutFoldQuant(Cell):
    r"""
    2D convolution and batchnorm without fold with fake quantized construct.

-    This part is a more detailed overview of Conv2d operation.
+    This part is a more detailed overview of Conv2d operation. For more detials about Quantilization,
+    please refer to :class`mindspore.nn.FakeQuantWithMinMaxObserver`.

    Args:
        in_channels (int): The number of input channel :math:`C_{in}`.
@ -743,7 +801,7 @@ class Conv2dBnWithoutFoldQuant(Cell):
        weight_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the convolution kernel.
            Default: 'normal'.
        bias_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the bias vector. Default: 'zeros'.
-        quant_config (QuantConfig): Configs the observer types and quant settings of weight and activation. Can be
+        quant_config (QuantConfig): Configures the oberser types and quant settings of weight and activation. Can be
            generated by compression.quant.create_quant_config method.
            Default: both set to default FakeQuantWithMinMaxObserver.
        quant_dtype (QuantDtype): Specifies the FakeQuant datatype. Default: QuantDtype.INT8.
@ -840,7 +898,8 @@ class Conv2dQuant(Cell):
    r"""
    2D convolution with fake quantized operation layer.

-    This part is a more detailed overview of Conv2d operation.
+    This part is a more detailed overview of Conv2d operation. For more detials about Quantilization,
+    please refer to :class`mindspore.nn.FakeQuantWithMinMaxObserver`.

    Args:
        in_channels (int): The number of input channel :math:`C_{in}`.
@ -856,7 +915,7 @@ class Conv2dQuant(Cell):
        weight_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the convolution kernel.
            Default: 'normal'.
        bias_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the bias vector. Default: 'zeros'.
-        quant_config (QuantConfig): Configs the observer types and quant settings of weight and activation. Can be
+        quant_config (QuantConfig): Configures the oberser types and quant settings of weight and activation. Can be
            generated by compression.quant.create_quant_config method.
            Default: both set to default FakeQuantWithMinMaxObserver.
        quant_dtype (QuantDtype): Specifies the FakeQuant datatype. Default: QuantDtype.INT8.
@ -950,7 +1009,8 @@ class DenseQuant(Cell):
    r"""
    The fully connected layer with fake quantized operation.

-    This part is a more detailed overview of Dense operation.
+    This part is a more detailed overview of Dense operation. For more detials about Quantilization,
+    please refer to :class`mindspore.nn.FakeQuantWithMinMaxObserver`.

    Args:
        in_channels (int): The dimension of the input space.
@ -962,7 +1022,7 @@ class DenseQuant(Cell):
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: True.
        activation (Union[str, Cell, Primitive]): The regularization function applied to the output of the layer,
            eg. 'relu'. Default: None.
-        quant_config (QuantConfig): Configs the observer types and quant settings of weight and activation. Can be
+        quant_config (QuantConfig): Configures the oberser types and quant settings of weight and activation. Can be
            generated by compression.quant.create_quant_config method.
            Default: both set to default FakeQuantWithMinMaxObserver.
        quant_dtype (QuantDtype): Specifies the FakeQuant datatype. Default: QuantDtype.INT8.
@ -1066,20 +1126,21 @@ class ActQuant(_QuantActivation):
    Quantization aware training activation function.

    Add the fake quantized operation to the end of activation operation, by which the output of activation operation
-    will be truncated. Please check `FakeQuantWithMinMaxObserver` or other observer for more details.
+    will be truncated. For more detials about Quantilization,
+    please refer to :class`mindspore.nn.FakeQuantWithMinMaxObserver`.

    Args:
        activation (Cell): Activation cell.
        ema (bool): The exponential Moving Average algorithm updates min and max. Default: False.
        ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.999.
        fake_before (bool): Whether add fake quantized operation before activation. Default: False.
-        quant_config (QuantConfig): Configs the observer types and quant settings of weight and activation. Can be
+        quant_config (QuantConfig): Configures the oberser types and quant settings of weight and activation. Can be
            generated by compression.quant.create_quant_config method.
            Default: both set to default FakeQuantWithMinMaxObserver.
        quant_dtype (QuantDtype): Specifies the FakeQuant datatype. Default: QuantDtype.INT8.

    Inputs:
-        - **input** (Tensor) - The input of ReLU6Quant.
+        - **input** (Tensor) - The input of ActQuant.

    Outputs:
        Tensor, with the same type and shape as the `input`.
@ -1134,11 +1195,12 @@ class TensorAddQuant(Cell):
    r"""
    Add fake quantized operation after TensorAdd operation.

-    This part is a more detailed overview of TensorAdd operation.
+    This part is a more detailed overview of TensorAdd operation. For more detials about Quantilization,
+    please refer to :class`mindspore.nn.FakeQuantWithMinMaxObserver`.

    Args:
        ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.999.
-        quant_config (QuantConfig): Configs the observer types and quant settings of weight and activation. Can be
+        quant_config (QuantConfig): Configures the oberser types and quant settings of weight and activation. Can be
            generated by compression.quant.create_quant_config method.
            Default: both set to default FakeQuantWithMinMaxObserver.
        quant_dtype (QuantDtype): Specifies the FakeQuant datatype. Default: QuantDtype.INT8.
@ -1186,11 +1248,12 @@ class MulQuant(Cell):
    r"""
    Add fake quantized operation after `Mul` operation.

-    This part is a more detailed overview of `Mul` operation.
+    This part is a more detailed overview of `Mul` operation. For more detials about Quantilization,
+    please refer to :class`mindspore.nn.FakeQuantWithMinMaxObserver`.

    Args:
        ema_decay (float): Exponential Moving Average algorithm parameter. Default: 0.999.
-        quant_config (QuantConfig): Configs the observer types and quant settings of weight and activation. Can be
+        quant_config (QuantConfig): Configures the oberser types and quant settings of weight and activation. Can be
            generated by compression.quant.create_quant_config method.
            Default: both set to default FakeQuantWithMinMaxObserver.
        quant_dtype (QuantDtype): Specifies the FakeQuant datatype. Default: QuantDtype.INT8.
--- a/mindspore/nn/optim/ftrl.py
+++ b/mindspore/nn/optim/ftrl.py
@ -73,7 +73,7 @@ def _check_param(initial_accum, lr_power, l1, l2, use_locking, prim_name=None):


 class FTRL(Optimizer):
-    """
+    r"""
    Implements the FTRL algorithm with ApplyFtrl Operator.

    FTRL is an online convex optimization algorithm that adaptively chooses its regularization function
@ -81,6 +81,26 @@ class FTRL(Optimizer):
    <https://arxiv.org/abs/1002.4908>`_. Refer to paper `Ad Click Prediction: a View from the Trenches
    <https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf>`_ for engineering document.

+    The updating formulas are as follows,
+
+    .. math::
+
+    \begin{array}{ll} \\
+        m_{t+1} = m_{t} + g^2 \\
+        u_{t+1} = u_{t} + g  - \frac{m_{t+1}^\text{-p} - m_{t}^\text{-p}}{\alpha } * \omega_{t} \\
+        \omega_{t+1} =
+        \begin{cases}
+            \frac{(sign(u_{t+1}) * l1 - u_{t+1})}{\frac{m_{t+1}^\text{-p}}{\alpha } + 2 * l2 }
+                & \text{ if } |u_{t+1}| > l1 \\
+            0.0
+                & \text{ otherwise }
+        \end{cases}\\
+    \end{array}
+
+    :math:`m` represents `accum`, :math:`g` represents `grads`, :math:`t` represents updateing step,
+    :math:`u` represents `linear`, :math:`p` represents `lr_power`, :math:`\alpha` represents `learning_rate`,
+    :math:`\omega` represents `params`.
+
    Note:
        When separating parameter groups, the weight decay in each group will be applied on the parameters if the
        weight decay is positive. When not separating parameter groups, the `weight_decay` in the API will be applied
--- a/mindspore/nn/optim/lars.py
+++ b/mindspore/nn/optim/lars.py
@ -47,12 +47,32 @@ def _check_param_value(optimizer, epsilon, coefficient, use_clip, prim_name):


 class LARS(Optimizer):
-    """
+    r"""
    Implements the LARS algorithm with LARSUpdate Operator.

    LARS is an optimization algorithm employing a large batch optimization technique. Refer to paper `LARGE BATCH
    TRAINING OF CONVOLUTIONAL NETWORKS <https://arxiv.org/abs/1708.03888>`_.

+    The updating formulas are as follows,
+
+    .. math::
+
+    \begin{array}\\
+        \lambda  = \frac{\theta  \text{ * } || \omega  ||  }{|| g_{t} || \text{ + } \delta \text{ * } || \omega  || }  \\
+        \lambda  =
+        \begin{cases}
+            \min(\frac{\lambda}{\alpha }, 1)
+                & \text{ if } clip = True \\
+            \lambda
+                & \text{ otherwise }
+        \end{cases}\\
+        g_{t+1} = \lambda * (g_{t} + \delta * \omega)
+    \end{array}
+
+    :math:`\theta` represents `coefficient`, :math:`\omega` represents `parameters`, :math:`g` represents `gradients`,
+    :math:`t` represents updateing step, :math:`\delta` represents `weight_decay`,
+    :math:`\alpha` represents `learning_rate`, :math:`clip` represents `use_clip`.
+
    Args:
        optimizer (Optimizer): MindSpore optimizer for which to wrap and modify gradients.
        epsilon (float): Term added to the denominator to improve numerical stability. Default: 1e-05.