refine adam/strided_slice && fix doc for rmsprop/unstack (#27740)

* refine parameters order && doc * update rmsprop doc * refine adam/transpose/unstack/stride_slice * fix bug && doc * fix doc * bug fix * bug fix * fix doc * fix doc * fix doc * fix doc * depercate old strided_slice * update doc * set default value for name * update doc
5 years ago · 84d8e49de8
parent e96fc6abb2
commit 84d8e49de8
7 changed files with 147 additions and 67 deletions
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@ -10241,9 +10241,9 @@ def unstack(x, axis=0, num=None):
    Examples:
        .. code-block:: python
-            import paddle.fluid as fluid
+            import paddle
-            x = fluid.data(name='x', shape=[2, 3, 5], dtype='float32')  # create a tensor with shape=[2, 3, 5]
+            x = paddle.ones(name='x', shape=[2, 3, 5], dtype='float32')  # create a tensor with shape=[2, 3, 5]
-            y = fluid.layers.unstack(x, axis=1)  # unstack with second axis, which results 3 tensors with shape=[2, 5]
+            y = paddle.unstack(x, axis=1)  # unstack with second axis, which results 3 tensors with shape=[2, 5]
    """
    helper = LayerHelper('unstack', **locals())
@ -11017,7 +11017,7 @@ def slice(input, axes, starts, ends):
    return out
-@templatedoc()
+@deprecated(since='2.0.0', update_to="paddle.strided_slice")
 def strided_slice(input, axes, starts, ends, strides):
    """
    :alias_main: paddle.strided_slice
@ -11095,7 +11095,9 @@ def strided_slice(input, axes, starts, ends, strides):
        .. code-block:: python
            import paddle.fluid as fluid
            import paddle
            paddle.enable_static()
            input = fluid.data(
                name="input", shape=[3, 4, 5, 6], dtype='float32')
--- a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
@ -16,6 +16,9 @@ from op_test import OpTest
 import numpy as np
 import unittest
 import paddle.fluid as fluid
 import paddle
 paddle.enable_static()
 def strided_slice_native_forward(input, axes, starts, ends, strides):
@ -498,6 +501,16 @@ class TestStridedSliceAPI(unittest.TestCase):
        assert np.array_equal(res_6, input[-3:3, 0:100:2, :, -1:2:-1])
        assert np.array_equal(res_7, input[-1, 0:100:2, :, -1:2:-1])
    def test_dygraph_op(self):
        x = paddle.zeros(shape=[3, 4, 5, 6], dtype="float32")
        axes = [1, 2, 3]
        starts = [-3, 0, 2]
        ends = [3, 2, 4]
        strides_1 = [1, 1, 1]
        sliced_1 = paddle.strided_slice(
            x, axes=axes, starts=starts, ends=ends, strides=strides_1)
        assert sliced_1.shape == (3, 2, 2, 2)
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/incubate/complex/tensor/manipulation.py
+++ b/python/paddle/incubate/complex/tensor/manipulation.py
@ -128,16 +128,13 @@ def transpose(x, perm, name=None):
        .. code-block:: python
            import paddle
            import numpy as np
            import paddle.fluid.dygraph as dg
-            with dg.guard():
+            x = paddle.to_tensor([[1.0 + 1.0j, 2.0 + 1.0j], [3.0+1.0j, 4.0+1.0j], [5.0+1.0j, 6.0+1.0j]])
-                a = np.array([[1.0 + 1.0j, 2.0 + 1.0j], [3.0+1.0j, 4.0+1.0j]])
+            x_transposed = paddle.complex.transpose(x, [1, 0])
-                x = dg.to_variable(a)
+            print(x_transposed.numpy())
-                y = paddle.complex.transpose(x, [1, 0])
+            #[[1.+1.j 3.+1.j 5.+1.j]
-                print(y.numpy())
+            # [2.+1.j 4.+1.j 6.+1.j]]
-                # [[1.+1.j 3.+1.j]
+
                #  [2.+1.j 4.+1.j]]
    """
    complex_variable_exists([x], "transpose")
    real = layers.transpose(x.real, perm, name)
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@ -29,7 +29,7 @@ class Adam(Optimizer):
    of section 2 of `Adam paper <https://arxiv.org/abs/1412.6980>`_ ,
    it can dynamically adjusts the learning rate of each parameter using
    the 1st moment estimates and the 2nd moment estimates of the gradient.
-    
+
    The parameter ``param_out`` update rule with gradient ``grad``:
    .. math::
@ -68,13 +68,10 @@ class Adam(Optimizer):
 	    the regularization setting here in optimizer will be ignored for this parameter. \
 	    Otherwise, the regularization setting here in optimizer will take effect. \
 	    Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
        name (str, optional): Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name`.
            The default value is None.
        lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
            The accumulators are updated at every step. Every element of the two moving-average
            is updated in both dense mode and sparse mode. If the size of parameter is very large,
@ -82,17 +79,17 @@ class Adam(Optimizer):
            gradient in current mini-batch, so it will be much more faster. But this mode has
            different semantics with the original Adam algorithm and may lead to different result.
            The default value is False.
        name (str, optional): Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name`.
            The default value is None.
    Examples:
        .. code-block:: python
            import paddle
            import numpy as np
            paddle.disable_static()
            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
            linear = paddle.nn.Linear(10, 10)
-            inp = paddle.to_tensor(inp)
+            inp = paddle.rand([10,10], dtype="float32")
            out = linear(inp)
            loss = paddle.mean(out)
            adam = paddle.optimizer.Adam(learning_rate=0.1,
@ -105,12 +102,9 @@ class Adam(Optimizer):
            # Adam with beta1/beta2 as Tensor and weight_decay as float
            import paddle
            import numpy as np
            paddle.disable_static()
            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
            linear = paddle.nn.Linear(10, 10)
-            inp = paddle.to_tensor(inp)
+            inp = paddle.rand([10,10], dtype="float32")
            out = linear(inp)
            loss = paddle.mean(out)
@ -140,8 +134,8 @@ class Adam(Optimizer):
                 parameters=None,
                 weight_decay=None,
                 grad_clip=None,
-                 name=None,
+                 lazy_mode=False,
-                 lazy_mode=False):
+                 name=None):
        assert learning_rate is not None
        assert beta1 is not None
        assert beta2 is not None
@ -258,7 +252,7 @@ class Adam(Optimizer):
    def step(self):
        """
        Execute the optimizer and update parameters once.
-        
+
        Returns:
            None
@ -266,13 +260,11 @@ class Adam(Optimizer):
            .. code-block:: python
                import paddle
-                import numpy as np
+                
-                paddle.disable_static()
+                a = paddle.rand([2,13], dtype="float32")
                value = np.arange(26).reshape(2, 13).astype("float32")
                a = paddle.to_tensor(value)
                linear = paddle.nn.Linear(13, 5)
                # This can be any optimizer supported by dygraph.
-                adam = paddle.optimizer.Adam(learning_rate = 0.01, 
+                adam = paddle.optimizer.Adam(learning_rate = 0.01,
                                            parameters = linear.parameters())
                out = linear(a)
                out.backward()
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@ -23,7 +23,7 @@ __all__ = ['AdamW']
 class AdamW(Adam):
    """
-    The AdamW optimizer is implemented based on the AdamW Optimization 
+    The AdamW optimizer is implemented based on the AdamW Optimization
    in paper `DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
    it can resolves the problem of L2 regularization failure in the Adam optimizer.
@ -32,7 +32,7 @@ class AdamW(Adam):
        t & = t + 1
        moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
-        
+
        moemnt\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
        learning\_rate & = learning\_rate * \\
@ -57,16 +57,13 @@ class AdamW(Adam):
            The default value is 1e-08.
        weight_decay (float|Tensor, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01.
        apply_decay_param_fun (function|None, optional): If it is not None,
-            only tensors that makes apply_decay_param_fun(Tensor)==True 
+            only tensors that makes apply_decay_param_fun(Tensor)==True
            will be updated. It only works when we want to specify tensors.
            Default: None.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
        name (str, optional): Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name`.
            The default value is None.
        lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
            The accumulators are updated at every step. Every element of the two moving-average
            is updated in both dense mode and sparse mode. If the size of parameter is very large,
@ -74,18 +71,18 @@ class AdamW(Adam):
            gradient in current mini-batch, so it will be much more faster. But this mode has
            different semantics with the original Adam algorithm and may lead to different result.
            The default value is False.
        name (str, optional): Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name`.
            The default value is None.
    **Notes**:
        **Currently, AdamW doesn't support sparse parameter optimization.**
    Examples:
        .. code-block:: python
            import paddle
            import numpy as np
            paddle.disable_static()
            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
            linear = paddle.nn.Linear(10, 10)
-            inp = paddle.to_tensor(inp)
+            inp = paddle.rand([10,10], dtype="float32")
            out = linear(inp)
            loss = paddle.mean(out)
@ -112,8 +109,8 @@ class AdamW(Adam):
                 weight_decay=0.01,
                 apply_decay_param_fun=None,
                 grad_clip=None,
-                 name=None,
+                 lazy_mode=False,
-                 lazy_mode=False):
+                 name=None):
        assert learning_rate is not None
        assert beta1 is not None
        assert beta2 is not None
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
@ -90,9 +90,9 @@ class RMSProp(Optimizer):
 	    the regularization setting here in optimizer will be ignored for this parameter. \
 	    Otherwise, the regularization setting here in optimizer will take effect. \
 	    Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
        name (str, optional): This parameter is used by developers to print debugging information. \
            For details, please refer to :ref:`api_guide_Name`. Default is None.
@ -104,24 +104,18 @@ class RMSProp(Optimizer):
          .. code-block:: python
            import paddle
            import numpy as np
-            paddle.disable_static()
+            inp = paddle.rand([10,10], dtype="float32")
            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
            linear = paddle.nn.Linear(10, 10)
            inp = paddle.to_tensor(inp)
            out = linear(inp)
            loss = paddle.mean(out)
-            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            rmsprop = paddle.optimizer.RMSProp(learning_rate=0.1,
-            beta2 = paddle.to_tensor([0.99], dtype="float32")
+                             parameters=linear.parameters(),
-
+                                       weight_decay=0.01)
            adam = paddle.optimizer.RMSProp(learning_rate=0.1,
                    parameters=linear.parameters(),
                    weight_decay=0.01)
            out.backward()
-            adam.step()
+            rmsprop.step()
-            adam.clear_grad()
+            rmsprop.clear_grad()
    """
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@ -25,7 +25,6 @@ import six
 # TODO: define functions to manipulate a tensor  
 from ..fluid.layers import cast  #DEFINE_ALIAS
 from ..fluid.layers import slice  #DEFINE_ALIAS
 from ..fluid.layers import strided_slice  #DEFINE_ALIAS
 from ..fluid.layers import transpose  #DEFINE_ALIAS
 from ..fluid.layers import unstack  #DEFINE_ALIAS
@ -1461,3 +1460,89 @@ def gather_nd(x, index, name=None):
    """
    return paddle.fluid.layers.gather_nd(input=x, index=index, name=name)
 def strided_slice(x, axes, starts, ends, strides, name=None):
    """
    This operator produces a slice of ``x`` along multiple axes. Similar to numpy:
    https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
    Slice uses ``axes``, ``starts`` and ``ends`` attributes to specify the start and
    end dimension for each axis in the list of axes and Slice uses this information
    to slice the input data tensor. If a negative value is passed to
    ``starts`` or ``ends`` such as :math:`-i`,  it represents the reverse position of the
    axis :math:`i-1` th(here 0 is the initial position). The ``strides`` represents steps of
    slicing and if the ``strides`` is negative, slice operation is in the opposite direction.
    If the value passed to ``starts`` or ``ends`` is greater than n
    (the number of elements in this dimension), it represents n.
    For slicing to the end of a dimension with unknown size, it is recommended
    to pass in INT_MAX. The size of ``axes`` must be equal to ``starts`` , ``ends`` and ``strides``.
    Following examples will explain how strided_slice works:
    .. code-block:: text
        Case1:
            Given:
                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
                axes = [0, 1]
                starts = [1, 0]
                ends = [2, 3]
                strides = [1, 1]
            Then:
                result = [ [5, 6, 7], ]
        Case2:
            Given:
                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
                axes = [0, 1]
                starts = [0, 1]
                ends = [2, 0]
                strides = [1, -1]
            Then:
                result = [ [8, 7, 6], ]
        Case3:
            Given:
                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
                axes = [0, 1]
                starts = [0, 1]
                ends = [-1, 1000]
                strides = [1, 3]
            Then:
                result = [ [2], ]
    Args:
        x (Tensor): An N-D ``Tensor``. The data type is ``float32``, ``float64``, ``int32`` or ``int64``.
        axes (list|tuple): The data type is ``int32`` . Axes that `starts` and `ends` apply to.
                            It's optional. If it is not provides, it will be treated as :math:`[0,1,...,len(starts)-1]`.
        starts (list|tuple|Tensor): The data type is ``int32`` . If ``starts`` is a list or tuple, the elements of                                                                                          it should be integers or Tensors with shape [1]. If ``starts`` is an Tensor, it should be an 1-D Tensor.                                                                                    It represents starting indices of corresponding axis in ``axes``.
        ends (list|tuple|Tensor): The data type is ``int32`` . If ``ends`` is a list or tuple, the elements of
                it should be integers or Tensors with shape [1]. If ``ends`` is an Tensor, it should be an 1-D Tensor .                                                                                     It represents ending indices of corresponding axis in ``axes``.
        strides (list|tuple|Tensor): The data type is ``int32`` . If ``strides`` is a list or tuple, the elements of
                it should be integers or Tensors with shape [1]. If ``strides`` is an Tensor, it should be an 1-D Tensor .                                                                                  It represents slice step of corresponding axis in ``axes``.
        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
                        For more information, please refer to :ref:`api_guide_Name` .
    Returns:
        Tensor:  A ``Tensor`` with the same dimension as ``x``. The data type is same as ``x``.
    Examples:
        .. code-block:: python
            import paddle
            x = paddle.zeros(shape=[3,4,5,6], dtype="float32")
            # example 1:
            # attr starts is a list which doesn't contain Tensor.
            axes = [1, 2, 3]
            starts = [-3, 0, 2]
            ends = [3, 2, 4]
            strides_1 = [1, 1, 1]
            strides_2 = [1, 1, 2]
            sliced_1 = paddle.strided_slice(x, axes=axes, starts=starts, ends=ends, strides=strides_1)
            # sliced_1 is x[:, 1:3:1, 0:2:1, 2:4:1].                                
            # example 2:
            # attr starts is a list which contain tensor Tensor.
            minus_3 = paddle.fill_constant([1], "int32", -3)
            sliced_2 = paddle.strided_slice(x, axes=axes, starts=[minus_3, 0, 2], ends=ends, strides=strides_2)
            # sliced_2 is x[:, 1:3:1, 0:2:1, 2:4:2].
    """
    return paddle.fluid.layers.strided_slice(
        input=x, axes=axes, starts=starts, ends=ends, strides=strides)