[Speed]implement cudnn sequence softmax cudnn (#8978)

* "add softmax cudnn functor support" * "add testing" * "refine cmakelist" * "sequence softmax forward speed up" * "add softmax grad" * "fix sequence softmax test" * "add double precision' * "fix softmax test" * "add softmax cudnn support" * "fix softmax cudnn test" * "add softmax to nn.py" * "fix compile bug" * "refine cmakelist" * "fix ci" * "fix based on comment" * "fix based on comments" * "fix ci"
7 years ago · 128adf53cb
parent 9b9f3f09b3
commit 128adf53cb
14 changed files with 481 additions and 19 deletions
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@ -14,13 +14,86 @@ limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/operators/math/softmax_impl.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 namespace paddle {
 namespace operators {
 namespace math {
 using Tensor = framework::Tensor;
 using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using DataLayout = platform::DataLayout;
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
 void SoftmaxCUDNNFunctor<T>::operator()(
    const platform::CUDADeviceContext& context, const framework::Tensor* X,
    framework::Tensor* Y) {
  // ------------------- cudnn descriptors ---------------------
  ScopedTensorDescriptor xDesc;
  ScopedTensorDescriptor yDesc;
  std::vector<int> cudnn_tensor_dims = framework::vectorize2int(X->dims());
  DataLayout layout = DataLayout::kNCHW;
  if (cudnn_tensor_dims.size() == 5) {
    layout = DataLayout::kNCDHW;
  }
  // NOTE(*) : cudnn softmax only support >= 4D Tensor,
  // fill 1 at unused dims
  if (cudnn_tensor_dims.size() <= 2) {
    cudnn_tensor_dims.resize(4, 1);
  }
  cudnnTensorDescriptor_t cudnn_x_desc =
      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
  cudnnTensorDescriptor_t cudnn_y_desc =
      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
  PADDLE_ENFORCE(platform::dynload::cudnnSoftmaxForward(
      context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
      CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_x_desc,
      X->data<T>(), CudnnDataType<T>::kZero(), cudnn_y_desc,
      Y->mutable_data<T>(context.GetPlace())));
 }
 template <typename T>
 void SoftmaxGradCUDNNFunctor<T>::operator()(
    const platform::CUDADeviceContext& context, const framework::Tensor* Y,
    const framework::Tensor* YGrad, framework::Tensor* XGrad) {
  // ------------------- cudnn descriptors ---------------------
  ScopedTensorDescriptor yDesc;
  ScopedTensorDescriptor dyDesc;
  ScopedTensorDescriptor dxDesc;
  std::vector<int> cudnn_tensor_dims = framework::vectorize2int(Y->dims());
  DataLayout layout = DataLayout::kNCHW;
  if (cudnn_tensor_dims.size() == 5) {
    layout = DataLayout::kNCDHW;
  }
  // NOTE(*) : cudnn softmax only support >= 4D Tensor,
  // fill 1 at unused dims
  if (cudnn_tensor_dims.size() <= 2) {
    cudnn_tensor_dims.resize(4, 1);
  }
  cudnnTensorDescriptor_t cudnn_y_desc =
      yDesc.descriptor<T>(layout, cudnn_tensor_dims);
  cudnnTensorDescriptor_t cudnn_xgrad_desc =
      dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
  cudnnTensorDescriptor_t cudnn_ygrad_desc =
      dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
  PADDLE_ENFORCE(platform::dynload::cudnnSoftmaxBackward(
      context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
      CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_y_desc,
      Y->data<T>(), cudnn_ygrad_desc, YGrad->data<T>(),
      CudnnDataType<T>::kZero(), cudnn_xgrad_desc,
      XGrad->mutable_data<T>(context.GetPlace())));
 }
 template class SoftmaxCUDNNFunctor<float>;
 template class SoftmaxCUDNNFunctor<double>;
 template class SoftmaxGradCUDNNFunctor<float>;
 template class SoftmaxGradCUDNNFunctor<double>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
--- a/paddle/fluid/operators/math/softmax.h
+++ b/paddle/fluid/operators/math/softmax.h
@ -33,6 +33,23 @@ class SoftmaxGradFunctor {
                  const framework::Tensor* y_grad, framework::Tensor* x_grad);
 };
 #ifdef PADDLE_WITH_CUDA
 template <typename T>
 class SoftmaxCUDNNFunctor {
 public:
  void operator()(const platform::CUDADeviceContext& context,
                  const framework::Tensor* X, framework::Tensor* Y);
 };
 template <typename T>
 class SoftmaxGradCUDNNFunctor {
 public:
  void operator()(const platform::CUDADeviceContext& context,
                  const framework::Tensor* Y, const framework::Tensor* y_grad,
                  framework::Tensor* x_grad);
 };
 #endif
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
@ -0,0 +1,105 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/softmax.h"
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 template <typename T>
 class SequenceSoftmaxCUDNNKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* x = ctx.Input<LoDTensor>("X");
    auto* out = ctx.Output<LoDTensor>("Out");
    auto lod = x->lod();
    auto dims = x->dims();
    const size_t level = lod.size() - 1;
    PADDLE_ENFORCE_EQ(dims[0], static_cast<int64_t>(lod[level].back()),
                      "The first dimension of Input(X) should be equal to the "
                      "sum of all sequences' lengths.");
    PADDLE_ENFORCE_EQ(dims[0], x->numel(),
                      "The width of each timestep in Input(X) of "
                      "SequenceSoftmaxOp should be 1.");
    out->mutable_data<T>(ctx.GetPlace());
    for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
      int start_pos = static_cast<int>(lod[level][i]);
      int end_pos = static_cast<int>(lod[level][i + 1]);
      Tensor x_i = x->Slice(start_pos, end_pos);
      Tensor out_i = out->Slice(start_pos, end_pos);
      // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
      framework::DDim dims_i =
          // framework::make_ddim({1UL, end_pos - start_pos, 1UL, 1UL});
          framework::make_ddim({1UL, end_pos - start_pos});
      x_i.Resize(dims_i);
      out_i.Resize(dims_i);
      math::SoftmaxCUDNNFunctor<T>()(
          ctx.template device_context<platform::CUDADeviceContext>(), &x_i,
          &out_i);
    }
  }
 };
 template <typename T>
 class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* out = ctx.Input<LoDTensor>("Out");
    auto* out_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
    auto* x = ctx.Input<LoDTensor>("X");
    auto* x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
    auto lod = x->lod();
    const size_t level = lod.size() - 1;
    x_grad->mutable_data<T>(ctx.GetPlace());
    for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
      int start_pos = static_cast<int>(lod[level][i]);
      int end_pos = static_cast<int>(lod[level][i + 1]);
      Tensor out_i = out->Slice(start_pos, end_pos);
      Tensor out_grad_i = out_grad->Slice(start_pos, end_pos);
      Tensor x_grad_i = x_grad->Slice(start_pos, end_pos);
      // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
      framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos});
      out_i.Resize(dims_i);
      out_grad_i.Resize(dims_i);
      x_grad_i.Resize(dims_i);
      math::SoftmaxGradCUDNNFunctor<T>()(
          ctx.template device_context<platform::CUDADeviceContext>(), &out_i,
          &out_grad_i, &x_grad_i);
    }
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(sequence_softmax, CUDNN, ::paddle::platform::CUDAPlace,
                   ops::SequenceSoftmaxCUDNNKernel<float>,
                   ops::SequenceSoftmaxCUDNNKernel<double>)
 REGISTER_OP_KERNEL(sequence_softmax_grad, CUDNN, ::paddle::platform::CUDAPlace,
                   ops::SequenceSoftmaxGradCUDNNKernel<float>,
                   ops::SequenceSoftmaxGradCUDNNKernel<double>)
--- a/paddle/fluid/operators/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cc
@ -29,6 +29,29 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
    ctx->ShareLoD("X", /*->*/ "Out");
  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    // choose cudnn kernel if the runtime supported.
    bool use_cudnn = ctx.Attr<bool>("use_cudnn");
    bool runtime_cudnn_support = false;
 #ifdef PADDLE_WITH_CUDA
    if (platform::is_gpu_place(ctx.GetPlace())) {
      auto& dev_ctx =
          ctx.template device_context<platform::CUDADeviceContext>();
      runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false;
    }
 #endif
    framework::LibraryType library_ = framework::LibraryType::kPlain;
    if (use_cudnn && runtime_cudnn_support) {
      library_ = framework::LibraryType::kCUDNN;
    }
    std::string data_format = ctx.Attr<std::string>("data_format");
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
        framework::StringToDataLayout(data_format), library_);
  }
 };
 class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
@ -41,6 +64,17 @@ class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Out",
              "(LoDTensor) 1-D or 2-D output LoDTensor with the 2-nd dimension "
              "of length 1.");
    AddAttr<bool>(
        "use_cudnn",
        "(bool, default false) Only used in cudnn kernel, need install cudnn")
        .SetDefault(false);
    AddAttr<std::string>(
        "data_format",
        "(string, default NCHW) Only used in "
        "An optional string from: \"NHWC\", \"NCHW\". "
        "Defaults to \"NHWC\". Specify the data format of the output data, "
        "the input will be transformed automatically. ")
        .SetDefault("AnyLayout");
    AddComment(R"DOC(
 Sequence Softmax Operator.
@ -91,6 +125,29 @@ class SequenceSoftmaxGradOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    // choose cudnn kernel if the runtime supported.
    bool use_cudnn = ctx.Attr<bool>("use_cudnn");
    bool runtime_cudnn_support = false;
 #ifdef PADDLE_WITH_CUDA
    if (platform::is_gpu_place(ctx.GetPlace())) {
      auto& dev_ctx =
          ctx.template device_context<platform::CUDADeviceContext>();
      runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false;
    }
 #endif
    framework::LibraryType library_ = framework::LibraryType::kPlain;
    if (use_cudnn && runtime_cudnn_support) {
      library_ = framework::LibraryType::kCUDNN;
    }
    std::string data_format = ctx.Attr<std::string>("data_format");
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
        framework::StringToDataLayout(data_format), library_);
  }
 };
 }  // namespace operators
@ -102,7 +159,9 @@ REGISTER_OP(sequence_softmax, ops::SequenceSoftmaxOp,
            ops::SequenceSoftmaxGradOp);
 REGISTER_OP_CPU_KERNEL(
    sequence_softmax,
-    ops::SequenceSoftmaxKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::SequenceSoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
    ops::SequenceSoftmaxKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
    sequence_softmax_grad,
-    ops::SequenceSoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::SequenceSoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>,
    ops::SequenceSoftmaxGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/sequence_softmax_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cu.cc
@ -17,7 +17,10 @@ limitations under the License. */
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
    sequence_softmax,
-    ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, float>)
+    ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, float>,
    ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, double>)
 REGISTER_OP_CUDA_KERNEL(
    sequence_softmax_grad,
-    ops::SequenceSoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::SequenceSoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>,
    ops::SequenceSoftmaxGradKernel<paddle::platform::CUDADeviceContext,
                                   double>);
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
@ -0,0 +1,62 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
 template <typename T>
 class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* X = context.Input<Tensor>("X");
    auto* Out = context.Output<Tensor>("Out");
    // allocate memory on device.
    Out->mutable_data<T>(context.GetPlace());
    math::SoftmaxCUDNNFunctor<T>()(
        context.template device_context<platform::CUDADeviceContext>(), X, Out);
  }
 };
 template <typename T>
 class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* Out = context.Input<Tensor>("Out");
    auto* dOut = context.Input<Tensor>(framework::GradVarName("Out"));
    auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
    // allocate memory on device.
    dX->mutable_data<T>(context.GetPlace());
    math::SoftmaxGradCUDNNFunctor<T>()(
        context.template device_context<platform::CUDADeviceContext>(), Out,
        dOut, dX);
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(softmax, CUDNN, ::paddle::platform::CUDAPlace,
                   ops::SoftmaxCUDNNKernel<float>);
 REGISTER_OP_KERNEL(softmax_grad, CUDNN, ::paddle::platform::CUDAPlace,
                   ops::SoftmaxGradCUDNNKernel<float>);
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@ -33,6 +33,29 @@ class SoftmaxOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("Out", x_dims);
    ctx->ShareLoD("X", /*->*/ "Out");
  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    // choose cudnn kernel if the runtime supported.
    bool use_cudnn = ctx.Attr<bool>("use_cudnn");
    bool runtime_cudnn_support = false;
 #ifdef PADDLE_WITH_CUDA
    if (platform::is_gpu_place(ctx.GetPlace())) {
      auto& dev_ctx =
          ctx.template device_context<platform::CUDADeviceContext>();
      runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false;
    }
 #endif
    framework::LibraryType library_ = framework::LibraryType::kPlain;
    if (use_cudnn && runtime_cudnn_support) {
      library_ = framework::LibraryType::kCUDNN;
    }
    std::string data_format = ctx.Attr<std::string>("data_format");
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
        framework::StringToDataLayout(data_format), library_);
  }
 };
 class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
@ -43,6 +66,17 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
             "The input tensor of softmax. "
             "2-D with shape [batch_size, input_feature_dimensions].");
    AddOutput("Out", "The normalized values with the same shape as X.");
    AddAttr<bool>(
        "use_cudnn",
        "(bool, default false) Only used in cudnn kernel, need install cudnn")
        .SetDefault(false);
    AddAttr<std::string>(
        "data_format",
        "(string, default NCHW) Only used in "
        "An optional string from: \"NHWC\", \"NCHW\". "
        "Defaults to \"NHWC\". Specify the data format of the output data, "
        "the input will be transformed automatically. ")
        .SetDefault("AnyLayout");
    AddComment(R"DOC(
 Softmax Operator.
@ -80,6 +114,29 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    // choose cudnn kernel if the runtime supported.
    bool use_cudnn = ctx.Attr<bool>("use_cudnn");
    bool runtime_cudnn_support = false;
 #ifdef PADDLE_WITH_CUDA
    if (platform::is_gpu_place(ctx.GetPlace())) {
      auto& dev_ctx =
          ctx.template device_context<platform::CUDADeviceContext>();
      runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false;
    }
 #endif
    framework::LibraryType library_ = framework::LibraryType::kPlain;
    if (use_cudnn && runtime_cudnn_support) {
      library_ = framework::LibraryType::kCUDNN;
    }
    std::string data_format = ctx.Attr<std::string>("data_format");
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
        framework::StringToDataLayout(data_format), library_);
  }
 };
 }  // namespace operators
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@ -289,7 +289,7 @@ inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
  use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace());
 #ifdef PADDLE_WITH_CUDA
  if (use_cudnn) {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
  }
 #endif
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@ -132,7 +132,7 @@ def detection_output(loc,
    old_shape = scores.shape
    scores = ops.reshape(x=scores, shape=(-1, old_shape[-1]))
-    scores = ops.softmax(x=scores)
+    scores = nn.softmax(input=scores)
    scores = ops.reshape(x=scores, shape=old_shape)
    scores = nn.transpose(scores, perm=[0, 2, 1])
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@ -39,6 +39,8 @@ __all__ = [
    'sequence_conv',
    'conv2d',
    'sequence_pool',
    'sequence_softmax',
    'softmax',
    'pool2d',
    'batch_norm',
    'beam_search_decode',
@ -1085,6 +1087,30 @@ def sequence_conv(input,
    return helper.append_activation(pre_act)
 def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
    helper = LayerHelper('sequence_softmax', **locals())
    dtype = helper.input_dtype()
    softmax_out = helper.create_tmp_variable(dtype)
    helper.append_op(
        type="sequence_softmax",
        inputs={"X": input},
        outputs={"Out": softmax_out},
        attrs={"use_cudnn": use_cudnn})
    return softmax_out
 def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
    helper = LayerHelper('softmax', **locals())
    dtype = helper.input_dtype()
    softmax_out = helper.create_tmp_variable(dtype)
    helper.append_op(
        type="softmax",
        inputs={"X": input},
        outputs={"Out": softmax_out},
        attrs={"use_cudnn": use_cudnn})
    return softmax_out
 def conv2d(input,
           num_filters,
           filter_size,
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@ -45,13 +45,30 @@ __activations__ = [
 ]
 __all__ = [
-    'mean', 'mul', 'reshape', 'scale', 'sigmoid_cross_entropy_with_logits',
+    'mean',
-    'elementwise_add', 'elementwise_div', 'elementwise_sub', 'elementwise_mul',
+    'mul',
-    'elementwise_max', 'elementwise_min', 'elementwise_pow', 'clip',
+    'reshape',
-    'clip_by_norm', 'softmax', 'sequence_softmax', 'logical_and', 'logical_or',
+    'scale',
-    'logical_xor', 'logical_not', 'uniform_random',
+    'sigmoid_cross_entropy_with_logits',
-    'uniform_random_batch_size_like', 'gaussian_random',
+    'elementwise_add',
-    'gaussian_random_batch_size_like', 'cumsum', 'scatter'
+    'elementwise_div',
    'elementwise_sub',
    'elementwise_mul',
    'elementwise_max',
    'elementwise_min',
    'elementwise_pow',
    'clip',
    'clip_by_norm',
    'logical_and',
    'logical_or',
    'logical_xor',
    'logical_not',
    'uniform_random',
    'uniform_random_batch_size_like',
    'gaussian_random',
    'gaussian_random_batch_size_like',
    'cumsum',
    'scatter',
 ] + __activations__
 for _OP in set(__all__):
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@ -220,7 +220,7 @@ class TestBook(unittest.TestCase):
            seq_data = layers.data(
                name='seq_data', shape=[10, 10], dtype='float32', lod_level=1)
            seq = layers.fc(input=seq_data, size=20)
-            self.assertIsNotNone(layers.sequence_softmax(x=seq))
+            self.assertIsNotNone(layers.sequence_softmax(seq))
        print(str(program))
    def test_softmax(self):
@ -228,7 +228,7 @@ class TestBook(unittest.TestCase):
        with program_guard(program):
            data = layers.data(name='data', shape=[10], dtype='float32')
            hid = layers.fc(input=data, size=20)
-            self.assertIsNotNone(layers.softmax(x=hid))
+            self.assertIsNotNone(layers.softmax(hid))
        print(str(program))
    def test_get_places(self):
--- a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
@ -16,11 +16,15 @@ import unittest
 import numpy as np
 from op_test import OpTest
 from test_softmax_op import stable_softmax
 import paddle.fluid.core as core
 class TestSequenceSoftmaxOp(OpTest):
    def setUp(self):
        self.op_type = "sequence_softmax"
        self.use_cudnn = False
        self.init_op_type()
        x = np.random.uniform(0.1, 1, (11, 1)).astype("float32")
        lod = [[0, 4, 5, 8, 11]]
@ -34,13 +38,32 @@ class TestSequenceSoftmaxOp(OpTest):
        self.inputs = {"X": (x, lod)}
        self.outputs = {"Out": out}
        self.attrs = {'use_cudnn': self.use_cudnn, }
    def init_op_type(self):
        pass
    def test_check_output(self):
        if self.use_cudnn:
            place = core.CUDAPlace(0)
            self.check_output_with_place(place, atol=1e-5)
        else:
            self.check_output()
    def test_check_grad(self):
        if self.use_cudnn:
            place = core.CUDAPlace(0)
            self.check_grad_with_place(
                place, ["X"], "Out", max_relative_error=0.01)
        else:
            self.check_grad(["X"], "Out", max_relative_error=0.01)
 # ----------------cudnn Sequencesoftmax----------------
 class TestSequenceSoftmaxCUDNNOp(TestSequenceSoftmaxOp):
    def init_op_type(self):
        self.use_cudnn = True
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@ -15,6 +15,7 @@
 import unittest
 import numpy as np
 from op_test import OpTest
 import paddle.fluid.core as core
 def stable_softmax(x):
@ -27,18 +28,37 @@ def stable_softmax(x):
 class TestSoftmaxOp(OpTest):
    def setUp(self):
        self.op_type = "softmax"
        self.use_cudnn = False
        self.inputs = {
            'X': np.random.uniform(0.1, 1, [10, 10]).astype("float32")
        }
        self.outputs = {
            'Out': np.apply_along_axis(stable_softmax, 1, self.inputs['X'])
        }
        self.attrs = {'use_cudnn': self.use_cudnn, }
    def init_op_type(self):
        pass
    def test_check_output(self):
        if self.use_cudnn:
            place = core.CUDAPlace(0)
            self.check_output_with_place(place, atol=1e-5)
        else:
            self.check_output()
    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        if self.use_cudnn:
            place = core.CUDAPlace(0)
            self.check_grad_with_place(
                place, ["X"], "Out", max_relative_error=0.01)
        else:
            self.check_grad(["X"], "Out", max_relative_error=0.01)
 class TestSoftmaxCUDNNOp(TestSoftmaxOp):
    def init_op_type(self):
        self.use_cudnn = True
 if __name__ == "__main__":