[ROCM] update fluid operators for rocm (part6), test=develop (#31301)

4 years ago · 946dbdae8c
parent 1cbccfa594
commit 946dbdae8c
19 changed files with 350 additions and 36 deletions
--- a/paddle/fluid/operators/activation_cudnn.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn.cu.cc
@ -14,7 +14,11 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/miopen_desc.h"
 #else
 #include "paddle/fluid/platform/cudnn_desc.h"
 #endif
 namespace paddle {
 namespace operators {
--- a/paddle/fluid/operators/activation_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc
@ -14,7 +14,11 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/miopen_desc.h"
 #else
 #include "paddle/fluid/platform/cudnn_desc.h"
 #endif
 namespace paddle {
 namespace platform {
@ -29,35 +33,71 @@ using platform::ActivationDescriptor;
 using platform::TensorDescriptor;
 using platform::CUDADeviceContext;
 #ifdef PADDLE_WITH_HIP
 #define GPUDNN_ACTIVATION_RELU miopenActivationRELU
 #define GPUDNN_ACTIVATION_CLIPPED_RELU miopenActivationCLIPPEDRELU
 #define GPUDNN_ACTIVATION_SIGMOID miopenActivationLOGISTIC
 #define GPUDNN_ACTIVATION_TANH miopenActivationTANH
 #else
 #define GPUDNN_ACTIVATION_RELU CUDNN_ACTIVATION_RELU
 #define GPUDNN_ACTIVATION_CLIPPED_RELU CUDNN_ACTIVATION_CLIPPED_RELU
 #define GPUDNN_ACTIVATION_SIGMOID CUDNN_ACTIVATION_SIGMOID
 #define GPUDNN_ACTIVATION_TANH CUDNN_ACTIVATION_TANH
 #endif
 template <typename T>
 struct CudnnActivationFunctor {
  using ELEMENT_TYPE = T;
 #ifdef PADDLE_WITH_HIP
  CudnnActivationFunctor(const CUDADeviceContext& ctx, const T& c,
                         const miopenActivationMode_t& m)
      : ctx_(ctx), coef_(c), mode_(m) {}
 #else
  CudnnActivationFunctor(const CUDADeviceContext& ctx, const T& c,
                         const cudnnActivationMode_t& m)
      : ctx_(ctx), coef_(c), mode_(m) {}
 #endif
  void operator()(const Tensor& x, Tensor* out) {
    ActivationDescriptor act_desc;
    act_desc.set(mode_, coef_);
    TensorDescriptor x_desc, out_desc;
    x_desc.set(x);
    out_desc.set(GET_DATA_SAFELY(out, "Output", "Out", "CudnnActivation"));
 #ifdef PADDLE_WITH_HIP
    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenActivationForward(
        ctx_.cudnn_handle(), act_desc.desc(),
        platform::CudnnDataType<T>::kOne(), x_desc.desc(), x.data<T>(),
        platform::CudnnDataType<T>::kZero(), out_desc.desc(),
        out->mutable_data<T>(ctx_.GetPlace())));
 #else
    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnActivationForward(
        ctx_.cudnn_handle(), act_desc.desc(),
        platform::CudnnDataType<T>::kOne(), x_desc.desc(), x.data<T>(),
        platform::CudnnDataType<T>::kZero(), out_desc.desc(),
        out->mutable_data<T>(ctx_.GetPlace())));
 #endif
  }
  const CUDADeviceContext& ctx_;
  const T coef_;
 #ifdef PADDLE_WITH_HIP
  const miopenActivationMode_t mode_;
 #else
  const cudnnActivationMode_t mode_;
 #endif
 };
 template <typename T>
 struct CudnnActivationGradFunctor {
  using ELEMENT_TYPE = T;
 #ifdef PADDLE_WITH_HIP
  CudnnActivationGradFunctor(const CUDADeviceContext& ctx, const T& c,
                             const miopenActivationMode_t& m)
      : ctx_(ctx), coef_(c), mode_(m) {}
 #else
  CudnnActivationGradFunctor(const CUDADeviceContext& ctx, const T& c,
                             const cudnnActivationMode_t& m)
      : ctx_(ctx), coef_(c), mode_(m) {}
 #endif
  void operator()(const Tensor& x, const Tensor& out, const Tensor dout,
                  Tensor* dx) {
    ActivationDescriptor act_desc;
@ -67,27 +107,40 @@ struct CudnnActivationGradFunctor {
    out_desc.set(out);
    dout_desc.set(dout);
    dx_desc.set(GET_DATA_SAFELY(dx, "Output", "X@GRAD", "CudnnActivationGrad"));
 #ifdef PADDLE_WITH_HIP
    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenActivationBackward(
        ctx_.cudnn_handle(), act_desc.desc(),
        platform::CudnnDataType<T>::kOne(), out_desc.desc(), out.data<T>(),
        dout_desc.desc(), dout.data<T>(), x_desc.desc(), x.data<T>(),
        platform::CudnnDataType<T>::kZero(), dx_desc.desc(),
        dx->mutable_data<T>(ctx_.GetPlace())));
 #else
    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnActivationBackward(
        ctx_.cudnn_handle(), act_desc.desc(),
        platform::CudnnDataType<T>::kOne(), out_desc.desc(), out.data<T>(),
        dout_desc.desc(), dout.data<T>(), x_desc.desc(), x.data<T>(),
        platform::CudnnDataType<T>::kZero(), dx_desc.desc(),
        dx->mutable_data<T>(ctx_.GetPlace())));
 #endif
  }
  const CUDADeviceContext& ctx_;
  const T coef_;
 #ifdef PADDLE_WITH_HIP
  const miopenActivationMode_t mode_;
 #else
  const cudnnActivationMode_t mode_;
 #endif
 };
 template <typename T>
 struct CudnnReluFunctor : public CudnnActivationFunctor<T> {
  explicit CudnnReluFunctor(const CUDADeviceContext& ctx)
-      : CudnnActivationFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_RELU) {}
+      : CudnnActivationFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {}
 };
 template <typename T>
 struct CudnnReluGradFunctor : public CudnnActivationGradFunctor<T> {
  explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx)
-      : CudnnActivationGradFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_RELU) {}
+      : CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {}
  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
@ -95,13 +148,13 @@ struct CudnnReluGradFunctor : public CudnnActivationGradFunctor<T> {
 template <typename T>
 struct CudnnRelu6Functor : public CudnnActivationFunctor<T> {
  explicit CudnnRelu6Functor(const CUDADeviceContext& ctx)
-      : CudnnActivationFunctor<T>(ctx, 6.0, CUDNN_ACTIVATION_CLIPPED_RELU) {}
+      : CudnnActivationFunctor<T>(ctx, 6.0, GPUDNN_ACTIVATION_CLIPPED_RELU) {}
 };
 template <typename T>
 struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor<T> {
  explicit CudnnRelu6GradFunctor(const CUDADeviceContext& ctx)
-      : CudnnActivationGradFunctor<T>(ctx, 6.0, CUDNN_ACTIVATION_CLIPPED_RELU) {
+      : CudnnActivationGradFunctor<T>(ctx, 6.0,
-  }
+                                      GPUDNN_ACTIVATION_CLIPPED_RELU) {}
  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
@ -109,12 +162,12 @@ struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor<T> {
 template <typename T>
 struct CudnnSigmoidFunctor : public CudnnActivationFunctor<T> {
  explicit CudnnSigmoidFunctor(const CUDADeviceContext& ctx)
-      : CudnnActivationFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_SIGMOID) {}
+      : CudnnActivationFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {}
 };
 template <typename T>
 struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor<T> {
  explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx)
-      : CudnnActivationGradFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_SIGMOID) {}
+      : CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {}
  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
@ -122,12 +175,12 @@ struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor<T> {
 template <typename T>
 struct CudnnTanhFunctor : public CudnnActivationFunctor<T> {
  explicit CudnnTanhFunctor(const CUDADeviceContext& ctx)
-      : CudnnActivationFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_TANH) {}
+      : CudnnActivationFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {}
 };
 template <typename T>
 struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor<T> {
  explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx)
-      : CudnnActivationGradFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_TANH) {}
+      : CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {}
  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
@ -183,6 +236,14 @@ namespace ops = paddle::operators;
  __macro(sigmoid, CudnnSigmoidFunctor, CudnnSigmoidGradFunctor); \
  __macro(tanh, CudnnTanhFunctor, CudnnTanhGradFunctor)
 #ifdef PADDLE_WITH_HIP
 #define REGISTER_ACTIVATION_CUDNN_KERNEL(act_type, functor, grad_functor) \
  REGISTER_OP_KERNEL(act_type, CUDNN, plat::CUDAPlace,                    \
                     ops::CudnnActivationKernel<ops::functor<float>>);    \
  REGISTER_OP_KERNEL(                                                     \
      act_type##_grad, CUDNN, plat::CUDAPlace,                            \
      ops::CudnnActivationGradKernel<ops::grad_functor<float>>);
 #else
 #define REGISTER_ACTIVATION_CUDNN_KERNEL(act_type, functor, grad_functor) \
  REGISTER_OP_KERNEL(act_type, CUDNN, plat::CUDAPlace,                    \
                     ops::CudnnActivationKernel<ops::functor<float>>,     \
@ -191,5 +252,6 @@ namespace ops = paddle::operators;
      act_type##_grad, CUDNN, plat::CUDAPlace,                            \
      ops::CudnnActivationGradKernel<ops::grad_functor<float>>,           \
      ops::CudnnActivationGradKernel<ops::grad_functor<double>>);
 #endif
 FOR_EACH_CUDNN_OP_FUNCTOR(REGISTER_ACTIVATION_CUDNN_KERNEL);
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@ -24,9 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
 #include "paddle/fluid/platform/port.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
 DECLARE_bool(use_mkldnn);
--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
@ -12,7 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
--- a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef PADDLE_WITH_HIP
 // HIP not support cudnnSpatialTfGridGeneratorForward
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
@ -121,3 +124,5 @@ REGISTER_OP_KERNEL(affine_grid, CUDNN, plat::CUDAPlace,
 REGISTER_OP_KERNEL(affine_grid_grad, CUDNN, plat::CUDAPlace,
                   paddle::operators::CUDNNAffineGridGradOpKernel<float>,
                   paddle::operators::CUDNNAffineGridGradOpKernel<double>);
 #endif  // not PADDLE_WITH_HIP
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@ -21,6 +21,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/miopen_helper.h"
 #endif
 namespace paddle {
 namespace operators {
@ -109,7 +112,7 @@ class AffineGridOp : public framework::OperatorWithKernel {
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    framework::LibraryType library{framework::LibraryType::kPlain};
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    if (platform::CanCUDNNBeUsed(ctx)) {
      library = framework::LibraryType::kCUDNN;
    }
@ -226,7 +229,7 @@ class AffineGridOpGrad : public framework::OperatorWithKernel {
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    framework::LibraryType library_{framework::LibraryType::kPlain};
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    if (platform::CanCUDNNBeUsed(ctx)) {
      library_ = framework::LibraryType::kCUDNN;
    }
--- a/paddle/fluid/operators/allclose_op.cu
+++ b/paddle/fluid/operators/allclose_op.cu
@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <cuda_runtime.h>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/allclose_op.h"
@ -67,7 +66,11 @@ struct AllcloseFunctor<platform::CUDADeviceContext, T> {
    int block = 1024;
    int grid = (block - 1 + num) / block;
    grid = (grid > block) ? block : grid;
 #ifdef PADDLE_WITH_HIP
    hipMemset(out_data, true, sizeof(bool));
 #else
    cudaMemset(out_data, true, sizeof(bool));
 #endif
    AllcloseCUDAKernel<T><<<grid, block, 0, dev_ctx.stream()>>>(
        in_data, other_data, rtol, atol, equal_nan, num, out_data);
  }
--- a/paddle/fluid/operators/arg_min_max_op_base.cu.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.cu.h
@ -14,9 +14,15 @@ limitations under the License. */
 #pragma once
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
-#include <cub/cub.cuh>
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
 #include <limits>
 #include <string>
 #include <typeinfo>
--- a/paddle/fluid/operators/argsort_op.cu
+++ b/paddle/fluid/operators/argsort_op.cu
@ -16,13 +16,28 @@ limitations under the License. */
 #include <thrust/execution_policy.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/argsort_op.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #ifdef __HIPCC__
 namespace rocprim {
 namespace detail {
 template <>
 struct radix_key_codec_base<paddle::platform::float16>
    : radix_key_codec_integral<paddle::platform::float16, uint16_t> {};
 }  // namespace detail
 }  // namespace rocprim
 #else
 // set cub base traits in order to handle float16
 namespace cub {
 template <>
@ -30,6 +45,7 @@ struct NumericTraits<paddle::platform::float16>
    : BaseTraits<FLOATING_POINT, true, false, uint16_t,
                 paddle::platform::float16> {};
 }  // namespace cub
 #endif
 namespace paddle {
 namespace operators {
@ -139,7 +155,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
                              cub::CountingInputIterator<IndType>>
      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
-  cudaError_t err;
+  gpuError_t err;
  if (descending) {
    err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
        nullptr, temp_storage_bytes, inp, sorted_out_ptr,
--- a/paddle/fluid/operators/batch_fc_op.cu
+++ b/paddle/fluid/operators/batch_fc_op.cu
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <cublas.h>
 #include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/batch_fc_op.h"
@ -42,7 +41,7 @@ __global__ void add_bias_kernel(T* data, int slot_pairs_num, int ins_num,
 }
 template <typename T>
-void add_bias(cudaStream_t stream, T* data, int slot_pairs_num, int ins_num,
+void add_bias(gpuStream_t stream, T* data, int slot_pairs_num, int ins_num,
              int out_dim, const T* bias) {
  add_bias_kernel<<<GET_BLOCKS(slot_pairs_num * ins_num * out_dim),
                    CUDA_NUM_THREADS, 0, stream>>>(data, slot_pairs_num,
@ -65,7 +64,7 @@ __global__ void add_bias_grad_kernel(const T* dout_data, int slot_pairs_num,
 }
 template <typename T>
-void add_bias_grad(cudaStream_t stream, const T* dout_data, int slot_pairs_num,
+void add_bias_grad(gpuStream_t stream, const T* dout_data, int slot_pairs_num,
                   int ins_num, int out_dim, T* db_data) {
  add_bias_grad_kernel<<<GET_BLOCKS(slot_pairs_num * out_dim), CUDA_NUM_THREADS,
                         0, stream>>>(dout_data, slot_pairs_num, ins_num,
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
--- a/paddle/fluid/operators/bce_loss_op.cu
+++ b/paddle/fluid/operators/bce_loss_op.cu
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <algorithm>
 #include "cub/cub.cuh"
 #include "paddle/fluid/operators/bce_loss_op.h"
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
--- a/paddle/fluid/operators/math/sequence_padding_test.cc
+++ b/paddle/fluid/operators/math/sequence_padding_test.cc
@ -105,7 +105,7 @@ TEST(Seq2BatchPadding, CPU) {
                                                                 128);
 }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(SequencePadding, CUDA) {
  auto place = paddle::platform::CUDAPlace(0);
  auto *context = static_cast<paddle::platform::CUDADeviceContext *>(
--- a/paddle/fluid/operators/math/sequence_pooling_test.cc
+++ b/paddle/fluid/operators/math/sequence_pooling_test.cc
@ -123,7 +123,7 @@ TEST(SequencePoolingGrad, CPU_SUM) {
                                                                    lod2, 128);
 }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(SequencePoolingGrad, CUDA_SUM) {
  auto place = paddle::platform::CUDAPlace(0);
  auto *context = static_cast<paddle::platform::CUDADeviceContext *>(
--- a/paddle/fluid/operators/math/sequence_scale.cu
+++ b/paddle/fluid/operators/math/sequence_scale.cu
@ -44,10 +44,18 @@ class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
    T* seq_data = seq->mutable_data<T>(context.GetPlace());
 #ifdef PADDLE_WITH_HIP
    hipLaunchKernelGGL(
        HIP_KERNEL_NAME(SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS>),
        dim3(num_seq), dim3(PADDLE_CUDA_NUM_THREADS), 0, context.stream(),
        seq_data, abs_offset_lod[level].CUDAMutableData(context.GetPlace()),
        scales, seq_width);
 #else
    SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
        num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
        seq_data, abs_offset_lod[level].CUDAMutableData(context.GetPlace()),
        scales, seq_width);
 #endif
  }
 };
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@ -16,7 +16,11 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/operators/math/softmax_impl.h"
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/miopen_helper.h"
 #else
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
 namespace paddle {
 namespace operators {
@ -45,6 +49,16 @@ void SoftmaxCUDNNFunctor<T>::operator()(
  if (cudnn_tensor_dims.size() <= 2) {
    cudnn_tensor_dims.resize(4, 1);
  }
 #ifdef PADDLE_WITH_HIP
  miopenTensorDescriptor_t cudnn_x_desc =
      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
  miopenTensorDescriptor_t cudnn_y_desc =
      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward(
      context.cudnn_handle(), CudnnDataType<T>::kOne(), cudnn_x_desc,
      X->data<T>(), CudnnDataType<T>::kZero(), cudnn_y_desc,
      Y->mutable_data<T>(context.GetPlace())));
 #else
  cudnnTensorDescriptor_t cudnn_x_desc =
      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
  cudnnTensorDescriptor_t cudnn_y_desc =
@ -54,6 +68,7 @@ void SoftmaxCUDNNFunctor<T>::operator()(
      CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_x_desc,
      X->data<T>(), CudnnDataType<T>::kZero(), cudnn_y_desc,
      Y->mutable_data<T>(context.GetPlace())));
 #endif
 }
 template <typename T>
@ -74,6 +89,19 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
  if (cudnn_tensor_dims.size() <= 2) {
    cudnn_tensor_dims.resize(4, 1);
  }
 #ifdef PADDLE_WITH_HIP
  miopenTensorDescriptor_t cudnn_y_desc =
      yDesc.descriptor<T>(layout, cudnn_tensor_dims);
  miopenTensorDescriptor_t cudnn_xgrad_desc =
      dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
  miopenTensorDescriptor_t cudnn_ygrad_desc =
      dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward(
      context.cudnn_handle(), CudnnDataType<T>::kOne(), cudnn_y_desc,
      Y->data<T>(), cudnn_ygrad_desc, YGrad->data<T>(),
      CudnnDataType<T>::kZero(), cudnn_xgrad_desc,
      XGrad->mutable_data<T>(context.GetPlace())));
 #else
  cudnnTensorDescriptor_t cudnn_y_desc =
      yDesc.descriptor<T>(layout, cudnn_tensor_dims);
  cudnnTensorDescriptor_t cudnn_xgrad_desc =
@ -86,15 +114,20 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
      Y->data<T>(), cudnn_ygrad_desc, YGrad->data<T>(),
      CudnnDataType<T>::kZero(), cudnn_xgrad_desc,
      XGrad->mutable_data<T>(context.GetPlace())));
 #endif
 }
 template class SoftmaxCUDNNFunctor<platform::float16>;
 template class SoftmaxCUDNNFunctor<float>;
-template class SoftmaxCUDNNFunctor<double>;
+template class SoftmaxCUDNNFunctor<platform::float16>;
 template class SoftmaxGradCUDNNFunctor<float>;
 template class SoftmaxGradCUDNNFunctor<double>;
 template class SoftmaxGradCUDNNFunctor<platform::float16>;
 // MIOPEN do not support double
 #ifndef PADDLE_WITH_HIP
 template class SoftmaxCUDNNFunctor<double>;
 template class SoftmaxGradCUDNNFunctor<double>;
 #endif
 template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16,
                              false>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16,
--- a/paddle/fluid/operators/math/softmax.h
+++ b/paddle/fluid/operators/math/softmax.h
@ -35,7 +35,7 @@ class SoftmaxGradFunctor {
                  framework::Tensor* x_grad);
 };
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <typename T>
 class SoftmaxCUDNNFunctor {
 public:
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"
-#ifdef __NVCC__
+#if defined(__HIPCC__) || defined(__NVCC__)
 #include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
 #endif
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@ -278,6 +278,9 @@ class OpTest(unittest.TestCase):
        def is_mkldnn_op_test():
            return hasattr(cls, "use_mkldnn") and cls.use_mkldnn == True
        def is_rocm_op_test():
            return core.is_compiled_with_rocm()
        if not hasattr(cls, "op_type"):
            raise AssertionError(
                "This test do not have op_type in class attrs, "
@ -298,7 +301,8 @@ class OpTest(unittest.TestCase):
                and cls.op_type not in op_accuracy_white_list.NO_FP64_CHECK_GRAD_OP_LIST \
                and not hasattr(cls, 'exist_fp64_check_grad') \
                and not is_xpu_op_test() \
-                and not is_mkldnn_op_test():
+                and not is_mkldnn_op_test() \
                and not is_rocm_op_test():
                raise AssertionError(
                    "This test of %s op needs check_grad with fp64 precision." %
                    cls.op_type)