[ROCM] update fluid platform for rocm35 (part1), test=develop (#30639)

* [ROCM] update fluid platform for rocm35 (part1), test=develop * address review comments, test=develop
5 years ago · f89da4ab45
parent fc00240575
commit f89da4ab45
19 changed files with 626 additions and 152 deletions
--- a/paddle/fluid/platform/bfloat16.h
+++ b/paddle/fluid/platform/bfloat16.h
@ -47,7 +47,17 @@ struct PADDLE_ALIGN(2) bfloat16 {
  ~bfloat16() = default;

  HOSTDEVICE inline explicit bfloat16(float val) {
+#ifdef PADDLE_WITH_HIP
+    uint32_t res = 0;
+    uint32_t* tempRes;
+    // We should be using memcpy in order to respect the strict aliasing rule
+    // but it fails in the HIP environment.
+    tempRes = reinterpret_cast<uint32_t*>(&val);
+    res = *tempRes;
+    x = res >> 16;
+#else
    std::memcpy(&x, reinterpret_cast<char*>(&val) + 2, 2);
+#endif
  }

  template <class T>
--- a/paddle/fluid/platform/complex128.h
+++ b/paddle/fluid/platform/complex128.h
@ -28,6 +28,11 @@
 #include <thrust/complex.h>
 #endif  // PADDLE_WITH_CUDA

+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_complex.h>
+#include <thrust/complex.h>  // NOLINT
+#endif
+
 #include <cstring>

 #include "paddle/fluid/platform/hostdevice.h"
@ -54,7 +59,7 @@ struct PADDLE_ALIGN(16) complex128 {
  ~complex128() = default;

  HOSTDEVICE complex128(double real, double imag) : real(real), imag(imag) {}
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

  HOSTDEVICE inline explicit complex128(const thrust::complex<double>& c) {
    real = c.real();
@ -65,9 +70,15 @@ struct PADDLE_ALIGN(16) complex128 {
    return thrust::complex<double>(real, imag);
  }

+#ifdef PADDLE_WITH_HIP
+  HOSTDEVICE inline explicit operator hipDoubleComplex() const {
+    return make_hipDoubleComplex(real, imag);
+  }
+#else
  HOSTDEVICE inline explicit operator cuDoubleComplex() const {
    return make_cuDoubleComplex(real, imag);
  }
+#endif
 #endif

  HOSTDEVICE complex128(const float& val)
@ -202,7 +213,7 @@ struct PADDLE_ALIGN(16) complex128 {

 HOSTDEVICE inline complex128 operator+(const complex128& a,
                                       const complex128& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  return complex128(thrust::complex<double>(a.real, a.imag) +
                    thrust::complex<double>(b.real, b.imag));
 #else
@ -212,7 +223,7 @@ HOSTDEVICE inline complex128 operator+(const complex128& a,

 HOSTDEVICE inline complex128 operator-(const complex128& a,
                                       const complex128& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  return complex128(thrust::complex<double>(a.real, a.imag) -
                    thrust::complex<double>(b.real, b.imag));
 #else
@ -222,7 +233,7 @@ HOSTDEVICE inline complex128 operator-(const complex128& a,

 HOSTDEVICE inline complex128 operator*(const complex128& a,
                                       const complex128& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  return complex128(thrust::complex<double>(a.real, a.imag) *
                    thrust::complex<double>(b.real, b.imag));
 #else
@ -233,7 +244,7 @@ HOSTDEVICE inline complex128 operator*(const complex128& a,

 HOSTDEVICE inline complex128 operator/(const complex128& a,
                                       const complex128& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  return complex128(thrust::complex<double>(a.real, a.imag) /
                    thrust::complex<double>(b.real, b.imag));
 #else
@ -244,7 +255,7 @@ HOSTDEVICE inline complex128 operator/(const complex128& a,
 }

 HOSTDEVICE inline complex128 operator-(const complex128& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  return complex128(-thrust::complex<double>(a.real, a.imag));
 #else
  complex128 res;
@ -256,7 +267,7 @@ HOSTDEVICE inline complex128 operator-(const complex128& a) {

 HOSTDEVICE inline complex128& operator+=(complex128& a,  // NOLINT
                                         const complex128& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  a = complex128(thrust::complex<double>(a.real, a.imag) +=
                 thrust::complex<double>(b.real, b.imag));
  return a;
@ -269,7 +280,7 @@ HOSTDEVICE inline complex128& operator+=(complex128& a,  // NOLINT

 HOSTDEVICE inline complex128& operator-=(complex128& a,  // NOLINT
                                         const complex128& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  a = complex128(thrust::complex<double>(a.real, a.imag) -=
                 thrust::complex<double>(b.real, b.imag));
  return a;
@ -282,7 +293,7 @@ HOSTDEVICE inline complex128& operator-=(complex128& a,  // NOLINT

 HOSTDEVICE inline complex128& operator*=(complex128& a,  // NOLINT
                                         const complex128& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  a = complex128(thrust::complex<double>(a.real, a.imag) *=
                 thrust::complex<double>(b.real, b.imag));
  return a;
@ -295,7 +306,7 @@ HOSTDEVICE inline complex128& operator*=(complex128& a,  // NOLINT

 HOSTDEVICE inline complex128& operator/=(complex128& a,  // NOLINT
                                         const complex128& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  a = complex128(thrust::complex<double>(a.real, a.imag) /=
                 thrust::complex<double>(b.real, b.imag));
  return a;
@ -339,6 +350,7 @@ HOSTDEVICE inline bool operator>=(const complex128& a, const complex128& b) {

 HOSTDEVICE inline bool(isnan)(const complex128& a) {
 #if defined(__CUDA_ARCH__)
+  // __isnanf not supported on HIP platform
  return __isnan(a.real) || __isnan(a.imag);
 #else
  return std::isnan(a.real) || std::isnan(a.imag);
@ -347,6 +359,7 @@ HOSTDEVICE inline bool(isnan)(const complex128& a) {

 HOSTDEVICE inline bool(isinf)(const complex128& a) {
 #if defined(__CUDA_ARCH__)
+  // __isinf not supported on HIP platform
  return __isinf(a.real) || __isinf(a.imag);
 #else
  return std::isinf(a.real) || std::isinf(a.imag);
@ -358,7 +371,7 @@ HOSTDEVICE inline bool(isfinite)(const complex128& a) {
 }

 HOSTDEVICE inline double(abs)(const complex128& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  return thrust::abs(thrust::complex<double>(a.real, a.imag));
 #else
  return std::abs(std::complex<double>(a.real, a.imag));
@ -366,7 +379,7 @@ HOSTDEVICE inline double(abs)(const complex128& a) {
 }

 HOSTDEVICE inline complex128(pow)(const complex128& a, const complex128& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  return complex128(thrust::pow(thrust::complex<double>(a.real, a.imag),
                                thrust::complex<double>(b.real, b.imag)));
 #else
@ -375,7 +388,7 @@ HOSTDEVICE inline complex128(pow)(const complex128& a, const complex128& b) {
 }

 HOSTDEVICE inline complex128(sqrt)(const complex128& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  return complex128(thrust::sqrt(thrust::complex<double>(a.real, a.imag)));
 #else
  return std::sqrt(std::complex<double>(a));
@ -383,7 +396,7 @@ HOSTDEVICE inline complex128(sqrt)(const complex128& a) {
 }

 HOSTDEVICE inline complex128(tanh)(const complex128& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  return complex128(thrust::tanh(thrust::complex<double>(a.real, a.imag)));
 #else
  return std::tanh(std::complex<double>(a));
@ -391,7 +404,7 @@ HOSTDEVICE inline complex128(tanh)(const complex128& a) {
 }

 HOSTDEVICE inline complex128(log)(const complex128& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  return complex128(thrust::log(thrust::complex<double>(a.real, a.imag)));
 #else
  return complex128(std::log(std::complex<double>(a)));
--- a/paddle/fluid/platform/complex64.h
+++ b/paddle/fluid/platform/complex64.h
@ -27,6 +27,11 @@
 #include <thrust/complex.h>
 #endif  // PADDLE_WITH_CUDA

+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_complex.h>
+#include <thrust/complex.h>  // NOLINT
+#endif
+
 #include <cstring>

 #include "paddle/fluid/platform/complex128.h"
@ -54,7 +59,7 @@ struct PADDLE_ALIGN(8) complex64 {
  ~complex64() = default;

  HOSTDEVICE complex64(float real, float imag) : real(real), imag(imag) {}
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

  HOSTDEVICE inline explicit complex64(const thrust::complex<float>& c) {
    real = c.real();
@ -65,9 +70,15 @@ struct PADDLE_ALIGN(8) complex64 {
    return thrust::complex<float>(real, imag);
  }

+#ifdef PADDLE_WITH_HIP
+  HOSTDEVICE inline explicit operator hipFloatComplex() const {
+    return make_hipFloatComplex(real, imag);
+  }
+#else
  HOSTDEVICE inline explicit operator cuFloatComplex() const {
    return make_cuFloatComplex(real, imag);
  }
+#endif
 #endif

  HOSTDEVICE complex64(const float& val) : real(val), imag(0) {}
@ -207,7 +218,7 @@ struct PADDLE_ALIGN(8) complex64 {
 };

 HOSTDEVICE inline complex64 operator+(const complex64& a, const complex64& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  return complex64(thrust::complex<float>(a.real, a.imag) +
                   thrust::complex<float>(b.real, b.imag));
 #else
@ -216,7 +227,7 @@ HOSTDEVICE inline complex64 operator+(const complex64& a, const complex64& b) {
 }

 HOSTDEVICE inline complex64 operator-(const complex64& a, const complex64& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  return complex64(thrust::complex<float>(a.real, a.imag) -
                   thrust::complex<float>(b.real, b.imag));
 #else
@ -225,7 +236,7 @@ HOSTDEVICE inline complex64 operator-(const complex64& a, const complex64& b) {
 }

 HOSTDEVICE inline complex64 operator*(const complex64& a, const complex64& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  return complex64(thrust::complex<float>(a.real, a.imag) *
                   thrust::complex<float>(b.real, b.imag));
 #else
@ -235,7 +246,7 @@ HOSTDEVICE inline complex64 operator*(const complex64& a, const complex64& b) {
 }

 HOSTDEVICE inline complex64 operator/(const complex64& a, const complex64& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  return complex64(thrust::complex<float>(a.real, a.imag) /
                   thrust::complex<float>(b.real, b.imag));
 #else
@ -246,7 +257,7 @@ HOSTDEVICE inline complex64 operator/(const complex64& a, const complex64& b) {
 }

 HOSTDEVICE inline complex64 operator-(const complex64& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  return complex64(-thrust::complex<float>(a.real, a.imag));
 #else
  complex64 res;
@ -258,7 +269,7 @@ HOSTDEVICE inline complex64 operator-(const complex64& a) {

 HOSTDEVICE inline complex64& operator+=(complex64& a,  // NOLINT
                                        const complex64& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  a = complex64(thrust::complex<float>(a.real, a.imag) +=
                thrust::complex<float>(b.real, b.imag));
  return a;
@ -271,7 +282,7 @@ HOSTDEVICE inline complex64& operator+=(complex64& a,  // NOLINT

 HOSTDEVICE inline complex64& operator-=(complex64& a,  // NOLINT
                                        const complex64& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  a = complex64(thrust::complex<float>(a.real, a.imag) -=
                thrust::complex<float>(b.real, b.imag));
  return a;
@ -284,7 +295,7 @@ HOSTDEVICE inline complex64& operator-=(complex64& a,  // NOLINT

 HOSTDEVICE inline complex64& operator*=(complex64& a,  // NOLINT
                                        const complex64& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  a = complex64(thrust::complex<float>(a.real, a.imag) *=
                thrust::complex<float>(b.real, b.imag));
  return a;
@ -297,7 +308,7 @@ HOSTDEVICE inline complex64& operator*=(complex64& a,  // NOLINT

 HOSTDEVICE inline complex64& operator/=(complex64& a,  // NOLINT
                                        const complex64& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  a = complex64(thrust::complex<float>(a.real, a.imag) /=
                thrust::complex<float>(b.real, b.imag));
  return a;
@ -341,6 +352,7 @@ HOSTDEVICE inline bool operator>=(const complex64& a, const complex64& b) {

 HOSTDEVICE inline bool(isnan)(const complex64& a) {
 #if defined(__CUDA_ARCH__)
+  // __isnanf not supported on HIP platform
  return __isnanf(a.real) || __isnanf(a.imag);
 #else
  return std::isnan(a.real) || std::isnan(a.imag);
@ -349,6 +361,7 @@ HOSTDEVICE inline bool(isnan)(const complex64& a) {

 HOSTDEVICE inline bool(isinf)(const complex64& a) {
 #if defined(__CUDA_ARCH__)
+  // __isinff not supported on HIP platform
  return __isinff(a.real) || __isinff(a.imag);
 #else
  return std::isinf(a.real) || std::isinf(a.imag);
@ -360,7 +373,7 @@ HOSTDEVICE inline bool(isfinite)(const complex64& a) {
 }

 HOSTDEVICE inline float(abs)(const complex64& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  return complex64(thrust::abs(thrust::complex<float>(a.real, a.imag)));
 #else
  return std::abs(std::complex<float>(a.real, a.imag));
@ -368,7 +381,7 @@ HOSTDEVICE inline float(abs)(const complex64& a) {
 }

 HOSTDEVICE inline complex64(pow)(const complex64& a, const complex64& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  return complex64(thrust::pow(thrust::complex<float>(a.real, a.imag),
                               thrust::complex<float>(b.real, b.imag)));
 #else
@ -377,7 +390,7 @@ HOSTDEVICE inline complex64(pow)(const complex64& a, const complex64& b) {
 }

 HOSTDEVICE inline complex64(sqrt)(const complex64& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  return complex64(thrust::sqrt(thrust::complex<float>(a.real, a.imag)));
 #else
  return std::sqrt(std::complex<float>(a));
@ -385,7 +398,7 @@ HOSTDEVICE inline complex64(sqrt)(const complex64& a) {
 }

 HOSTDEVICE inline complex64(tanh)(const complex64& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  return complex64(thrust::tanh(thrust::complex<float>(a.real, a.imag)));
 #else
  return std::tanh(std::complex<float>(a));
@ -393,7 +406,7 @@ HOSTDEVICE inline complex64(tanh)(const complex64& a) {
 }

 HOSTDEVICE inline complex64(log)(const complex64& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
  return complex64(thrust::log(thrust::complex<float>(a.real, a.imag)));
 #else
  return std::log(std::complex<float>(a));
--- a/paddle/fluid/platform/details/cuda_transform_iterator_cast.h
+++ b/paddle/fluid/platform/details/cuda_transform_iterator_cast.h
@ -14,7 +14,7 @@ limitations under the License. */

 #pragma once

-#ifndef __NVCC__
+#if !defined(__NVCC__) && !defined(__HIPCC__)
 #error device_ptr_cast must be include by .cu file
 #endif

--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@ -1,9 +1,9 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)

 list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc nvtx.cc)
-#hip
-if (WITH_ROCM_PLATFORM)
-    list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc)
+
+if (WITH_ROCM)
+  list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc)
 endif()

 # There is no macOS version of NCCL.
@ -13,7 +13,7 @@ if (NOT APPLE AND NOT WIN32)
  if (WITH_NCCL)
    list(APPEND CUDA_SRCS nccl.cc)
  endif()
-  if (WITH_ROCM_PLATFORM)
+  if (WITH_ROCM)
    list(APPEND HIP_SRCS hiprtc.cc rocm_driver.cc)
    if (WITH_RCCL)
      list(APPEND HIP_SRCS rccl.cc)
@ -29,9 +29,9 @@ configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
 if (CUPTI_FOUND)
    list(APPEND CUDA_SRCS cupti.cc)
 endif(CUPTI_FOUND)
-if(WITH_ROCM_PLATFORM)
+if(WITH_ROCM)
  hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader)
-  hip_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
+  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
 else()
  nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@ -55,7 +55,7 @@ DEFINE_string(miopen_dir, "",

 DEFINE_string(rocm_dir, "",
              "Specify path for loading rocm library, such as librocblas, "
-              "libcurand, libcusolver. For instance, /opt/rocm/lib. "
+              "libmiopen, libhipsparse. For instance, /opt/rocm/lib. "
              "If default, dlopen will search rocm from LD_LIBRARY_PATH");

 DEFINE_string(rccl_dir, "",
@ -264,7 +264,7 @@ void* GetCublasDsoHandle() {
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib, true,
                                    {cuda_lib_path});
-#elif PADDLE_WITH_HIP
+#elif defined(PADDLE_WITH_HIP)
  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so");
 #else
  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
@ -292,7 +292,7 @@ void* GetCUDNNDsoHandle() {
      "CUDNN version.");
  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib, true,
                                    {cuda_lib_path}, win_warn_meg);
-#elif PADDLE_WITH_HIP
+#elif defined(PADDLE_WITH_HIP)
  return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false);
 #else
  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false,
@ -316,7 +316,7 @@ void* GetCurandDsoHandle() {
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib, true,
                                    {cuda_lib_path});
-#elif PADDLE_WITH_HIP
+#elif defined(PADDLE_WITH_HIP)
  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so");
 #else
  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
@ -337,8 +337,8 @@ void* GetCusolverDsoHandle() {
 void* GetNVRTCDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false);
-#elif PADDLE_WITH_HIP
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprtc.so");
+#elif defined(PADDLE_WITH_HIP)
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprtc.so", false);
 #else
  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false);
 #endif
@ -347,8 +347,8 @@ void* GetNVRTCDsoHandle() {
 void* GetCUDADsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false);
-#elif PADDLE_WITH_HIP
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhip_hcc.so");
+#elif defined(PADDLE_WITH_HIP)
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhip_hcc.so", false);
 #else
  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so", false);
 #endif
@ -369,15 +369,24 @@ void* GetWarpCTCDsoHandle() {
 }

 void* GetNCCLDsoHandle() {
+#ifdef PADDLE_WITH_HIP
+  std::string warning_msg(
+      "You may need to install 'rccl' from ROCM official website: "
+      "https://rocmdocs.amd.com/en/latest/Installation_Guide/"
+      "Installation-Guide.html before install PaddlePaddle.");
+#else
  std::string warning_msg(
      "You may need to install 'nccl2' from NVIDIA official website: "
      "https://developer.nvidia.com/nccl/nccl-download"
      "before install PaddlePaddle.");
+#endif
+
 #if defined(__APPLE__) || defined(__OSX__)
  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", true, {},
                                    warning_msg);
 #elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
-  return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true);
+  return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true, {},
+                                    warning_msg);
 #else
  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", true, {},
                                    warning_msg);
--- a/paddle/fluid/platform/dynload/miopen.h
+++ b/paddle/fluid/platform/dynload/miopen.h
@ -44,6 +44,8 @@ inline const char* miopenGetErrorString(miopenStatus_t status) {
      return "MIOPEN_STATUS_INTERNAL_ERROR";
    case miopenStatusNotImplemented:
      return "MIOPEN_STATUS_NOT_IMPLEMENTED";
+    case miopenStatusUnsupportedOp:
+      return "MIOPEN_STATUS_UNSUPPORTED_OP";
    case miopenStatusUnknownError:
    default:
      return "MIOPEN_STATUS_UNKNOWN_ERROR";
@ -70,6 +72,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
 * include all needed miopen functions in HPPL
 **/
 #define MIOPEN_DNN_ROUTINE_EACH(__macro)                  \
+  __macro(miopenGetVersion);                              \
  __macro(miopenSet4dTensorDescriptor);                   \
  __macro(miopenSetTensorDescriptor);                     \
  __macro(miopenInitConvolutionNdDescriptor);             \
@ -80,6 +83,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
  __macro(miopenGetTensorDescriptor);                     \
  __macro(miopenCreateTensorDescriptor);                  \
  __macro(miopenDestroyTensorDescriptor);                 \
+  __macro(miopenGetTensorDescriptorSize);                 \
  __macro(miopenSet2dPoolingDescriptor);                  \
  __macro(miopenGet2dPoolingDescriptor);                  \
  __macro(miopenGetPoolingNdForwardOutputDim);            \
@ -109,9 +113,12 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
  __macro(miopenSoftmaxBackward);                         \
  __macro(miopenSoftmaxForward);                          \
  __macro(miopenCreateDropoutDescriptor);                 \
+  __macro(miopenDestroyDropoutDescriptor);                \
+  __macro(miopenRestoreDropoutDescriptor);                \
  __macro(miopenDropoutGetStatesSize);                    \
  __macro(miopenSetDropoutDescriptor);                    \
  __macro(miopenCreateRNNDescriptor);                     \
+  __macro(miopenDestroyRNNDescriptor);                    \
  __macro(miopenSetRNNDescriptor);                        \
  __macro(miopenGetRNNParamsSize);                        \
  __macro(miopenGetRNNWorkspaceSize);                     \
@ -120,8 +127,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
  __macro(miopenRNNBackwardData);                         \
  __macro(miopenRNNBackwardWeights);                      \
  __macro(miopenRNNForwardInference);                     \
-  __macro(miopenDestroyDropoutDescriptor);                \
-  __macro(miopenDestroyRNNDescriptor);
+  __macro(miopenGetTensorNumBytes);

 MIOPEN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)

--- a/paddle/fluid/platform/dynload/rccl.cc
+++ b/paddle/fluid/platform/dynload/rccl.cc
@ -25,6 +25,14 @@ void *rccl_dso_handle;

 RCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);

+#if NCCL_VERSION_CODE >= 2212
+RCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
+#endif
+
+#if NCCL_VERSION_CODE >= 2703
+RCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/rccl.h
+++ b/paddle/fluid/platform/dynload/rccl.h
@ -59,6 +59,18 @@ extern void* rccl_dso_handle;

 RCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)

+#if NCCL_VERSION_CODE >= 2212
+#define RCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast);
+RCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
+#endif
+
+#if NCCL_VERSION_CODE >= 2703
+#define RCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
+  __macro(ncclSend);                               \
+  __macro(ncclRecv);
+RCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/rocblas.h
+++ b/paddle/fluid/platform/dynload/rocblas.h
@ -36,12 +36,11 @@ extern void *rocblas_dso_handle;
 *
 * note: default dynamic linked libs
 */
-#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                              \
+#define DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP(__name)                             \
  struct DynLoad__##__name {                                                  \
    template <typename... Args>                                               \
-    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {   \
-      using rocblas_func =                                                    \
-          decltype(::__name(std::declval<Args>()...)) (*)(Args...);           \
+    rocblas_status operator()(Args... args) {                                 \
+      using rocblas_func = decltype(&::__name);                               \
      std::call_once(rocblas_dso_flag, []() {                                 \
        rocblas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
      });                                                                     \
@ -51,56 +50,65 @@ extern void *rocblas_dso_handle;
  };                                                                          \
  extern DynLoad__##__name __name

-#define ROCBLAS_BLAS_ROUTINE_EACH(__macro)            \
-  __macro(rocblas_saxpy);                             \
-  __macro(rocblas_daxpy);                             \
-  __macro(rocblas_sscal);                             \
-  __macro(rocblas_dscal);                             \
-  __macro(rocblas_scopy);                             \
-  __macro(rocblas_dcopy);                             \
-  __macro(rocblas_sgemv);                             \
-  __macro(rocblas_dgemv);                             \
-  __macro(rocblas_sgemm);                             \
-  __macro(rocblas_dgemm);                             \
-  __macro(rocblas_hgemm);                             \
-  __macro(rocblas_dgeam);                             \
-  /*rocblas_gemm_ex function not support at rocm3.5*/ \
-  /*__macro(rocblas_gemm_ex);                 */      \
-  __macro(rocblas_sgemm_batched);                     \
-  __macro(rocblas_dgemm_batched);                     \
-  __macro(rocblas_cgemm_batched);                     \
-  __macro(rocblas_zgemm_batched);                     \
-  __macro(rocblas_create_handle);                     \
-  __macro(rocblas_destroy_handle);                    \
-  __macro(rocblas_add_stream);                        \
-  __macro(rocblas_set_stream);                        \
-  __macro(rocblas_get_stream);                        \
-  __macro(rocblas_set_pointer_mode);                  \
+#define ROCBLAS_BLAS_ROUTINE_EACH(__macro) \
+  __macro(rocblas_caxpy);                  \
+  __macro(rocblas_saxpy);                  \
+  __macro(rocblas_daxpy);                  \
+  __macro(rocblas_zaxpy);                  \
+  __macro(rocblas_sscal);                  \
+  __macro(rocblas_dscal);                  \
+  __macro(rocblas_scopy);                  \
+  __macro(rocblas_dcopy);                  \
+  __macro(rocblas_cgemv);                  \
+  __macro(rocblas_sgemv);                  \
+  __macro(rocblas_zgemv);                  \
+  __macro(rocblas_dgemv);                  \
+  __macro(rocblas_cgemm);                  \
+  __macro(rocblas_sgemm);                  \
+  __macro(rocblas_dgemm);                  \
+  __macro(rocblas_hgemm);                  \
+  __macro(rocblas_zgemm);                  \
+  __macro(rocblas_sgeam);                  \
+  __macro(rocblas_strsm);                  \
+  __macro(rocblas_dtrsm);                  \
+  __macro(rocblas_dgeam);                  \
+  __macro(rocblas_sgemm_batched);          \
+  __macro(rocblas_dgemm_batched);          \
+  __macro(rocblas_cgemm_batched);          \
+  __macro(rocblas_zgemm_batched);          \
+  __macro(rocblas_create_handle);          \
+  __macro(rocblas_destroy_handle);         \
+  __macro(rocblas_set_stream);             \
+  __macro(rocblas_get_stream);             \
+  __macro(rocblas_set_pointer_mode);       \
  __macro(rocblas_get_pointer_mode);

-ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)

+// APIs available after CUDA 8.0
 #define ROCBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
+  __macro(rocblas_gemm_ex);                   \
  __macro(rocblas_sgemm_strided_batched);     \
  __macro(rocblas_dgemm_strided_batched);     \
  __macro(rocblas_cgemm_strided_batched);     \
  __macro(rocblas_zgemm_strided_batched);     \
  __macro(rocblas_hgemm_strided_batched);

-ROCBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+ROCBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)

-#define ROCBLAS_BLAS_ROUTINE_EACH_R3(__macro)
-
-ROCBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+// HIP not supported in ROCM3.5
+// #define ROCBLAS_BLAS_ROUTINE_EACH_R3(__macro)
+//   __macro(cublasSetMathMode);
+//   __macro(cublasGetMathMode);
+// ROCBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)

 #define ROCBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
  __macro(rocblas_gemm_batched_ex);           \
-// rocm not support now(rocm3.5)
-//  __macro(rocblas_gemm_strided_batched_ex);
+  __macro(rocblas_gemm_strided_batched_ex);

-ROCBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+ROCBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)

-#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
+#undef DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/rocm_driver.h
+++ b/paddle/fluid/platform/dynload/rocm_driver.h
@ -55,6 +55,7 @@ extern bool HasCUDADriver();
  __macro(hipModuleLaunchKernel);                             \
  __macro(hipLaunchKernel);                                   \
  __macro(hipGetDevice);                                      \
+  __macro(hipGetDeviceCount);                                 \
  __macro(hipDevicePrimaryCtxGetState)

 ROCM_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCM_WRAP);
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@ -295,7 +295,7 @@ TEST(EOF_EXCEPTION, THROW_EOF) {
  EXPECT_TRUE(caught_eof);
 }

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <typename T>
 bool CheckCudaStatusSuccess(T value, const std::string& msg = "success") {
  PADDLE_ENFORCE_CUDA_SUCCESS(value);
@ -312,7 +312,35 @@ bool CheckCudaStatusFailure(T value, const std::string& msg) {
    return ex_msg.find(msg) != std::string::npos;
  }
 }
+#ifdef PADDLE_WITH_HIP
+TEST(enforce, hip_success) {
+  EXPECT_TRUE(CheckCudaStatusSuccess(hipSuccess));
+  EXPECT_TRUE(CheckCudaStatusFailure(hipErrorInvalidValue, "Hip error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(hipErrorOutOfMemory, "Hip error"));

+  EXPECT_TRUE(CheckCudaStatusSuccess(HIPRAND_STATUS_SUCCESS));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(HIPRAND_STATUS_VERSION_MISMATCH, "Hiprand error"));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(HIPRAND_STATUS_NOT_INITIALIZED, "Hiprand error"));
+
+  EXPECT_TRUE(CheckCudaStatusSuccess(miopenStatusSuccess));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(miopenStatusNotInitialized, "Miopen error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(miopenStatusAllocFailed, "Miopen error"));
+
+  EXPECT_TRUE(CheckCudaStatusSuccess(rocblas_status_success));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(rocblas_status_invalid_handle, "Rocblas error"));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(rocblas_status_invalid_value, "Rocblas error"));
+#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
+  EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
+  EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "Rccl error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "Rccl error"));
+#endif
+}
+#else
 TEST(enforce, cuda_success) {
  EXPECT_TRUE(CheckCudaStatusSuccess(cudaSuccess));
  EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue, "Cuda error"));
@ -341,6 +369,7 @@ TEST(enforce, cuda_success) {
 #endif
 }
 #endif
+#endif

 struct CannotToStringType {
  explicit CannotToStringType(int num) : num_(num) {}
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
--- a/paddle/fluid/platform/stream/CMakeLists.txt
+++ b/paddle/fluid/platform/stream/CMakeLists.txt
@ -1,3 +1,3 @@
-IF(WITH_GPU)
+IF(WITH_GPU OR WITH_ROCM)
 cc_library(cuda_stream SRCS cuda_stream.cc DEPS enforce boost)
 ENDIF()
--- a/paddle/fluid/platform/stream/cuda_stream.cc
+++ b/paddle/fluid/platform/stream/cuda_stream.cc
@ -20,7 +20,11 @@ namespace paddle {
 namespace platform {
 namespace stream {

+#ifdef PADDLE_WITH_HIP
+constexpr unsigned int kDefaultFlag = hipStreamDefault;
+#else
 constexpr unsigned int kDefaultFlag = cudaStreamDefault;
+#endif

 bool CUDAStream::Init(const Place& place, const Priority& priority) {
  PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
@ -29,11 +33,21 @@ bool CUDAStream::Init(const Place& place, const Priority& priority) {
  place_ = place;
  CUDADeviceGuard guard(BOOST_GET_CONST(CUDAPlace, place_).device);
  if (priority == Priority::kHigh) {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        hipStreamCreateWithPriority(&stream_, kDefaultFlag, -1));
+#else
    PADDLE_ENFORCE_CUDA_SUCCESS(
        cudaStreamCreateWithPriority(&stream_, kDefaultFlag, -1));
+#endif
  } else if (priority == Priority::kNormal) {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        hipStreamCreateWithPriority(&stream_, kDefaultFlag, 0));
+#else
    PADDLE_ENFORCE_CUDA_SUCCESS(
        cudaStreamCreateWithPriority(&stream_, kDefaultFlag, 0));
+#endif
  }
  callback_manager_.reset(new StreamCallbackManager(stream_));
  VLOG(3) << "CUDAStream Init stream: " << stream_
@ -46,12 +60,27 @@ void CUDAStream::Destroy() {
  Wait();
  WaitCallback();
  if (stream_) {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(stream_));
+#else
    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_));
+#endif
  }
  stream_ = nullptr;
 }

 void CUDAStream::Wait() const {
+#ifdef PADDLE_WITH_HIP
+  hipError_t e_sync = hipSuccess;
+#if !defined(_WIN32)
+  e_sync = hipStreamSynchronize(stream_);
+#else
+  while (e_sync = hipStreamQuery(stream_)) {
+    if (e_sync == hipErrorNotReady) continue;
+    break;
+  }
+#endif
+#else
  cudaError_t e_sync = cudaSuccess;
 #if !defined(_WIN32)
  e_sync = cudaStreamSynchronize(stream_);
@ -61,6 +90,7 @@ void CUDAStream::Wait() const {
    break;
  }
 #endif
+#endif  // PADDLE_WITH_HIP

  PADDLE_ENFORCE_CUDA_SUCCESS(e_sync);
 }
--- a/paddle/fluid/platform/stream/cuda_stream.h
+++ b/paddle/fluid/platform/stream/cuda_stream.h
@ -26,7 +26,7 @@ namespace paddle {
 namespace platform {
 namespace stream {

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

 enum class Priority : uint8_t {
  kNull = 0x0,
@ -51,28 +51,55 @@ class CUDAStream final {
  }

  template <typename Callback>
+#ifdef PADDLE_WITH_HIP
+  void RecordEvent(hipEvent_t ev, Callback callback) const {
+    callback();
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(ev, stream_));
+  }
+#else
  void RecordEvent(cudaEvent_t ev, Callback callback) const {
    callback();
    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_));
  }
+#endif

+#ifdef PADDLE_WITH_HIP
+  void RecordEvent(hipEvent_t ev) const {
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(ev, stream_));
+  }
+#else
  void RecordEvent(cudaEvent_t ev) const {
    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_));
  }
+#endif

+#ifdef PADDLE_WITH_HIP
+  void WaitEvent(hipEvent_t ev) const {
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(stream_, ev, 0));
+  }
+#else
  void WaitEvent(cudaEvent_t ev) const {
    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream_, ev, 0));
  }
+#endif

  void Wait() const;
  void WaitCallback() const { callback_manager_->Wait(); }

+#ifdef PADDLE_WITH_HIP
+  const hipStream_t& raw_stream() const { return stream_; }
+#else
  const cudaStream_t& raw_stream() const { return stream_; }
+#endif
  void Destroy();

 private:
  Place place_;
+#ifdef PADDLE_WITH_HIP
+  hipStream_t stream_{nullptr};
+#else
  cudaStream_t stream_{nullptr};
+#endif
  Priority priority_{Priority::kNormal};
  std::unique_ptr<StreamCallbackManager> callback_manager_;

--- a/paddle/fluid/platform/type_defs.h
+++ b/paddle/fluid/platform/type_defs.h
@ -0,0 +1,37 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#else
+#include <cuda_runtime.h>
+#endif
+
+namespace paddle {
+
+#ifdef PADDLE_WITH_HIP
+#define gpuSuccess hipSuccess
+using gpuStream_t = hipStream_t;
+using gpuError_t = hipError_t;
+using gpuEvent_t = hipEvent_t;
+#else
+#define gpuSuccess cudaSuccess
+using gpuStream_t = cudaStream_t;
+using gpuError_t = cudaError_t;
+using gpuEvent_t = cudaEvent_t;
+#endif
+
+}  // namespace paddle
--- a/tools/dockerfile/Dockerfile.rocm
+++ b/tools/dockerfile/Dockerfile.rocm
@ -30,7 +30,7 @@ ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8

-RUN yum install -y epel-release deltarpm sudo openssh-server openssl-devel gettext-devel sqlite-devel \
+RUN yum install -y epel-release deltarpm sudo openssh-server gettext-devel sqlite-devel \
        zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz wget curl-devel \
        make bzip2 git patch unzip bison yasm diffutils automake which file kernel-headers kernel-devel

@ -65,6 +65,15 @@ RUN echo "[ROCm]" > /etc/yum.repos.d/rocm.repo && \
 RUN yum install -y rocm-dev rocm-utils rocfft miopen-hip rocblas hipsparse rocrand rccl hipcub rocthrust rocprofiler-dev roctracer-dev
 # fix rocthrust
 RUN sed -i '21 a #include <thrust/system/hip/config.h>' /opt/rocm/include/thrust/system/hip/detail/error.inl
+# export ROCM env
+ENV ROCM_PATH=/opt/rocm
+ENV HIP_PATH=/opt/rocm/hip
+ENV HIP_CLANG_PATH=/opt/rocm/llvm/bin
+ENV PATH=/opt/rocm/bin:$PATH
+ENV PATH=/opt/rocm/hcc/bin:$PATH
+ENV PATH=/opt/rocm/hip/bin:$PATH
+ENV PATH=/opt/rocm/opencl/bin:$PATH
+ENV PATH=/opt/rocm/llvm/bin:$PATH

 # git 2.17.1
 RUN cd /opt && wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz && \
@ -117,6 +126,13 @@ RUN sed -i "s/^#PermitRootLogin/PermitRootLogin/" /etc/ssh/sshd_config && \
    sed -i "s/^#PubkeyAuthentication/PubkeyAuthentication/" /etc/ssh/sshd_config && \
    sed -i "s/^#RSAAuthentication/RSAAuthentication/" /etc/ssh/sshd_config

+# patchelf
+RUN yum install -y patchelf && \
+    yum clean all && \
+    rm -rf /var/cache/yum && \
+    rm -rf /var/lib/yum/yumdb && \
+    rm -rf /var/lib/yum/history
+
 # swig 2.0.12
 RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
    cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && \