[ROCM] update fluid platform for rocm39 (part2), test=develop (#30774)

4 years ago · 34f1628ce8
parent 5ded39f226
commit 34f1628ce8
19 changed files with 530 additions and 110 deletions
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@ -22,30 +22,109 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"

 #define ARITHMETIC_KERNEL(op_type, sign)                                 \
-  __global__ void op_type(const half* in1, const half* in2, half* out) { \
+  __global__ void op_type(const half *in1, const half *in2, half *out) { \
    out[0] = in1[0] sign in2[0];                                         \
  }

 #define COMPOUND_KERNEL(op_type, sign) \
-  __global__ void op_type(half* in1, const half* in2) { in1[0] sign in2[0]; }
+  __global__ void op_type(half *in1, const half *in2) { in1[0] sign in2[0]; }

 #define COMPARISON_KERNEL(op_type, sign)                                 \
-  __global__ void op_type(const half* in1, const half* in2, bool* out) { \
+  __global__ void op_type(const half *in1, const half *in2, bool *out) { \
    out[0] = in1[0] sign in2[0];                                         \
  }

+#ifdef PADDLE_WITH_HIP
+#define ARITHMETIC_KERNEL_LAUNCH(op_type)                                     \
+  void Test##op_type(float v_in1, float v_in2, float v_out) {                 \
+    LOG(INFO) << "Test " << #op_type << " on GPU!";                           \
+    half *in1, *in2, *out;                                                    \
+    half *d_in1, *d_in2, *d_out;                                              \
+    int size = sizeof(half);                                                  \
+    hipMalloc(reinterpret_cast<void **>(&d_in1), size);                       \
+    hipMalloc(reinterpret_cast<void **>(&d_in2), size);                       \
+    hipMalloc(reinterpret_cast<void **>(&d_out), size);                       \
+    in1 = reinterpret_cast<half *>(malloc(size));                             \
+    in2 = reinterpret_cast<half *>(malloc(size));                             \
+    out = reinterpret_cast<half *>(malloc(size));                             \
+    in1[0] = half(float16(v_in1));                                            \
+    in2[0] = half(float16(v_in2));                                            \
+    hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice);                       \
+    hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice);                       \
+    hipLaunchKernelGGL(op_type, dim3(1), dim3(1), 0, 0, d_in1, d_in2, d_out); \
+    hipMemcpy(out, d_out, size, hipMemcpyDeviceToHost);                       \
+    EXPECT_EQ(static_cast<float>(float16(out[0])), v_out);                    \
+    free(in1);                                                                \
+    free(in2);                                                                \
+    free(out);                                                                \
+    hipFree(d_in1);                                                           \
+    hipFree(d_in2);                                                           \
+    hipFree(d_out);                                                           \
+  }
+
+#define COMPOUND_KERNEL_LAUNCH(op_type)                                \
+  void Test##op_type(float v_in1, float v_in2, float v_out) {          \
+    LOG(INFO) << "Test " << #op_type << " on GPU!";                    \
+    half *in1, *in2;                                                   \
+    half *d_in1, *d_in2;                                               \
+    int size = sizeof(half);                                           \
+    hipMalloc(reinterpret_cast<void **>(&d_in1), size);                \
+    hipMalloc(reinterpret_cast<void **>(&d_in2), size);                \
+    in1 = reinterpret_cast<half *>(malloc(size));                      \
+    in2 = reinterpret_cast<half *>(malloc(size));                      \
+    in1[0] = half(float16(v_in1));                                     \
+    in2[0] = half(float16(v_in2));                                     \
+    hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice);                \
+    hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice);                \
+    hipLaunchKernelGGL(op_type, dim3(1), dim3(1), 0, 0, d_in1, d_in2); \
+    hipMemcpy(in1, d_in1, size, hipMemcpyDeviceToHost);                \
+    EXPECT_EQ(static_cast<float>(float16(in1[0])), v_out);             \
+    free(in1);                                                         \
+    free(in2);                                                         \
+    hipFree(d_in1);                                                    \
+    hipFree(d_in2);                                                    \
+  }
+
+#define COMPARISON_KERNEL_LAUNCH(op_type)                                     \
+  void Test##op_type(float v_in1, float v_in2, bool v_out) {                  \
+    LOG(INFO) << "Test " << #op_type << " on GPU!";                           \
+    half *in1, *in2;                                                          \
+    half *d_in1, *d_in2;                                                      \
+    bool *out, *d_out;                                                        \
+    int size = sizeof(half);                                                  \
+    hipMalloc(reinterpret_cast<void **>(&d_in1), size);                       \
+    hipMalloc(reinterpret_cast<void **>(&d_in2), size);                       \
+    hipMalloc(reinterpret_cast<void **>(&d_out), 1);                          \
+    in1 = reinterpret_cast<half *>(malloc(size));                             \
+    in2 = reinterpret_cast<half *>(malloc(size));                             \
+    out = reinterpret_cast<bool *>(malloc(1));                                \
+    in1[0] = half(float16(v_in1));                                            \
+    in2[0] = half(float16(v_in2));                                            \
+    hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice);                       \
+    hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice);                       \
+    hipLaunchKernelGGL(op_type, dim3(1), dim3(1), 0, 0, d_in1, d_in2, d_out); \
+    hipMemcpy(out, d_out, 1, hipMemcpyDeviceToHost);                          \
+    EXPECT_EQ(out[0], v_out);                                                 \
+    free(in1);                                                                \
+    free(in2);                                                                \
+    free(out);                                                                \
+    hipFree(d_in1);                                                           \
+    hipFree(d_in2);                                                           \
+    hipFree(d_out);                                                           \
+  }
+#else
 #define ARITHMETIC_KERNEL_LAUNCH(op_type)                     \
  void Test##op_type(float v_in1, float v_in2, float v_out) { \
    LOG(INFO) << "Test " << #op_type << " on GPU!";           \
    half *in1, *in2, *out;                                    \
    half *d_in1, *d_in2, *d_out;                              \
    int size = sizeof(half);                                  \
-    cudaMalloc(reinterpret_cast<void**>(&d_in1), size);       \
-    cudaMalloc(reinterpret_cast<void**>(&d_in2), size);       \
-    cudaMalloc(reinterpret_cast<void**>(&d_out), size);       \
-    in1 = reinterpret_cast<half*>(malloc(size));              \
-    in2 = reinterpret_cast<half*>(malloc(size));              \
-    out = reinterpret_cast<half*>(malloc(size));              \
+    cudaMalloc(reinterpret_cast<void **>(&d_in1), size);      \
+    cudaMalloc(reinterpret_cast<void **>(&d_in2), size);      \
+    cudaMalloc(reinterpret_cast<void **>(&d_out), size);      \
+    in1 = reinterpret_cast<half *>(malloc(size));             \
+    in2 = reinterpret_cast<half *>(malloc(size));             \
+    out = reinterpret_cast<half *>(malloc(size));             \
    in1[0] = half(float16(v_in1));                            \
    in2[0] = half(float16(v_in2));                            \
    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
@ -67,10 +146,10 @@ limitations under the License. */
    half *in1, *in2;                                          \
    half *d_in1, *d_in2;                                      \
    int size = sizeof(half);                                  \
-    cudaMalloc(reinterpret_cast<void**>(&d_in1), size);       \
-    cudaMalloc(reinterpret_cast<void**>(&d_in2), size);       \
-    in1 = reinterpret_cast<half*>(malloc(size));              \
-    in2 = reinterpret_cast<half*>(malloc(size));              \
+    cudaMalloc(reinterpret_cast<void **>(&d_in1), size);      \
+    cudaMalloc(reinterpret_cast<void **>(&d_in2), size);      \
+    in1 = reinterpret_cast<half *>(malloc(size));             \
+    in2 = reinterpret_cast<half *>(malloc(size));             \
    in1[0] = half(float16(v_in1));                            \
    in2[0] = half(float16(v_in2));                            \
    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
@ -91,12 +170,12 @@ limitations under the License. */
    half *d_in1, *d_in2;                                     \
    bool *out, *d_out;                                       \
    int size = sizeof(half);                                 \
-    cudaMalloc(reinterpret_cast<void**>(&d_in1), size);      \
-    cudaMalloc(reinterpret_cast<void**>(&d_in2), size);      \
-    cudaMalloc(reinterpret_cast<void**>(&d_out), 1);         \
-    in1 = reinterpret_cast<half*>(malloc(size));             \
-    in2 = reinterpret_cast<half*>(malloc(size));             \
-    out = reinterpret_cast<bool*>(malloc(1));                \
+    cudaMalloc(reinterpret_cast<void **>(&d_in1), size);     \
+    cudaMalloc(reinterpret_cast<void **>(&d_in2), size);     \
+    cudaMalloc(reinterpret_cast<void **>(&d_out), 1);        \
+    in1 = reinterpret_cast<half *>(malloc(size));            \
+    in2 = reinterpret_cast<half *>(malloc(size));            \
+    out = reinterpret_cast<bool *>(malloc(1));               \
    in1[0] = half(float16(v_in1));                           \
    in2[0] = half(float16(v_in2));                           \
    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);    \
@ -111,12 +190,14 @@ limitations under the License. */
    cudaFree(d_in2);                                         \
    cudaFree(d_out);                                         \
  }
+#endif

 #ifdef PADDLE_CUDA_FP16
 namespace paddle {
 namespace platform {

-#if CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP) || \
+    (defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000)
 ARITHMETIC_KERNEL(Add, +)
 ARITHMETIC_KERNEL(Sub, -)
 ARITHMETIC_KERNEL(Mul, *)
@ -128,21 +209,37 @@ ARITHMETIC_KERNEL_LAUNCH(Mul)
 ARITHMETIC_KERNEL_LAUNCH(Div)

 // Negative sign kernel
-__global__ void Neg(half* in) { in[0] = -in[0]; }
+__global__ void Neg(half *in) { in[0] = -in[0]; }

 void TestNeg(float v_in, float v_out) {
  LOG(INFO) << "Test Neg on GPU!";
  half *in, *d_in;
  int size = sizeof(half);
-  cudaMalloc(reinterpret_cast<void**>(&d_in), size);
-  in = reinterpret_cast<half*>(malloc(size));
+#ifdef PADDLE_WITH_HIP
+  hipMalloc(reinterpret_cast<void **>(&d_in), size);
+#else
+  cudaMalloc(reinterpret_cast<void **>(&d_in), size);
+#endif
+  in = reinterpret_cast<half *>(malloc(size));
  in[0] = half(float16(v_in));
+#ifdef PADDLE_WITH_HIP
+  hipMemcpy(d_in, in, size, hipMemcpyHostToDevice);
+#else
  cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
+#endif
  Neg<<<1, 1>>>(d_in);
+#ifdef PADDLE_WITH_HIP
+  hipMemcpy(in, d_in, size, hipMemcpyDeviceToHost);
+#else
  cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
+#endif
  EXPECT_EQ(static_cast<float>(float16(in[0])), v_out);
  free(in);
+#ifdef PADDLE_WITH_HIP
+  hipFree(d_in);
+#else
  cudaFree(d_in);
+#endif
 }

 COMPOUND_KERNEL(AddAssign, +=)
@ -221,7 +318,7 @@ TEST(float16, lod_tensor_on_gpu) {
  framework::LoDTensor gpu_tensor;
  framework::LoDTensor dst_tensor;

-  float16* src_ptr = src_tensor.mutable_data<float16>(
+  float16 *src_ptr = src_tensor.mutable_data<float16>(
      framework::make_ddim({2, 2}), CPUPlace());

  float16 arr[4] = {float16(1.0f), float16(0.5f), float16(0.33333f),
@ -238,7 +335,7 @@ TEST(float16, lod_tensor_on_gpu) {

  // Sync before comparing LoDTensors
  gpu_ctx.Wait();
-  const float16* dst_ptr = dst_tensor.data<float16>();
+  const float16 *dst_ptr = dst_tensor.data<float16>();
  ASSERT_NE(src_ptr, dst_ptr);
  for (size_t i = 0; i < 4; ++i) {
    EXPECT_EQ(src_ptr[i].x, dst_ptr[i].x);
@ -247,7 +344,7 @@ TEST(float16, lod_tensor_on_gpu) {

 template <typename T>
 struct Functor {
-  bool operator()(const T& val) {
+  bool operator()(const T &val) {
    return std::type_index(typeid(T)) ==
           std::type_index(typeid(platform::float16));
  }
@ -304,13 +401,13 @@ TEST(float16, cast) {
  auto b = a;
  {
    // change semantic, keep the same value
-    float16 c = reinterpret_cast<float16&>(reinterpret_cast<unsigned&>(b));
+    float16 c = reinterpret_cast<float16 &>(reinterpret_cast<unsigned &>(b));
    EXPECT_EQ(b, c);
  }

  {
    // use uint32 low 16 bit store float16
-    uint32_t c = reinterpret_cast<uint32_t&>(b);
+    uint32_t c = reinterpret_cast<uint32_t &>(b);
    float16 d;
    d.x = c;
    EXPECT_EQ(b, d);
--- a/paddle/fluid/platform/gen_comm_id_helper.cc
+++ b/paddle/fluid/platform/gen_comm_id_helper.cc
@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_XPU_BKCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/gen_comm_id_helper.h"

 #include <arpa/inet.h>
@ -336,7 +337,7 @@ void RecvBroadCastCommID(int server_fd, std::string endpoint,
  template void RecvBroadCastCommID<Type>(std::string endpoint,             \
                                          std::vector<Type> * nccl_ids);

-#ifdef PADDLE_WITH_NCCL
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 INSTANT_TEMPLATE(ncclUniqueId)
 #endif
 #ifdef PADDLE_WITH_XPU_BKCL
--- a/paddle/fluid/platform/gen_comm_id_helper.h
+++ b/paddle/fluid/platform/gen_comm_id_helper.h
@ -14,7 +14,8 @@ limitations under the License. */

 #pragma once

-#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_XPU_BKCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_XPU_BKCL)
 #include <functional>
 #include <string>
 #include <vector>
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@ -15,11 +15,19 @@ limitations under the License. */
 #pragma once

 #ifdef PADDLE_WITH_CUDA
-
 #include <cuda_runtime.h>
+#endif
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+// Note: this header for simplify HIP and CUDA type string
 #include <stddef.h>
 #include <string>
 #include <vector>
+#include "paddle/fluid/platform/type_defs.h"

 namespace paddle {
 namespace platform {
@ -86,28 +94,36 @@ size_t GpuMaxChunkSize();

 //! Copy memory from address src to dst asynchronously.
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+#ifdef PADDLE_WITH_HIP
+                    enum hipMemcpyKind kind, hipStream_t stream);
+#else
                    enum cudaMemcpyKind kind, cudaStream_t stream);
+#endif

 //! Copy memory from address src to dst synchronously.
 void GpuMemcpySync(void *dst, const void *src, size_t count,
+#ifdef PADDLE_WITH_HIP
+                   enum hipMemcpyKind kind);
+#else
                   enum cudaMemcpyKind kind);
+#endif

 //! Copy memory from one device to another device asynchronously.
 void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
-                        int src_device, size_t count, cudaStream_t stream);
+                        int src_device, size_t count, gpuStream_t stream);

 //! Copy memory from one device to another device synchronously.
 void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
                       int src_device, size_t count);

 //! Set memory dst with value count size asynchronously
-void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);
+void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream);

 //! Blocks until stream has completed all operations.
-void GpuStreamSync(cudaStream_t stream);
+void GpuStreamSync(gpuStream_t stream);

 //! CudaMalloc with recorded info
-cudaError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id);
+gpuError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id);

 //! CudaFree with recorded info
 void RecordedCudaFree(void *p, size_t size, int dev_id);
--- a/paddle/fluid/platform/gpu_launch_config.h
+++ b/paddle/fluid/platform/gpu_launch_config.h
@ -16,9 +16,13 @@

 #pragma once

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

+#ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
+#else
+#include <hip/hip_runtime.h>
+#endif
 #include <stddef.h>
 #include <algorithm>
 #include <string>
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@ -14,7 +14,7 @@

 #pragma once

-#ifdef PADDLE_WITH_NCCL
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include <stdio.h>
 #include <memory>
 #include <string>
@ -25,7 +25,12 @@

 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/collective_helper.h"
+#ifdef PADDLE_WITH_NCCL
 #include "paddle/fluid/platform/dynload/nccl.h"
+#endif
+#ifdef PADDLE_WITH_RCCL
+#include "paddle/fluid/platform/dynload/rccl.h"
+#endif
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"

@ -81,7 +86,7 @@ struct NCCLContext {
  explicit NCCLContext(int dev_id)
      : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}

-  cudaStream_t stream() const { return ctx_->stream(); }
+  gpuStream_t stream() const { return ctx_->stream(); }
  ncclComm_t comm() const { return comm_; }

  int device_id() const {
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@ -154,7 +154,7 @@ struct PlaceVisitorWrapper
  }

  typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    return visitor_(cuda);
 #else
    PADDLE_THROW(platform::errors::Unavailable(
@ -165,7 +165,7 @@ struct PlaceVisitorWrapper

  typename Visitor::result_type operator()(
      const CUDAPinnedPlace &cuda_pinned) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    return visitor_(cuda_pinned);
 #else
    PADDLE_THROW(platform::errors::Unavailable(
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@ -206,7 +206,7 @@ void EnableProfiler(ProfilerState state) {
  g_state = state;
  should_send_profile_state = true;
  GetDeviceTracer()->Enable();
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (g_state == ProfilerState::kCUDA || g_state == ProfilerState::kAll ||
      g_state == ProfilerState::kCPU) {
    // Generate some dummy events first to reduce the startup overhead.
--- a/paddle/fluid/platform/profiler.cu
+++ b/paddle/fluid/platform/profiler.cu
@ -12,7 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
+#endif
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+
 #include "paddle/fluid/platform/profiler.h"

 namespace paddle {
@ -31,6 +38,21 @@ static void ForEachDevice(std::function<void(int)> func) {
 }

 void DummyKernelAndEvent() {
+#ifdef PADDLE_WITH_HIP
+  for (int i = 0; i < 5; i++) {
+    ForEachDevice([](int d) {
+      platform::SetDeviceId(d);
+      hipStream_t stream;
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream));
+      Mark("_cuda_startup_");
+      int *ptr;
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipMalloc(&ptr, sizeof(int)));
+      hipLaunchKernelGGL(DummyKernel, dim3(1), dim3(1), 0, stream, ptr);
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipFree(ptr));
+    });
+  }
+#else
  for (int i = 0; i < 5; i++) {
    ForEachDevice([](int d) {
      platform::SetDeviceId(d);
@ -44,6 +66,7 @@ void DummyKernelAndEvent() {
      PADDLE_ENFORCE_CUDA_SUCCESS(cudaFree(ptr));
    });
  }
+#endif
 }

 }  // namespace platform
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@ -28,7 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/event.h"
 #include "paddle/fluid/platform/place.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
 namespace paddle {
@ -220,7 +220,7 @@ std::string OpName(const framework::VariableNameMap& name_map,
                   const std::string& type_name);
 void SetTracerOption(TracerOption option);
 platform::TracerOption GetTracerOption();
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void DummyKernelAndEvent();
 #endif

--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@ -31,6 +31,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #endif  // PADDLE_WITH_CUDA
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif

 namespace paddle {
 namespace platform {
@ -122,6 +125,13 @@ void SynchronizeAllDevice() {
    PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
  }
 #endif
+#ifdef PADDLE_WITH_HIP
+  int count = GetCUDADeviceCount();
+  for (int i = 0; i < count; i++) {
+    SetDeviceId(i);
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize());
+  }
+#endif
 }

 // Print results
@ -300,7 +310,7 @@ void SetEvent(bool merge_thread, const Event &analyze_event,
    if (rit != pushed_events->rend()) {
      double event_time = 0;
      double gpu_time = 0.0f;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      gpu_time = rit->CudaElapsedMs(analyze_event);
 #endif
      double cpu_time = rit->CpuElapsedMs(analyze_event);
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@ -122,7 +122,7 @@ TEST(RecordEvent, RecordEvent) {
      if (events[i][j].name() == "_start_profiler_") ++start_profiler_count;
      if (events[i][j].name() == "push") {
        EXPECT_EQ(events[i][j + 1].name(), "pop");
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
        EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0);
 #else
        EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0);
@ -146,3 +146,13 @@ TEST(TMP, stream_wait) {
  cudaStreamSynchronize(stream);
 }
 #endif
+
+#ifdef PADDLE_WITH_HIP
+TEST(TMP, stream_wait) {
+  hipStream_t stream;
+  hipStreamCreate(&stream);
+  hipStreamSynchronize(stream);
+  hipStreamSynchronize(stream);
+  hipStreamSynchronize(stream);
+}
+#endif
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@ -18,7 +18,10 @@
 namespace paddle {
 namespace platform {

-#if CUDA_VERSION >= 10000
+#ifdef PADDLE_WITH_HIP
+static void StreamCallbackFunc(gpuStream_t stream, gpuError_t status,
+                               void *user_data)
+#elif CUDA_VERSION >= 10000
 static void CUDART_CB StreamCallbackFunc(void *user_data)
 #else
 static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
@ -30,7 +33,7 @@ static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
  (*func)();
 }

-StreamCallbackManager::StreamCallbackManager(const cudaStream_t stream)
+StreamCallbackManager::StreamCallbackManager(const gpuStream_t stream)
    : stream_(stream), thread_pool_(1) {}

 void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
@ -42,7 +45,10 @@ void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
      (*callback_func)();
    });
  });
-#if CUDA_VERSION >= 10000
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      hipStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
+#elif CUDA_VERSION >= 10000
  PADDLE_ENFORCE_CUDA_SUCCESS(
      cudaLaunchHostFunc(stream_, StreamCallbackFunc, func));
 #else
@ -52,7 +58,11 @@ void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
 }

 void StreamCallbackManager::Wait() const {
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_));
+#else
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_));
+#endif
  {
    std::lock_guard<std::mutex> lock(mtx_);
    if (last_future_.valid()) {
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@ -15,8 +15,16 @@
 #pragma once

 #include <ThreadPool.h>
+
+#ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
+#endif
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+
 #include <functional>
 #include <future>  // NOLINT
 #include <memory>
@ -31,7 +39,7 @@ namespace platform {
 // Make StreamCallbackManager thread-safe
 class StreamCallbackManager {
 public:
-  explicit StreamCallbackManager(const cudaStream_t stream);
+  explicit StreamCallbackManager(const gpuStream_t stream);

  ~StreamCallbackManager() = default;

@ -40,7 +48,7 @@ class StreamCallbackManager {
  void Wait() const;

 private:
-  const cudaStream_t stream_;
+  const gpuStream_t stream_;
  mutable ::ThreadPool thread_pool_;
  mutable std::mutex mtx_;
  mutable std::future<void> last_future_;
--- a/paddle/fluid/platform/test_limit_gpu_memory.cu
+++ b/paddle/fluid/platform/test_limit_gpu_memory.cu
@ -40,24 +40,36 @@ TEST(test_record_malloc, test_limit_gpu_memory) {
    RecordedCudaMemGetInfo(&avail, &total, &actual_avail, &actual_total,
                           DEVICE_ID);
    ASSERT_EQ(total, limit);
-    ASSERT_EQ(cudaGetLastError(), cudaSuccess);
+#ifdef PADDLE_WITH_HIP
+    ASSERT_EQ(hipGetLastError(), gpuSuccess);
+#else
+    ASSERT_EQ(cudaGetLastError(), gpuSuccess);
+#endif
  }

  {
    CUDADeviceGuard guard(DEVICE_ID);
    GpuMemoryUsage(&avail, &total);
    ASSERT_EQ(total, limit);
-    ASSERT_EQ(cudaGetLastError(), cudaSuccess);
+#ifdef PADDLE_WITH_HIP
+    ASSERT_EQ(hipGetLastError(), gpuSuccess);
+#else
+    ASSERT_EQ(cudaGetLastError(), gpuSuccess);
+#endif
  }

-  cudaError_t err = cudaSuccess;
+  gpuError_t err = gpuSuccess;

  void *p1 = nullptr;
  size_t size1 = limit / 4 * 3;
  {
    err = platform::RecordedCudaMalloc(&p1, size1, DEVICE_ID);
-    ASSERT_EQ(err, cudaSuccess);
-    ASSERT_EQ(cudaGetLastError(), cudaSuccess);
+    ASSERT_EQ(err, gpuSuccess);
+#ifdef PADDLE_WITH_HIP
+    ASSERT_EQ(hipGetLastError(), gpuSuccess);
+#else
+    ASSERT_EQ(cudaGetLastError(), gpuSuccess);
+#endif
    ASSERT_NE(p1, nullptr);

    ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size1);
@ -67,8 +79,13 @@ TEST(test_record_malloc, test_limit_gpu_memory) {
  size_t size2 = limit / 2;
  {
    err = platform::RecordedCudaMalloc(&p2, size2, DEVICE_ID);
+#ifdef PADDLE_WITH_HIP
+    ASSERT_EQ(err, hipErrorOutOfMemory);
+    ASSERT_EQ(hipGetLastError(), gpuSuccess);
+#else
    ASSERT_EQ(err, cudaErrorMemoryAllocation);
-    ASSERT_EQ(cudaGetLastError(), cudaSuccess);
+    ASSERT_EQ(cudaGetLastError(), gpuSuccess);
+#endif
    ASSERT_EQ(p2, nullptr);

    ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size1);
@ -81,8 +98,12 @@ TEST(test_record_malloc, test_limit_gpu_memory) {

  {
    err = platform::RecordedCudaMalloc(&p2, size2, DEVICE_ID);
-    ASSERT_EQ(err, cudaSuccess);
+    ASSERT_EQ(err, gpuSuccess);
+#ifdef PADDLE_WITH_HIP
+    ASSERT_EQ(hipGetLastError(), hipSuccess);
+#else
    ASSERT_EQ(cudaGetLastError(), cudaSuccess);
+#endif
    ASSERT_NE(p2, nullptr);
    ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size2);
  }
--- a/paddle/fluid/platform/transform.h
+++ b/paddle/fluid/platform/transform.h
@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/hostdevice.h"
 #include "paddle/fluid/platform/place.h"

-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/execution_policy.h>
 #include <thrust/transform.h>
 #include "paddle/fluid/platform/details/cuda_transform_iterator_cast.h"
@ -76,7 +76,7 @@ struct Transform<platform::CPUDeviceContext> {
  }
 };

-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <>
 struct Transform<platform::CUDADeviceContext> {
  template <typename InputIter, typename OutputIter, typename UnaryOperation>
@ -86,10 +86,17 @@ struct Transform<platform::CUDADeviceContext> {
    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
                      platform::errors::PreconditionNotMet(
                          "The CUDA Transform must be used in GPU place."));
+#ifdef __HIPCC__
+    thrust::transform(thrust::hip::par.on(context.stream()),
+                      details::CastToCUDATransformIterator(first),
+                      details::CastToCUDATransformIterator(last),
+                      details::CastToCUDATransformIterator(result), op);
+#else
    thrust::transform(thrust::cuda::par.on(context.stream()),
                      details::CastToCUDATransformIterator(first),
                      details::CastToCUDATransformIterator(last),
                      details::CastToCUDATransformIterator(result), op);
+#endif
  }

  template <typename InputIter1, typename InputIter2, typename OutputIter,
@ -101,11 +108,19 @@ struct Transform<platform::CUDADeviceContext> {
    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
                      platform::errors::PreconditionNotMet(
                          "The CUDA Transform must be used in GPU place."));
+#ifdef __HIPCC__
+    thrust::transform(thrust::hip::par.on(context.stream()),
+                      details::CastToCUDATransformIterator(first1),
+                      details::CastToCUDATransformIterator(last1),
+                      details::CastToCUDATransformIterator(first2),
+                      details::CastToCUDATransformIterator(result), op);
+#else
    thrust::transform(thrust::cuda::par.on(context.stream()),
                      details::CastToCUDATransformIterator(first1),
                      details::CastToCUDATransformIterator(last1),
                      details::CastToCUDATransformIterator(first2),
                      details::CastToCUDATransformIterator(result), op);
+#endif
  }
 };
 #endif
--- a/paddle/fluid/platform/variant.h
+++ b/paddle/fluid/platform/variant.h
@ -32,7 +32,7 @@ limitations under the License. */
 // BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same
 // function symbols.  For details,
 // https://github.com/PaddlePaddle/Paddle/issues/3386
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #ifndef BOOST_NO_CXX11_VARIADIC_TEMPLATES
 #define BOOST_NO_CXX11_VARIADIC_TEMPLATES
 #endif