[ROCM] update fluid platform for rocm39 (part2), test=develop (#30774)

revert-31068-fix_conv3d_windows
Qi Li 4 years ago committed by GitHub
parent 5ded39f226
commit 34f1628ce8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

File diff suppressed because it is too large Load Diff

@ -22,30 +22,109 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#define ARITHMETIC_KERNEL(op_type, sign) \ #define ARITHMETIC_KERNEL(op_type, sign) \
__global__ void op_type(const half* in1, const half* in2, half* out) { \ __global__ void op_type(const half *in1, const half *in2, half *out) { \
out[0] = in1[0] sign in2[0]; \ out[0] = in1[0] sign in2[0]; \
} }
#define COMPOUND_KERNEL(op_type, sign) \ #define COMPOUND_KERNEL(op_type, sign) \
__global__ void op_type(half* in1, const half* in2) { in1[0] sign in2[0]; } __global__ void op_type(half *in1, const half *in2) { in1[0] sign in2[0]; }
#define COMPARISON_KERNEL(op_type, sign) \ #define COMPARISON_KERNEL(op_type, sign) \
__global__ void op_type(const half* in1, const half* in2, bool* out) { \ __global__ void op_type(const half *in1, const half *in2, bool *out) { \
out[0] = in1[0] sign in2[0]; \ out[0] = in1[0] sign in2[0]; \
} }
#ifdef PADDLE_WITH_HIP
#define ARITHMETIC_KERNEL_LAUNCH(op_type) \
void Test##op_type(float v_in1, float v_in2, float v_out) { \
LOG(INFO) << "Test " << #op_type << " on GPU!"; \
half *in1, *in2, *out; \
half *d_in1, *d_in2, *d_out; \
int size = sizeof(half); \
hipMalloc(reinterpret_cast<void **>(&d_in1), size); \
hipMalloc(reinterpret_cast<void **>(&d_in2), size); \
hipMalloc(reinterpret_cast<void **>(&d_out), size); \
in1 = reinterpret_cast<half *>(malloc(size)); \
in2 = reinterpret_cast<half *>(malloc(size)); \
out = reinterpret_cast<half *>(malloc(size)); \
in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \
hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice); \
hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice); \
hipLaunchKernelGGL(op_type, dim3(1), dim3(1), 0, 0, d_in1, d_in2, d_out); \
hipMemcpy(out, d_out, size, hipMemcpyDeviceToHost); \
EXPECT_EQ(static_cast<float>(float16(out[0])), v_out); \
free(in1); \
free(in2); \
free(out); \
hipFree(d_in1); \
hipFree(d_in2); \
hipFree(d_out); \
}
#define COMPOUND_KERNEL_LAUNCH(op_type) \
void Test##op_type(float v_in1, float v_in2, float v_out) { \
LOG(INFO) << "Test " << #op_type << " on GPU!"; \
half *in1, *in2; \
half *d_in1, *d_in2; \
int size = sizeof(half); \
hipMalloc(reinterpret_cast<void **>(&d_in1), size); \
hipMalloc(reinterpret_cast<void **>(&d_in2), size); \
in1 = reinterpret_cast<half *>(malloc(size)); \
in2 = reinterpret_cast<half *>(malloc(size)); \
in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \
hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice); \
hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice); \
hipLaunchKernelGGL(op_type, dim3(1), dim3(1), 0, 0, d_in1, d_in2); \
hipMemcpy(in1, d_in1, size, hipMemcpyDeviceToHost); \
EXPECT_EQ(static_cast<float>(float16(in1[0])), v_out); \
free(in1); \
free(in2); \
hipFree(d_in1); \
hipFree(d_in2); \
}
#define COMPARISON_KERNEL_LAUNCH(op_type) \
void Test##op_type(float v_in1, float v_in2, bool v_out) { \
LOG(INFO) << "Test " << #op_type << " on GPU!"; \
half *in1, *in2; \
half *d_in1, *d_in2; \
bool *out, *d_out; \
int size = sizeof(half); \
hipMalloc(reinterpret_cast<void **>(&d_in1), size); \
hipMalloc(reinterpret_cast<void **>(&d_in2), size); \
hipMalloc(reinterpret_cast<void **>(&d_out), 1); \
in1 = reinterpret_cast<half *>(malloc(size)); \
in2 = reinterpret_cast<half *>(malloc(size)); \
out = reinterpret_cast<bool *>(malloc(1)); \
in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \
hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice); \
hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice); \
hipLaunchKernelGGL(op_type, dim3(1), dim3(1), 0, 0, d_in1, d_in2, d_out); \
hipMemcpy(out, d_out, 1, hipMemcpyDeviceToHost); \
EXPECT_EQ(out[0], v_out); \
free(in1); \
free(in2); \
free(out); \
hipFree(d_in1); \
hipFree(d_in2); \
hipFree(d_out); \
}
#else
#define ARITHMETIC_KERNEL_LAUNCH(op_type) \ #define ARITHMETIC_KERNEL_LAUNCH(op_type) \
void Test##op_type(float v_in1, float v_in2, float v_out) { \ void Test##op_type(float v_in1, float v_in2, float v_out) { \
LOG(INFO) << "Test " << #op_type << " on GPU!"; \ LOG(INFO) << "Test " << #op_type << " on GPU!"; \
half *in1, *in2, *out; \ half *in1, *in2, *out; \
half *d_in1, *d_in2, *d_out; \ half *d_in1, *d_in2, *d_out; \
int size = sizeof(half); \ int size = sizeof(half); \
cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \ cudaMalloc(reinterpret_cast<void **>(&d_in1), size); \
cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \ cudaMalloc(reinterpret_cast<void **>(&d_in2), size); \
cudaMalloc(reinterpret_cast<void**>(&d_out), size); \ cudaMalloc(reinterpret_cast<void **>(&d_out), size); \
in1 = reinterpret_cast<half*>(malloc(size)); \ in1 = reinterpret_cast<half *>(malloc(size)); \
in2 = reinterpret_cast<half*>(malloc(size)); \ in2 = reinterpret_cast<half *>(malloc(size)); \
out = reinterpret_cast<half*>(malloc(size)); \ out = reinterpret_cast<half *>(malloc(size)); \
in1[0] = half(float16(v_in1)); \ in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \ in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
@ -67,10 +146,10 @@ limitations under the License. */
half *in1, *in2; \ half *in1, *in2; \
half *d_in1, *d_in2; \ half *d_in1, *d_in2; \
int size = sizeof(half); \ int size = sizeof(half); \
cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \ cudaMalloc(reinterpret_cast<void **>(&d_in1), size); \
cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \ cudaMalloc(reinterpret_cast<void **>(&d_in2), size); \
in1 = reinterpret_cast<half*>(malloc(size)); \ in1 = reinterpret_cast<half *>(malloc(size)); \
in2 = reinterpret_cast<half*>(malloc(size)); \ in2 = reinterpret_cast<half *>(malloc(size)); \
in1[0] = half(float16(v_in1)); \ in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \ in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
@ -91,12 +170,12 @@ limitations under the License. */
half *d_in1, *d_in2; \ half *d_in1, *d_in2; \
bool *out, *d_out; \ bool *out, *d_out; \
int size = sizeof(half); \ int size = sizeof(half); \
cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \ cudaMalloc(reinterpret_cast<void **>(&d_in1), size); \
cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \ cudaMalloc(reinterpret_cast<void **>(&d_in2), size); \
cudaMalloc(reinterpret_cast<void**>(&d_out), 1); \ cudaMalloc(reinterpret_cast<void **>(&d_out), 1); \
in1 = reinterpret_cast<half*>(malloc(size)); \ in1 = reinterpret_cast<half *>(malloc(size)); \
in2 = reinterpret_cast<half*>(malloc(size)); \ in2 = reinterpret_cast<half *>(malloc(size)); \
out = reinterpret_cast<bool*>(malloc(1)); \ out = reinterpret_cast<bool *>(malloc(1)); \
in1[0] = half(float16(v_in1)); \ in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \ in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
@ -111,12 +190,14 @@ limitations under the License. */
cudaFree(d_in2); \ cudaFree(d_in2); \
cudaFree(d_out); \ cudaFree(d_out); \
} }
#endif
#ifdef PADDLE_CUDA_FP16 #ifdef PADDLE_CUDA_FP16
namespace paddle { namespace paddle {
namespace platform { namespace platform {
#if CUDA_VERSION < 9000 #if defined(PADDLE_WITH_HIP) || \
(defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000)
ARITHMETIC_KERNEL(Add, +) ARITHMETIC_KERNEL(Add, +)
ARITHMETIC_KERNEL(Sub, -) ARITHMETIC_KERNEL(Sub, -)
ARITHMETIC_KERNEL(Mul, *) ARITHMETIC_KERNEL(Mul, *)
@ -128,21 +209,37 @@ ARITHMETIC_KERNEL_LAUNCH(Mul)
ARITHMETIC_KERNEL_LAUNCH(Div) ARITHMETIC_KERNEL_LAUNCH(Div)
// Negative sign kernel // Negative sign kernel
__global__ void Neg(half* in) { in[0] = -in[0]; } __global__ void Neg(half *in) { in[0] = -in[0]; }
void TestNeg(float v_in, float v_out) { void TestNeg(float v_in, float v_out) {
LOG(INFO) << "Test Neg on GPU!"; LOG(INFO) << "Test Neg on GPU!";
half *in, *d_in; half *in, *d_in;
int size = sizeof(half); int size = sizeof(half);
cudaMalloc(reinterpret_cast<void**>(&d_in), size); #ifdef PADDLE_WITH_HIP
in = reinterpret_cast<half*>(malloc(size)); hipMalloc(reinterpret_cast<void **>(&d_in), size);
#else
cudaMalloc(reinterpret_cast<void **>(&d_in), size);
#endif
in = reinterpret_cast<half *>(malloc(size));
in[0] = half(float16(v_in)); in[0] = half(float16(v_in));
#ifdef PADDLE_WITH_HIP
hipMemcpy(d_in, in, size, hipMemcpyHostToDevice);
#else
cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice); cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
#endif
Neg<<<1, 1>>>(d_in); Neg<<<1, 1>>>(d_in);
#ifdef PADDLE_WITH_HIP
hipMemcpy(in, d_in, size, hipMemcpyDeviceToHost);
#else
cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost); cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
#endif
EXPECT_EQ(static_cast<float>(float16(in[0])), v_out); EXPECT_EQ(static_cast<float>(float16(in[0])), v_out);
free(in); free(in);
#ifdef PADDLE_WITH_HIP
hipFree(d_in);
#else
cudaFree(d_in); cudaFree(d_in);
#endif
} }
COMPOUND_KERNEL(AddAssign, +=) COMPOUND_KERNEL(AddAssign, +=)
@ -221,7 +318,7 @@ TEST(float16, lod_tensor_on_gpu) {
framework::LoDTensor gpu_tensor; framework::LoDTensor gpu_tensor;
framework::LoDTensor dst_tensor; framework::LoDTensor dst_tensor;
float16* src_ptr = src_tensor.mutable_data<float16>( float16 *src_ptr = src_tensor.mutable_data<float16>(
framework::make_ddim({2, 2}), CPUPlace()); framework::make_ddim({2, 2}), CPUPlace());
float16 arr[4] = {float16(1.0f), float16(0.5f), float16(0.33333f), float16 arr[4] = {float16(1.0f), float16(0.5f), float16(0.33333f),
@ -238,7 +335,7 @@ TEST(float16, lod_tensor_on_gpu) {
// Sync before comparing LoDTensors // Sync before comparing LoDTensors
gpu_ctx.Wait(); gpu_ctx.Wait();
const float16* dst_ptr = dst_tensor.data<float16>(); const float16 *dst_ptr = dst_tensor.data<float16>();
ASSERT_NE(src_ptr, dst_ptr); ASSERT_NE(src_ptr, dst_ptr);
for (size_t i = 0; i < 4; ++i) { for (size_t i = 0; i < 4; ++i) {
EXPECT_EQ(src_ptr[i].x, dst_ptr[i].x); EXPECT_EQ(src_ptr[i].x, dst_ptr[i].x);
@ -247,7 +344,7 @@ TEST(float16, lod_tensor_on_gpu) {
template <typename T> template <typename T>
struct Functor { struct Functor {
bool operator()(const T& val) { bool operator()(const T &val) {
return std::type_index(typeid(T)) == return std::type_index(typeid(T)) ==
std::type_index(typeid(platform::float16)); std::type_index(typeid(platform::float16));
} }
@ -304,13 +401,13 @@ TEST(float16, cast) {
auto b = a; auto b = a;
{ {
// change semantic, keep the same value // change semantic, keep the same value
float16 c = reinterpret_cast<float16&>(reinterpret_cast<unsigned&>(b)); float16 c = reinterpret_cast<float16 &>(reinterpret_cast<unsigned &>(b));
EXPECT_EQ(b, c); EXPECT_EQ(b, c);
} }
{ {
// use uint32 low 16 bit store float16 // use uint32 low 16 bit store float16
uint32_t c = reinterpret_cast<uint32_t&>(b); uint32_t c = reinterpret_cast<uint32_t &>(b);
float16 d; float16 d;
d.x = c; d.x = c;
EXPECT_EQ(b, d); EXPECT_EQ(b, d);

@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/gen_comm_id_helper.h" #include "paddle/fluid/platform/gen_comm_id_helper.h"
#include <arpa/inet.h> #include <arpa/inet.h>
@ -336,7 +337,7 @@ void RecvBroadCastCommID(int server_fd, std::string endpoint,
template void RecvBroadCastCommID<Type>(std::string endpoint, \ template void RecvBroadCastCommID<Type>(std::string endpoint, \
std::vector<Type> * nccl_ids); std::vector<Type> * nccl_ids);
#ifdef PADDLE_WITH_NCCL #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
INSTANT_TEMPLATE(ncclUniqueId) INSTANT_TEMPLATE(ncclUniqueId)
#endif #endif
#ifdef PADDLE_WITH_XPU_BKCL #ifdef PADDLE_WITH_XPU_BKCL

@ -14,7 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
#include <functional> #include <functional>
#include <string> #include <string>
#include <vector> #include <vector>

File diff suppressed because it is too large Load Diff

@ -15,11 +15,19 @@ limitations under the License. */
#pragma once #pragma once
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h> #include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// Note: this header for simplify HIP and CUDA type string
#include <stddef.h> #include <stddef.h>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/platform/type_defs.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
@ -86,28 +94,36 @@ size_t GpuMaxChunkSize();
//! Copy memory from address src to dst asynchronously. //! Copy memory from address src to dst asynchronously.
void GpuMemcpyAsync(void *dst, const void *src, size_t count, void GpuMemcpyAsync(void *dst, const void *src, size_t count,
#ifdef PADDLE_WITH_HIP
enum hipMemcpyKind kind, hipStream_t stream);
#else
enum cudaMemcpyKind kind, cudaStream_t stream); enum cudaMemcpyKind kind, cudaStream_t stream);
#endif
//! Copy memory from address src to dst synchronously. //! Copy memory from address src to dst synchronously.
void GpuMemcpySync(void *dst, const void *src, size_t count, void GpuMemcpySync(void *dst, const void *src, size_t count,
#ifdef PADDLE_WITH_HIP
enum hipMemcpyKind kind);
#else
enum cudaMemcpyKind kind); enum cudaMemcpyKind kind);
#endif
//! Copy memory from one device to another device asynchronously. //! Copy memory from one device to another device asynchronously.
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src, void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
int src_device, size_t count, cudaStream_t stream); int src_device, size_t count, gpuStream_t stream);
//! Copy memory from one device to another device synchronously. //! Copy memory from one device to another device synchronously.
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
int src_device, size_t count); int src_device, size_t count);
//! Set memory dst with value count size asynchronously //! Set memory dst with value count size asynchronously
void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream); void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream);
//! Blocks until stream has completed all operations. //! Blocks until stream has completed all operations.
void GpuStreamSync(cudaStream_t stream); void GpuStreamSync(gpuStream_t stream);
//! CudaMalloc with recorded info //! CudaMalloc with recorded info
cudaError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id); gpuError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id);
//! CudaFree with recorded info //! CudaFree with recorded info
void RecordedCudaFree(void *p, size_t size, int dev_id); void RecordedCudaFree(void *p, size_t size, int dev_id);

@ -16,9 +16,13 @@
#pragma once #pragma once
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h> #include <cuda_runtime.h>
#else
#include <hip/hip_runtime.h>
#endif
#include <stddef.h> #include <stddef.h>
#include <algorithm> #include <algorithm>
#include <string> #include <string>

@ -14,7 +14,7 @@
#pragma once #pragma once
#ifdef PADDLE_WITH_NCCL #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include <stdio.h> #include <stdio.h>
#include <memory> #include <memory>
#include <string> #include <string>
@ -25,7 +25,12 @@
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#ifdef PADDLE_WITH_NCCL
#include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/dynload/nccl.h"
#endif
#ifdef PADDLE_WITH_RCCL
#include "paddle/fluid/platform/dynload/rccl.h"
#endif
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
@ -81,7 +86,7 @@ struct NCCLContext {
explicit NCCLContext(int dev_id) explicit NCCLContext(int dev_id)
: ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {} : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}
cudaStream_t stream() const { return ctx_->stream(); } gpuStream_t stream() const { return ctx_->stream(); }
ncclComm_t comm() const { return comm_; } ncclComm_t comm() const { return comm_; }
int device_id() const { int device_id() const {

@ -154,7 +154,7 @@ struct PlaceVisitorWrapper
} }
typename Visitor::result_type operator()(const CUDAPlace &cuda) const { typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return visitor_(cuda); return visitor_(cuda);
#else #else
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
@ -165,7 +165,7 @@ struct PlaceVisitorWrapper
typename Visitor::result_type operator()( typename Visitor::result_type operator()(
const CUDAPinnedPlace &cuda_pinned) const { const CUDAPinnedPlace &cuda_pinned) const {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return visitor_(cuda_pinned); return visitor_(cuda_pinned);
#else #else
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(

@ -206,7 +206,7 @@ void EnableProfiler(ProfilerState state) {
g_state = state; g_state = state;
should_send_profile_state = true; should_send_profile_state = true;
GetDeviceTracer()->Enable(); GetDeviceTracer()->Enable();
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (g_state == ProfilerState::kCUDA || g_state == ProfilerState::kAll || if (g_state == ProfilerState::kCUDA || g_state == ProfilerState::kAll ||
g_state == ProfilerState::kCPU) { g_state == ProfilerState::kCPU) {
// Generate some dummy events first to reduce the startup overhead. // Generate some dummy events first to reduce the startup overhead.

@ -12,7 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include <cuda.h> #include <cuda.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
@ -31,6 +38,21 @@ static void ForEachDevice(std::function<void(int)> func) {
} }
void DummyKernelAndEvent() { void DummyKernelAndEvent() {
#ifdef PADDLE_WITH_HIP
for (int i = 0; i < 5; i++) {
ForEachDevice([](int d) {
platform::SetDeviceId(d);
hipStream_t stream;
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream));
Mark("_cuda_startup_");
int *ptr;
PADDLE_ENFORCE_CUDA_SUCCESS(hipMalloc(&ptr, sizeof(int)));
hipLaunchKernelGGL(DummyKernel, dim3(1), dim3(1), 0, stream, ptr);
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
PADDLE_ENFORCE_CUDA_SUCCESS(hipFree(ptr));
});
}
#else
for (int i = 0; i < 5; i++) { for (int i = 0; i < 5; i++) {
ForEachDevice([](int d) { ForEachDevice([](int d) {
platform::SetDeviceId(d); platform::SetDeviceId(d);
@ -44,6 +66,7 @@ void DummyKernelAndEvent() {
PADDLE_ENFORCE_CUDA_SUCCESS(cudaFree(ptr)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaFree(ptr));
}); });
} }
#endif
} }
} // namespace platform } // namespace platform

@ -28,7 +28,7 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/event.h" #include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#endif #endif
namespace paddle { namespace paddle {
@ -220,7 +220,7 @@ std::string OpName(const framework::VariableNameMap& name_map,
const std::string& type_name); const std::string& type_name);
void SetTracerOption(TracerOption option); void SetTracerOption(TracerOption option);
platform::TracerOption GetTracerOption(); platform::TracerOption GetTracerOption();
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void DummyKernelAndEvent(); void DummyKernelAndEvent();
#endif #endif

@ -31,6 +31,9 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda.h> #include <cuda.h>
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
namespace paddle { namespace paddle {
namespace platform { namespace platform {
@ -122,6 +125,13 @@ void SynchronizeAllDevice() {
PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize()); PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
} }
#endif #endif
#ifdef PADDLE_WITH_HIP
int count = GetCUDADeviceCount();
for (int i = 0; i < count; i++) {
SetDeviceId(i);
PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize());
}
#endif
} }
// Print results // Print results
@ -300,7 +310,7 @@ void SetEvent(bool merge_thread, const Event &analyze_event,
if (rit != pushed_events->rend()) { if (rit != pushed_events->rend()) {
double event_time = 0; double event_time = 0;
double gpu_time = 0.0f; double gpu_time = 0.0f;
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
gpu_time = rit->CudaElapsedMs(analyze_event); gpu_time = rit->CudaElapsedMs(analyze_event);
#endif #endif
double cpu_time = rit->CpuElapsedMs(analyze_event); double cpu_time = rit->CpuElapsedMs(analyze_event);

@ -122,7 +122,7 @@ TEST(RecordEvent, RecordEvent) {
if (events[i][j].name() == "_start_profiler_") ++start_profiler_count; if (events[i][j].name() == "_start_profiler_") ++start_profiler_count;
if (events[i][j].name() == "push") { if (events[i][j].name() == "push") {
EXPECT_EQ(events[i][j + 1].name(), "pop"); EXPECT_EQ(events[i][j + 1].name(), "pop");
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0); EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0);
#else #else
EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0); EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0);
@ -146,3 +146,13 @@ TEST(TMP, stream_wait) {
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
} }
#endif #endif
#ifdef PADDLE_WITH_HIP
TEST(TMP, stream_wait) {
hipStream_t stream;
hipStreamCreate(&stream);
hipStreamSynchronize(stream);
hipStreamSynchronize(stream);
hipStreamSynchronize(stream);
}
#endif

@ -18,7 +18,10 @@
namespace paddle { namespace paddle {
namespace platform { namespace platform {
#if CUDA_VERSION >= 10000 #ifdef PADDLE_WITH_HIP
static void StreamCallbackFunc(gpuStream_t stream, gpuError_t status,
void *user_data)
#elif CUDA_VERSION >= 10000
static void CUDART_CB StreamCallbackFunc(void *user_data) static void CUDART_CB StreamCallbackFunc(void *user_data)
#else #else
static void CUDART_CB StreamCallbackFunc(cudaStream_t stream, static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
@ -30,7 +33,7 @@ static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
(*func)(); (*func)();
} }
StreamCallbackManager::StreamCallbackManager(const cudaStream_t stream) StreamCallbackManager::StreamCallbackManager(const gpuStream_t stream)
: stream_(stream), thread_pool_(1) {} : stream_(stream), thread_pool_(1) {}
void StreamCallbackManager::AddCallback(std::function<void()> callback) const { void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
@ -42,7 +45,10 @@ void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
(*callback_func)(); (*callback_func)();
}); });
}); });
#if CUDA_VERSION >= 10000 #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
#elif CUDA_VERSION >= 10000
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaLaunchHostFunc(stream_, StreamCallbackFunc, func)); cudaLaunchHostFunc(stream_, StreamCallbackFunc, func));
#else #else
@ -52,7 +58,11 @@ void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
} }
void StreamCallbackManager::Wait() const { void StreamCallbackManager::Wait() const {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_));
#endif
{ {
std::lock_guard<std::mutex> lock(mtx_); std::lock_guard<std::mutex> lock(mtx_);
if (last_future_.valid()) { if (last_future_.valid()) {

@ -15,8 +15,16 @@
#pragma once #pragma once
#include <ThreadPool.h> #include <ThreadPool.h>
#ifdef PADDLE_WITH_CUDA
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <functional> #include <functional>
#include <future> // NOLINT #include <future> // NOLINT
#include <memory> #include <memory>
@ -31,7 +39,7 @@ namespace platform {
// Make StreamCallbackManager thread-safe // Make StreamCallbackManager thread-safe
class StreamCallbackManager { class StreamCallbackManager {
public: public:
explicit StreamCallbackManager(const cudaStream_t stream); explicit StreamCallbackManager(const gpuStream_t stream);
~StreamCallbackManager() = default; ~StreamCallbackManager() = default;
@ -40,7 +48,7 @@ class StreamCallbackManager {
void Wait() const; void Wait() const;
private: private:
const cudaStream_t stream_; const gpuStream_t stream_;
mutable ::ThreadPool thread_pool_; mutable ::ThreadPool thread_pool_;
mutable std::mutex mtx_; mutable std::mutex mtx_;
mutable std::future<void> last_future_; mutable std::future<void> last_future_;

@ -40,24 +40,36 @@ TEST(test_record_malloc, test_limit_gpu_memory) {
RecordedCudaMemGetInfo(&avail, &total, &actual_avail, &actual_total, RecordedCudaMemGetInfo(&avail, &total, &actual_avail, &actual_total,
DEVICE_ID); DEVICE_ID);
ASSERT_EQ(total, limit); ASSERT_EQ(total, limit);
ASSERT_EQ(cudaGetLastError(), cudaSuccess); #ifdef PADDLE_WITH_HIP
ASSERT_EQ(hipGetLastError(), gpuSuccess);
#else
ASSERT_EQ(cudaGetLastError(), gpuSuccess);
#endif
} }
{ {
CUDADeviceGuard guard(DEVICE_ID); CUDADeviceGuard guard(DEVICE_ID);
GpuMemoryUsage(&avail, &total); GpuMemoryUsage(&avail, &total);
ASSERT_EQ(total, limit); ASSERT_EQ(total, limit);
ASSERT_EQ(cudaGetLastError(), cudaSuccess); #ifdef PADDLE_WITH_HIP
ASSERT_EQ(hipGetLastError(), gpuSuccess);
#else
ASSERT_EQ(cudaGetLastError(), gpuSuccess);
#endif
} }
cudaError_t err = cudaSuccess; gpuError_t err = gpuSuccess;
void *p1 = nullptr; void *p1 = nullptr;
size_t size1 = limit / 4 * 3; size_t size1 = limit / 4 * 3;
{ {
err = platform::RecordedCudaMalloc(&p1, size1, DEVICE_ID); err = platform::RecordedCudaMalloc(&p1, size1, DEVICE_ID);
ASSERT_EQ(err, cudaSuccess); ASSERT_EQ(err, gpuSuccess);
ASSERT_EQ(cudaGetLastError(), cudaSuccess); #ifdef PADDLE_WITH_HIP
ASSERT_EQ(hipGetLastError(), gpuSuccess);
#else
ASSERT_EQ(cudaGetLastError(), gpuSuccess);
#endif
ASSERT_NE(p1, nullptr); ASSERT_NE(p1, nullptr);
ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size1); ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size1);
@ -67,8 +79,13 @@ TEST(test_record_malloc, test_limit_gpu_memory) {
size_t size2 = limit / 2; size_t size2 = limit / 2;
{ {
err = platform::RecordedCudaMalloc(&p2, size2, DEVICE_ID); err = platform::RecordedCudaMalloc(&p2, size2, DEVICE_ID);
#ifdef PADDLE_WITH_HIP
ASSERT_EQ(err, hipErrorOutOfMemory);
ASSERT_EQ(hipGetLastError(), gpuSuccess);
#else
ASSERT_EQ(err, cudaErrorMemoryAllocation); ASSERT_EQ(err, cudaErrorMemoryAllocation);
ASSERT_EQ(cudaGetLastError(), cudaSuccess); ASSERT_EQ(cudaGetLastError(), gpuSuccess);
#endif
ASSERT_EQ(p2, nullptr); ASSERT_EQ(p2, nullptr);
ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size1); ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size1);
@ -81,8 +98,12 @@ TEST(test_record_malloc, test_limit_gpu_memory) {
{ {
err = platform::RecordedCudaMalloc(&p2, size2, DEVICE_ID); err = platform::RecordedCudaMalloc(&p2, size2, DEVICE_ID);
ASSERT_EQ(err, cudaSuccess); ASSERT_EQ(err, gpuSuccess);
#ifdef PADDLE_WITH_HIP
ASSERT_EQ(hipGetLastError(), hipSuccess);
#else
ASSERT_EQ(cudaGetLastError(), cudaSuccess); ASSERT_EQ(cudaGetLastError(), cudaSuccess);
#endif
ASSERT_NE(p2, nullptr); ASSERT_NE(p2, nullptr);
ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size2); ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size2);
} }

@ -22,7 +22,7 @@ limitations under the License. */
#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/platform/hostdevice.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#ifdef __NVCC__ #if defined(__NVCC__) || defined(__HIPCC__)
#include <thrust/execution_policy.h> #include <thrust/execution_policy.h>
#include <thrust/transform.h> #include <thrust/transform.h>
#include "paddle/fluid/platform/details/cuda_transform_iterator_cast.h" #include "paddle/fluid/platform/details/cuda_transform_iterator_cast.h"
@ -76,7 +76,7 @@ struct Transform<platform::CPUDeviceContext> {
} }
}; };
#ifdef __NVCC__ #if defined(__NVCC__) || defined(__HIPCC__)
template <> template <>
struct Transform<platform::CUDADeviceContext> { struct Transform<platform::CUDADeviceContext> {
template <typename InputIter, typename OutputIter, typename UnaryOperation> template <typename InputIter, typename OutputIter, typename UnaryOperation>
@ -86,10 +86,17 @@ struct Transform<platform::CUDADeviceContext> {
PADDLE_ENFORCE_EQ(is_gpu_place(place), true, PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"The CUDA Transform must be used in GPU place.")); "The CUDA Transform must be used in GPU place."));
#ifdef __HIPCC__
thrust::transform(thrust::hip::par.on(context.stream()),
details::CastToCUDATransformIterator(first),
details::CastToCUDATransformIterator(last),
details::CastToCUDATransformIterator(result), op);
#else
thrust::transform(thrust::cuda::par.on(context.stream()), thrust::transform(thrust::cuda::par.on(context.stream()),
details::CastToCUDATransformIterator(first), details::CastToCUDATransformIterator(first),
details::CastToCUDATransformIterator(last), details::CastToCUDATransformIterator(last),
details::CastToCUDATransformIterator(result), op); details::CastToCUDATransformIterator(result), op);
#endif
} }
template <typename InputIter1, typename InputIter2, typename OutputIter, template <typename InputIter1, typename InputIter2, typename OutputIter,
@ -101,11 +108,19 @@ struct Transform<platform::CUDADeviceContext> {
PADDLE_ENFORCE_EQ(is_gpu_place(place), true, PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"The CUDA Transform must be used in GPU place.")); "The CUDA Transform must be used in GPU place."));
#ifdef __HIPCC__
thrust::transform(thrust::hip::par.on(context.stream()),
details::CastToCUDATransformIterator(first1),
details::CastToCUDATransformIterator(last1),
details::CastToCUDATransformIterator(first2),
details::CastToCUDATransformIterator(result), op);
#else
thrust::transform(thrust::cuda::par.on(context.stream()), thrust::transform(thrust::cuda::par.on(context.stream()),
details::CastToCUDATransformIterator(first1), details::CastToCUDATransformIterator(first1),
details::CastToCUDATransformIterator(last1), details::CastToCUDATransformIterator(last1),
details::CastToCUDATransformIterator(first2), details::CastToCUDATransformIterator(first2),
details::CastToCUDATransformIterator(result), op); details::CastToCUDATransformIterator(result), op);
#endif
} }
}; };
#endif #endif

@ -32,7 +32,7 @@ limitations under the License. */
// BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same // BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same
// function symbols. For details, // function symbols. For details,
// https://github.com/PaddlePaddle/Paddle/issues/3386 // https://github.com/PaddlePaddle/Paddle/issues/3386
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifndef BOOST_NO_CXX11_VARIADIC_TEMPLATES #ifndef BOOST_NO_CXX11_VARIADIC_TEMPLATES
#define BOOST_NO_CXX11_VARIADIC_TEMPLATES #define BOOST_NO_CXX11_VARIADIC_TEMPLATES
#endif #endif

Loading…
Cancel
Save