[ROCM] update fluid platform for rocm39 (part2), test=develop (#30774)

revert-31068-fix_conv3d_windows
Qi Li 4 years ago committed by GitHub
parent 5ded39f226
commit 34f1628ce8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

File diff suppressed because it is too large Load Diff

@ -22,30 +22,109 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#define ARITHMETIC_KERNEL(op_type, sign) \
__global__ void op_type(const half* in1, const half* in2, half* out) { \
__global__ void op_type(const half *in1, const half *in2, half *out) { \
out[0] = in1[0] sign in2[0]; \
}
#define COMPOUND_KERNEL(op_type, sign) \
__global__ void op_type(half* in1, const half* in2) { in1[0] sign in2[0]; }
__global__ void op_type(half *in1, const half *in2) { in1[0] sign in2[0]; }
#define COMPARISON_KERNEL(op_type, sign) \
__global__ void op_type(const half* in1, const half* in2, bool* out) { \
__global__ void op_type(const half *in1, const half *in2, bool *out) { \
out[0] = in1[0] sign in2[0]; \
}
#ifdef PADDLE_WITH_HIP
#define ARITHMETIC_KERNEL_LAUNCH(op_type) \
void Test##op_type(float v_in1, float v_in2, float v_out) { \
LOG(INFO) << "Test " << #op_type << " on GPU!"; \
half *in1, *in2, *out; \
half *d_in1, *d_in2, *d_out; \
int size = sizeof(half); \
hipMalloc(reinterpret_cast<void **>(&d_in1), size); \
hipMalloc(reinterpret_cast<void **>(&d_in2), size); \
hipMalloc(reinterpret_cast<void **>(&d_out), size); \
in1 = reinterpret_cast<half *>(malloc(size)); \
in2 = reinterpret_cast<half *>(malloc(size)); \
out = reinterpret_cast<half *>(malloc(size)); \
in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \
hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice); \
hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice); \
hipLaunchKernelGGL(op_type, dim3(1), dim3(1), 0, 0, d_in1, d_in2, d_out); \
hipMemcpy(out, d_out, size, hipMemcpyDeviceToHost); \
EXPECT_EQ(static_cast<float>(float16(out[0])), v_out); \
free(in1); \
free(in2); \
free(out); \
hipFree(d_in1); \
hipFree(d_in2); \
hipFree(d_out); \
}
#define COMPOUND_KERNEL_LAUNCH(op_type) \
void Test##op_type(float v_in1, float v_in2, float v_out) { \
LOG(INFO) << "Test " << #op_type << " on GPU!"; \
half *in1, *in2; \
half *d_in1, *d_in2; \
int size = sizeof(half); \
hipMalloc(reinterpret_cast<void **>(&d_in1), size); \
hipMalloc(reinterpret_cast<void **>(&d_in2), size); \
in1 = reinterpret_cast<half *>(malloc(size)); \
in2 = reinterpret_cast<half *>(malloc(size)); \
in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \
hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice); \
hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice); \
hipLaunchKernelGGL(op_type, dim3(1), dim3(1), 0, 0, d_in1, d_in2); \
hipMemcpy(in1, d_in1, size, hipMemcpyDeviceToHost); \
EXPECT_EQ(static_cast<float>(float16(in1[0])), v_out); \
free(in1); \
free(in2); \
hipFree(d_in1); \
hipFree(d_in2); \
}
#define COMPARISON_KERNEL_LAUNCH(op_type) \
void Test##op_type(float v_in1, float v_in2, bool v_out) { \
LOG(INFO) << "Test " << #op_type << " on GPU!"; \
half *in1, *in2; \
half *d_in1, *d_in2; \
bool *out, *d_out; \
int size = sizeof(half); \
hipMalloc(reinterpret_cast<void **>(&d_in1), size); \
hipMalloc(reinterpret_cast<void **>(&d_in2), size); \
hipMalloc(reinterpret_cast<void **>(&d_out), 1); \
in1 = reinterpret_cast<half *>(malloc(size)); \
in2 = reinterpret_cast<half *>(malloc(size)); \
out = reinterpret_cast<bool *>(malloc(1)); \
in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \
hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice); \
hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice); \
hipLaunchKernelGGL(op_type, dim3(1), dim3(1), 0, 0, d_in1, d_in2, d_out); \
hipMemcpy(out, d_out, 1, hipMemcpyDeviceToHost); \
EXPECT_EQ(out[0], v_out); \
free(in1); \
free(in2); \
free(out); \
hipFree(d_in1); \
hipFree(d_in2); \
hipFree(d_out); \
}
#else
#define ARITHMETIC_KERNEL_LAUNCH(op_type) \
void Test##op_type(float v_in1, float v_in2, float v_out) { \
LOG(INFO) << "Test " << #op_type << " on GPU!"; \
half *in1, *in2, *out; \
half *d_in1, *d_in2, *d_out; \
int size = sizeof(half); \
cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \
cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \
cudaMalloc(reinterpret_cast<void**>(&d_out), size); \
in1 = reinterpret_cast<half*>(malloc(size)); \
in2 = reinterpret_cast<half*>(malloc(size)); \
out = reinterpret_cast<half*>(malloc(size)); \
cudaMalloc(reinterpret_cast<void **>(&d_in1), size); \
cudaMalloc(reinterpret_cast<void **>(&d_in2), size); \
cudaMalloc(reinterpret_cast<void **>(&d_out), size); \
in1 = reinterpret_cast<half *>(malloc(size)); \
in2 = reinterpret_cast<half *>(malloc(size)); \
out = reinterpret_cast<half *>(malloc(size)); \
in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
@ -67,10 +146,10 @@ limitations under the License. */
half *in1, *in2; \
half *d_in1, *d_in2; \
int size = sizeof(half); \
cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \
cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \
in1 = reinterpret_cast<half*>(malloc(size)); \
in2 = reinterpret_cast<half*>(malloc(size)); \
cudaMalloc(reinterpret_cast<void **>(&d_in1), size); \
cudaMalloc(reinterpret_cast<void **>(&d_in2), size); \
in1 = reinterpret_cast<half *>(malloc(size)); \
in2 = reinterpret_cast<half *>(malloc(size)); \
in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
@ -91,12 +170,12 @@ limitations under the License. */
half *d_in1, *d_in2; \
bool *out, *d_out; \
int size = sizeof(half); \
cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \
cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \
cudaMalloc(reinterpret_cast<void**>(&d_out), 1); \
in1 = reinterpret_cast<half*>(malloc(size)); \
in2 = reinterpret_cast<half*>(malloc(size)); \
out = reinterpret_cast<bool*>(malloc(1)); \
cudaMalloc(reinterpret_cast<void **>(&d_in1), size); \
cudaMalloc(reinterpret_cast<void **>(&d_in2), size); \
cudaMalloc(reinterpret_cast<void **>(&d_out), 1); \
in1 = reinterpret_cast<half *>(malloc(size)); \
in2 = reinterpret_cast<half *>(malloc(size)); \
out = reinterpret_cast<bool *>(malloc(1)); \
in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
@ -111,12 +190,14 @@ limitations under the License. */
cudaFree(d_in2); \
cudaFree(d_out); \
}
#endif
#ifdef PADDLE_CUDA_FP16
namespace paddle {
namespace platform {
#if CUDA_VERSION < 9000
#if defined(PADDLE_WITH_HIP) || \
(defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000)
ARITHMETIC_KERNEL(Add, +)
ARITHMETIC_KERNEL(Sub, -)
ARITHMETIC_KERNEL(Mul, *)
@ -128,21 +209,37 @@ ARITHMETIC_KERNEL_LAUNCH(Mul)
ARITHMETIC_KERNEL_LAUNCH(Div)
// Negative sign kernel
__global__ void Neg(half* in) { in[0] = -in[0]; }
__global__ void Neg(half *in) { in[0] = -in[0]; }
void TestNeg(float v_in, float v_out) {
LOG(INFO) << "Test Neg on GPU!";
half *in, *d_in;
int size = sizeof(half);
cudaMalloc(reinterpret_cast<void**>(&d_in), size);
in = reinterpret_cast<half*>(malloc(size));
#ifdef PADDLE_WITH_HIP
hipMalloc(reinterpret_cast<void **>(&d_in), size);
#else
cudaMalloc(reinterpret_cast<void **>(&d_in), size);
#endif
in = reinterpret_cast<half *>(malloc(size));
in[0] = half(float16(v_in));
#ifdef PADDLE_WITH_HIP
hipMemcpy(d_in, in, size, hipMemcpyHostToDevice);
#else
cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
#endif
Neg<<<1, 1>>>(d_in);
#ifdef PADDLE_WITH_HIP
hipMemcpy(in, d_in, size, hipMemcpyDeviceToHost);
#else
cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
#endif
EXPECT_EQ(static_cast<float>(float16(in[0])), v_out);
free(in);
#ifdef PADDLE_WITH_HIP
hipFree(d_in);
#else
cudaFree(d_in);
#endif
}
COMPOUND_KERNEL(AddAssign, +=)
@ -221,7 +318,7 @@ TEST(float16, lod_tensor_on_gpu) {
framework::LoDTensor gpu_tensor;
framework::LoDTensor dst_tensor;
float16* src_ptr = src_tensor.mutable_data<float16>(
float16 *src_ptr = src_tensor.mutable_data<float16>(
framework::make_ddim({2, 2}), CPUPlace());
float16 arr[4] = {float16(1.0f), float16(0.5f), float16(0.33333f),
@ -238,7 +335,7 @@ TEST(float16, lod_tensor_on_gpu) {
// Sync before comparing LoDTensors
gpu_ctx.Wait();
const float16* dst_ptr = dst_tensor.data<float16>();
const float16 *dst_ptr = dst_tensor.data<float16>();
ASSERT_NE(src_ptr, dst_ptr);
for (size_t i = 0; i < 4; ++i) {
EXPECT_EQ(src_ptr[i].x, dst_ptr[i].x);
@ -247,7 +344,7 @@ TEST(float16, lod_tensor_on_gpu) {
template <typename T>
struct Functor {
bool operator()(const T& val) {
bool operator()(const T &val) {
return std::type_index(typeid(T)) ==
std::type_index(typeid(platform::float16));
}
@ -304,13 +401,13 @@ TEST(float16, cast) {
auto b = a;
{
// change semantic, keep the same value
float16 c = reinterpret_cast<float16&>(reinterpret_cast<unsigned&>(b));
float16 c = reinterpret_cast<float16 &>(reinterpret_cast<unsigned &>(b));
EXPECT_EQ(b, c);
}
{
// use uint32 low 16 bit store float16
uint32_t c = reinterpret_cast<uint32_t&>(b);
uint32_t c = reinterpret_cast<uint32_t &>(b);
float16 d;
d.x = c;
EXPECT_EQ(b, d);

@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_XPU_BKCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#include <arpa/inet.h>
@ -336,7 +337,7 @@ void RecvBroadCastCommID(int server_fd, std::string endpoint,
template void RecvBroadCastCommID<Type>(std::string endpoint, \
std::vector<Type> * nccl_ids);
#ifdef PADDLE_WITH_NCCL
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
INSTANT_TEMPLATE(ncclUniqueId)
#endif
#ifdef PADDLE_WITH_XPU_BKCL

@ -14,7 +14,8 @@ limitations under the License. */
#pragma once
#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_XPU_BKCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
#include <functional>
#include <string>
#include <vector>

File diff suppressed because it is too large Load Diff

@ -15,11 +15,19 @@ limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// Note: this header for simplify HIP and CUDA type string
#include <stddef.h>
#include <string>
#include <vector>
#include "paddle/fluid/platform/type_defs.h"
namespace paddle {
namespace platform {
@ -86,28 +94,36 @@ size_t GpuMaxChunkSize();
//! Copy memory from address src to dst asynchronously.
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
#ifdef PADDLE_WITH_HIP
enum hipMemcpyKind kind, hipStream_t stream);
#else
enum cudaMemcpyKind kind, cudaStream_t stream);
#endif
//! Copy memory from address src to dst synchronously.
void GpuMemcpySync(void *dst, const void *src, size_t count,
#ifdef PADDLE_WITH_HIP
enum hipMemcpyKind kind);
#else
enum cudaMemcpyKind kind);
#endif
//! Copy memory from one device to another device asynchronously.
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
int src_device, size_t count, cudaStream_t stream);
int src_device, size_t count, gpuStream_t stream);
//! Copy memory from one device to another device synchronously.
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
int src_device, size_t count);
//! Set memory dst with value count size asynchronously
void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);
void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream);
//! Blocks until stream has completed all operations.
void GpuStreamSync(cudaStream_t stream);
void GpuStreamSync(gpuStream_t stream);
//! CudaMalloc with recorded info
cudaError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id);
gpuError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id);
//! CudaFree with recorded info
void RecordedCudaFree(void *p, size_t size, int dev_id);

@ -16,9 +16,13 @@
#pragma once
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h>
#else
#include <hip/hip_runtime.h>
#endif
#include <stddef.h>
#include <algorithm>
#include <string>

@ -14,7 +14,7 @@
#pragma once
#ifdef PADDLE_WITH_NCCL
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include <stdio.h>
#include <memory>
#include <string>
@ -25,7 +25,12 @@
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/collective_helper.h"
#ifdef PADDLE_WITH_NCCL
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
#ifdef PADDLE_WITH_RCCL
#include "paddle/fluid/platform/dynload/rccl.h"
#endif
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
@ -81,7 +86,7 @@ struct NCCLContext {
explicit NCCLContext(int dev_id)
: ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}
cudaStream_t stream() const { return ctx_->stream(); }
gpuStream_t stream() const { return ctx_->stream(); }
ncclComm_t comm() const { return comm_; }
int device_id() const {

@ -154,7 +154,7 @@ struct PlaceVisitorWrapper
}
typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return visitor_(cuda);
#else
PADDLE_THROW(platform::errors::Unavailable(
@ -165,7 +165,7 @@ struct PlaceVisitorWrapper
typename Visitor::result_type operator()(
const CUDAPinnedPlace &cuda_pinned) const {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return visitor_(cuda_pinned);
#else
PADDLE_THROW(platform::errors::Unavailable(

@ -206,7 +206,7 @@ void EnableProfiler(ProfilerState state) {
g_state = state;
should_send_profile_state = true;
GetDeviceTracer()->Enable();
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (g_state == ProfilerState::kCUDA || g_state == ProfilerState::kAll ||
g_state == ProfilerState::kCPU) {
// Generate some dummy events first to reduce the startup overhead.

@ -12,7 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
@ -31,6 +38,21 @@ static void ForEachDevice(std::function<void(int)> func) {
}
void DummyKernelAndEvent() {
#ifdef PADDLE_WITH_HIP
for (int i = 0; i < 5; i++) {
ForEachDevice([](int d) {
platform::SetDeviceId(d);
hipStream_t stream;
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream));
Mark("_cuda_startup_");
int *ptr;
PADDLE_ENFORCE_CUDA_SUCCESS(hipMalloc(&ptr, sizeof(int)));
hipLaunchKernelGGL(DummyKernel, dim3(1), dim3(1), 0, stream, ptr);
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
PADDLE_ENFORCE_CUDA_SUCCESS(hipFree(ptr));
});
}
#else
for (int i = 0; i < 5; i++) {
ForEachDevice([](int d) {
platform::SetDeviceId(d);
@ -44,6 +66,7 @@ void DummyKernelAndEvent() {
PADDLE_ENFORCE_CUDA_SUCCESS(cudaFree(ptr));
});
}
#endif
}
} // namespace platform

@ -28,7 +28,7 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/gpu_info.h"
#endif
namespace paddle {
@ -220,7 +220,7 @@ std::string OpName(const framework::VariableNameMap& name_map,
const std::string& type_name);
void SetTracerOption(TracerOption option);
platform::TracerOption GetTracerOption();
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void DummyKernelAndEvent();
#endif

@ -31,6 +31,9 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
namespace paddle {
namespace platform {
@ -122,6 +125,13 @@ void SynchronizeAllDevice() {
PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
}
#endif
#ifdef PADDLE_WITH_HIP
int count = GetCUDADeviceCount();
for (int i = 0; i < count; i++) {
SetDeviceId(i);
PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize());
}
#endif
}
// Print results
@ -300,7 +310,7 @@ void SetEvent(bool merge_thread, const Event &analyze_event,
if (rit != pushed_events->rend()) {
double event_time = 0;
double gpu_time = 0.0f;
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
gpu_time = rit->CudaElapsedMs(analyze_event);
#endif
double cpu_time = rit->CpuElapsedMs(analyze_event);

@ -122,7 +122,7 @@ TEST(RecordEvent, RecordEvent) {
if (events[i][j].name() == "_start_profiler_") ++start_profiler_count;
if (events[i][j].name() == "push") {
EXPECT_EQ(events[i][j + 1].name(), "pop");
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0);
#else
EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0);
@ -146,3 +146,13 @@ TEST(TMP, stream_wait) {
cudaStreamSynchronize(stream);
}
#endif
#ifdef PADDLE_WITH_HIP
TEST(TMP, stream_wait) {
hipStream_t stream;
hipStreamCreate(&stream);
hipStreamSynchronize(stream);
hipStreamSynchronize(stream);
hipStreamSynchronize(stream);
}
#endif

@ -18,7 +18,10 @@
namespace paddle {
namespace platform {
#if CUDA_VERSION >= 10000
#ifdef PADDLE_WITH_HIP
static void StreamCallbackFunc(gpuStream_t stream, gpuError_t status,
void *user_data)
#elif CUDA_VERSION >= 10000
static void CUDART_CB StreamCallbackFunc(void *user_data)
#else
static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
@ -30,7 +33,7 @@ static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
(*func)();
}
StreamCallbackManager::StreamCallbackManager(const cudaStream_t stream)
StreamCallbackManager::StreamCallbackManager(const gpuStream_t stream)
: stream_(stream), thread_pool_(1) {}
void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
@ -42,7 +45,10 @@ void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
(*callback_func)();
});
});
#if CUDA_VERSION >= 10000
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
#elif CUDA_VERSION >= 10000
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaLaunchHostFunc(stream_, StreamCallbackFunc, func));
#else
@ -52,7 +58,11 @@ void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
}
void StreamCallbackManager::Wait() const {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_));
#endif
{
std::lock_guard<std::mutex> lock(mtx_);
if (last_future_.valid()) {

@ -15,8 +15,16 @@
#pragma once
#include <ThreadPool.h>
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <functional>
#include <future> // NOLINT
#include <memory>
@ -31,7 +39,7 @@ namespace platform {
// Make StreamCallbackManager thread-safe
class StreamCallbackManager {
public:
explicit StreamCallbackManager(const cudaStream_t stream);
explicit StreamCallbackManager(const gpuStream_t stream);
~StreamCallbackManager() = default;
@ -40,7 +48,7 @@ class StreamCallbackManager {
void Wait() const;
private:
const cudaStream_t stream_;
const gpuStream_t stream_;
mutable ::ThreadPool thread_pool_;
mutable std::mutex mtx_;
mutable std::future<void> last_future_;

@ -40,24 +40,36 @@ TEST(test_record_malloc, test_limit_gpu_memory) {
RecordedCudaMemGetInfo(&avail, &total, &actual_avail, &actual_total,
DEVICE_ID);
ASSERT_EQ(total, limit);
ASSERT_EQ(cudaGetLastError(), cudaSuccess);
#ifdef PADDLE_WITH_HIP
ASSERT_EQ(hipGetLastError(), gpuSuccess);
#else
ASSERT_EQ(cudaGetLastError(), gpuSuccess);
#endif
}
{
CUDADeviceGuard guard(DEVICE_ID);
GpuMemoryUsage(&avail, &total);
ASSERT_EQ(total, limit);
ASSERT_EQ(cudaGetLastError(), cudaSuccess);
#ifdef PADDLE_WITH_HIP
ASSERT_EQ(hipGetLastError(), gpuSuccess);
#else
ASSERT_EQ(cudaGetLastError(), gpuSuccess);
#endif
}
cudaError_t err = cudaSuccess;
gpuError_t err = gpuSuccess;
void *p1 = nullptr;
size_t size1 = limit / 4 * 3;
{
err = platform::RecordedCudaMalloc(&p1, size1, DEVICE_ID);
ASSERT_EQ(err, cudaSuccess);
ASSERT_EQ(cudaGetLastError(), cudaSuccess);
ASSERT_EQ(err, gpuSuccess);
#ifdef PADDLE_WITH_HIP
ASSERT_EQ(hipGetLastError(), gpuSuccess);
#else
ASSERT_EQ(cudaGetLastError(), gpuSuccess);
#endif
ASSERT_NE(p1, nullptr);
ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size1);
@ -67,8 +79,13 @@ TEST(test_record_malloc, test_limit_gpu_memory) {
size_t size2 = limit / 2;
{
err = platform::RecordedCudaMalloc(&p2, size2, DEVICE_ID);
#ifdef PADDLE_WITH_HIP
ASSERT_EQ(err, hipErrorOutOfMemory);
ASSERT_EQ(hipGetLastError(), gpuSuccess);
#else
ASSERT_EQ(err, cudaErrorMemoryAllocation);
ASSERT_EQ(cudaGetLastError(), cudaSuccess);
ASSERT_EQ(cudaGetLastError(), gpuSuccess);
#endif
ASSERT_EQ(p2, nullptr);
ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size1);
@ -81,8 +98,12 @@ TEST(test_record_malloc, test_limit_gpu_memory) {
{
err = platform::RecordedCudaMalloc(&p2, size2, DEVICE_ID);
ASSERT_EQ(err, cudaSuccess);
ASSERT_EQ(err, gpuSuccess);
#ifdef PADDLE_WITH_HIP
ASSERT_EQ(hipGetLastError(), hipSuccess);
#else
ASSERT_EQ(cudaGetLastError(), cudaSuccess);
#endif
ASSERT_NE(p2, nullptr);
ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size2);
}

@ -22,7 +22,7 @@ limitations under the License. */
#include "paddle/fluid/platform/hostdevice.h"
#include "paddle/fluid/platform/place.h"
#ifdef __NVCC__
#if defined(__NVCC__) || defined(__HIPCC__)
#include <thrust/execution_policy.h>
#include <thrust/transform.h>
#include "paddle/fluid/platform/details/cuda_transform_iterator_cast.h"
@ -76,7 +76,7 @@ struct Transform<platform::CPUDeviceContext> {
}
};
#ifdef __NVCC__
#if defined(__NVCC__) || defined(__HIPCC__)
template <>
struct Transform<platform::CUDADeviceContext> {
template <typename InputIter, typename OutputIter, typename UnaryOperation>
@ -86,10 +86,17 @@ struct Transform<platform::CUDADeviceContext> {
PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
platform::errors::PreconditionNotMet(
"The CUDA Transform must be used in GPU place."));
#ifdef __HIPCC__
thrust::transform(thrust::hip::par.on(context.stream()),
details::CastToCUDATransformIterator(first),
details::CastToCUDATransformIterator(last),
details::CastToCUDATransformIterator(result), op);
#else
thrust::transform(thrust::cuda::par.on(context.stream()),
details::CastToCUDATransformIterator(first),
details::CastToCUDATransformIterator(last),
details::CastToCUDATransformIterator(result), op);
#endif
}
template <typename InputIter1, typename InputIter2, typename OutputIter,
@ -101,11 +108,19 @@ struct Transform<platform::CUDADeviceContext> {
PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
platform::errors::PreconditionNotMet(
"The CUDA Transform must be used in GPU place."));
#ifdef __HIPCC__
thrust::transform(thrust::hip::par.on(context.stream()),
details::CastToCUDATransformIterator(first1),
details::CastToCUDATransformIterator(last1),
details::CastToCUDATransformIterator(first2),
details::CastToCUDATransformIterator(result), op);
#else
thrust::transform(thrust::cuda::par.on(context.stream()),
details::CastToCUDATransformIterator(first1),
details::CastToCUDATransformIterator(last1),
details::CastToCUDATransformIterator(first2),
details::CastToCUDATransformIterator(result), op);
#endif
}
};
#endif

@ -32,7 +32,7 @@ limitations under the License. */
// BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same
// function symbols. For details,
// https://github.com/PaddlePaddle/Paddle/issues/3386
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifndef BOOST_NO_CXX11_VARIADIC_TEMPLATES
#define BOOST_NO_CXX11_VARIADIC_TEMPLATES
#endif

Loading…
Cancel
Save