You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
138 lines
4.4 KiB
138 lines
4.4 KiB
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include <cuda.h>
|
|
#include <cuda_runtime.h>
|
|
#include <thread> // NOLINT
|
|
#include <vector>
|
|
|
|
#include "gtest/gtest.h"
|
|
#include "paddle/fluid/memory/malloc.h"
|
|
#include "paddle/fluid/platform/device_context.h"
|
|
|
|
namespace paddle {
|
|
namespace memory {
|
|
|
|
const int NUM_STREAMS = 8;
|
|
const int N = 2;
|
|
const float DELTA = 1e-1;
|
|
|
|
using CudaDevCtxVec = std::vector<std::unique_ptr<platform::CUDADeviceContext>>;
|
|
|
|
__global__ void kernel(float *x, int n) {
|
|
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
|
for (int i = tid; i < n; i += blockDim.x * gridDim.x) {
|
|
x[i] = 3.14159 * i;
|
|
}
|
|
}
|
|
|
|
void CheckKernelOutput(float *x, int n) {
|
|
auto host_x = std::unique_ptr<float[]>(new float[n]);
|
|
for (int i = 0; i < n; ++i) {
|
|
EXPECT_TRUE(cudaSuccess == cudaMemcpy(host_x.get(), x, n * sizeof(float),
|
|
cudaMemcpyDeviceToHost));
|
|
EXPECT_GE(host_x[i] + DELTA, 3.14159f * i);
|
|
EXPECT_LE(host_x[i] - DELTA, 3.14159f * i);
|
|
}
|
|
}
|
|
|
|
void MultiStreamCompute(float **data, float **second_data,
|
|
const platform::CUDADeviceContext &ctx) {
|
|
// multi-streams
|
|
AllocationPtr allocation_ptr = Alloc(ctx, N * sizeof(float));
|
|
EXPECT_GE(allocation_ptr->size(), N * sizeof(float));
|
|
*data = reinterpret_cast<float *>(allocation_ptr->ptr());
|
|
kernel<<<1, 64, 0, ctx.stream()>>>(*data, N);
|
|
|
|
// allocate and compute on same stream again
|
|
allocation_ptr = Alloc(ctx, N * sizeof(float));
|
|
EXPECT_GE(allocation_ptr->size(), N * sizeof(float));
|
|
*second_data = reinterpret_cast<float *>(allocation_ptr->ptr());
|
|
kernel<<<1, 64, 0, ctx.stream()>>>(*second_data, N);
|
|
}
|
|
|
|
TEST(Malloc, CUDADeviceContextMultiStream) {
|
|
auto place = platform::CUDAPlace(0);
|
|
EXPECT_TRUE(cudaSuccess == cudaSetDevice(0));
|
|
|
|
AllocationPtr main_stream_alloc_ptr = Alloc(place, N * sizeof(float));
|
|
EXPECT_GE(main_stream_alloc_ptr->size(), N * sizeof(float));
|
|
float *main_stream_data =
|
|
reinterpret_cast<float *>(main_stream_alloc_ptr->ptr());
|
|
|
|
float *data[NUM_STREAMS];
|
|
float *second_data[NUM_STREAMS];
|
|
CudaDevCtxVec dev_ctx;
|
|
|
|
// default stream
|
|
kernel<<<1, 64>>>(main_stream_data, N);
|
|
main_stream_alloc_ptr.reset();
|
|
|
|
for (int i = 0; i < NUM_STREAMS; ++i) {
|
|
dev_ctx.push_back(std::unique_ptr<platform::CUDADeviceContext>(
|
|
new platform::CUDADeviceContext(place)));
|
|
MultiStreamCompute(&data[i], &second_data[i], *dev_ctx[i]);
|
|
}
|
|
|
|
EXPECT_TRUE(cudaSuccess == cudaDeviceSynchronize());
|
|
for (int i = 0; i < NUM_STREAMS; ++i) {
|
|
CheckKernelOutput(data[i], N);
|
|
CheckKernelOutput(second_data[i], N);
|
|
}
|
|
}
|
|
|
|
TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) {
|
|
auto place = platform::CUDAPlace(0);
|
|
EXPECT_TRUE(cudaSuccess == cudaSetDevice(0));
|
|
|
|
AllocationPtr main_stream_alloc_ptr = Alloc(place, N * sizeof(float));
|
|
EXPECT_GE(main_stream_alloc_ptr->size(), N * sizeof(float));
|
|
float *main_stream_data =
|
|
reinterpret_cast<float *>(main_stream_alloc_ptr->ptr());
|
|
|
|
float *data[NUM_STREAMS];
|
|
float *second_data[NUM_STREAMS];
|
|
CudaDevCtxVec dev_ctx;
|
|
std::vector<std::thread> threads;
|
|
|
|
// default stream
|
|
kernel<<<1, 64>>>(main_stream_data, N);
|
|
main_stream_alloc_ptr.reset();
|
|
|
|
for (int i = 0; i < NUM_STREAMS; ++i) {
|
|
dev_ctx.push_back(std::unique_ptr<platform::CUDADeviceContext>(
|
|
new platform::CUDADeviceContext(place)));
|
|
threads.push_back(std::thread(MultiStreamCompute, &data[i], &second_data[i],
|
|
std::cref(*dev_ctx[i])));
|
|
}
|
|
|
|
for (int i = 0; i < NUM_STREAMS; ++i) {
|
|
threads[i].join();
|
|
}
|
|
|
|
EXPECT_TRUE(cudaSuccess == cudaDeviceSynchronize());
|
|
for (int i = 0; i < NUM_STREAMS; ++i) {
|
|
CheckKernelOutput(data[i], N);
|
|
CheckKernelOutput(second_data[i], N);
|
|
}
|
|
}
|
|
|
|
TEST(Malloc, AllocZero) {
|
|
auto place = platform::CUDAPlace(0);
|
|
AllocationPtr allocation_ptr = Alloc(place, 0);
|
|
EXPECT_GE(allocation_ptr->size(), 0);
|
|
}
|
|
} // namespace memory
|
|
} // namespace paddle
|