GPUPlace to CUDAPlace (#6960)

del_some_in_makelist
dzhwinter 7 years ago committed by QI JUN
parent 87aae57cf0
commit 0d2235aadf

@ -54,7 +54,7 @@ bool InitDevices(const std::vector<std::string> &devices) {
#ifdef PADDLE_WITH_CUDA
auto pos = string::RFind(p, ':', string::Piece::npos);
auto number = device.substr(pos + 1);
places.emplace_back(platform::GPUPlace(std::stoi(number)));
places.emplace_back(platform::CUDAPlace(std::stoi(number)));
#else
LOG(WARNING)
<< "'GPU' is not supported, Please re-compile with WITH_GPU option";

@ -224,7 +224,7 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu, buf.get(),
boost::get<platform::GPUPlace>(tensor.place()),
boost::get<platform::CUDAPlace>(tensor.place()),
reinterpret_cast<const void *>(data), size_to_write,
gpu_dev_ctx.stream());
gpu_dev_ctx.Wait();

@ -27,7 +27,7 @@ __global__ void test(size_t* a, int size) {
TEST(LoDTensor, LoDInGPU) {
paddle::framework::LoDTensor lod_tensor;
paddle::platform::GPUPlace place(0);
paddle::platform::CUDAPlace place(0);
paddle::framework::LoD src_lod;
src_lod.push_back(std::vector<size_t>{0, 2, 4, 6, 8, 10, 12, 14});

@ -37,13 +37,13 @@ TEST(OpKernelType, Hash) {
using OpKernelType = paddle::framework::OpKernelType;
using DataType = paddle::framework::proto::DataType;
using CPUPlace = paddle::platform::CPUPlace;
using GPUPlace = paddle::platform::GPUPlace;
using CUDAPlace = paddle::platform::CUDAPlace;
using DataLayout = paddle::framework::DataLayout;
using LibraryType = paddle::framework::LibraryType;
OpKernelType op_kernel_type_1(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
LibraryType::kCUDNN);
OpKernelType op_kernel_type_2(DataType::FP32, GPUPlace(0), DataLayout::kNCHW,
OpKernelType op_kernel_type_2(DataType::FP32, CUDAPlace(0), DataLayout::kNCHW,
LibraryType::kCUDNN);
OpKernelType::Hash hasher;

@ -188,7 +188,7 @@ class OpKernelRegistrar : public Registrar {
}
#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::GPUPlace, __VA_ARGS__)
REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
#define REGISTER_OP_CPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)

@ -71,7 +71,7 @@ private:
```
```c++
typedef boost::variant<GpuPlace, CpuPlace> Place;
typedef boost::variant<CUDAPlace, CpuPlace> Place;
typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>,
Dim<6>, Dim<7>, Dim<8>, Dim<9>> DDimVar;
typedef boost::variant<

@ -125,11 +125,11 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
boost::get<platform::CPUPlace>(place), size, type));
} else if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
}
#else
holder_.reset(new PlaceholderImpl<platform::GPUPlace>(
boost::get<platform::GPUPlace>(place), size, type));
holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
boost::get<platform::CUDAPlace>(place), size, type));
}
#endif
offset_ = 0;

@ -80,20 +80,20 @@ TEST(Tensor, MutableData) {
float* p1 = nullptr;
float* p2 = nullptr;
// initialization
p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CUDAPlace());
EXPECT_NE(p1, nullptr);
// set src_tensor a new dim with large size
// momery is supposed to be re-allocated
p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), GPUPlace());
p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), CUDAPlace());
EXPECT_NE(p2, nullptr);
EXPECT_NE(p1, p2);
// set src_tensor a new dim with same size
// momery block is supposed to be unchanged
p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), GPUPlace());
p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), CUDAPlace());
EXPECT_EQ(p1, p2);
// set src_tensor a new dim with smaller size
// momery block is supposed to be unchanged
p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace());
p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CUDAPlace());
EXPECT_EQ(p1, p2);
}
#endif
@ -130,7 +130,7 @@ TEST(Tensor, ShareDataWith) {
{
Tensor src_tensor;
Tensor dst_tensor;
src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CUDAPlace());
dst_tensor.ShareDataWith(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
}
@ -166,7 +166,7 @@ TEST(Tensor, Slice) {
#ifdef PADDLE_WITH_CUDA
{
Tensor src_tensor;
src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
src_tensor.mutable_data<double>(make_ddim({6, 9}), CUDAPlace());
Tensor slice_tensor = src_tensor.Slice(2, 6);
DDim slice_dims = slice_tensor.dims();
ASSERT_EQ(arity(slice_dims), 2);
@ -176,11 +176,11 @@ TEST(Tensor, Slice) {
uintptr_t src_data_address =
reinterpret_cast<uintptr_t>(src_tensor.data<double>());
uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
src_tensor.mutable_data<double>(src_tensor.dims(), GPUPlace()));
src_tensor.mutable_data<double>(src_tensor.dims(), CUDAPlace()));
uintptr_t slice_data_address =
reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
slice_tensor.mutable_data<double>(slice_tensor.dims(), GPUPlace()));
slice_tensor.mutable_data<double>(slice_tensor.dims(), CUDAPlace()));
EXPECT_EQ(src_data_address, src_mutable_data_address);
EXPECT_EQ(slice_data_address, slice_mutable_data_address);
EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);

@ -47,11 +47,11 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
#ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
memory::Copy(
dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
@ -59,21 +59,21 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
} else if (platform::is_cpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
memory::Copy(
dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
memory::Copy(
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
@ -108,7 +108,7 @@ inline void CopyFromVector(const std::vector<T>& src,
#ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(dst_place)) { // NOLINT
memory::Copy(
boost::get<platform::GPUPlace>(dst_place), dst_ptr, src_place, src_ptr,
boost::get<platform::CUDAPlace>(dst_place), dst_ptr, src_place, src_ptr,
size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
}
@ -141,7 +141,7 @@ inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx,
#ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(src.place())) { // NOLINT
memory::Copy(
dst_place, dst_ptr, boost::get<platform::GPUPlace>(src.place()),
dst_place, dst_ptr, boost::get<platform::CUDAPlace>(src.place()),
src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
}

@ -58,7 +58,7 @@ TEST(CopyFrom, Tensor) {
memcpy(src_ptr, arr, 9 * sizeof(int));
// CPU Tensor to GPU Tensor
auto gpu_place = new platform::GPUPlace(0);
auto gpu_place = new platform::CUDAPlace(0);
platform::CUDADeviceContext gpu_ctx(*gpu_place);
CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
@ -143,7 +143,7 @@ TEST(CopyFromVector, Tensor) {
// Copy to GPUTensor
gpu_tensor.Resize(make_ddim({3, 3}));
auto gpu_place = new paddle::platform::GPUPlace();
auto gpu_place = new paddle::platform::CUDAPlace();
CUDADeviceContext gpu_ctx(*gpu_place);
CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
// Copy from GPU to CPU tensor for comparison
@ -210,7 +210,7 @@ TEST(CopyToVector, Tensor) {
{
std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
Tensor gpu_tensor;
GPUPlace place;
CUDAPlace place;
CUDADeviceContext gpu_ctx(place);
CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);

@ -12,13 +12,13 @@ p = memory::Alloc(platform::CPUPlace(), 4*1024);
To allocate 4KB memory on the 3rd GPU:
```cpp
p = memory::Alloc(platform::GPUPlace(2), 4*1024);
p = memory::Alloc(platform::CUDAPlace(2), 4*1024);
```
To free memory and check the so-far used amount of memory on a place:
```cpp
auto pl = platform::GPUPlace(0);
auto pl = platform::CUDAPlace(0);
p = memory::Alloc(pl, 4*1024);
cout << memory::Used(pl);
memory::Free(pl, p);
@ -36,7 +36,7 @@ template <typename Place> size_t Used(Place);
} // namespace memory
```
These function templates have specializations on either `platform::CPUPlace` or `platform::GPUPlace`:
These function templates have specializations on either `platform::CPUPlace` or `platform::CUDAPlace`:
```cpp
template<>
@ -49,7 +49,7 @@ and
```cpp
template<>
void Alloc<GPUPlace>(GPUPlace p, size_t size) {
void Alloc<CUDAPlace>(CUDAPlace p, size_t size) {
return GetGPUBuddyAllocator(p.id)->Alloc(size);
}
```
@ -122,7 +122,7 @@ There are two implementations of `Context`:
1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory.
1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202). This looks very similar to class `majel::GPUPlace`, who also has an `int id_` data member. `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202). This looks very similar to class `majel::CUDAPlace`, who also has an `int id_` data member. `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
### Majel

@ -28,31 +28,25 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
#ifdef PADDLE_WITH_CUDA
template <>
void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
void* dst,
platform::GPUPlace src_place,
const void* src, size_t num,
cudaStream_t stream) {
void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
const void* src, size_t num, cudaStream_t stream) {
platform::SetDeviceId(src_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
}
template <>
void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
void* dst,
platform::CPUPlace src_place,
const void* src, size_t num,
cudaStream_t stream) {
void Copy<platform::CUDAPlace, platform::CPUPlace>(
platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
const void* src, size_t num, cudaStream_t stream) {
platform::SetDeviceId(dst_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
}
template <>
void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
void* dst,
platform::GPUPlace src_place,
const void* src, size_t num,
cudaStream_t stream) {
void Copy<platform::CUDAPlace, platform::CUDAPlace>(
platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
const void* src, size_t num, cudaStream_t stream) {
if (dst_place == src_place) {
platform::SetDeviceId(src_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);

@ -83,12 +83,12 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
}
template <>
size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
return GetGPUBuddyAllocator(place.device)->Used();
}
template <>
void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
auto* ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
@ -101,14 +101,14 @@ void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
LOG(WARNING) << "total " << total;
LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
LOG(WARNING) << "GPU memory used: " << Used<platform::GPUPlace>(place);
LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
platform::SetDeviceId(cur_dev);
}
return ptr;
}
template <>
void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) {
void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
GetGPUBuddyAllocator(place.device)->Free(p);
}

@ -82,7 +82,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
#ifdef PADDLE_WITH_CUDA
size_t align(size_t size, paddle::platform::GPUPlace place) {
size_t align(size_t size, paddle::platform::CUDAPlace place) {
size += sizeof(paddle::memory::detail::Metadata);
size_t alignment = paddle::platform::GpuMinChunkSize();
size_t remaining = size % alignment;
@ -94,7 +94,7 @@ TEST(BuddyAllocator, GPUAllocation) {
EXPECT_EQ(p, nullptr);
paddle::platform::GPUPlace gpu(0);
paddle::platform::CUDAPlace gpu(0);
p = paddle::memory::Alloc(gpu, 4096);
EXPECT_NE(p, nullptr);
@ -103,7 +103,7 @@ TEST(BuddyAllocator, GPUAllocation) {
}
TEST(BuddyAllocator, GPUMultAlloc) {
paddle::platform::GPUPlace gpu;
paddle::platform::CUDAPlace gpu;
std::unordered_map<void *, size_t> ps;

@ -56,7 +56,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace.");
"It must use CUDAPlace.");
auto* inference = ctx.Input<Tensor>("Out");
auto* indices = ctx.Input<Tensor>("Indices");
auto* label = ctx.Input<Tensor>("Label");

@ -53,7 +53,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace.");
"It must use CUDAPlace.");
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
const float momentum = ctx.Attr<float>("momentum");
const bool is_test = ctx.Attr<bool>("is_test");
@ -179,7 +179,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace.");
"It must use CUDAPlace.");
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
const DataLayout data_layout =

@ -36,7 +36,7 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace.");
"It must use CUDAPlace.");
auto* input = ctx.Input<Tensor>("Input");
auto* filter = ctx.Input<Tensor>("Filter");
auto* output = ctx.Output<Tensor>("Output");
@ -130,7 +130,7 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, algo, &workspace_size_in_bytes));
// Allocate on GPU memory
platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv forward ---------------------
T alpha = 1.0f, beta = 0.0f;
@ -151,7 +151,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace.");
"It must use CUDAPlace.");
auto input = ctx.Input<Tensor>("Input");
auto filter = ctx.Input<Tensor>("Filter");
auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
@ -277,7 +277,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv workspace ---------------------
// Already on GPU
void* cudnn_workspace = nullptr;
platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv backward data ---------------------
T alpha = 1.0f, beta = 0.0f;

@ -35,7 +35,7 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace.");
"It must use CUDAPlace.");
auto* input = ctx.Input<Tensor>("Input");
auto* filter = ctx.Input<Tensor>("Filter");
auto* output = ctx.Output<Tensor>("Output");
@ -100,7 +100,7 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
cudnn_output_desc, algo, &workspace_size_in_bytes));
// Allocate on GPU memory
platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv transpose forward ---------------------
@ -120,7 +120,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace.");
"It must use CUDAPlace.");
auto input = ctx.Input<Tensor>("Input");
auto filter = ctx.Input<Tensor>("Filter");
auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
@ -201,7 +201,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv workspace ---------------------
// Already on GPU
void* cudnn_workspace = nullptr;
platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv backward data ---------------------
// FIXME(typhoonzero): template type T may not be the same as cudnn call.

@ -35,7 +35,7 @@ struct StridedMemcpyFunctor<T, 1> {
memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head);
} else {
#ifdef PADDLE_WITH_CUDA
auto& gpu_place = boost::get<platform::GPUPlace>(place);
auto& gpu_place = boost::get<platform::CUDAPlace>(place);
auto& cuda_ctx =
reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim.head,

@ -219,8 +219,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
// operators runs on GPU device.
auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
Tensor* dst) {
dst->mutable_data<T>(platform::GPUPlace());
framework::CopyFrom(src, platform::GPUPlace(), ctx, dst);
dst->mutable_data<T>(platform::CUDAPlace());
framework::CopyFrom(src, platform::CUDAPlace(), ctx, dst);
};
copyTensor(ctx, emission_exps_src, emission_exps_dst);
copyTensor(ctx, transition_exps_src, transition_exps_dst);
@ -433,8 +433,8 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor* src,
Tensor* dst) {
if (src && dst) {
dst->mutable_data<T>(platform::GPUPlace());
framework::CopyFrom(*src, platform::GPUPlace(), ctx, dst);
dst->mutable_data<T>(platform::CUDAPlace());
framework::CopyFrom(*src, platform::CUDAPlace(), ctx, dst);
}
};
copyTensor(ctx, emission_grad_src, emission_grad_dst);

@ -101,7 +101,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
// copy GPU memory to CPU pinned memory
framework::Vector<int64_t> new_rows;
new_rows.resize(ids_dim[0]);
auto gpu_place = boost::get<platform::GPUPlace>(context.GetPlace());
auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
memory::Copy(platform::CPUPlace(), new_rows.data(), gpu_place, ids_data,
ids_dim[0] * sizeof(int64_t), stream);

@ -98,7 +98,7 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace.");
"It must use CUDAPlace.");
auto* x_tensor = ctx.Input<framework::Tensor>("X");
auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
@ -129,7 +129,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace.");
"It must use CUDAPlace.");
auto x_tensor = ctx.Input<Tensor>("X");
auto c_prev_tensor = ctx.Input<Tensor>("C_prev");

@ -159,6 +159,7 @@ void testIm2col() {
TEST(math, im2col) {
testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
#ifdef PADDLE_WITH_CUDA
testIm2col<paddle::platform::CUDADeviceContext, paddle::platform::GPUPlace>();
testIm2col<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace>();
#endif
}

@ -105,7 +105,7 @@ void matmul<platform::CUDADeviceContext, float>(
PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
platform::is_gpu_place(matrix_b.place()) &&
platform::is_gpu_place(matrix_out->place()),
"Matrix must all be in GPUPlace");
"Matrix must all be in CUDAPlace");
int M = dim_out[0];
int N = dim_out[1];
@ -134,7 +134,7 @@ void matmul<platform::CUDADeviceContext, double>(
PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
platform::is_gpu_place(matrix_b.place()) &&
platform::is_gpu_place(matrix_out->place()),
"Matrix must all be in GPUPlace");
"Matrix must all be in CUDAPlace");
int M = dim_out[0];
int N = dim_out[1];
@ -266,7 +266,7 @@ struct TensorSetConstantGPU {
};
template <>
void set_constant_with_place<platform::GPUPlace>(
void set_constant_with_place<platform::CUDAPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
framework::VisitDataType(framework::ToDataType(tensor->type()),
@ -277,7 +277,7 @@ template <>
void set_constant_with_place<platform::CUDNNPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
set_constant_with_place<platform::GPUPlace>(context, tensor, value);
set_constant_with_place<platform::CUDAPlace>(context, tensor, value);
}
template struct RowwiseAdd<platform::CUDADeviceContext, float>;

@ -13,7 +13,7 @@ TEST(math_function, notrans_mul_trans) {
float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr, 6 * sizeof(float));
auto* gpu_place = new paddle::platform::GPUPlace(0);
auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
@ -47,7 +47,7 @@ TEST(math_function, trans_mul_notrans) {
float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr, 6 * sizeof(float));
auto* gpu_place = new paddle::platform::GPUPlace(0);
auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
@ -96,7 +96,7 @@ TEST(math_function, gemm_notrans_cublas) {
float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
memcpy(input3_ptr, arr3, 8 * sizeof(float));
auto* gpu_place = new paddle::platform::GPUPlace(0);
auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
@ -151,7 +151,7 @@ TEST(math_function, gemm_trans_cublas) {
float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
memcpy(input3_ptr, arr3, 8 * sizeof(float));
auto* gpu_place = new paddle::platform::GPUPlace(0);
auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
@ -189,7 +189,7 @@ void GemvTest(int m, int n, bool trans) {
T* data_b = vec_b.mutable_data<T>({trans ? m : n}, *cpu_place);
T* data_c = vec_c.mutable_data<T>({trans ? n : m}, *cpu_place);
auto* gpu_place = new paddle::platform::GPUPlace(0);
auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::framework::Tensor g_mat_a;
paddle::framework::Tensor g_vec_b;
paddle::framework::Tensor g_vec_c;

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save