You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Paddle/paddle/operators/conv_transpose_cudnn_op.cu.cc

246 lines
11 KiB

7 years ago
/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
#include "paddle/memory/memory.h"
7 years ago
#include "paddle/operators/conv_transpose_op.h"
7 years ago
#include "paddle/platform/assert.h"
#include "paddle/platform/cudnn_helper.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
using DataLayout = platform::DataLayout;
7 years ago
static constexpr size_t kConvCudnnWorkspaceLimitBytes = 1024 * 1024 * 1024;
7 years ago
template <typename T>
class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace.");
auto* input = ctx.Input<Tensor>("Input");
auto* filter = ctx.Input<Tensor>("Filter");
auto* output = ctx.Output<Tensor>("Output");
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
7 years ago
// cudnn v5 does not support dilations
7 years ago
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
const T* input_data = input->data<T>();
const T* filter_data = filter->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace());
// ------------------- cudnn descriptors ---------------------
ScopedTensorDescriptor input_desc;
ScopedTensorDescriptor output_desc;
ScopedFilterDescriptor filter_desc;
ScopedConvolutionDescriptor conv_desc;
7 years ago
DataLayout layout;
if (strides.size() == 2U) {
layout = DataLayout::kNCHW;
} else {
layout = DataLayout::kNCDHW;
}
7 years ago
7 years ago
// (N, M, H, W) or (N, M, D, H, W)
7 years ago
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
layout, framework::vectorize2int(input->dims()));
7 years ago
// (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
7 years ago
cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
layout, framework::vectorize2int(output->dims()));
7 years ago
// (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w)
7 years ago
cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
layout, framework::vectorize2int(filter->dims()));
cudnnConvolutionDescriptor_t cudnn_conv_desc =
conv_desc.descriptor<T>(paddings, strides, dilations);
// ------------------- cudnn conv workspace ---------------------
void* cudnn_workspace = nullptr;
size_t workspace_size_in_bytes; // final workspace to allocate.
7 years ago
size_t workspace_size_limit = kConvCudnnWorkspaceLimitBytes;
7 years ago
if (user_workspace_size > 0) {
workspace_size_limit = user_workspace_size * 1024 * 1024;
}
// ------------------- cudnn conv algorithm ---------------------
7 years ago
cudnnConvolutionBwdDataAlgo_t algo;
7 years ago
auto handle = ctx.cuda_device_context().cudnn_handle();
// Get the algorithm
PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
// dxDesc: Handle to the previously initialized output tensor
// descriptor.
cudnn_output_desc, CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
// get workspace size able to allocate
PADDLE_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
7 years ago
cudnn_output_desc, algo, &workspace_size_in_bytes));
7 years ago
// Allocate on GPU memory
platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv transpose forward ---------------------
T alpha = 1.0f, beta = 0.0f;
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
handle, &alpha, cudnn_filter_desc, filter_data, cudnn_input_desc,
input_data, cudnn_conv_desc, algo, cudnn_workspace,
workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
// Release the cudnn workspace
paddle::memory::Free(gpu, cudnn_workspace);
}
};
template <typename T>
class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace.");
auto input = ctx.Input<Tensor>("Input");
auto filter = ctx.Input<Tensor>("Filter");
auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
const T* input_data = input->data<T>();
const T* output_grad_data = output_grad->data<T>();
const T* filter_data = filter->data<T>();
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
7 years ago
// cudnn v5 does not support dilations
7 years ago
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
// ------------------- cudnn descriptors ---------------------
ScopedTensorDescriptor input_desc;
7 years ago
ScopedTensorDescriptor output_desc;
7 years ago
ScopedFilterDescriptor filter_desc;
ScopedConvolutionDescriptor conv_desc;
DataLayout layout = DataLayout::kNCHW;
7 years ago
// Input: (N, M, H, W) or (N, M, D, H, W)
7 years ago
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
7 years ago
layout, framework::vectorize2int(input->dims()));
7 years ago
// Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
7 years ago
cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
layout, framework::vectorize2int(output_grad->dims()));
7 years ago
// Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w)
7 years ago
cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
7 years ago
layout, framework::vectorize2int(filter->dims()));
7 years ago
cudnnConvolutionDescriptor_t cudnn_conv_desc =
conv_desc.descriptor<T>(paddings, strides, dilations);
// ------------------- cudnn backward algorithm ---------------------
7 years ago
cudnnConvolutionFwdAlgo_t data_algo;
7 years ago
cudnnConvolutionBwdFilterAlgo_t filter_algo;
7 years ago
size_t bwd_filter_ws_size, fwd_ws_size;
size_t workspace_size_in_bytes = 0;
7 years ago
size_t workspace_size_limit = kConvCudnnWorkspaceLimitBytes;
7 years ago
if (user_workspace_size > 0) {
workspace_size_limit = user_workspace_size * 1024 * 1024;
}
auto handle = ctx.cuda_device_context().cudnn_handle();
if (input_grad) {
7 years ago
// choose backward algorithm for data
PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &data_algo));
PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_input_desc, data_algo, &fwd_ws_size));
workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size);
7 years ago
}
if (filter_grad) {
7 years ago
// choose backward algorithm for filter
7 years ago
PADDLE_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
7 years ago
handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
7 years ago
cudnn_filter_desc,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &filter_algo));
7 years ago
// get workspace for backwards filter algorithm
7 years ago
PADDLE_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
7 years ago
handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
cudnn_filter_desc, filter_algo, &bwd_filter_ws_size));
workspace_size_in_bytes =
std::max(workspace_size_in_bytes, bwd_filter_ws_size);
7 years ago
}
7 years ago
7 years ago
// ------------------- cudnn conv workspace ---------------------
// Already on GPU
void* cudnn_workspace = nullptr;
platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv backward data ---------------------
// FIXME(typhoonzero): template type T may not be the same as cudnn call.
T alpha = 1.0f, beta = 0.0f;
if (input_grad) {
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
7 years ago
// Because beta is zero, it is unnecessary to reset input_grad.
7 years ago
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_output_desc, output_grad_data,
cudnn_filter_desc, filter_data, cudnn_conv_desc, data_algo,
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
input_grad_data));
7 years ago
}
7 years ago
7 years ago
// ------------------- cudnn conv backward filter ---------------------
if (filter_grad) {
T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
7 years ago
// Because beta is zero, it is unnecessary to reset filter_grad.
7 years ago
// Gradient with respect to the filter
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc,
input_data, cudnn_conv_desc, filter_algo, cudnn_workspace,
workspace_size_in_bytes, &beta, cudnn_filter_desc, filter_grad_data));
7 years ago
}
// Release the cudnn workspace
paddle::memory::Free(gpu, cudnn_workspace);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
7 years ago
REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn,
7 years ago
ops::CudnnConvTransposeOpKernel<float>);
7 years ago
REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn_grad,
7 years ago
ops::CudnnConvTransposeGradOpKernel<float>);
REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn,
ops::CudnnConvTransposeOpKernel<float>);
REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn_grad,
ops::CudnnConvTransposeGradOpKernel<float>);