|
|
|
@ -46,7 +46,6 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
|
|
|
|
|
auto* input = ctx.Input<Tensor>("Input");
|
|
|
|
|
auto* filter = ctx.Input<Tensor>("Filter");
|
|
|
|
|
auto* bias = ctx.Input<Tensor>("Bias");
|
|
|
|
|
PADDLE_ENFORCE_NOT_NULL(bias, "The bias should not be null.");
|
|
|
|
|
auto* residual = ctx.Input<Tensor>("ResidualData");
|
|
|
|
|
auto* output = ctx.Output<Tensor>("Output");
|
|
|
|
|
output->mutable_data<T>(ctx.GetPlace());
|
|
|
|
@ -61,28 +60,25 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
|
|
|
|
|
bool exhaustive_search =
|
|
|
|
|
FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
|
|
|
|
|
|
|
|
|
|
// const T* input_data = input->data<T>();
|
|
|
|
|
const T* filter_data = filter->data<T>();
|
|
|
|
|
const T* bias_data = bias->data<T>();
|
|
|
|
|
// T* output_data = output->mutable_data<T>(ctx.GetPlace());
|
|
|
|
|
|
|
|
|
|
const std::string padding_algorithm =
|
|
|
|
|
ctx.Attr<std::string>("padding_algorithm");
|
|
|
|
|
const std::string data_format = ctx.Attr<std::string>("data_format");
|
|
|
|
|
|
|
|
|
|
Tensor transformed_input_channel(input->type());
|
|
|
|
|
Tensor transformed_output(output->type());
|
|
|
|
|
T* output_data = nullptr;
|
|
|
|
|
|
|
|
|
|
transformed_input_channel = *input;
|
|
|
|
|
transformed_output = *output;
|
|
|
|
|
output_data = transformed_output.data<T>();
|
|
|
|
|
T* output_data = transformed_output.data<T>();
|
|
|
|
|
|
|
|
|
|
const T* residual_data = residual ? residual->data<T>() : output_data;
|
|
|
|
|
|
|
|
|
|
// update padding and dilation
|
|
|
|
|
auto in_dims = transformed_input_channel.dims();
|
|
|
|
|
auto filter_dims = filter->dims();
|
|
|
|
|
framework::DDim in_data_dims;
|
|
|
|
|
in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size());
|
|
|
|
|
framework::DDim in_data_dims =
|
|
|
|
|
framework::slice_ddim(in_dims, 2, in_dims.size());
|
|
|
|
|
|
|
|
|
|
framework::DDim filter_data_dims =
|
|
|
|
|
framework::slice_ddim(filter_dims, 2, filter_dims.size());
|
|
|
|
@ -134,7 +130,10 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
|
|
|
|
|
&transformed_input);
|
|
|
|
|
} break;
|
|
|
|
|
default:
|
|
|
|
|
PADDLE_THROW("ConvOp only support tensors with 4 or 5 dimensions.");
|
|
|
|
|
PADDLE_THROW(platform::errors::PermissionDenied(
|
|
|
|
|
"Operator Conv2DFusion expects Input to be a 4-D or 5-D Tensor. "
|
|
|
|
|
"But recieved the actual dimension = %d, shape = [%s].",
|
|
|
|
|
rank, transformed_input_channel.dims()));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
@ -168,7 +167,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
|
|
|
|
|
conv_desc.descriptor<T>(padding_common, strides, dilations);
|
|
|
|
|
PADDLE_ENFORCE_CUDA_SUCCESS(
|
|
|
|
|
platform::dynload::cudnnSetConvolutionGroupCount(cudnn_conv_desc,
|
|
|
|
|
groups));
|
|
|
|
|
groups),
|
|
|
|
|
platform::errors::External(
|
|
|
|
|
"Call of cudnnSetConvolutionGroupCount(cudnn_conv_desc, groups) "
|
|
|
|
|
"failed, where cudnn_conv_desc is configured: padding = [%s], "
|
|
|
|
|
"strides = [%s], dilations = [%s]; groups = %d",
|
|
|
|
|
framework::make_ddim(padding_common), framework::make_ddim(strides),
|
|
|
|
|
framework::make_ddim(dilations), groups));
|
|
|
|
|
|
|
|
|
|
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
|
|
|
|
|
layout, framework::vectorize<int>(transformed_input.dims()));
|
|
|
|
@ -199,8 +204,15 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
|
|
|
|
|
auto handle = dev_ctx.cudnn_handle();
|
|
|
|
|
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
|
|
|
|
|
|
|
|
|
|
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
|
|
|
|
|
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
|
|
|
|
|
PADDLE_ENFORCE_CUDA_SUCCESS(
|
|
|
|
|
platform::dynload::cudnnSetConvolutionMathType(cudnn_conv_desc,
|
|
|
|
|
CUDNN_DEFAULT_MATH),
|
|
|
|
|
platform::errors::External(
|
|
|
|
|
"Call of cudnnSetConvolutionMathType(cudnn_conv_desc, "
|
|
|
|
|
"CUDNN_DEFAULT_MATH) failed, where cudnn_conv_desc is configured: "
|
|
|
|
|
"padding = %d, strides = %d, dilations = %d.",
|
|
|
|
|
framework::make_ddim(padding_common), framework::make_ddim(strides),
|
|
|
|
|
framework::make_ddim(dilations)));
|
|
|
|
|
|
|
|
|
|
auto x_dims = framework::vectorize(transformed_input.dims());
|
|
|
|
|
auto f_dims = framework::vectorize(filter->dims());
|
|
|
|
@ -209,7 +221,9 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
|
|
|
|
|
platform::dynload::cudnnGetConvolutionForwardAlgorithm(
|
|
|
|
|
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
|
|
|
|
|
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
|
|
|
|
|
workspace_size_limit, &algo));
|
|
|
|
|
workspace_size_limit, &algo),
|
|
|
|
|
platform::errors::External(
|
|
|
|
|
"Call of cudnnGetConvolutionForwardAlgorithm failed."));
|
|
|
|
|
VLOG(3) << "cuDNN forward algo " << algo;
|
|
|
|
|
} else {
|
|
|
|
|
std::function<cudnnConvolutionFwdAlgo_t()> search_func =
|
|
|
|
@ -223,7 +237,9 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
|
|
|
|
|
handle, cudnn_input_desc, input_data, cudnn_filter_desc,
|
|
|
|
|
filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
|
|
|
|
|
kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
|
|
|
|
|
fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit));
|
|
|
|
|
fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit),
|
|
|
|
|
platform::errors::External(
|
|
|
|
|
"Call of cudnnFindConvolutionForwardAlgorithmEx failed."));
|
|
|
|
|
};
|
|
|
|
|
workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
|
|
|
|
|
VLOG(3) << "Perf result: (algo: stat, time, memory)";
|
|
|
|
@ -257,9 +273,16 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
|
|
|
|
|
PADDLE_ENFORCE_CUDA_SUCCESS(
|
|
|
|
|
platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
|
|
|
|
|
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
|
|
|
|
|
cudnn_output_desc, algo, &workspace_size_in_bytes));
|
|
|
|
|
PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
|
|
|
|
|
"workspace_size to be allocated exceeds the limit");
|
|
|
|
|
cudnn_output_desc, algo, &workspace_size_in_bytes),
|
|
|
|
|
platform::errors::External(
|
|
|
|
|
"Call of cudnnGetConvolutionForwardWorkspaceSize failed."));
|
|
|
|
|
PADDLE_ENFORCE_LE(
|
|
|
|
|
workspace_size_in_bytes, workspace_size_limit,
|
|
|
|
|
platform::errors::InvalidArgument(
|
|
|
|
|
"The actual workspace size to be allocated for cuDNN is expected "
|
|
|
|
|
"to be less than the limit. But recieved: the actual workspace "
|
|
|
|
|
"size = %d, limit = %d.",
|
|
|
|
|
workspace_size_in_bytes, workspace_size_limit));
|
|
|
|
|
|
|
|
|
|
if ((activation == "identity") && (!residual)) {
|
|
|
|
|
// Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
|
|
|
|
@ -269,15 +292,20 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
|
|
|
|
|
// ------------- cudnn conv forward and bias add ---------------------
|
|
|
|
|
ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
|
|
|
|
|
auto cudnn_func = [&](void* cudnn_workspace) {
|
|
|
|
|
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnConvolutionForward(
|
|
|
|
|
handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
|
|
|
|
|
filter_data, cudnn_conv_desc, algo, cudnn_workspace,
|
|
|
|
|
workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
|
|
|
|
|
PADDLE_ENFORCE_CUDA_SUCCESS(
|
|
|
|
|
platform::dynload::cudnnConvolutionForward(
|
|
|
|
|
handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
|
|
|
|
|
filter_data, cudnn_conv_desc, algo, cudnn_workspace,
|
|
|
|
|
workspace_size_in_bytes, &beta, cudnn_output_desc, output_data),
|
|
|
|
|
platform::errors::External(
|
|
|
|
|
"Call of cudnnConvolutionForward failed."));
|
|
|
|
|
};
|
|
|
|
|
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
|
|
|
|
|
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnAddTensor(
|
|
|
|
|
handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc,
|
|
|
|
|
output_data));
|
|
|
|
|
PADDLE_ENFORCE_CUDA_SUCCESS(
|
|
|
|
|
platform::dynload::cudnnAddTensor(handle, &alpha, cudnn_bias_desc,
|
|
|
|
|
bias_data, &alpha,
|
|
|
|
|
cudnn_output_desc, output_data),
|
|
|
|
|
platform::errors::External("Call of cudnnAddTensor failed."));
|
|
|
|
|
} else {
|
|
|
|
|
if (activation == "identity") {
|
|
|
|
|
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
|
|
|
|
@ -292,7 +320,9 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
|
|
|
|
|
cudnn_filter_desc, filter_data, cudnn_conv_desc, algo,
|
|
|
|
|
cudnn_workspace, workspace_size_in_bytes, &alpha2,
|
|
|
|
|
cudnn_output_desc, residual_data, cudnn_bias_desc, bias_data,
|
|
|
|
|
cudnn_act_desc, cudnn_output_desc, output_data));
|
|
|
|
|
cudnn_act_desc, cudnn_output_desc, output_data),
|
|
|
|
|
platform::errors::External(
|
|
|
|
|
"Call of cudnnConvolutionBiasActivationForward failed."));
|
|
|
|
|
};
|
|
|
|
|
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
|
|
|
|
|
}
|
|
|
|
@ -314,7 +344,10 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// TODO(qingiqng): do copy when batch size large than 1
|
|
|
|
|
PADDLE_THROW("Batch size greater than 1 is Unsupported");
|
|
|
|
|
PADDLE_THROW(platform::errors::Unimplemented(
|
|
|
|
|
"Input with batch size greater than 1 is unsupported. The recieved "
|
|
|
|
|
"batch size is %d, Input's shape is [%s].",
|
|
|
|
|
x_dims[0], framework::make_ddim(x_dims)));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|