|
|
@ -77,7 +77,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
|
|
|
|
// cudnn 7 can support groups, no need to do it mannually
|
|
|
|
// cudnn 7 can support groups, no need to do it mannually
|
|
|
|
// FIXME(typhoonzero): find a better way to disable groups
|
|
|
|
// FIXME(typhoonzero): find a better way to disable groups
|
|
|
|
// rather than setting it to 1.
|
|
|
|
// rather than setting it to 1.
|
|
|
|
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
|
|
|
|
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
|
|
|
|
cudnn_conv_desc, groups));
|
|
|
|
cudnn_conv_desc, groups));
|
|
|
|
groups = 1;
|
|
|
|
groups = 1;
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
@ -129,7 +129,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
|
|
|
|
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
|
|
|
|
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
|
|
|
|
auto handle = dev_ctx.cudnn_handle();
|
|
|
|
auto handle = dev_ctx.cudnn_handle();
|
|
|
|
|
|
|
|
|
|
|
|
PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
|
|
|
|
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
|
|
|
|
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
|
|
|
|
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
|
|
|
|
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
|
|
|
|
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
|
|
|
|
workspace_size_limit, &algo));
|
|
|
|
workspace_size_limit, &algo));
|
|
|
@ -140,18 +140,18 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
|
|
|
|
if (dev_ctx.GetComputeCapability() >= 70 &&
|
|
|
|
if (dev_ctx.GetComputeCapability() >= 70 &&
|
|
|
|
std::type_index(typeid(T)) ==
|
|
|
|
std::type_index(typeid(T)) ==
|
|
|
|
std::type_index(typeid(platform::float16))) {
|
|
|
|
std::type_index(typeid(platform::float16))) {
|
|
|
|
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
|
|
|
|
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
|
|
|
|
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
|
|
|
|
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
|
|
|
|
// Currently tensor core is only enabled using this algo
|
|
|
|
// Currently tensor core is only enabled using this algo
|
|
|
|
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
|
|
|
|
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
|
|
|
|
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
|
|
|
|
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
|
|
|
|
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
// get workspace size able to allocate
|
|
|
|
// get workspace size able to allocate
|
|
|
|
PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
|
|
|
|
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
|
|
|
|
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
|
|
|
|
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
|
|
|
|
cudnn_output_desc, algo, &workspace_size_in_bytes));
|
|
|
|
cudnn_output_desc, algo, &workspace_size_in_bytes));
|
|
|
|
// It is possible for float16 on Volta GPU to allocate more memory than
|
|
|
|
// It is possible for float16 on Volta GPU to allocate more memory than
|
|
|
@ -165,7 +165,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
|
|
|
|
// ------------------- cudnn conv forward ---------------------
|
|
|
|
// ------------------- cudnn conv forward ---------------------
|
|
|
|
ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
|
|
|
|
ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
|
|
|
|
for (int i = 0; i < groups; i++) {
|
|
|
|
for (int i = 0; i < groups; i++) {
|
|
|
|
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
|
|
|
|
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
|
|
|
|
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
|
|
|
|
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
|
|
|
|
cudnn_filter_desc, filter_data + i * group_offset_filter,
|
|
|
|
cudnn_filter_desc, filter_data + i * group_offset_filter,
|
|
|
|
cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
|
|
|
|
cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
|
|
|
@ -218,7 +218,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
|
|
|
|
// cudnn 7 can support groups, no need to do it mannually
|
|
|
|
// cudnn 7 can support groups, no need to do it mannually
|
|
|
|
// FIXME(typhoonzero): find a better way to disable groups
|
|
|
|
// FIXME(typhoonzero): find a better way to disable groups
|
|
|
|
// rather than setting it to 1.
|
|
|
|
// rather than setting it to 1.
|
|
|
|
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
|
|
|
|
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
|
|
|
|
cudnn_conv_desc, groups));
|
|
|
|
cudnn_conv_desc, groups));
|
|
|
|
groups = 1;
|
|
|
|
groups = 1;
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
@ -273,7 +273,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
|
|
|
|
auto handle = dev_ctx.cudnn_handle();
|
|
|
|
auto handle = dev_ctx.cudnn_handle();
|
|
|
|
if (input_grad) {
|
|
|
|
if (input_grad) {
|
|
|
|
if (FLAGS_cudnn_deterministic) {
|
|
|
|
if (FLAGS_cudnn_deterministic) {
|
|
|
|
PADDLE_ENFORCE(
|
|
|
|
CUDNN_ENFORCE(
|
|
|
|
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
|
|
|
|
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
|
|
|
|
handle, cudnn_filter_desc,
|
|
|
|
handle, cudnn_filter_desc,
|
|
|
|
// dyDesc: Handle to the previously initialized input
|
|
|
|
// dyDesc: Handle to the previously initialized input
|
|
|
@ -289,7 +289,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
|
|
|
|
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
|
|
|
|
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
PADDLE_ENFORCE(
|
|
|
|
CUDNN_ENFORCE(
|
|
|
|
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
|
|
|
|
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
|
|
|
|
handle, cudnn_filter_desc, cudnn_output_grad_desc,
|
|
|
|
handle, cudnn_filter_desc, cudnn_output_grad_desc,
|
|
|
|
cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size));
|
|
|
|
cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size));
|
|
|
@ -298,7 +298,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
|
|
|
|
|
|
|
|
|
|
|
|
if (filter_grad) {
|
|
|
|
if (filter_grad) {
|
|
|
|
if (FLAGS_cudnn_deterministic) {
|
|
|
|
if (FLAGS_cudnn_deterministic) {
|
|
|
|
PADDLE_ENFORCE(
|
|
|
|
CUDNN_ENFORCE(
|
|
|
|
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
|
|
|
|
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
|
|
|
|
handle, cudnn_input_desc, cudnn_output_grad_desc,
|
|
|
|
handle, cudnn_input_desc, cudnn_output_grad_desc,
|
|
|
|
cudnn_conv_desc, cudnn_filter_desc,
|
|
|
|
cudnn_conv_desc, cudnn_filter_desc,
|
|
|
@ -308,7 +308,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
|
|
|
|
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
|
|
|
|
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
PADDLE_ENFORCE(
|
|
|
|
CUDNN_ENFORCE(
|
|
|
|
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
|
|
|
|
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
|
|
|
|
handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
|
|
|
|
handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
|
|
|
|
cudnn_filter_desc, filter_algo, &tmp_size));
|
|
|
|
cudnn_filter_desc, filter_algo, &tmp_size));
|
|
|
@ -326,7 +326,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
|
|
|
|
// Because beta is zero, it is unnecessary to reset input_grad.
|
|
|
|
// Because beta is zero, it is unnecessary to reset input_grad.
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < groups; i++) {
|
|
|
|
for (int i = 0; i < groups; i++) {
|
|
|
|
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
|
|
|
|
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
|
|
|
|
handle, &alpha, cudnn_filter_desc,
|
|
|
|
handle, &alpha, cudnn_filter_desc,
|
|
|
|
filter_data + i * group_offset_filter, cudnn_output_grad_desc,
|
|
|
|
filter_data + i * group_offset_filter, cudnn_output_grad_desc,
|
|
|
|
output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo,
|
|
|
|
output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo,
|
|
|
@ -339,7 +339,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
|
|
|
|
T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
|
|
|
|
T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
|
|
|
|
// Because beta is zero, it is unnecessary to reset filter_grad.
|
|
|
|
// Because beta is zero, it is unnecessary to reset filter_grad.
|
|
|
|
for (int i = 0; i < groups; i++) {
|
|
|
|
for (int i = 0; i < groups; i++) {
|
|
|
|
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
|
|
|
|
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
|
|
|
|
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
|
|
|
|
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
|
|
|
|
cudnn_output_grad_desc, output_grad_data + i * group_offset_out,
|
|
|
|
cudnn_output_grad_desc, output_grad_data + i * group_offset_out,
|
|
|
|
cudnn_conv_desc, filter_algo, cudnn_workspace,
|
|
|
|
cudnn_conv_desc, filter_algo, cudnn_workspace,
|
|
|
|