|
|
@ -76,7 +76,6 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
|
|
|
|
conv_desc.descriptor<T>(paddings, strides, dilations);
|
|
|
|
conv_desc.descriptor<T>(paddings, strides, dilations);
|
|
|
|
|
|
|
|
|
|
|
|
// ------------------- cudnn conv workspace ---------------------
|
|
|
|
// ------------------- cudnn conv workspace ---------------------
|
|
|
|
void* cudnn_workspace = nullptr;
|
|
|
|
|
|
|
|
size_t workspace_size_in_bytes; // final workspace to allocate.
|
|
|
|
size_t workspace_size_in_bytes; // final workspace to allocate.
|
|
|
|
size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes;
|
|
|
|
size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes;
|
|
|
|
if (user_workspace_size > 0) {
|
|
|
|
if (user_workspace_size > 0) {
|
|
|
@ -100,25 +99,21 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
|
|
|
|
handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
|
|
|
|
handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
|
|
|
|
cudnn_output_desc, algo, &workspace_size_in_bytes));
|
|
|
|
cudnn_output_desc, algo, &workspace_size_in_bytes));
|
|
|
|
|
|
|
|
|
|
|
|
// Allocate on GPU memory
|
|
|
|
|
|
|
|
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
|
|
|
|
|
|
|
|
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// ------------------- cudnn conv transpose forward ---------------------
|
|
|
|
// ------------------- cudnn conv transpose forward ---------------------
|
|
|
|
int input_offset = input->numel() / input->dims()[0] / groups;
|
|
|
|
int input_offset = input->numel() / input->dims()[0] / groups;
|
|
|
|
int output_offset = output->numel() / output->dims()[0] / groups;
|
|
|
|
int output_offset = output->numel() / output->dims()[0] / groups;
|
|
|
|
int filter_offset = filter->numel() / groups;
|
|
|
|
int filter_offset = filter->numel() / groups;
|
|
|
|
T alpha = 1.0f, beta = 0.0f;
|
|
|
|
T alpha = 1.0f, beta = 0.0f;
|
|
|
|
for (int g = 0; g < groups; g++) {
|
|
|
|
for (int g = 0; g < groups; g++) {
|
|
|
|
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
|
|
|
|
auto cudnn_func = [&](void* cudnn_workspace) {
|
|
|
|
handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
|
|
|
|
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
|
|
|
|
cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
|
|
|
|
handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
|
|
|
|
algo, cudnn_workspace, workspace_size_in_bytes, &beta,
|
|
|
|
cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
|
|
|
|
cudnn_output_desc, output_data + output_offset * g));
|
|
|
|
algo, cudnn_workspace, workspace_size_in_bytes, &beta,
|
|
|
|
|
|
|
|
cudnn_output_desc, output_data + output_offset * g));
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Release the cudnn workspace
|
|
|
|
|
|
|
|
paddle::memory::Free(gpu, cudnn_workspace);
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
@ -206,11 +201,6 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
|
|
|
|
std::max(workspace_size_in_bytes, bwd_filter_ws_size);
|
|
|
|
std::max(workspace_size_in_bytes, bwd_filter_ws_size);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ------------------- cudnn conv workspace ---------------------
|
|
|
|
|
|
|
|
// Already on GPU
|
|
|
|
|
|
|
|
void* cudnn_workspace = nullptr;
|
|
|
|
|
|
|
|
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
|
|
|
|
|
|
|
|
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
|
|
|
|
|
|
|
|
// ------------------- cudnn conv backward data ---------------------
|
|
|
|
// ------------------- cudnn conv backward data ---------------------
|
|
|
|
// FIXME(typhoonzero): template type T may not be the same as cudnn call.
|
|
|
|
// FIXME(typhoonzero): template type T may not be the same as cudnn call.
|
|
|
|
int input_offset = input->numel() / input->dims()[0] / groups;
|
|
|
|
int input_offset = input->numel() / input->dims()[0] / groups;
|
|
|
@ -222,12 +212,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
|
|
|
|
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
|
|
|
|
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
|
|
|
|
// Because beta is zero, it is unnecessary to reset input_grad.
|
|
|
|
// Because beta is zero, it is unnecessary to reset input_grad.
|
|
|
|
for (int g = 0; g < groups; g++) {
|
|
|
|
for (int g = 0; g < groups; g++) {
|
|
|
|
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
|
|
|
|
auto cudnn_func = [&](void* cudnn_workspace) {
|
|
|
|
handle, &alpha, cudnn_output_desc,
|
|
|
|
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
|
|
|
|
output_grad_data + output_grad_offset * g, cudnn_filter_desc,
|
|
|
|
handle, &alpha, cudnn_output_desc,
|
|
|
|
filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
|
|
|
|
output_grad_data + output_grad_offset * g, cudnn_filter_desc,
|
|
|
|
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
|
|
|
|
filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
|
|
|
|
input_grad_data + input_offset * g));
|
|
|
|
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
|
|
|
|
|
|
|
|
input_grad_data + input_offset * g));
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
@ -237,17 +230,17 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
|
|
|
|
// Because beta is zero, it is unnecessary to reset filter_grad.
|
|
|
|
// Because beta is zero, it is unnecessary to reset filter_grad.
|
|
|
|
// Gradient with respect to the filter
|
|
|
|
// Gradient with respect to the filter
|
|
|
|
for (int g = 0; g < groups; g++) {
|
|
|
|
for (int g = 0; g < groups; g++) {
|
|
|
|
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
|
|
|
|
auto cudnn_func = [&](void* cudnn_workspace) {
|
|
|
|
handle, &alpha, cudnn_output_desc,
|
|
|
|
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
|
|
|
|
output_grad_data + output_grad_offset * g, cudnn_input_desc,
|
|
|
|
handle, &alpha, cudnn_output_desc,
|
|
|
|
input_data + input_offset * g, cudnn_conv_desc, filter_algo,
|
|
|
|
output_grad_data + output_grad_offset * g, cudnn_input_desc,
|
|
|
|
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc,
|
|
|
|
input_data + input_offset * g, cudnn_conv_desc, filter_algo,
|
|
|
|
filter_grad_data + filter_offset * g));
|
|
|
|
cudnn_workspace, workspace_size_in_bytes, &beta,
|
|
|
|
|
|
|
|
cudnn_filter_desc, filter_grad_data + filter_offset * g));
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Release the cudnn workspace
|
|
|
|
|
|
|
|
paddle::memory::Free(gpu, cudnn_workspace);
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|