[ROCM] update fluid operators for rocm (part4), test=develop (#31225)

test_model_benchmark_ci
Qi Li 5 years ago committed by GitHub
parent 91635de390
commit 72d99c5dcd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -59,14 +59,14 @@ int PoolPlugin::enqueue(int batchSize, const void *const *inputs,
paddle::operators::math::MaxPool<float>, float>
pool2d_forward;
pool2d_forward(idata, input_shape, output_shape, ksize_, strides_,
paddings_, pool_process, true, adaptive_, odatas[0], stream);
paddings_, true, adaptive_, odatas[0], stream, pool_process);
} else if (pool_type_ == PoolType::avg) {
paddle::operators::math::AvgPool<float> pool_process;
paddle::operators::math::Pool2dDirectCUDAFunctor<
paddle::operators::math::AvgPool<float>, float>
pool2d_forward;
pool2d_forward(idata, input_shape, output_shape, ksize_, strides_,
paddings_, pool_process, true, adaptive_, odatas[0], stream);
paddings_, true, adaptive_, odatas[0], stream, pool_process);
}
return cudaGetLastError() != cudaSuccess;
@ -224,14 +224,14 @@ int PoolPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
paddle::operators::math::MaxPool<float>, float>
pool2d_forward;
pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
pool_process, true, adaptive_, output, stream);
true, adaptive_, output, stream, pool_process);
} else if (pool_type_ == "avg") {
paddle::operators::math::AvgPool<float> pool_process;
paddle::operators::math::Pool2dDirectCUDAFunctor<
paddle::operators::math::AvgPool<float>, float>
pool2d_forward;
pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
pool_process, true, adaptive_, output, stream);
true, adaptive_, output, stream, pool_process);
}
return cudaGetLastError() != cudaSuccess;

File diff suppressed because it is too large Load Diff

@ -18,7 +18,11 @@ limitations under the License. */
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/miopen_helper.h"
#else
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
DECLARE_uint64(conv_workspace_size_limit);
DECLARE_bool(cudnn_exhaustive_search);
@ -26,8 +30,11 @@ DECLARE_int64(cudnn_exhaustive_search_times);
namespace paddle {
namespace operators {
#if CUDNN_VERSION_MIN(6, 0, 5)
#ifdef PADDLE_WITH_HIP
static constexpr size_t kNUM_CUDNN_FWD_ALGS = 1;
static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 1;
static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 1;
#elif CUDNN_VERSION_MIN(6, 0, 5)
static constexpr size_t kNUM_CUDNN_FWD_ALGS = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;

File diff suppressed because it is too large Load Diff

@ -21,9 +21,13 @@ limitations under the License. */
#include "paddle/fluid/framework/op_version_registry.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/miopen_helper.h"
#endif
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
@ -149,7 +153,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
"AnyLayout"; // todo enable data layout when it's ready
framework::DataLayout layout = framework::StringToDataLayout(data_format);
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::CanCUDNNBeUsed(ctx)) {
library = framework::LibraryType::kCUDNN;
}
@ -559,7 +563,7 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Input");
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::CanCUDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN;
}
@ -744,7 +748,7 @@ framework::OpKernelType ConvOpDoubleGrad::GetExpectedKernelType(
std::string data_format = "AnyLayout";
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::CanCUDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN;
}

File diff suppressed because it is too large Load Diff

@ -183,7 +183,7 @@ framework::OpKernelType ConvTransposeOp::GetExpectedKernelType(
bool use_cudnn = ctx.Attr<bool>("use_cudnn");
use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Input");
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(ctx.GetPlace())) {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
@ -481,7 +481,7 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const {
bool use_cudnn = ctx.Attr<bool>("use_cudnn");
use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(ctx.GetPlace())) {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
@ -581,7 +581,7 @@ framework::OpKernelType ConvTransposeOpDoubleGrad::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const {
bool use_cudnn = ctx.Attr<bool>("use_cudnn");
use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(ctx.GetPlace())) {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
use_cudnn &= dev_ctx.cudnn_handle() != nullptr;

@ -28,15 +28,12 @@ function(math_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
list(APPEND cu_srcs ${TARGET}.cu.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
list(APPEND hip_srcs ${TARGET}.hip.cu)
endif()
list(LENGTH cc_srcs cc_srcs_len)
if (WITH_GPU)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
elseif (WITH_ROCM_PLATFORM AND (${hip_srcs} MATCHES ".*\\.hip.cu$"))
hip_library_ops(${TARGET} SRCS ${cc_srcs} ${hip_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
elseif (WITH_ROCM)
hip_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
elseif(${cc_srcs_len} GREATER 0)
cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
endif()
@ -89,6 +86,10 @@ if(WITH_GPU)
nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function)
nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function)
endif()
if(WITH_ROCM)
hip_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor)
hip_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function)
endif()
cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
if(WITH_TESTING AND TEST im2col_test)

@ -442,7 +442,7 @@ void TestConcatMain() {
TEST(math, concat) {
TestConcatMain<paddle::platform::CPUDeviceContext,
paddle::platform::CPUPlace>();
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TestConcatMain<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace>();
#endif

@ -30,8 +30,9 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings, PoolProcess pool_process,
bool exclusive, bool adaptive, framework::Tensor* output) {
const std::vector<int>& paddings, bool exclusive,
bool adaptive, framework::Tensor* output,
PoolProcess pool_process) {
const int batch_size = input.dims()[0];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
@ -104,8 +105,8 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
const framework::Tensor& input, const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string data_format, PoolProcess pool_process,
bool exclusive, bool adaptive, framework::Tensor* output) {
const std::string data_format, bool exclusive, bool adaptive,
framework::Tensor* output, PoolProcess pool_process) {
bool channel_last = (data_format == "NHWC");
const int batch_size = input.dims()[0];
@ -249,8 +250,8 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
const platform::CPUDeviceContext& context, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output_grad,
const std::vector<int>& ksize, const std::vector<int>& strides,
const std::vector<int>& paddings, PoolProcess pool_grad_process,
bool exclusive, bool adaptive, framework::Tensor* input_grad) {
const std::vector<int>& paddings, bool exclusive, bool adaptive,
framework::Tensor* input_grad, PoolProcess pool_grad_process) {
const int batch_size = input.dims()[0];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
@ -328,8 +329,8 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
const framework::Tensor& output, const framework::Tensor& output_grad,
const std::vector<int>& ksize, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::string data_format,
PoolProcess pool_grad_process, bool exclusive, bool adaptive,
framework::Tensor* input_grad) {
bool exclusive, bool adaptive, framework::Tensor* input_grad,
PoolProcess pool_grad_process) {
bool channel_last = (data_format == "NHWC");
const int batch_size = input.dims()[0];
@ -678,8 +679,9 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings, PoolProcess pool_process,
bool exclusive, bool adaptive, framework::Tensor* output) {
const std::vector<int>& paddings, bool exclusive,
bool adaptive, framework::Tensor* output,
PoolProcess pool_process) {
const int batch_size = input.dims()[0];
const int input_depth = input.dims()[2];
const int input_height = input.dims()[3];
@ -773,8 +775,8 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
const framework::Tensor& input, const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string data_format, PoolProcess pool_process,
bool exclusive, bool adaptive, framework::Tensor* output) {
const std::string data_format, bool exclusive, bool adaptive,
framework::Tensor* output, PoolProcess pool_process) {
bool channel_last = (data_format == "NDHWC");
const int batch_size = input.dims()[0];
@ -970,8 +972,8 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
const platform::CPUDeviceContext& context, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output_grad,
const std::vector<int>& ksize, const std::vector<int>& strides,
const std::vector<int>& paddings, PoolProcess pool_grad_process,
bool exclusive, bool adaptive, framework::Tensor* input_grad) {
const std::vector<int>& paddings, bool exclusive, bool adaptive,
framework::Tensor* input_grad, PoolProcess pool_grad_process) {
const int batch_size = input.dims()[0];
const int input_depth = input.dims()[2];
const int input_height = input.dims()[3];
@ -1071,8 +1073,8 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
const framework::Tensor& output, const framework::Tensor& output_grad,
const std::vector<int>& ksize, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::string data_format,
PoolProcess pool_grad_process, bool exclusive, bool adaptive,
framework::Tensor* input_grad) {
bool exclusive, bool adaptive, framework::Tensor* input_grad,
PoolProcess pool_grad_process) {
bool channel_last = (data_format == "NDHWC");
const int batch_size = input.dims()[0];

@ -237,8 +237,8 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
const T* input, const std::vector<int>& input_shape,
const std::vector<int>& output_shape, const std::vector<int>& ksize,
const std::vector<int>& strides, const std::vector<int>& paddings,
PoolProcess pool_compute, bool exclusive, bool adaptive, T* output,
cudaStream_t stream) {
bool exclusive, bool adaptive, T* output, gpuStream_t stream,
PoolProcess pool_compute) {
const int batch_size = input_shape[0];
const int input_channels = input_shape[1];
const int input_height = input_shape[2];
@ -277,8 +277,9 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings, PoolProcess pool_process,
bool exclusive, bool adaptive, framework::Tensor* output) {
const std::vector<int>& paddings, bool exclusive,
bool adaptive, framework::Tensor* output,
PoolProcess pool_process) {
const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1];
const int input_height = input.dims()[2];
@ -311,8 +312,8 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
const framework::Tensor& input, const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string data_format, PoolProcess pool_process,
bool exclusive, bool adaptive, framework::Tensor* output) {
const std::string data_format, bool exclusive, bool adaptive,
framework::Tensor* output, PoolProcess pool_process) {
bool channel_last = (data_format == "NHWC");
const int batch_size = input.dims()[0];
@ -367,9 +368,9 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
const framework::Tensor& output_grad,
const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings, PoolProcess pool_process,
bool exclusive, bool adaptive,
framework::Tensor* input_grad) {
const std::vector<int>& paddings, bool exclusive,
bool adaptive, framework::Tensor* input_grad,
PoolProcess pool_process) {
const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1];
const int input_height = input.dims()[2];
@ -399,13 +400,15 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
ksize_width, stride_height, stride_width, padding_height, padding_width,
pool_process, exclusive, adaptive, input_grad_data);
}
void operator()(
const platform::CUDADeviceContext& context,
const framework::Tensor& input, const framework::Tensor& output,
const framework::Tensor& output_grad, const std::vector<int>& ksize,
const std::vector<int>& strides, const std::vector<int>& paddings,
const std::string data_format, PoolProcess pool_process, bool exclusive,
bool adaptive, framework::Tensor* input_grad) {
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& output,
const framework::Tensor& output_grad,
const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string data_format, bool exclusive, bool adaptive,
framework::Tensor* input_grad, PoolProcess pool_process) {
bool channel_last = (data_format == "NHWC");
const int batch_size = input.dims()[0];
@ -881,8 +884,9 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings, PoolProcess pool_process,
bool exclusive, bool adaptive, framework::Tensor* output) {
const std::vector<int>& paddings, bool exclusive,
bool adaptive, framework::Tensor* output,
PoolProcess pool_process) {
const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1];
const int input_depth = input.dims()[2];
@ -922,8 +926,8 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
const framework::Tensor& input, const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string data_format, PoolProcess pool_process,
bool exclusive, bool adaptive, framework::Tensor* output) {
const std::string data_format, bool exclusive, bool adaptive,
framework::Tensor* output, PoolProcess pool_process) {
bool channel_last = (data_format == "NDHWC");
const int batch_size = input.dims()[0];
@ -988,9 +992,9 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
const framework::Tensor& output_grad,
const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings, PoolProcess pool_process,
bool exclusive, bool adaptive,
framework::Tensor* input_grad) {
const std::vector<int>& paddings, bool exclusive,
bool adaptive, framework::Tensor* input_grad,
PoolProcess pool_process) {
const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1];
const int input_depth = input.dims()[2];
@ -1028,13 +1032,15 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
stride_height, stride_width, padding_depth, padding_height,
padding_width, pool_process, exclusive, adaptive, input_grad_data);
}
void operator()(
const platform::CUDADeviceContext& context,
const framework::Tensor& input, const framework::Tensor& output,
const framework::Tensor& output_grad, const std::vector<int>& ksize,
const std::vector<int>& strides, const std::vector<int>& paddings,
const std::string data_format, PoolProcess pool_process, bool exclusive,
bool adaptive, framework::Tensor* input_grad) {
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& output,
const framework::Tensor& output_grad,
const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string data_format, bool exclusive, bool adaptive,
framework::Tensor* input_grad, PoolProcess pool_process) {
bool channel_last = (data_format == "NDHWC");
const int batch_size = input.dims()[0];

@ -97,7 +97,7 @@ HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) {
* This is different from average pooling. So we rewrite the max_pool_grad:
* MaxPool2dGradFunctor, MaxPool3dGradFunctor.
*/
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <typename PoolProcess, typename T>
class Pool2dDirectCUDAFunctor {
public:
@ -105,9 +105,9 @@ class Pool2dDirectCUDAFunctor {
const std::vector<int>& output_shape,
const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings, PoolProcess pool_compute,
bool exclusive, bool adaptive, T* output,
cudaStream_t stream);
const std::vector<int>& paddings, bool exclusive,
bool adaptive, T* output, gpuStream_t stream,
PoolProcess pool_compute);
};
#endif
@ -117,16 +117,17 @@ class Pool2dFunctor {
void operator()(const DeviceContext& context, const framework::Tensor& input,
const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings, PoolProcess pool_compute,
bool exclusive, bool adaptive, framework::Tensor* output);
const std::vector<int>& paddings, bool exclusive,
bool adaptive, framework::Tensor* output,
PoolProcess pool_compute);
// overload operator() to support argument data_format
void operator()(const DeviceContext& context, const framework::Tensor& input,
const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string data_format, PoolProcess pool_compute,
bool exclusive, bool adaptive, framework::Tensor* output);
const std::string data_format, bool exclusive, bool adaptive,
framework::Tensor* output, PoolProcess pool_compute);
};
template <typename DeviceContext, typename PoolProcess, typename T>
@ -137,8 +138,9 @@ class Pool2dGradFunctor {
const framework::Tensor& output_grad,
const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings, PoolProcess pool_compute,
bool exclusive, bool adaptive, framework::Tensor* input_grad);
const std::vector<int>& paddings, bool exclusive,
bool adaptive, framework::Tensor* input_grad,
PoolProcess pool_compute);
// overload operator() to support argument data_format
void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& output,
@ -146,8 +148,8 @@ class Pool2dGradFunctor {
const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string data_format, PoolProcess pool_compute,
bool exclusive, bool adaptive, framework::Tensor* input_grad);
const std::string data_format, bool exclusive, bool adaptive,
framework::Tensor* input_grad, PoolProcess pool_compute);
};
template <typename DeviceContext, class T>
@ -176,15 +178,16 @@ class Pool3dFunctor {
void operator()(const DeviceContext& context, const framework::Tensor& input,
const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings, PoolProcess pool_compute,
bool exclusive, bool adaptive, framework::Tensor* output);
const std::vector<int>& paddings, bool exclusive,
bool adaptive, framework::Tensor* output,
PoolProcess pool_compute);
// overload operator() to support argument data_format
void operator()(const DeviceContext& context, const framework::Tensor& input,
const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string data_format, PoolProcess pool_compute,
bool exclusive, bool adaptive, framework::Tensor* output);
const std::string data_format, bool exclusive, bool adaptive,
framework::Tensor* output, PoolProcess pool_compute);
};
template <typename DeviceContext, typename PoolProcess, typename T>
@ -195,8 +198,9 @@ class Pool3dGradFunctor {
const framework::Tensor& output_grad,
const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings, PoolProcess pool_compute,
bool exclusive, bool adaptive, framework::Tensor* input_grad);
const std::vector<int>& paddings, bool exclusive,
bool adaptive, framework::Tensor* input_grad,
PoolProcess pool_compute);
// overload operator() to support argument data_format
void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& output,
@ -204,8 +208,8 @@ class Pool3dGradFunctor {
const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string data_format, PoolProcess pool_compute,
bool exclusive, bool adaptive, framework::Tensor* input_grad);
const std::string data_format, bool exclusive, bool adaptive,
framework::Tensor* input_grad, PoolProcess pool_compute);
};
template <typename DeviceContext, class T>

File diff suppressed because it is too large Load Diff

@ -18,6 +18,9 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/miopen_helper.h"
#endif
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
@ -180,7 +183,7 @@ framework::OpKernelType PoolOp::GetExpectedKernelType(
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::CanCUDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN;
}
@ -235,7 +238,7 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::CanCUDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN;
}

@ -205,7 +205,7 @@ class PoolKernel : public framework::OpKernel<T> {
pool2d_forward;
paddle::operators::math::MaxPool<T> pool_process;
pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format,
pool_process, true, false, out);
true, false, out, pool_process);
} else if (pooling_type == "avg") {
std::vector<int> reduce_dim;
@ -213,7 +213,12 @@ class PoolKernel : public framework::OpKernel<T> {
if (reduce_num > 0 &&
adaptive) { // for adaptive_avg_pool2d && output_size == 1
#ifdef __NVCC__
#ifdef __HIPCC__
auto stream = dev_ctx.stream();
TensorReduce<T, T, hipcub::Sum, DivideFunctor<T>>(
*in_x, out, reduce_dim, static_cast<T>(0), hipcub::Sum(),
DivideFunctor<T>(reduce_num), stream);
#elif defined(__NVCC__)
auto stream = dev_ctx.stream();
TensorReduce<T, T, cub::Sum, DivideFunctor<T>>(
*in_x, out, reduce_dim, static_cast<T>(0), cub::Sum(),
@ -224,7 +229,7 @@ class PoolKernel : public framework::OpKernel<T> {
pool2d_forward;
paddle::operators::math::AvgPool<T> pool_process;
pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings,
data_format, pool_process, exclusive, adaptive, out);
data_format, exclusive, adaptive, out, pool_process);
#endif
} else { // avgpool_2d or adaptive_avg_pool2d && output_size != 1
paddle::operators::math::Pool2dFunctor<
@ -232,7 +237,7 @@ class PoolKernel : public framework::OpKernel<T> {
pool2d_forward;
paddle::operators::math::AvgPool<T> pool_process;
pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings,
data_format, pool_process, exclusive, adaptive, out);
data_format, exclusive, adaptive, out, pool_process);
}
}
} break;
@ -243,7 +248,7 @@ class PoolKernel : public framework::OpKernel<T> {
pool3d_forward;
paddle::operators::math::MaxPool<T> pool_process;
pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format,
pool_process, true, false, out);
true, false, out, pool_process);
} else if (pooling_type == "avg") {
paddle::operators::math::Pool3dFunctor<
@ -251,7 +256,7 @@ class PoolKernel : public framework::OpKernel<T> {
pool3d_forward;
paddle::operators::math::AvgPool<T> pool_process;
pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format,
pool_process, exclusive, adaptive, out);
exclusive, adaptive, out, pool_process);
}
} break;
default: {
@ -324,8 +329,8 @@ class PoolGradKernel : public framework::OpKernel<T> {
pool2d_backward;
paddle::operators::math::AvgPoolGrad<T> pool_process;
pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
paddings, data_format, pool_process, exclusive,
adaptive, in_x_grad);
paddings, data_format, exclusive, adaptive,
in_x_grad, pool_process);
}
} break;
case 3: {
@ -340,8 +345,8 @@ class PoolGradKernel : public framework::OpKernel<T> {
pool3d_backward;
paddle::operators::math::AvgPoolGrad<T> pool_process;
pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
paddings, data_format, pool_process, exclusive,
adaptive, in_x_grad);
paddings, data_format, exclusive, adaptive,
in_x_grad, pool_process);
}
} break;
default: {

@ -56,14 +56,14 @@ class SppKernel : public framework::OpKernel<T> {
math::Pool2dFunctor<DeviceContext, math::MaxPool<T>, T> pool_forward;
math::MaxPool<T> max_process;
pool_forward(context.template device_context<DeviceContext>(), *in_x,
kernel_size, strides, paddings, max_process, true, false,
&out_level);
kernel_size, strides, paddings, true, false, &out_level,
max_process);
} else if (pooling_type == "avg") {
math::Pool2dFunctor<DeviceContext, math::AvgPool<T>, T> pool_forward;
math::AvgPool<T> avg_process;
pool_forward(context.template device_context<DeviceContext>(), *in_x,
kernel_size, strides, paddings, avg_process, true, false,
&out_level);
kernel_size, strides, paddings, true, false, &out_level,
avg_process);
}
// flatten pooling output shape
int output_flatten_w = in_x->dims()[1] * bins * bins;
@ -156,7 +156,7 @@ class SppGradKernel : public framework::OpKernel<T> {
math::AvgPoolGrad<T> avg_process;
pool_backward(context.template device_context<DeviceContext>(), *in_x,
*&out_level, *&outgrad_level, kernel_size, strides,
paddings, avg_process, true, false, in_x_grad);
paddings, true, false, in_x_grad, avg_process);
}
}
}

Loading…
Cancel
Save