From 2fd999d9799c141864a308db359f948700fac4c0 Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Mon, 1 Mar 2021 10:41:57 +0800 Subject: [PATCH] Optimized the adaptive_avg_pool2d op when output_size == 1 (#31197) * Optimized the adaptive_avg_pool2d op when output_size == 1 --- .../operators/{pool_op.cu.cc => pool_op.cu} | 0 paddle/fluid/operators/pool_op.h | 65 +++++++++++++++++-- 2 files changed, 58 insertions(+), 7 deletions(-) rename paddle/fluid/operators/{pool_op.cu.cc => pool_op.cu} (100%) diff --git a/paddle/fluid/operators/pool_op.cu.cc b/paddle/fluid/operators/pool_op.cu similarity index 100% rename from paddle/fluid/operators/pool_op.cu.cc rename to paddle/fluid/operators/pool_op.cu diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h index 71bef11b67..6b0dbd2d83 100644 --- a/paddle/fluid/operators/pool_op.h +++ b/paddle/fluid/operators/pool_op.h @@ -22,8 +22,20 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/pooling.h" +#ifdef __NVCC__ +#include "paddle/fluid/operators/reduce_ops/cub_reduce.h" +#endif + namespace paddle { namespace operators { +template +struct DivideFunctor { + HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {} + HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; } + + private: + T n_inv; +}; using Tensor = framework::Tensor; @@ -124,6 +136,26 @@ inline void UpdateKsize(std::vector* ksize, } } +inline int getReduceNum(const framework::Tensor& input, + const framework::Tensor* output, + const std::string data_format, + std::vector* reduce_dim) { + // data_format only can be NCHW + bool channel_last = (data_format == "NHWC"); + if (channel_last) { + return 0; + } + int reduce_num = 0; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + if ((output_height == 1) && (output_width == 1)) { + reduce_dim->push_back(2); + reduce_dim->push_back(3); + reduce_num = input.dims()[2] * input.dims()[3]; + } + return reduce_num; +} + template class PoolKernel : public framework::OpKernel { public: @@ -164,7 +196,6 @@ class PoolKernel : public framework::OpKernel { if (global_pooling) { UpdateKsize(&ksize, data_dims); } - auto& dev_ctx = context.template device_context(); switch (ksize.size()) { case 2: { @@ -177,12 +208,32 @@ class PoolKernel : public framework::OpKernel { pool_process, true, false, out); } else if (pooling_type == "avg") { - paddle::operators::math::Pool2dFunctor< - DeviceContext, paddle::operators::math::AvgPool, T> - pool2d_forward; - paddle::operators::math::AvgPool pool_process; - pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format, - pool_process, exclusive, adaptive, out); + std::vector reduce_dim; + int reduce_num = getReduceNum(*in_x, out, data_format, &reduce_dim); + + if (reduce_num > 0 && + adaptive) { // for adaptive_avg_pool2d && output_size == 1 +#ifdef __NVCC__ + auto stream = dev_ctx.stream(); + TensorReduce>( + *in_x, out, reduce_dim, static_cast(0), cub::Sum(), + DivideFunctor(reduce_num), stream); +#else // for cpu + paddle::operators::math::Pool2dFunctor< + DeviceContext, paddle::operators::math::AvgPool, T> + pool2d_forward; + paddle::operators::math::AvgPool pool_process; + pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, + data_format, pool_process, exclusive, adaptive, out); +#endif + } else { // avgpool_2d or adaptive_avg_pool2d && output_size != 1 + paddle::operators::math::Pool2dFunctor< + DeviceContext, paddle::operators::math::AvgPool, T> + pool2d_forward; + paddle::operators::math::AvgPool pool_process; + pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, + data_format, pool_process, exclusive, adaptive, out); + } } } break; case 3: {