From 03ccb9a461db7650fd1dc749f2f61a4df253bf31 Mon Sep 17 00:00:00 2001 From: Yihua Xu Date: Thu, 15 Nov 2018 16:07:16 +0800 Subject: [PATCH 01/16] Optimize the stack operator --- paddle/fluid/operators/stack_op.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h index d236c5b943..f1692ae956 100644 --- a/paddle/fluid/operators/stack_op.h +++ b/paddle/fluid/operators/stack_op.h @@ -147,16 +147,23 @@ class StackKernel : public framework::OpKernel { auto &dim = x[0]->dims(); for (auto i = 0; i < axis; ++i) pre *= dim[i]; for (auto i = axis; i < dim.size(); ++i) post *= dim[i]; - int total_num = pre * n * post; - auto &dev_ctx = ctx.template device_context(); #ifdef __NVCC__ thrust::device_vector device_x_vec(x_datas); auto x_data_arr = device_x_vec.data().get(); #else auto x_data_arr = x_datas.data(); #endif - StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post); + size_t x_offset = 0; + size_t y_offset = 0; + for (int i = 0; i < pre; i++) { + for (int j = 0; j < n; j++) { + std::memcpy(y_data + y_offset, x_data_arr[j] + x_offset, + post * sizeof(T)); + y_offset += post; + } + x_offset += post; + } #ifdef __NVCC__ // Wait() must be called because device_x_vec may be destructed before // kernel ends From 513bb6c1513dde0e3b9e2b9da5acccd9649cda0d Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Thu, 8 Nov 2018 17:16:16 +0100 Subject: [PATCH 02/16] Squashing MKL based softmax for inference test=develop - Added profiling to softmax functors - MKL based softmax inference op - Fix to softmax compuation via MKL - cleaning - Cosmetic fixes to softmax MKL - Fix to ON_INFER lack of propagation --- CMakeLists.txt | 15 +++--- paddle/fluid/operators/math/softmax_impl.h | 59 ++++++++++++---------- paddle/fluid/operators/softmax_op.h | 2 +- 3 files changed, 42 insertions(+), 34 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9cfec8e70b..c62cc9bfd7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -302,6 +302,14 @@ set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") +if (ON_INFER) + message(STATUS "On inference mode, will take place some specific optimization.") + add_definitions(-DPADDLE_ON_INFERENCE) +else() + #TODO(luotao), combine this warning with `make inference_lib_dist` command. + message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.") +endif() + add_subdirectory(paddle) if(WITH_PYTHON) add_subdirectory(python) @@ -312,10 +320,3 @@ if(WITH_DOC) find_python_module(recommonmark REQUIRED) add_subdirectory(doc) endif() - -if (ON_INFER) - message(STATUS "On inference mode, will take place some specific optimization.") -else() - #TODO(luotao), combine this warning with `make inference_lib_dist` command. - message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.") -endif() diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index 7cf98f2725..e09a243347 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/math/blas.h" namespace paddle { namespace operators { namespace math { @@ -65,36 +66,42 @@ void SoftmaxFunctor::operator()( .broadcast(one_by_class)); } -template -class SoftmaxFunctor { +template +class SoftmaxFunctor { void operator()(const DeviceContext& context, const framework::Tensor* X, framework::Tensor* Y) { - auto logits = EigenMatrix::From(*X); - auto softmax = EigenMatrix::From(*Y); - + auto in_dims = X->dims(); + auto out_dims = Y->dims(); + const float* in_data = X->data(); + float* out_data = Y->data(); const int kBatchDim = 0; const int kClassDim = 1; - - const int batch_size = logits.dimension(kBatchDim); - const int num_classes = logits.dimension(kClassDim); - - Eigen::DSizes along_class(kClassDim); - Eigen::DSizes batch_by_one(batch_size, 1); - Eigen::DSizes one_by_class(1, num_classes); - - auto shifted_logits = (logits - - logits.maximum(along_class) - .eval() - .reshape(batch_by_one) - .broadcast(one_by_class)); - - softmax.device(*context.eigen_device()) = shifted_logits.exp(); - softmax.device(*context.eigen_device()) = (softmax * - softmax.sum(along_class) - .inverse() - .eval() - .reshape(batch_by_one) - .broadcast(one_by_class)); + // 2D data. Batch x C + const int batch_size = in_dims[kBatchDim]; + const int num_classes = in_dims[kClassDim]; + std::vector entities(batch_size); + auto blas = math::GetBlas(context); + for (int n = 0; n < batch_size; ++n) { + entities[n] = in_data[n * num_classes]; + for (int c = 1; c < num_classes; ++c) { + entities[n] = in_data[n * num_classes + c] > entities[n] + ? in_data[n * num_classes + c] + : entities[n]; + } + for (int c = 0; c < num_classes; ++c) { + out_data[n * num_classes + c] = + in_data[n * num_classes + c] - entities[n]; + } + } + + blas.VEXP(num_classes * batch_size, out_data, out_data); + for (int n = 0; n < batch_size; ++n) { + entities[n] = out_data[n * num_classes]; + for (int c = 1; c < num_classes; ++c) { + entities[n] += out_data[n * num_classes + c]; + } + blas.SCAL(num_classes, 1.0f / entities[n], &out_data[n * num_classes]); + } } }; diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h index 2fea8a65bc..91829d5761 100644 --- a/paddle/fluid/operators/softmax_op.h +++ b/paddle/fluid/operators/softmax_op.h @@ -35,7 +35,7 @@ class SoftmaxKernel : public framework::OpKernel { Tensor X_2d = framework::ReshapeToMatrix(*X, rank - 1); Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1); -#ifdef ON_INFER +#ifdef PADDLE_ON_INFERENCE math::SoftmaxFunctor()( context.template device_context(), &X_2d, &Out_2d); #else From ba3eaed7a7426a10f4a394071852c6f5d6ab8e1e Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 16 Nov 2018 09:13:34 +0000 Subject: [PATCH 03/16] exp support all size --- paddle/fluid/operators/math/jit_code.cc | 114 ++++++++++++++++-- paddle/fluid/operators/math/jit_code.h | 8 +- .../fluid/operators/math/jit_kernel_test.cc | 5 +- 3 files changed, 113 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index e3b600d442..9efd4e8174 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -81,10 +81,10 @@ void VXXJitCode::generate() { } if (rest >= 2) { if (scalar_index_ != 1) { - vmovups(xmm_src1, ptr[param1 + offset]); + vmovq(xmm_src1, ptr[param1 + offset]); } if (scalar_index_ != 2) { - vmovups(xmm_src2, ptr[param2 + offset]); + vmovq(xmm_src2, ptr[param2 + offset]); } if (type_ == operand_type::mul) { vmulps(xmm_dst, xmm_src1, xmm_src2); @@ -100,10 +100,10 @@ void VXXJitCode::generate() { } if (rest > 0) { if (scalar_index_ != 1) { - vmovups(xmm_src1, ptr[param1 + offset]); + vmovss(xmm_src1, ptr[param1 + offset]); } if (scalar_index_ != 2) { - vmovups(xmm_src2, ptr[param2 + offset]); + vmovss(xmm_src2, ptr[param2 + offset]); } if (type_ == operand_type::mul) { vmulss(xmm_dst, xmm_src1, xmm_src2); @@ -179,7 +179,7 @@ bool VActJitCode::init(int d, operand_type type) { return ok; } else if (type == operand_type::exp) { // exp is slower than mkl when d >= 256 - return ok && d % 8 == 0 && d < 256; + return ok; //&& d % 4 == 0 && d < 256; } else { // TODO(TJ): support more return ok && d % 8 == 0; @@ -190,6 +190,10 @@ void VActJitCode::relu_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, ymm_t& ymm_zero) { vmaxps(ymm_dst, ymm_zero, ymm_src); } +void VActJitCode::relu_xmm(xmm_t& xmm_dst, xmm_t& xmm_src, xmm_t& xmm_zero) { + vmaxps(xmm_dst, xmm_zero, xmm_src); +} + void VActJitCode::exp_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx, int fy_idx, int mask_idx, int tmp_idx) { assert(ymm_src.getIdx() != ymm_dst.getIdx()); // TODO(TJ): use enfore @@ -271,6 +275,65 @@ void VActJitCode::exp_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx, pop(reg_ptr_global); } +void VActJitCode::exp_xmm(xmm_t& ymm_dst, xmm_t& ymm_src, int fx_idx, + int fy_idx, int mask_idx, int tmp_idx) { + assert(ymm_src.getIdx() != ymm_dst.getIdx()); // TODO(TJ): use enfore + // check all idx can not equal + xmm_t ymm_fx = xmm_t(fx_idx); + xmm_t ymm_fy = xmm_t(fy_idx); + xmm_t ymm_mask = xmm_t(mask_idx); + xmm_t ymm_tmp = xmm_t(tmp_idx); + reg64_t reg_ptr_global = rax; + push(reg_ptr_global); + mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); + vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]); + vminps(ymm_src, ymm_src, ymm_tmp); + vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]); + vmaxps(ymm_src, ymm_src, ymm_tmp); + // express exp(x) as exp(g + n*log(2)) + vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]); + vmulps(ymm_fx, ymm_src, ymm_tmp); + vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]); + vaddps(ymm_fx, ymm_fx, ymm_tmp); + vroundps(ymm_fy, ymm_fx, 0x01); + // if greater, substract 1 + vcmpgtps(ymm_mask, ymm_fy, ymm_fx); + vmovaps(ymm_tmp, ptr[reg_ptr_global]); + vandps(ymm_mask, ymm_mask, ymm_tmp); + vsubps(ymm_fx, ymm_fy, ymm_mask); + vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]); + vmulps(ymm_fy, ymm_fx, ymm_tmp); + vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]); + xmm_t ymm_z = xmm_t(ymm_mask.getIdx()); + vmulps(ymm_z, ymm_fx, ymm_tmp); + vsubps(ymm_src, ymm_src, ymm_fy); + vsubps(ymm_src, ymm_src, ymm_z); + vmulps(ymm_z, ymm_src, ymm_src); + vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]); + vmulps(ymm_dst, ymm_src, ymm_tmp); + for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5; + i += (YMM_FLOAT_BLOCK * sizeof(float))) { + vmovaps(ymm_tmp, ptr[reg_ptr_global + i]); // P1~P4 + vaddps(ymm_dst, ymm_dst, ymm_tmp); + vmulps(ymm_dst, ymm_dst, ymm_src); + } + vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]); + vaddps(ymm_dst, ymm_dst, ymm_tmp); + vmulps(ymm_dst, ymm_dst, ymm_z); + vaddps(ymm_dst, ymm_dst, ymm_src); + vmovaps(ymm_tmp, ptr[reg_ptr_global]); + vaddps(ymm_dst, ymm_dst, ymm_tmp); + // build 2^n + xmm_t ymm_int = ymm_fx; + vcvttps2dq(ymm_int, ymm_fx); + mov(reg_ptr_global, reinterpret_cast(exp_int_0x7f)); + vmovdqa(ymm_tmp, ptr[reg_ptr_global]); + vpaddd(ymm_int, ymm_int, ymm_tmp); + vpslld(ymm_int, ymm_int, 23); + vmulps(ymm_dst, ymm_dst, ymm_int); + pop(reg_ptr_global); +} + void VActJitCode::sigmoid_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx, int fy_idx, int mask_idx, int tmp_idx) { // y = 1 / (1 + e^-x) @@ -343,7 +406,7 @@ void VActJitCode::generate() { vmovups(ptr[param2 + offset], ymm_dst); offset += sizeof(float) * YMM_FLOAT_BLOCK; } - if (type_ != operand_type::relu) { + if (type_ != operand_type::relu && type_ != operand_type::exp) { // TODO(TJ): remove me ret(); return; @@ -351,21 +414,50 @@ void VActJitCode::generate() { int rest = num_ % YMM_FLOAT_BLOCK; if (rest >= 4) { vmovups(xmm_src, ptr[param1 + offset]); - vmaxps(xmm_dst, xmm_zero, xmm_src); + switch (type_) { + case operand_type::relu: + relu_xmm(xmm_dst, xmm_src, xmm_zero); + break; + case operand_type::exp: + exp_xmm(xmm_dst, xmm_src, 2, 3, 4, 5); + break; + default: + break; + } vmovups(ptr[param2 + offset], xmm_dst); offset += sizeof(float) * 4; rest -= 4; } if (rest >= 2) { - vmovups(xmm_src, ptr[param1 + offset]); - vmaxps(xmm_dst, xmm_zero, xmm_src); + vmovq(xmm_src, ptr[param1 + offset]); + switch (type_) { + case operand_type::relu: + relu_xmm(xmm_dst, xmm_src, xmm_zero); + break; + case operand_type::exp: + exp_xmm(xmm_dst, xmm_src, 2, 3, 4, 5); + break; + default: + break; + } vmovq(ptr[param2 + offset], xmm_dst); offset += sizeof(float) * 2; rest -= 2; } if (rest > 0) { - vmovups(xmm_src, ptr[param1 + offset]); - vmaxps(xmm_dst, xmm_zero, xmm_src); + // vmovups(); + vmovss(xmm_src, ptr[param1 + offset]); + + switch (type_) { + case operand_type::relu: + relu_xmm(xmm_dst, xmm_src, xmm_zero); + break; + case operand_type::exp: + exp_xmm(xmm_dst, xmm_src, 2, 3, 4, 5); + break; + default: + break; + } vmovss(ptr[param2 + offset], xmm_dst); } ret(); diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index 71205b211b..1467978f26 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -127,13 +127,17 @@ class VActJitCode : public JitCode { void generate() override; protected: - // compute relu with ymm + // compute relu with ymm, xmm void relu_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, const Xbyak::Ymm& zero); + void relu_xmm(const Xbyak::Xmm& dst, const Xbyak::Xmm& src, + const Xbyak::Xmm& zero); - // compute exp with ymm + // compute exp with ymm, xmm void exp_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2, int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5); + void exp_xmm(const Xbyak::Xmm& dst, const Xbyak::Xmm& src, int fx_idx = 2, + int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5); // compute sigmoid with ymm void sigmoid_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2, diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc index 5a6f87fe1f..178298bf56 100644 --- a/paddle/fluid/operators/math/jit_kernel_test.cc +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -33,6 +33,9 @@ limitations under the License. */ constexpr int repeat = 20000; +// TODO(TJ): benchmark and test should be seperated, +// benchmark should verify more sizes + inline double GetCurrentUS() { struct timeval time; gettimeofday(&time, NULL); @@ -156,7 +159,7 @@ void vexp_mkl(const int n, const float* x, float* y) { TEST(JitKernel, vexp) { namespace jit = paddle::operators::math::jitkernel; - for (int d : {7, 8, 15, 16, 30, 128, 256}) { + for (int d : {7, 8, 12, 15, 16, 20, 30, 128, 256}) { std::vector x(d); std::vector zref(d), ztgt(d); RandomVec(d, x.data(), -2.f, 2.f); From 4e67fe6a122636bc84b2f8df6d5f94feb5ed1a78 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 16 Nov 2018 10:09:40 +0000 Subject: [PATCH 04/16] refine act and vxx with all size --- paddle/fluid/operators/math/jit_code.cc | 147 ++++++++++-------------- 1 file changed, 60 insertions(+), 87 deletions(-) diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index 9efd4e8174..a5eef019c8 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -60,60 +60,53 @@ void VXXJitCode::generate() { offset += sizeof(float) * YMM_FLOAT_BLOCK; } int rest = num_ % YMM_FLOAT_BLOCK; - if (rest >= 4) { - if (scalar_index_ != 1) { - vmovups(xmm_src1, ptr[param1 + offset]); - } - if (scalar_index_ != 2) { - vmovups(xmm_src2, ptr[param2 + offset]); - } - if (type_ == operand_type::mul) { - vmulps(xmm_dst, xmm_src1, xmm_src2); - } else if (type_ == operand_type::add) { - vaddps(xmm_dst, xmm_src1, xmm_src2); - } - if (with_relu_) { - vmaxps(xmm_dst, xmm_zero, xmm_dst); - } - vmovups(ptr[param3 + offset], xmm_dst); - offset += sizeof(float) * 4; - rest -= 4; - } - if (rest >= 2) { - if (scalar_index_ != 1) { - vmovq(xmm_src1, ptr[param1 + offset]); - } - if (scalar_index_ != 2) { - vmovq(xmm_src2, ptr[param2 + offset]); + int block = XMM_FLOAT_BLOCK; + while (rest > 0) { + if (rest >= 4) { + if (scalar_index_ != 1) { + vmovups(xmm_src1, ptr[param1 + offset]); + } + if (scalar_index_ != 2) { + vmovups(xmm_src2, ptr[param2 + offset]); + } + } else if (rest >= 2) { + if (scalar_index_ != 1) { + vmovq(xmm_src1, ptr[param1 + offset]); + } + if (scalar_index_ != 2) { + vmovq(xmm_src2, ptr[param2 + offset]); + } + } else { + if (scalar_index_ != 1) { + vmovss(xmm_src1, ptr[param1 + offset]); + } + if (scalar_index_ != 2) { + vmovss(xmm_src2, ptr[param2 + offset]); + } } - if (type_ == operand_type::mul) { - vmulps(xmm_dst, xmm_src1, xmm_src2); - } else if (type_ == operand_type::add) { - vaddps(xmm_dst, xmm_src1, xmm_src2); + switch (type_) { + case operand_type::mul: + vmulps(xmm_dst, xmm_src1, xmm_src2); + break; + case operand_type::add: + vaddps(xmm_dst, xmm_src1, xmm_src2); + break; + default: + break; } if (with_relu_) { vmaxps(xmm_dst, xmm_zero, xmm_dst); } - vmovq(ptr[param3 + offset], xmm_dst); - offset += sizeof(float) * 2; - rest -= 2; - } - if (rest > 0) { - if (scalar_index_ != 1) { - vmovss(xmm_src1, ptr[param1 + offset]); - } - if (scalar_index_ != 2) { - vmovss(xmm_src2, ptr[param2 + offset]); - } - if (type_ == operand_type::mul) { - vmulss(xmm_dst, xmm_src1, xmm_src2); - } else if (type_ == operand_type::add) { - vaddss(xmm_dst, xmm_src1, xmm_src2); + if (rest >= 4) { + vmovups(ptr[param3 + offset], xmm_dst); + } else if (rest >= 2) { + vmovq(ptr[param3 + offset], xmm_dst); + } else { + vmovss(ptr[param3 + offset], xmm_dst); } - if (with_relu_) { - vmaxps(xmm_dst, xmm_zero, xmm_dst); - } - vmovss(ptr[param3 + offset], xmm_dst); + offset += sizeof(float) * block; + rest -= block; + block /= 2; } ret(); } @@ -175,11 +168,9 @@ static int g_tmp_mem[16] ALIGN32 = {0}; bool VActJitCode::init(int d, operand_type type) { bool ok = MayIUse(avx); - if (type == operand_type::relu) { + if (type == operand_type::relu || type == operand_type::exp) { + // TODO(TJ): implement avx512, avx_exp is slower than mkl when d >= 256 return ok; - } else if (type == operand_type::exp) { - // exp is slower than mkl when d >= 256 - return ok; //&& d % 4 == 0 && d < 256; } else { // TODO(TJ): support more return ok && d % 8 == 0; @@ -412,24 +403,15 @@ void VActJitCode::generate() { return; } int rest = num_ % YMM_FLOAT_BLOCK; - if (rest >= 4) { - vmovups(xmm_src, ptr[param1 + offset]); - switch (type_) { - case operand_type::relu: - relu_xmm(xmm_dst, xmm_src, xmm_zero); - break; - case operand_type::exp: - exp_xmm(xmm_dst, xmm_src, 2, 3, 4, 5); - break; - default: - break; + int block = XMM_FLOAT_BLOCK; + while (rest > 0) { + if (rest >= 4) { + vmovups(xmm_src, ptr[param1 + offset]); + } else if (rest >= 2) { + vmovq(xmm_src, ptr[param1 + offset]); + } else { + vmovss(xmm_src, ptr[param1 + offset]); } - vmovups(ptr[param2 + offset], xmm_dst); - offset += sizeof(float) * 4; - rest -= 4; - } - if (rest >= 2) { - vmovq(xmm_src, ptr[param1 + offset]); switch (type_) { case operand_type::relu: relu_xmm(xmm_dst, xmm_src, xmm_zero); @@ -440,25 +422,16 @@ void VActJitCode::generate() { default: break; } - vmovq(ptr[param2 + offset], xmm_dst); - offset += sizeof(float) * 2; - rest -= 2; - } - if (rest > 0) { - // vmovups(); - vmovss(xmm_src, ptr[param1 + offset]); - - switch (type_) { - case operand_type::relu: - relu_xmm(xmm_dst, xmm_src, xmm_zero); - break; - case operand_type::exp: - exp_xmm(xmm_dst, xmm_src, 2, 3, 4, 5); - break; - default: - break; + if (rest >= 4) { + vmovups(ptr[param2 + offset], xmm_dst); + } else if (rest >= 2) { + vmovq(ptr[param2 + offset], xmm_dst); + } else { + vmovss(ptr[param2 + offset], xmm_dst); } - vmovss(ptr[param2 + offset], xmm_dst); + offset += sizeof(float) * block; + rest -= block; + block /= 2; } ret(); } From d3eae8f61b26c4fa053a74ce35aeb241db2c3b3b Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 16 Nov 2018 14:58:43 +0000 Subject: [PATCH 05/16] refine relu and fix addrelu test --- paddle/fluid/operators/math/jit_code.cc | 12 ++---------- paddle/fluid/operators/math/jit_code.h | 8 ++++---- paddle/fluid/operators/math/jit_kernel_test.cc | 2 +- 3 files changed, 7 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index a5eef019c8..2a10cd7821 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -177,14 +177,6 @@ bool VActJitCode::init(int d, operand_type type) { } } -void VActJitCode::relu_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, ymm_t& ymm_zero) { - vmaxps(ymm_dst, ymm_zero, ymm_src); -} - -void VActJitCode::relu_xmm(xmm_t& xmm_dst, xmm_t& xmm_src, xmm_t& xmm_zero) { - vmaxps(xmm_dst, xmm_zero, xmm_src); -} - void VActJitCode::exp_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx, int fy_idx, int mask_idx, int tmp_idx) { assert(ymm_src.getIdx() != ymm_dst.getIdx()); // TODO(TJ): use enfore @@ -378,7 +370,7 @@ void VActJitCode::generate() { vmovups(ymm_src, ptr[param1 + offset]); switch (type_) { case operand_type::relu: - relu_ymm(ymm_dst, ymm_src, ymm_zero); + relu_jmm(ymm_dst, ymm_src, ymm_zero); break; case operand_type::exp: exp_ymm(ymm_dst, ymm_src, 2, 3, 4, 5); @@ -414,7 +406,7 @@ void VActJitCode::generate() { } switch (type_) { case operand_type::relu: - relu_xmm(xmm_dst, xmm_src, xmm_zero); + relu_jmm(xmm_dst, xmm_src, xmm_zero); break; case operand_type::exp: exp_xmm(xmm_dst, xmm_src, 2, 3, 4, 5); diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index 1467978f26..6adeebca7c 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -128,10 +128,10 @@ class VActJitCode : public JitCode { protected: // compute relu with ymm, xmm - void relu_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, - const Xbyak::Ymm& zero); - void relu_xmm(const Xbyak::Xmm& dst, const Xbyak::Xmm& src, - const Xbyak::Xmm& zero); + template + void relu_jmm(JMM& dst, JMM& src, JMM& zero) { // NOLINT + vmaxps(dst, src, zero); + } // compute exp with ymm, xmm void exp_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2, diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc index 178298bf56..932fa4c000 100644 --- a/paddle/fluid/operators/math/jit_kernel_test.cc +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -762,7 +762,7 @@ TEST(JitKernel, vaddrelu) { float* zref_data = zref.data(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vadd_ref(d, x_data, y_data, zref_data); + vaddrelu_ref(d, x_data, y_data, zref_data); } auto trefe = GetCurrentUS(); auto tmkls = GetCurrentUS(); From ccb8963705205eef1f7447be7964dce008c7b997 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 16 Nov 2018 16:54:48 +0000 Subject: [PATCH 06/16] refine exp jitcode with all size test=develop --- paddle/fluid/operators/math/jit_code.cc | 223 +++-------------------- paddle/fluid/operators/math/jit_code.h | 132 +++++++++++++- paddle/fluid/operators/math/jit_kernel.h | 1 + 3 files changed, 153 insertions(+), 203 deletions(-) diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index 2a10cd7821..fd18256b0c 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -13,8 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/jit_code.h" -#include "paddle/fluid/operators/math/jit_kernel.h" -#include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/operators/math/jit_kernel.h" // TODO(TJ): remove me namespace paddle { namespace operators { @@ -111,60 +110,26 @@ void VXXJitCode::generate() { ret(); } -#define ALIGN32 __attribute__((aligned(32))) -#define EXP_HIG 88.3762626647949f -#define EXP_LOW -88.3762626647949f -#define CEPHES_LOG2EF 1.44269504088896341 -#define CEPHES_EXP_C1 0.693359375 -#define CEPHES_EXP_C2 -2.12194440e-4 -#define CEPHES_EXP_P0 1.9875691500E-4 -#define CEPHES_EXP_P1 1.3981999507E-3 -#define CEPHES_EXP_P2 8.3334519073E-3 -#define CEPHES_EXP_P3 4.1665795894E-2 -#define CEPHES_EXP_P4 1.6666665459E-1 -#define CEPHES_EXP_P5 5.0000001201E-1 +const float exp_float_consts[] ALIGN32 = {REPEAT_8TIMES(1.f), + REPEAT_8TIMES(2.f), + REPEAT_8TIMES(0.5f), + REPEAT_8TIMES(EXP_HIG), + REPEAT_8TIMES(EXP_LOW), + REPEAT_8TIMES(CEPHES_LOG2EF), + REPEAT_8TIMES(CEPHES_EXP_C1), + REPEAT_8TIMES(CEPHES_EXP_C2), + REPEAT_8TIMES(CEPHES_EXP_P0), + REPEAT_8TIMES(CEPHES_EXP_P1), + REPEAT_8TIMES(CEPHES_EXP_P2), + REPEAT_8TIMES(CEPHES_EXP_P3), + REPEAT_8TIMES(CEPHES_EXP_P4), + REPEAT_8TIMES(CEPHES_EXP_P5), + REPEAT_8TIMES(EXP_MAX_INPUT), + REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX), + REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)}; -#define REPEAT_8TIMES(val) val, val, val, val, val, val, val, val - -#define OFFSET_EXP_ONE 0 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_TWO 1 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_0P5 2 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_HIG 3 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_LOW 4 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_LOG2EF 5 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_C1 6 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_C2 7 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_P0 8 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_P1 9 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_P2 10 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_P3 11 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_P4 12 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_P5 13 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_MAX_INPUT 14 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float) - -static const float exp_float_consts[] ALIGN32 = { - REPEAT_8TIMES(1.f), - REPEAT_8TIMES(2.f), - REPEAT_8TIMES(0.5f), - REPEAT_8TIMES(EXP_HIG), - REPEAT_8TIMES(EXP_LOW), - REPEAT_8TIMES(CEPHES_LOG2EF), - REPEAT_8TIMES(CEPHES_EXP_C1), - REPEAT_8TIMES(CEPHES_EXP_C2), - REPEAT_8TIMES(CEPHES_EXP_P0), - REPEAT_8TIMES(CEPHES_EXP_P1), - REPEAT_8TIMES(CEPHES_EXP_P2), - REPEAT_8TIMES(CEPHES_EXP_P3), - REPEAT_8TIMES(CEPHES_EXP_P4), - REPEAT_8TIMES(CEPHES_EXP_P5), - REPEAT_8TIMES(EXP_MAX_INPUT), - REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX), - REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)}; - -static const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)}; -static int g_tmp_mem[16] ALIGN32 = {0}; +const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)}; +int g_tmp_mem[16] ALIGN32 = {0}; bool VActJitCode::init(int d, operand_type type) { bool ok = MayIUse(avx); @@ -177,146 +142,6 @@ bool VActJitCode::init(int d, operand_type type) { } } -void VActJitCode::exp_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx, - int fy_idx, int mask_idx, int tmp_idx) { - assert(ymm_src.getIdx() != ymm_dst.getIdx()); // TODO(TJ): use enfore - // check all idx can not equal - ymm_t ymm_fx = ymm_t(fx_idx); - ymm_t ymm_fy = ymm_t(fy_idx); - ymm_t ymm_mask = ymm_t(mask_idx); - ymm_t ymm_tmp = ymm_t(tmp_idx); - reg64_t reg_ptr_global = rax; - push(reg_ptr_global); - mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); - vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]); - vminps(ymm_src, ymm_src, ymm_tmp); - vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]); - vmaxps(ymm_src, ymm_src, ymm_tmp); - // express exp(x) as exp(g + n*log(2)) - vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]); - vmulps(ymm_fx, ymm_src, ymm_tmp); - vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]); - vaddps(ymm_fx, ymm_fx, ymm_tmp); - vroundps(ymm_fy, ymm_fx, 0x01); - // if greater, substract 1 - vcmpgtps(ymm_mask, ymm_fy, ymm_fx); - vmovaps(ymm_tmp, ptr[reg_ptr_global]); - vandps(ymm_mask, ymm_mask, ymm_tmp); - vsubps(ymm_fx, ymm_fy, ymm_mask); - vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]); - vmulps(ymm_fy, ymm_fx, ymm_tmp); - vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]); - ymm_t ymm_z = ymm_t(ymm_mask.getIdx()); - vmulps(ymm_z, ymm_fx, ymm_tmp); - vsubps(ymm_src, ymm_src, ymm_fy); - vsubps(ymm_src, ymm_src, ymm_z); - vmulps(ymm_z, ymm_src, ymm_src); - vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]); - vmulps(ymm_dst, ymm_src, ymm_tmp); - for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5; - i += (YMM_FLOAT_BLOCK * sizeof(float))) { - vmovaps(ymm_tmp, ptr[reg_ptr_global + i]); // P1~P4 - vaddps(ymm_dst, ymm_dst, ymm_tmp); - vmulps(ymm_dst, ymm_dst, ymm_src); - } - vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]); - vaddps(ymm_dst, ymm_dst, ymm_tmp); - vmulps(ymm_dst, ymm_dst, ymm_z); - vaddps(ymm_dst, ymm_dst, ymm_src); - vmovaps(ymm_tmp, ptr[reg_ptr_global]); - vaddps(ymm_dst, ymm_dst, ymm_tmp); - // build 2^n - ymm_t ymm_int = ymm_fx; - vcvttps2dq(ymm_int, ymm_fx); - mov(reg_ptr_global, reinterpret_cast(exp_int_0x7f)); - vmovdqa(ymm_tmp, ptr[reg_ptr_global]); - if (MayIUse(avx2)) { - vpaddd(ymm_int, ymm_int, ymm_tmp); - vpslld(ymm_int, ymm_int, 23); - } else if (MayIUse(avx)) { - xmm_t xtmp1 = xmm_t(ymm_int.getIdx()); - xmm_t xtmp2 = xmm_t(ymm_tmp.getIdx()); - reg64_t reg_ptr_tmp = reg_ptr_global; - mov(reg_ptr_tmp, reinterpret_cast(g_tmp_mem)); - vmovdqa(ptr[reg_ptr_tmp], ymm_int); - vmovdqa(ptr[reg_ptr_tmp + YMM_FLOAT_BLOCK * sizeof(float)], ymm_tmp); - vpaddd(xtmp1, xtmp1, xtmp2); - vpslld(xtmp1, xtmp1, 23); - vmovdqa(ptr[reg_ptr_tmp], xtmp1); - // next 128bits - vmovdqa(xtmp1, ptr[reg_ptr_tmp + 4 /*xmm float block*/ * sizeof(float)]); - vmovdqa(xtmp2, - ptr[reg_ptr_tmp + - (YMM_FLOAT_BLOCK + 4 /*xmm float block*/) * sizeof(float)]); - vpaddd(xtmp1, xtmp1, xtmp2); - vpslld(xtmp1, xtmp1, 23); - vmovdqa(ptr[reg_ptr_tmp + 4 /*xmm float block*/ * sizeof(float)], xtmp1); - // load out - vmovdqa(ymm_int, ptr[reg_ptr_tmp]); - } - vmulps(ymm_dst, ymm_dst, ymm_int); - pop(reg_ptr_global); -} - -void VActJitCode::exp_xmm(xmm_t& ymm_dst, xmm_t& ymm_src, int fx_idx, - int fy_idx, int mask_idx, int tmp_idx) { - assert(ymm_src.getIdx() != ymm_dst.getIdx()); // TODO(TJ): use enfore - // check all idx can not equal - xmm_t ymm_fx = xmm_t(fx_idx); - xmm_t ymm_fy = xmm_t(fy_idx); - xmm_t ymm_mask = xmm_t(mask_idx); - xmm_t ymm_tmp = xmm_t(tmp_idx); - reg64_t reg_ptr_global = rax; - push(reg_ptr_global); - mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); - vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]); - vminps(ymm_src, ymm_src, ymm_tmp); - vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]); - vmaxps(ymm_src, ymm_src, ymm_tmp); - // express exp(x) as exp(g + n*log(2)) - vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]); - vmulps(ymm_fx, ymm_src, ymm_tmp); - vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]); - vaddps(ymm_fx, ymm_fx, ymm_tmp); - vroundps(ymm_fy, ymm_fx, 0x01); - // if greater, substract 1 - vcmpgtps(ymm_mask, ymm_fy, ymm_fx); - vmovaps(ymm_tmp, ptr[reg_ptr_global]); - vandps(ymm_mask, ymm_mask, ymm_tmp); - vsubps(ymm_fx, ymm_fy, ymm_mask); - vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]); - vmulps(ymm_fy, ymm_fx, ymm_tmp); - vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]); - xmm_t ymm_z = xmm_t(ymm_mask.getIdx()); - vmulps(ymm_z, ymm_fx, ymm_tmp); - vsubps(ymm_src, ymm_src, ymm_fy); - vsubps(ymm_src, ymm_src, ymm_z); - vmulps(ymm_z, ymm_src, ymm_src); - vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]); - vmulps(ymm_dst, ymm_src, ymm_tmp); - for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5; - i += (YMM_FLOAT_BLOCK * sizeof(float))) { - vmovaps(ymm_tmp, ptr[reg_ptr_global + i]); // P1~P4 - vaddps(ymm_dst, ymm_dst, ymm_tmp); - vmulps(ymm_dst, ymm_dst, ymm_src); - } - vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]); - vaddps(ymm_dst, ymm_dst, ymm_tmp); - vmulps(ymm_dst, ymm_dst, ymm_z); - vaddps(ymm_dst, ymm_dst, ymm_src); - vmovaps(ymm_tmp, ptr[reg_ptr_global]); - vaddps(ymm_dst, ymm_dst, ymm_tmp); - // build 2^n - xmm_t ymm_int = ymm_fx; - vcvttps2dq(ymm_int, ymm_fx); - mov(reg_ptr_global, reinterpret_cast(exp_int_0x7f)); - vmovdqa(ymm_tmp, ptr[reg_ptr_global]); - vpaddd(ymm_int, ymm_int, ymm_tmp); - vpslld(ymm_int, ymm_int, 23); - vmulps(ymm_dst, ymm_dst, ymm_int); - pop(reg_ptr_global); -} - void VActJitCode::sigmoid_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx, int fy_idx, int mask_idx, int tmp_idx) { // y = 1 / (1 + e^-x) @@ -330,7 +155,7 @@ void VActJitCode::sigmoid_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx, vmaxps(ymm_src, ymm_src, ymm_tmp); vxorps(ymm_tmp, ymm_tmp, ymm_tmp); vsubps(ymm_src, ymm_tmp, ymm_src); - exp_ymm(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx); + exp_jmm(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx); vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); vaddps(ymm_dst, ymm_dst, ymm_tmp); vdivps(ymm_dst, ymm_tmp, ymm_dst); @@ -349,7 +174,7 @@ void VActJitCode::tanh_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx, vxorps(ymm_zero, ymm_zero, ymm_zero); vsubps(ymm_tmp, ymm_zero, ymm_tmp); vmulps(ymm_src, ymm_src, ymm_tmp); - exp_ymm(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx); + exp_jmm(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx); vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); vaddps(ymm_dst, ymm_dst, ymm_tmp); vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]); @@ -373,7 +198,7 @@ void VActJitCode::generate() { relu_jmm(ymm_dst, ymm_src, ymm_zero); break; case operand_type::exp: - exp_ymm(ymm_dst, ymm_src, 2, 3, 4, 5); + exp_jmm(ymm_dst, ymm_src, 2, 3, 4, 5); break; case operand_type::sigmoid: sigmoid_ymm(ymm_dst, ymm_src, 2, 3, 4, 5); @@ -409,7 +234,7 @@ void VActJitCode::generate() { relu_jmm(xmm_dst, xmm_src, xmm_zero); break; case operand_type::exp: - exp_xmm(xmm_dst, xmm_src, 2, 3, 4, 5); + exp_jmm(xmm_dst, xmm_src, 2, 3, 4, 5); break; default: break; diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index 6adeebca7c..534398f4a4 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -16,6 +16,8 @@ limitations under the License. */ #include #include "paddle/fluid/operators/math/jit_gen.h" +#include "paddle/fluid/platform/cpu_info.h" + namespace paddle { namespace operators { namespace math { @@ -40,6 +42,51 @@ typedef enum { identity } operand_type; +extern const float exp_float_consts[]; +extern const int exp_int_0x7f[]; +extern int g_tmp_mem[]; + +// TODO(TJ): move these to some proper place +#define SIGMOID_THRESHOLD_MIN -40.0 +#define SIGMOID_THRESHOLD_MAX 13.0 +#define EXP_MAX_INPUT 40.0 +#define XMM_FLOAT_BLOCK 4 +#define YMM_FLOAT_BLOCK 8 +#define ZMM_FLOAT_BLOCK 16 + +#define ALIGN32 __attribute__((aligned(32))) +#define EXP_HIG 88.3762626647949f +#define EXP_LOW -88.3762626647949f +#define CEPHES_LOG2EF 1.44269504088896341 +#define CEPHES_EXP_C1 0.693359375 +#define CEPHES_EXP_C2 -2.12194440e-4 +#define CEPHES_EXP_P0 1.9875691500E-4 +#define CEPHES_EXP_P1 1.3981999507E-3 +#define CEPHES_EXP_P2 8.3334519073E-3 +#define CEPHES_EXP_P3 4.1665795894E-2 +#define CEPHES_EXP_P4 1.6666665459E-1 +#define CEPHES_EXP_P5 5.0000001201E-1 + +#define REPEAT_8TIMES(val) val, val, val, val, val, val, val, val + +#define OFFSET_EXP_ONE 0 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_TWO 1 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_0P5 2 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_HIG 3 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_LOW 4 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_LOG2EF 5 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_C1 6 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_C2 7 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_P0 8 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_P1 9 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_P2 10 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_P3 11 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_P4 12 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_P5 13 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_EXP_MAX_INPUT 14 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float) +#define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float) + // function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu) class VXXJitCode : public JitCode { public: @@ -134,10 +181,87 @@ class VActJitCode : public JitCode { } // compute exp with ymm, xmm - void exp_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2, - int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5); - void exp_xmm(const Xbyak::Xmm& dst, const Xbyak::Xmm& src, int fx_idx = 2, - int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5); + template + void exp_jmm(JMM& dst, JMM& src, int fx_idx = 2, int fy_idx = 3, // NOLINT + int mask_idx = 4, int tmp_idx = 5) { + using namespace platform::jit; // NOLINT + assert(src.getIdx() != dst.getIdx()); // TODO(TJ): use enfore + // check all idx can not equal + JMM jmm_fx = JMM(fx_idx); + JMM jmm_fy = JMM(fy_idx); + JMM jmm_mask = JMM(mask_idx); + JMM jmm_tmp = JMM(tmp_idx); + reg64_t reg_ptr_global = rax; + push(reg_ptr_global); + mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]); + vminps(src, src, jmm_tmp); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]); + vmaxps(src, src, jmm_tmp); + // express exp(x) as exp(g + n*log(2)) + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]); + vmulps(jmm_fx, src, jmm_tmp); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]); + vaddps(jmm_fx, jmm_fx, jmm_tmp); + vroundps(jmm_fy, jmm_fx, 0x01); + // if greater, substract 1 + vcmpgtps(jmm_mask, jmm_fy, jmm_fx); + vmovaps(jmm_tmp, ptr[reg_ptr_global]); + vandps(jmm_mask, jmm_mask, jmm_tmp); + vsubps(jmm_fx, jmm_fy, jmm_mask); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]); + vmulps(jmm_fy, jmm_fx, jmm_tmp); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]); + JMM ymm_z = JMM(jmm_mask.getIdx()); + vmulps(ymm_z, jmm_fx, jmm_tmp); + vsubps(src, src, jmm_fy); + vsubps(src, src, ymm_z); + vmulps(ymm_z, src, src); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]); + vmulps(dst, src, jmm_tmp); + for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5; + i += (YMM_FLOAT_BLOCK * sizeof(float))) { + vmovaps(jmm_tmp, ptr[reg_ptr_global + i]); // P1~P4 + vaddps(dst, dst, jmm_tmp); + vmulps(dst, dst, src); + } + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]); + vaddps(dst, dst, jmm_tmp); + vmulps(dst, dst, ymm_z); + vaddps(dst, dst, src); + vmovaps(jmm_tmp, ptr[reg_ptr_global]); + vaddps(dst, dst, jmm_tmp); + // build 2^n + JMM ymm_int = jmm_fx; + vcvttps2dq(ymm_int, jmm_fx); + mov(reg_ptr_global, reinterpret_cast(exp_int_0x7f)); + vmovdqa(jmm_tmp, ptr[reg_ptr_global]); + if (MayIUse(avx2) || std::is_same::value) { + vpaddd(ymm_int, ymm_int, jmm_tmp); + vpslld(ymm_int, ymm_int, 23); + } else if (MayIUse(avx)) { + xmm_t xtmp1 = xmm_t(ymm_int.getIdx()); + xmm_t xtmp2 = xmm_t(jmm_tmp.getIdx()); + reg64_t reg_ptr_tmp = reg_ptr_global; + mov(reg_ptr_tmp, reinterpret_cast(g_tmp_mem)); + vmovdqa(ptr[reg_ptr_tmp], ymm_int); + vmovdqa(ptr[reg_ptr_tmp + YMM_FLOAT_BLOCK * sizeof(float)], jmm_tmp); + vpaddd(xtmp1, xtmp1, xtmp2); + vpslld(xtmp1, xtmp1, 23); + vmovdqa(ptr[reg_ptr_tmp], xtmp1); + // next 128bits + vmovdqa(xtmp1, ptr[reg_ptr_tmp + XMM_FLOAT_BLOCK * sizeof(float)]); + vmovdqa(xtmp2, ptr[reg_ptr_tmp + + (YMM_FLOAT_BLOCK + XMM_FLOAT_BLOCK) * sizeof(float)]); + vpaddd(xtmp1, xtmp1, xtmp2); + vpslld(xtmp1, xtmp1, 23); + vmovdqa(ptr[reg_ptr_tmp + XMM_FLOAT_BLOCK * sizeof(float)], xtmp1); + // load out + vmovdqa(ymm_int, ptr[reg_ptr_tmp]); + } + vmulps(dst, dst, ymm_int); + pop(reg_ptr_global); + } // compute sigmoid with ymm void sigmoid_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2, diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h index 4d8d3cd79a..117baaee2b 100644 --- a/paddle/fluid/operators/math/jit_kernel.h +++ b/paddle/fluid/operators/math/jit_kernel.h @@ -26,6 +26,7 @@ namespace operators { namespace math { namespace jitkernel { +// TODO(TJ): move these to some proper place #define SIGMOID_THRESHOLD_MIN -40.0 #define SIGMOID_THRESHOLD_MAX 13.0 #define EXP_MAX_INPUT 40.0 From 4dbdfa60ef6d13568880fb2de5ee31a469080ab7 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 16 Nov 2018 17:29:36 +0000 Subject: [PATCH 07/16] sigmoid and tanh support all size test=develop --- paddle/fluid/operators/math/jit_code.cc | 67 ++++--------------------- paddle/fluid/operators/math/jit_code.h | 50 +++++++++++++++--- 2 files changed, 54 insertions(+), 63 deletions(-) diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index fd18256b0c..a080079a2d 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -132,56 +132,8 @@ const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)}; int g_tmp_mem[16] ALIGN32 = {0}; bool VActJitCode::init(int d, operand_type type) { - bool ok = MayIUse(avx); - if (type == operand_type::relu || type == operand_type::exp) { - // TODO(TJ): implement avx512, avx_exp is slower than mkl when d >= 256 - return ok; - } else { - // TODO(TJ): support more - return ok && d % 8 == 0; - } -} - -void VActJitCode::sigmoid_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx, - int fy_idx, int mask_idx, int tmp_idx) { - // y = 1 / (1 + e^-x) - ymm_t ymm_tmp = ymm_t(tmp_idx); - reg64_t reg_ptr_global = rax; - push(reg_ptr_global); - mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); - vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MAX]); - vminps(ymm_src, ymm_src, ymm_tmp); - vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MIN]); - vmaxps(ymm_src, ymm_src, ymm_tmp); - vxorps(ymm_tmp, ymm_tmp, ymm_tmp); - vsubps(ymm_src, ymm_tmp, ymm_src); - exp_jmm(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx); - vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); - vaddps(ymm_dst, ymm_dst, ymm_tmp); - vdivps(ymm_dst, ymm_tmp, ymm_dst); - pop(reg_ptr_global); -} - -void VActJitCode::tanh_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx, - int fy_idx, int mask_idx, int tmp_idx) { - // y = 2 / (1 + e^(-2x)) - 1 - ymm_t ymm_tmp = ymm_t(tmp_idx); - ymm_t ymm_zero = ymm_t(mask_idx); - reg64_t reg_ptr_global = rax; - push(reg_ptr_global); - mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); - vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]); - vxorps(ymm_zero, ymm_zero, ymm_zero); - vsubps(ymm_tmp, ymm_zero, ymm_tmp); - vmulps(ymm_src, ymm_src, ymm_tmp); - exp_jmm(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx); - vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); - vaddps(ymm_dst, ymm_dst, ymm_tmp); - vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]); - vdivps(ymm_dst, ymm_tmp, ymm_dst); - vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); - vsubps(ymm_dst, ymm_dst, ymm_tmp); - pop(reg_ptr_global); + // TODO(TJ): implement avx512, avx_exp is slower than mkl when d >= 256 + return MayIUse(avx); } void VActJitCode::generate() { @@ -201,10 +153,10 @@ void VActJitCode::generate() { exp_jmm(ymm_dst, ymm_src, 2, 3, 4, 5); break; case operand_type::sigmoid: - sigmoid_ymm(ymm_dst, ymm_src, 2, 3, 4, 5); + sigmoid_jmm(ymm_dst, ymm_src, 2, 3, 4, 5); break; case operand_type::tanh: - tanh_ymm(ymm_dst, ymm_src, 2, 3, 4, 5); + tanh_jmm(ymm_dst, ymm_src, 2, 3, 4, 5); break; case operand_type::identity: break; @@ -214,11 +166,6 @@ void VActJitCode::generate() { vmovups(ptr[param2 + offset], ymm_dst); offset += sizeof(float) * YMM_FLOAT_BLOCK; } - if (type_ != operand_type::relu && type_ != operand_type::exp) { - // TODO(TJ): remove me - ret(); - return; - } int rest = num_ % YMM_FLOAT_BLOCK; int block = XMM_FLOAT_BLOCK; while (rest > 0) { @@ -236,6 +183,12 @@ void VActJitCode::generate() { case operand_type::exp: exp_jmm(xmm_dst, xmm_src, 2, 3, 4, 5); break; + case operand_type::sigmoid: + sigmoid_jmm(xmm_dst, xmm_src, 2, 3, 4, 5); + break; + case operand_type::tanh: + tanh_jmm(xmm_dst, xmm_src, 2, 3, 4, 5); + break; default: break; } diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index 534398f4a4..65f83ff484 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -263,13 +263,51 @@ class VActJitCode : public JitCode { pop(reg_ptr_global); } - // compute sigmoid with ymm - void sigmoid_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2, - int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5); + // compute sigmoid with ymm, xmm + template + void sigmoid_jmm(JMM& dst, JMM& src, int fx_idx = 2, // NOLINT + int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5) { + // y = 1 / (1 + e^-x) + JMM jmm_tmp = JMM(tmp_idx); + reg64_t reg_ptr_global = rax; + push(reg_ptr_global); + mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MAX]); + vminps(src, src, jmm_tmp); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MIN]); + vmaxps(src, src, jmm_tmp); + vxorps(jmm_tmp, jmm_tmp, jmm_tmp); + vsubps(src, jmm_tmp, src); + exp_jmm(dst, src, fx_idx, fy_idx, mask_idx, tmp_idx); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); + vaddps(dst, dst, jmm_tmp); + vdivps(dst, jmm_tmp, dst); + pop(reg_ptr_global); + } - // compute tanh with ymm - void tanh_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2, - int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5); + // compute tanh with ymm, xmm + template + void tanh_jmm(JMM& dst, JMM& src, int fx_idx = 2, int fy_idx = 3, // NOLINT + int mask_idx = 4, int tmp_idx = 5) { + // y = 2 / (1 + e^(-2x)) - 1 + JMM jmm_tmp = JMM(tmp_idx); + JMM jmm_zero = JMM(mask_idx); + reg64_t reg_ptr_global = rax; + push(reg_ptr_global); + mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]); + vxorps(jmm_zero, jmm_zero, jmm_zero); + vsubps(jmm_tmp, jmm_zero, jmm_tmp); + vmulps(src, src, jmm_tmp); + exp_jmm(dst, src, fx_idx, fy_idx, mask_idx, tmp_idx); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); + vaddps(dst, dst, jmm_tmp); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]); + vdivps(dst, jmm_tmp, dst); + vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); + vsubps(dst, dst, jmm_tmp); + pop(reg_ptr_global); + } protected: int num_; From be80bb4f28f4a50cfbc96edd790227f59273d20e Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Fri, 16 Nov 2018 20:01:56 +0100 Subject: [PATCH 08/16] - Fix to GPU test=develop --- paddle/fluid/operators/softmax_op.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h index 91829d5761..8eb5c7691e 100644 --- a/paddle/fluid/operators/softmax_op.h +++ b/paddle/fluid/operators/softmax_op.h @@ -36,7 +36,9 @@ class SoftmaxKernel : public framework::OpKernel { Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1); #ifdef PADDLE_ON_INFERENCE - math::SoftmaxFunctor()( + math::SoftmaxFunctor< + DeviceContext, T, + std::is_same::value>()( context.template device_context(), &X_2d, &Out_2d); #else math::SoftmaxFunctor()( From a19b3225a1da8c31fc996bace3ac09e6f5f177ef Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Sat, 17 Nov 2018 14:56:43 +0000 Subject: [PATCH 09/16] fix jitcode small size test=develop --- paddle/fluid/operators/math/jit_code.cc | 12 ++++++++---- paddle/fluid/operators/math/jit_kernel_test.cc | 10 +++++----- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index a080079a2d..e484e9a3c7 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -59,9 +59,10 @@ void VXXJitCode::generate() { offset += sizeof(float) * YMM_FLOAT_BLOCK; } int rest = num_ % YMM_FLOAT_BLOCK; - int block = XMM_FLOAT_BLOCK; while (rest > 0) { + int block = XMM_FLOAT_BLOCK; if (rest >= 4) { + block = 4; if (scalar_index_ != 1) { vmovups(xmm_src1, ptr[param1 + offset]); } @@ -69,6 +70,7 @@ void VXXJitCode::generate() { vmovups(xmm_src2, ptr[param2 + offset]); } } else if (rest >= 2) { + block = 2; if (scalar_index_ != 1) { vmovq(xmm_src1, ptr[param1 + offset]); } @@ -76,6 +78,7 @@ void VXXJitCode::generate() { vmovq(xmm_src2, ptr[param2 + offset]); } } else { + block = 1; if (scalar_index_ != 1) { vmovss(xmm_src1, ptr[param1 + offset]); } @@ -105,7 +108,6 @@ void VXXJitCode::generate() { } offset += sizeof(float) * block; rest -= block; - block /= 2; } ret(); } @@ -167,13 +169,16 @@ void VActJitCode::generate() { offset += sizeof(float) * YMM_FLOAT_BLOCK; } int rest = num_ % YMM_FLOAT_BLOCK; - int block = XMM_FLOAT_BLOCK; while (rest > 0) { + int block = XMM_FLOAT_BLOCK; if (rest >= 4) { + block = 4; vmovups(xmm_src, ptr[param1 + offset]); } else if (rest >= 2) { + block = 2; vmovq(xmm_src, ptr[param1 + offset]); } else { + block = 1; vmovss(xmm_src, ptr[param1 + offset]); } switch (type_) { @@ -201,7 +206,6 @@ void VActJitCode::generate() { } offset += sizeof(float) * block; rest -= block; - block /= 2; } ret(); } diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc index 932fa4c000..b6c62a2634 100644 --- a/paddle/fluid/operators/math/jit_kernel_test.cc +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -69,7 +69,7 @@ void vrelu_intri8(const int n, const float* x, float* y) { TEST(JitKernel, vrelu) { namespace jit = paddle::operators::math::jitkernel; - for (int d : {7, 8, 15, 16, 30, 256, 512}) { + for (int d : {3, 7, 8, 15, 16, 30, 256, 512}) { std::vector x(d); std::vector zref(d), ztgt(d); RandomVec(d, x.data(), -10.f, 1.f); @@ -159,7 +159,7 @@ void vexp_mkl(const int n, const float* x, float* y) { TEST(JitKernel, vexp) { namespace jit = paddle::operators::math::jitkernel; - for (int d : {7, 8, 12, 15, 16, 20, 30, 128, 256}) { + for (int d : {1, 3, 4, 6, 7, 8, 12, 15, 16, 20, 30, 128, 256}) { std::vector x(d); std::vector zref(d), ztgt(d); RandomVec(d, x.data(), -2.f, 2.f); @@ -234,7 +234,7 @@ void vsigmoid_better( TEST(JitKernel, vsigmoid) { namespace jit = paddle::operators::math::jitkernel; - for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) { + for (int d : {1, 3, 4, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) { std::vector x(d); std::vector zref(d), ztgt(d); RandomVec(d, x.data(), -2.f, 2.f); @@ -298,7 +298,7 @@ void vtanh_better( TEST(JitKernel, vtanh) { namespace jit = paddle::operators::math::jitkernel; - for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) { + for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) { std::vector x(d); std::vector zref(d), ztgt(d); RandomVec(d, x.data(), -2.f, 2.f); @@ -389,7 +389,7 @@ void lstm_ctht_better( TEST(JitKernel, lstm) { namespace jit = paddle::operators::math::jitkernel; - for (int d : {7, 8, 15, 16, 30, 32, 64, 100}) { + for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100}) { int d4 = d * 4; int d3 = d * 3; std::vector x(d4), xref(d4); From 9b0eae3023e3faf6a40a69f5ff79bcc2303c674b Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Sun, 18 Nov 2018 13:27:17 +0100 Subject: [PATCH 10/16] - Removing partial specialization of sotmax for inference for GPU test=develop --- paddle/fluid/operators/math/softmax.h | 3 ++- paddle/fluid/operators/math/softmax_impl.h | 10 +++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/math/softmax.h b/paddle/fluid/operators/math/softmax.h index bf698dc2f7..089458e957 100644 --- a/paddle/fluid/operators/math/softmax.h +++ b/paddle/fluid/operators/math/softmax.h @@ -19,7 +19,8 @@ namespace paddle { namespace operators { namespace math { -template +template class SoftmaxFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor* X, diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index e09a243347..0f3e5b2008 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -33,8 +33,8 @@ struct ValueClip { } }; -template -void SoftmaxFunctor::operator()( +template +void SoftmaxFunctor::operator()( const DeviceContext& context, const framework::Tensor* X, framework::Tensor* Y) { auto logits = EigenMatrix::From(*X); @@ -66,8 +66,12 @@ void SoftmaxFunctor::operator()( .broadcast(one_by_class)); } +template +using enable_if_CPU = typename std::enable_if< + std::is_same::value>::type; + template -class SoftmaxFunctor { +class SoftmaxFunctor> { void operator()(const DeviceContext& context, const framework::Tensor* X, framework::Tensor* Y) { auto in_dims = X->dims(); From be50670348a23b35172e2420baeb058321ab3e13 Mon Sep 17 00:00:00 2001 From: Yihua Xu Date: Tue, 20 Nov 2018 08:24:00 +0800 Subject: [PATCH 11/16] Remove the remnant code (test=develop) --- paddle/fluid/operators/stack_op.h | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h index f1692ae956..56a12852a9 100644 --- a/paddle/fluid/operators/stack_op.h +++ b/paddle/fluid/operators/stack_op.h @@ -72,25 +72,6 @@ class StackOpMaker : public framework::OpProtoAndCheckerMaker { } }; -template -struct StackFunctor { - HOSTDEVICE StackFunctor(const VecXType &x, T *y, int n, int post) - : x_(x), y_(y), n_(n), post_(post) {} - - HOSTDEVICE void operator()(int idx) { - int i = idx / (n_ * post_); - int which_x = idx / post_ - i * n_; - int x_index = i * post_ + idx % post_; - y_[idx] = x_[which_x][x_index]; - } - - private: - VecXType x_; - T *y_; - int n_; - int post_; -}; - template struct StackGradFunctor { HOSTDEVICE StackGradFunctor(const VecDxType &dx, const T *dy, int n, int post) @@ -110,14 +91,6 @@ struct StackGradFunctor { int post_; }; -template -static inline void StackFunctorForRange(const DeviceContext &ctx, - const VecXType &x, T *y, int total_num, - int n, int post) { - platform::ForRange for_range(ctx, total_num); - for_range(StackFunctor(x, y, n, post)); -} - template static inline void StackGradFunctorForRange(const DeviceContext &ctx, const VecDxType &dx, const T *dy, From d91740acb1e49e4baaad02aeda379f27f6ec0f69 Mon Sep 17 00:00:00 2001 From: Yihua Xu Date: Tue, 20 Nov 2018 08:25:48 +0800 Subject: [PATCH 12/16] Revert "Remove the remnant code (test=develop)" This reverts commit be50670348a23b35172e2420baeb058321ab3e13. --- paddle/fluid/operators/stack_op.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h index 56a12852a9..f1692ae956 100644 --- a/paddle/fluid/operators/stack_op.h +++ b/paddle/fluid/operators/stack_op.h @@ -72,6 +72,25 @@ class StackOpMaker : public framework::OpProtoAndCheckerMaker { } }; +template +struct StackFunctor { + HOSTDEVICE StackFunctor(const VecXType &x, T *y, int n, int post) + : x_(x), y_(y), n_(n), post_(post) {} + + HOSTDEVICE void operator()(int idx) { + int i = idx / (n_ * post_); + int which_x = idx / post_ - i * n_; + int x_index = i * post_ + idx % post_; + y_[idx] = x_[which_x][x_index]; + } + + private: + VecXType x_; + T *y_; + int n_; + int post_; +}; + template struct StackGradFunctor { HOSTDEVICE StackGradFunctor(const VecDxType &dx, const T *dy, int n, int post) @@ -91,6 +110,14 @@ struct StackGradFunctor { int post_; }; +template +static inline void StackFunctorForRange(const DeviceContext &ctx, + const VecXType &x, T *y, int total_num, + int n, int post) { + platform::ForRange for_range(ctx, total_num); + for_range(StackFunctor(x, y, n, post)); +} + template static inline void StackGradFunctorForRange(const DeviceContext &ctx, const VecDxType &dx, const T *dy, From a906a361be831b9b425a9f197036fef506020857 Mon Sep 17 00:00:00 2001 From: Yihua Xu Date: Tue, 20 Nov 2018 08:30:27 +0800 Subject: [PATCH 13/16] Add the macro for NVCC (test=develop) --- paddle/fluid/operators/stack_op.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h index f1692ae956..3d132e4397 100644 --- a/paddle/fluid/operators/stack_op.h +++ b/paddle/fluid/operators/stack_op.h @@ -149,11 +149,20 @@ class StackKernel : public framework::OpKernel { for (auto i = axis; i < dim.size(); ++i) post *= dim[i]; #ifdef __NVCC__ + int total_num = pre * n * post; + auto &dev_ctx = ctx.template device_context(); + thrust::device_vector device_x_vec(x_datas); auto x_data_arr = device_x_vec.data().get(); + + StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post); + + // Wait() must be called because device_x_vec may be destructed before + // kernel ends + dev_ctx.Wait(); #else auto x_data_arr = x_datas.data(); -#endif + size_t x_offset = 0; size_t y_offset = 0; for (int i = 0; i < pre; i++) { @@ -164,10 +173,6 @@ class StackKernel : public framework::OpKernel { } x_offset += post; } -#ifdef __NVCC__ - // Wait() must be called because device_x_vec may be destructed before - // kernel ends - dev_ctx.Wait(); #endif } }; From a94a7355f0014337006ea8bb04bb2c30c955f7ea Mon Sep 17 00:00:00 2001 From: chengduo Date: Tue, 20 Nov 2018 10:01:51 +0800 Subject: [PATCH 14/16] Refine the GraphNum check (#14144) * refine GraphCheck test=develop * fix ci fail test=develop --- paddle/fluid/framework/ir/graph_helper.cc | 28 +++++++++++++++------ paddle/fluid/framework/parallel_executor.cc | 13 ++++++++-- python/paddle/fluid/__init__.py | 3 ++- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index 98112c1ed3..963179192f 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -15,8 +15,15 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph_helper.h" #include #include +#include +#include +#include #include +DEFINE_string(print_sub_graph_dir, "", + "FLAGS_print_sub_graph_dir is used " + "to print the nodes of sub_graphs."); + namespace paddle { namespace framework { namespace ir { @@ -164,12 +171,15 @@ size_t GraphNum(const Graph &graph) { graph_nodes.emplace_back(g_nodes); } - if (VLOG_IS_ON(100)) { - VLOG(100) << "graph_num: " << graph_nodes.size(); - for (auto &g_n : graph_nodes) { - VLOG(100) << "graph_nodes: " << g_n.size(); - if (g_n.size() < 10) { - std::stringstream out; + if (FLAGS_print_sub_graph_dir.size()) { + if (graph_nodes.size() > 1) { + std::stringstream out; + for (auto &g_n : graph_nodes) { + out << "graph_nodes: " << g_n.size() << "\n"; + } + out << "\n\n"; + for (auto &g_n : graph_nodes) { + out << "graph_nodes: " << g_n.size(); for (auto &node : g_n) { out << "\nNode: " << node->Name() << " in ["; for (auto &n : node->inputs) { @@ -181,8 +191,12 @@ size_t GraphNum(const Graph &graph) { } out << "]"; } - VLOG(100) << out.str(); + out << "\n\n\n"; } + std::unique_ptr fout( + new std::ofstream(FLAGS_print_sub_graph_dir)); + PADDLE_ENFORCE(fout->good()); + *fout << out.str(); } } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 39b47415ff..2c6e337568 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -171,8 +171,17 @@ ParallelExecutor::ParallelExecutor( } // If the loss_var_name is given, the number of graph should be only one. if (loss_var_name.size()) { - PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1, - "The number of graph should be only one"); + size_t graph_num = ir::GraphNum(*graph); + if (graph_num > 1) { + LOG(WARNING) + << "The number of graph should be only one, " + "but the current graph has " + << ir::GraphNum(*graph) + << " sub_graphs. If you want to see the nodes of the " + "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' " + "to specify the output dir. NOTES: if you not do training, " + "please don't pass loss_var_name."; + } } if (exec_strategy.type_ == ExecutionStrategy::kDefault) { diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index b991974928..f2f49f813a 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -116,7 +116,8 @@ def __bootstrap__(): 'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb', - 'allocator_strategy', 'reader_queue_speed_test_mode' + 'allocator_strategy', 'reader_queue_speed_test_mode', + 'print_sub_graph_dir' ] if os.name != 'nt': read_env_flags.append('warpctc_dir') From bb2b35c85ebe726fa6baa94f466f65a71b21394e Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Mon, 19 Nov 2018 21:11:12 +0800 Subject: [PATCH 15/16] Add python example for resize_nearest. test=develop --- python/paddle/fluid/layers/nn.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index af96f5de4f..91599b156d 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -5788,7 +5788,7 @@ def image_resize(input, Examples: .. code-block:: python - out = fluid.layers.image_resize(input, out_shape=[12, 12]) + out = fluid.layers.image_resize(input, out_shape=[12, 12], resample="NEAREST") """ resample_methods = { 'BILINEAR': 'bilinear', @@ -5891,6 +5891,11 @@ def resize_bilinear(input, Returns: ${out_comment}. + + Examples: + .. code-block:: python + + out = fluid.layers.resize_bilinear(input, out_shape=[12, 12]) """ return image_resize(input, out_shape, scale, name, 'BILINEAR', actual_shape) @@ -5937,6 +5942,11 @@ def resize_nearest(input, Returns: ${out_comment}. + + Examples: + .. code-block:: python + + out = fluid.layers.resize_nearest(input, out_shape=[12, 12]) """ return image_resize(input, out_shape, scale, name, 'NEAREST', actual_shape) From 8bc1c5d2abb260ab4c20e009ceacb8508b8ae59d Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Tue, 20 Nov 2018 11:10:38 +0800 Subject: [PATCH 16/16] Implement the Tensorrt plugin for elementwise op (#14487) * Initialize the elementwise plugin. * Implement the basic CUDA kernel of elementwise plugin. test=develop --- .../ir_passes/tensorrt_subgraph_pass.cc | 2 +- .../passes/ir_analysis_compose_pass.cc | 3 +- .../inference/tensorrt/convert/CMakeLists.txt | 13 +- .../tensorrt/convert/elementwise_op.cc | 70 ++++++--- .../inference/tensorrt/convert/op_converter.h | 2 +- .../inference/tensorrt/convert/prelu_op.cc | 2 +- .../inference/tensorrt/convert/split_op.cc | 2 +- .../tensorrt/convert/test_elementwise_op.cc | 78 +++++++--- .../inference/tensorrt/convert/test_mul_op.cc | 18 +-- .../inference/tensorrt/convert/ut_helper.h | 2 +- paddle/fluid/inference/tensorrt/engine.cc | 5 +- paddle/fluid/inference/tensorrt/engine.h | 4 +- .../inference/tensorrt/plugin/CMakeLists.txt | 4 +- .../tensorrt/plugin/elementwise_op_plugin.cu | 138 ++++++++++++++++++ .../tensorrt/plugin/elementwise_op_plugin.h | 87 +++++++++++ .../tensorrt/plugin/prelu_op_plugin.cu | 2 + .../tensorrt/plugin/prelu_op_plugin.h | 2 + .../inference/tensorrt/plugin/serialize.h | 32 +++- .../tensorrt/plugin/split_op_plugin.cu | 25 ++-- .../tensorrt/plugin/split_op_plugin.h | 73 +++++---- .../inference/tensorrt/plugin/trt_plugin.cc | 28 ++-- .../inference/tensorrt/plugin/trt_plugin.h | 72 ++++++--- .../fluid/inference/tests/api/tester_helper.h | 2 +- 23 files changed, 500 insertions(+), 166 deletions(-) create mode 100644 paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu create mode 100644 paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 21fd8d2df4..c6b7c05f78 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -114,7 +114,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, // it is either an OP's input or an OP's output. auto &subgraph_nodes = *Agent(node).subgraph(); - for (size_t index = 0; index < block_desc.OpSize(); index++) { + for (size_t index = 0; index < block_desc.OpSize(); ++index) { framework::proto::OpDesc *op = block_desc.Op(index)->Proto(); auto correspond_node = subgraph_nodes[index]; PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type()); diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc index 38e9b1c5e7..267737e95c 100644 --- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc @@ -45,7 +45,8 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) { std::unordered_set teller_set( {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", - "elementwise_add", "dropout", "split", "prelu", "conv2d_transpose"}); + "elementwise_add", "elementwise_mul", "dropout", "split", "prelu", + "conv2d_transpose"}); if (!node->IsOp()) return false; if (teller_set.count(node->Op()->Type())) { diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 85ad5ffe78..8dd6e8453f 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -1,9 +1,9 @@ # Add TRT tests nv_library(tensorrt_converter - SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc -batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc -pad_op.cc split_op.cc prelu_op.cc - DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) + SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc + batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc + pad_op.cc split_op.cc prelu_op.cc + DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_converter) @@ -20,7 +20,8 @@ nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pool_op SERIAL) nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc - DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine elementwise_add_op SERIAL) + DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin + elementwise_add_op elementwise_mul_op SERIAL) nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine softmax_op SERIAL) nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc @@ -33,7 +34,7 @@ nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pad_op SERIAL) nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin - split_op concat_op SERIAL) + split_op concat_op SERIAL) nv_test(test_trt_prelu_op SRCS test_prelu_op.cc prelu_op.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin prelu_op SERIAL) diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 1af091fabd..6975086193 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -13,11 +13,25 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h" namespace paddle { namespace inference { namespace tensorrt { +static bool CheckDims(const nvinfer1::Dims& dims_x, + const nvinfer1::Dims& dims_y) { + if (dims_x.nbDims != dims_y.nbDims) { + return false; + } + for (int i = 0; i < dims_x.nbDims; i++) { + if (dims_x.d[i] != dims_y.d[i]) { + return false; + } + } + return true; +} + class ElementwiseWeightOpConverter : public OpConverter { public: ElementwiseWeightOpConverter() {} @@ -26,7 +40,7 @@ class ElementwiseWeightOpConverter : public OpConverter { // Here the two nullptr looks strange, that's because the // framework::OpDesc's constructor is strange. framework::OpDesc op_desc(op, nullptr); - VLOG(3) << "convert a fluid elementwise op to tensorrt IScaleLayer"; + VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer"; PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight @@ -106,10 +120,12 @@ class ElementwiseTensorOpConverter : public OpConverter { ElementwiseTensorOpConverter() {} void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { + auto op_pair = ops.find(op_type_); + PADDLE_ENFORCE(op_pair != ops.end(), "Wrong elementwise op type!"); + // Here the two nullptr looks strange, that's because the // framework::OpDesc's constructor is strange. framework::OpDesc op_desc(op, nullptr); - VLOG(3) << "convert a fluid elementwise op to tensorrt IScaleLayer"; PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight @@ -120,29 +136,35 @@ class ElementwiseTensorOpConverter : public OpConverter { nvinfer1::Dims dims_x = X->getDimensions(); nvinfer1::Dims dims_y = Y->getDimensions(); - // The two input tensor should have the same dims - PADDLE_ENFORCE(dims_x.nbDims >= 3); - if (dims_x.nbDims == dims_y.nbDims) { - for (int i = 0; i < dims_x.nbDims; i++) { - if (dims_x.d[i] != dims_y.d[i]) - PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!"); - } - } else { - PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!"); - } + int axis = boost::get(op_desc.GetAttr("axis")); + auto output_name = op_desc.Output("Out")[0]; + if (CheckDims(dims_x, dims_y)) { + // The two input tensor should have the same dims + VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer"; - auto op_pair = ops.find(op_type_); - if (op_pair == ops.end()) { - PADDLE_THROW("Wrong elementwise op type!"); - } - nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER( - engine_, ElementWise, *const_cast(X), - *const_cast(Y), op_pair->second); + nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER( + engine_, ElementWise, *const_cast(X), + *const_cast(Y), op_pair->second); - auto output_name = op_desc.Output("Out")[0]; - layer->setName(("elementwise (Output: " + output_name + ")").c_str()); - layer->getOutput(0)->setName(output_name.c_str()); - engine_->SetITensor(output_name, layer->getOutput(0)); + layer->setName(("elementwise (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); + engine_->SetITensor(output_name, layer->getOutput(0)); + } else { + VLOG(3) << "Convert a fluid elementwise op to TensorRT " + "ElementWisePluginLayer"; + + plugin::ElementWisePlugin* plugin = + new plugin::ElementWisePlugin(op_pair->second, dims_x, dims_y, axis); + plugin->AddInput(X); + plugin->AddInput(Y); + nvinfer1::IPluginLayer* layer = engine_->AddPlugin( + const_cast(plugin->GetInputs().data()), 2, + reinterpret_cast(plugin)); + + layer->setName(("elementwise (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); + engine_->SetITensor(output_name, layer->getOutput(0)); + } if (test_mode) { // the test framework can not determine which is the // output, so place the declaration inside. engine_->DeclareOutput(output_name); diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index d309d94c56..d61d635ed7 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -61,7 +61,7 @@ class OpConverter { // TODO(xingzhaolong): all mul, sub, div // static std::unordered_set add_weight_op_set {"add", "mul", // "sub", "div"}; - static std::unordered_set add_weight_op_set{"add"}; + static std::unordered_set add_weight_op_set{"add", "mul"}; PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL); int op_type_len = op_desc.Type().size(); std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len); diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc index 337885e6ba..dbdff85dde 100644 --- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc @@ -54,7 +54,7 @@ class PReluOpConverter : public OpConverter { TensorRTEngine::Weight alpha_rt(nvinfer1::DataType::kFLOAT, static_cast(alpha_data), alpha_tensor_device->numel()); - PReluPlugin* plugin = new PReluPlugin(alpha_rt, mode); + plugin::PReluPlugin* plugin = new plugin::PReluPlugin(alpha_rt, mode); nvinfer1::IPluginLayer* layer = engine_->AddPlugin(&input, input_num, plugin); // keep alpha tensor to avoid release it's memory diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc index 159854ab59..6620c76318 100644 --- a/paddle/fluid/inference/tensorrt/convert/split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc @@ -50,7 +50,7 @@ class SplitOpConverter : public OpConverter { PADDLE_ENFORCE(output_lengths.size() == output_num); // - SplitPlugin* plugin = new SplitPlugin(axis, output_lengths); + plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths); nvinfer1::IPluginLayer* layer = engine_->AddPlugin(&input, input_num, plugin); diff --git a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc index 7537d02a35..cc967464a5 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc @@ -20,13 +20,12 @@ namespace paddle { namespace inference { namespace tensorrt { -TEST(elementwise_op, add_weight_test) { +TEST(elementwise_op, add_weight) { std::unordered_set parameters({"elementwise_add-Y"}); framework::Scope scope; TRTConvertValidation validator(10, parameters, scope, 1 << 15); validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3)); validator.DeclParamVar("elementwise_add-Y", nvinfer1::Dims3(10, 1, 1)); - // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2)); validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3)); // Prepare Op description @@ -44,30 +43,65 @@ TEST(elementwise_op, add_weight_test) { validator.Execute(8); } -TEST(elementwise_op, add_tensor_test) { - std::unordered_set parameters; - framework::Scope scope; - TRTConvertValidation validator(8, parameters, scope, 1 << 15); - validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3)); - validator.DeclInputVar("elementwise_add-Y", nvinfer1::Dims3(10, 3, 3)); - // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2)); - validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3)); - - // Prepare Op description - framework::OpDesc desc; - desc.SetType("elementwise_add"); - desc.SetInput("X", {"elementwise_add-X"}); - desc.SetInput("Y", {"elementwise_add-Y"}); - desc.SetOutput("Out", {"elementwise_add-Out"}); - - // the defalut axis of elementwise op is -1 - - validator.SetOp(*desc.Proto()); +TEST(elementwise_op, native) { + for (std::string type : {"add", "mul"}) { + int batch_size = 8; + std::unordered_set parameters; + framework::Scope scope; + TRTConvertValidation validator(batch_size, parameters, scope, 1 << 15); + validator.DeclInputVar("elementwise_" + type + "-X", + nvinfer1::DimsCHW(10, 3, 3)); + validator.DeclInputVar("elementwise_" + type + "-Y", + nvinfer1::Dims3(10, 3, 3)); + validator.DeclOutputVar("elementwise_" + type + "-Out", + nvinfer1::DimsCHW(10, 3, 3)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("elementwise_" + type); + desc.SetInput("X", {"elementwise_" + type + "-X"}); + desc.SetInput("Y", {"elementwise_" + type + "-Y"}); + desc.SetOutput("Out", {"elementwise_" + type + "-Out"}); + + int axis = -1; + desc.SetAttr("axis", axis); + + validator.SetOp(*desc.Proto()); + validator.Execute(batch_size); + } +} - validator.Execute(8); +TEST(elementwise_op, plugin) { + for (std::string type : {"add", "mul"}) { + int batch_size = 8; + std::unordered_set parameters; + framework::Scope scope; + TRTConvertValidation validator(batch_size, parameters, scope, 1 << 15); + validator.DeclInputVar("elementwise_" + type + "-X", + nvinfer1::DimsCHW(10, 3, 3)); + validator.DeclInputVar("elementwise_" + type + "-Y", + nvinfer1::Dims3(10, 1, 1)); + validator.DeclOutputVar("elementwise_" + type + "-Out", + nvinfer1::DimsCHW(10, 3, 3)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("elementwise_" + type); + desc.SetInput("X", {"elementwise_" + type + "-X"}); + desc.SetInput("Y", {"elementwise_" + type + "-Y"}); + desc.SetOutput("Out", {"elementwise_" + type + "-Out"}); + + int axis = -1; + desc.SetAttr("axis", axis); + + validator.SetOp(*desc.Proto()); + validator.Execute(batch_size); + } } } // namespace tensorrt } // namespace inference } // namespace paddle + USE_OP(elementwise_add); +USE_OP(elementwise_mul); diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc index 3d34cd7d5d..282f53559a 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc @@ -1,16 +1,16 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index 0a6f171fc4..f313beb73b 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 208bd12b83..f739752cbc 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -257,9 +257,10 @@ void TensorRTEngine::freshDeviceId() { } nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin( - nvinfer1::ITensor *const *inputs, int nbInputs, PluginTensorRT *plugin) { + nvinfer1::ITensor *const *inputs, int num_inputs, + plugin::PluginTensorRT *plugin) { owned_plugin_.emplace_back(plugin); - return infer_network_.get()->addPluginExt(inputs, nbInputs, *plugin); + return infer_network_.get()->addPluginExt(inputs, num_inputs, *plugin); } } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 99420f19ba..f5b2c28ba9 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -128,7 +128,7 @@ class TensorRTEngine : public EngineBase { int GetRuntimeBatch(); int GetDevice() { return device_; } nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs, - int nbInputs, PluginTensorRT*); + int num_inputs, plugin::PluginTensorRT*); // A pointer to CPU memory is needed of the TRT weight. // Before TRT runs, fluid loads weight into GPU storage. @@ -171,7 +171,7 @@ class TensorRTEngine : public EngineBase { // The specific GPU id that the TensorRTEngine bounded to. int device_; - std::vector> owned_plugin_; + std::vector> owned_plugin_; // TensorRT related internal members template diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index b6811f9183..4090269499 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -1 +1,3 @@ -nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce device_context) +nv_library(tensorrt_plugin + SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu + DEPS enforce device_context) diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu new file mode 100644 index 0000000000..9cd9026b73 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu @@ -0,0 +1,138 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +namespace details { + +template +struct Add { + __device__ T operator()(const T& a, const T& b) const { return a + b; } +}; + +template +struct Mul { + __device__ T operator()(const T& a, const T& b) const { return a * b; } +}; + +template +__global__ void ColumnWiseKernel(Operator op, const T* x, const T* y, T* out, + int batch_size, int num_rows, int num_cols) { + for (int batch_id = 0; batch_id < batch_size; ++batch_id) { + int row = blockIdx.x; + for (; row < num_rows; row += gridDim.x) { + T value_y = y[batch_id * num_rows + row]; + int col = threadIdx.x; + int offset = (batch_id * num_rows + row) * num_cols; + for (; col < num_cols; col += blockDim.x) { + T value_x = x[offset + col]; + out[offset + col] = op(value_x, value_y); + } + } + } +} + +template +static void ElementWise(Operator op, const T* x, const T* y, T* out, + int batch_size, int prev, int midd, int post, + cudaStream_t stream) { + const int kThreadsPerBlock = 1024; + const int kMaximumBlocks = 65535; + if (prev == 1) { + int num_threads = (post > kThreadsPerBlock) ? kThreadsPerBlock + : (((post + 31) >> 5) << 5); + int num_blocks = (midd < kMaximumBlocks) ? midd : kMaximumBlocks; + ColumnWiseKernel<<>>( + op, x, y, out, batch_size, midd, post); + } else if (post == 1) { + PADDLE_THROW("Not implemented."); + } else { + PADDLE_THROW("Not implemented."); + } +} + +} // namespace details + +nvinfer1::Dims ElementWisePlugin::getOutputDimensions( + int index, const nvinfer1::Dims* input_dims, int num_inputs) { + PADDLE_ENFORCE_EQ(index, 0); + PADDLE_ENFORCE_EQ(num_inputs, 2); + PADDLE_ENFORCE_NOT_NULL(input_dims); + return input_dims[0]; +} + +int ElementWisePlugin::initialize() { + PADDLE_ENFORCE_GT(dims_y_.nbDims, 0); + + axis_ = (axis_ == -1) ? dims_x_.nbDims - dims_y_.nbDims : axis_; + int trimed_nb_dims = dims_y_.nbDims; + for (; trimed_nb_dims > 0; --trimed_nb_dims) { + if (dims_y_.d[trimed_nb_dims - 1] != 1) { + break; + } + } + dims_y_.nbDims = trimed_nb_dims; + + PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_); + PADDLE_ENFORCE_LT(axis_, dims_x_.nbDims); + + prev_size_ = 1; + midd_size_ = 1; + post_size_ = 1; + for (int i = 0; i < axis_; ++i) { + prev_size_ *= dims_x_.d[i]; + } + + for (int i = 0; i < dims_y_.nbDims; ++i) { + PADDLE_ENFORCE_EQ(dims_x_.d[i + axis_], dims_y_.d[i], + "Broadcast dimension mismatch."); + midd_size_ *= dims_y_.d[i]; + } + + for (int i = axis_ + dims_y_.nbDims; i < dims_x_.nbDims; ++i) { + post_size_ *= dims_x_.d[i]; + } + return 0; +} + +int ElementWisePlugin::enqueue(int batch_size, const void* const* inputs, + void** outputs, void* workspace, + cudaStream_t stream) { + const float* x = reinterpret_cast(inputs[0]); + const float* y = reinterpret_cast(inputs[1]); + float* out = reinterpret_cast(outputs[0]); + + if (type_ == nvinfer1::ElementWiseOperation::kSUM) { + details::ElementWise(details::Add(), x, y, out, batch_size, + prev_size_, midd_size_, post_size_, stream); + } else if (type_ == nvinfer1::ElementWiseOperation::kPROD) { + details::ElementWise(details::Mul(), x, y, out, batch_size, + prev_size_, midd_size_, post_size_, stream); + } else { + PADDLE_THROW("Not implemented."); + } + + return cudaGetLastError() != cudaSuccess; +} + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h new file mode 100644 index 0000000000..9c461f7a5c --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h @@ -0,0 +1,87 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +class ElementWisePlugin : public PluginTensorRT { + public: + ElementWisePlugin(nvinfer1::ElementWiseOperation type, + nvinfer1::Dims const &dims_x, nvinfer1::Dims const &dims_y, + int axis) + : type_(type), + dims_x_(dims_x), + dims_y_(dims_y), + axis_(axis), + prev_size_(1), + midd_size_(1), + post_size_(1) {} + + ElementWisePlugin(void const *serial_data, size_t serial_length) { + deserializeBase(serial_data, serial_length); + DeserializeValue(&serial_data, &serial_length, &axis_); + DeserializeValue(&serial_data, &serial_length, &dims_x_); + DeserializeValue(&serial_data, &serial_length, &dims_y_); + } + + ElementWisePlugin *clone() const override { + // return new ElementWisePlugin(dims_x_, dims_y_, axis_); + return nullptr; + } + + const char *getPluginType() const override { return "elementwise"; } + + nvinfer1::Dims getOutputDimensions(int index, + const nvinfer1::Dims *input_dims, + int num_inputs) override; + + int initialize() override; + + // execute the layer + int enqueue(int batch_size, const void *const *inputs, void **outputs, + void *workspace, cudaStream_t stream); + + protected: + size_t getSerializationSize() override { + return SerializedSize(axis_) + SerializedSize(dims_x_) + + SerializedSize(dims_y_) + getBaseSerializationSize(); + } + + void serialize(void *buffer) override { + serializeBase(buffer); + SerializeValue(&buffer, axis_); + SerializeValue(&buffer, dims_x_); + SerializeValue(&buffer, dims_y_); + } + + nvinfer1::ElementWiseOperation type_; + nvinfer1::Dims dims_x_; + nvinfer1::Dims dims_y_; + int axis_; + int prev_size_; + int midd_size_; + int post_size_; +}; + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu index 0f1ca11295..e8f4254402 100644 --- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu @@ -20,6 +20,7 @@ namespace paddle { namespace inference { namespace tensorrt { +namespace plugin { static const int CUDA_NUM_THREADS = 1024; static const int CUDA_MAX_NUM_BLOCKS = 65535; @@ -126,6 +127,7 @@ int PReluPlugin::enqueue(int batchSize, const void *const *inputs, return cudaGetLastError() != cudaSuccess; } +} // namespace plugin } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h index aa0f865c89..0db56a310b 100644 --- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h @@ -21,6 +21,7 @@ namespace paddle { namespace inference { namespace tensorrt { +namespace plugin { class PReluPlugin : public PluginTensorRT { TensorRTEngine::Weight alpha_; @@ -63,6 +64,7 @@ class PReluPlugin : public PluginTensorRT { void *workspace, cudaStream_t stream) override; }; +} // namespace plugin } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/serialize.h b/paddle/fluid/inference/tensorrt/plugin/serialize.h index 50c0b17d78..ce859f16fc 100644 --- a/paddle/fluid/inference/tensorrt/plugin/serialize.h +++ b/paddle/fluid/inference/tensorrt/plugin/serialize.h @@ -14,10 +14,15 @@ #pragma once -#include #include #include #include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { template inline void SerializeValue(void** buffer, T const& value); @@ -26,7 +31,7 @@ template inline void DeserializeValue(void const** buffer, size_t* buffer_size, T* value); -namespace { +namespace details { template struct Serializer {}; @@ -36,10 +41,12 @@ struct Serializer::value || std::is_enum::value || std::is_pod::value>::type> { static size_t SerializedSize(T const& value) { return sizeof(T); } + static void Serialize(void** buffer, T const& value) { std::memcpy(*buffer, &value, sizeof(T)); reinterpret_cast(*buffer) += sizeof(T); } + static void Deserialize(void const** buffer, size_t* buffer_size, T* value) { assert(*buffer_size >= sizeof(T)); std::memcpy(value, *buffer, sizeof(T)); @@ -51,10 +58,12 @@ struct Serializer::value || template <> struct Serializer { static size_t SerializedSize(const char* value) { return strlen(value) + 1; } + static void Serialize(void** buffer, const char* value) { - std::strcpy(static_cast(*buffer), value); + std::strcpy(static_cast(*buffer), value); // NOLINT reinterpret_cast(*buffer) += strlen(value) + 1; } + static void Deserialize(void const** buffer, size_t* buffer_size, const char** value) { *value = static_cast(*buffer); @@ -73,39 +82,46 @@ struct Serializer, static size_t SerializedSize(std::vector const& value) { return sizeof(value.size()) + value.size() * sizeof(T); } + static void Serialize(void** buffer, std::vector const& value) { SerializeValue(buffer, value.size()); size_t nbyte = value.size() * sizeof(T); std::memcpy(*buffer, value.data(), nbyte); reinterpret_cast(*buffer) += nbyte; } + static void Deserialize(void const** buffer, size_t* buffer_size, std::vector* value) { size_t size; DeserializeValue(buffer, buffer_size, &size); value->resize(size); size_t nbyte = value->size() * sizeof(T); - assert(*buffer_size >= nbyte); + PADDLE_ENFORCE_GE(*buffer_size, nbyte); std::memcpy(value->data(), *buffer, nbyte); reinterpret_cast(*buffer) += nbyte; *buffer_size -= nbyte; } }; -} // namespace +} // namespace details template inline size_t SerializedSize(T const& value) { - return Serializer::SerializedSize(value); + return details::Serializer::SerializedSize(value); } template inline void SerializeValue(void** buffer, T const& value) { - return Serializer::Serialize(buffer, value); + return details::Serializer::Serialize(buffer, value); } template inline void DeserializeValue(void const** buffer, size_t* buffer_size, T* value) { - return Serializer::Deserialize(buffer, buffer_size, value); + return details::Serializer::Deserialize(buffer, buffer_size, value); } + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu index bd6a44dcc1..4adea2db1e 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -12,26 +12,26 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include -#include #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" namespace paddle { namespace inference { namespace tensorrt { +namespace plugin { -nvinfer1::Dims SplitPlugin::getOutputDimensions(int index, - const nvinfer1::Dims* inputDims, - int nbInputs) { - assert(nbInputs == 1); - assert(index < this->getNbOutputs()); - nvinfer1::Dims const& input_dims = inputDims[0]; - nvinfer1::Dims output_dims = input_dims; +nvinfer1::Dims SplitPlugin::getOutputDimensions( + int index, const nvinfer1::Dims* input_dims, int num_inputs) { + PADDLE_ENFORCE_EQ(num_inputs, 1); + PADDLE_ENFORCE_LT(index, this->getNbOutputs()); + + nvinfer1::Dims output_dims = input_dims[0]; output_dims.d[axis_] = output_length_.at(index); return output_dims; } int SplitPlugin::initialize() { + PADDLE_ENFORCE_LE(axis_, nvinfer1::Dims::MAX_DIMS); + std::vector segment_offsets(1, 0); for (int i = 0; i < this->getNbOutputs(); ++i) { segment_offsets.push_back(segment_offsets.back() + output_length_[i]); @@ -76,6 +76,7 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs, return cudaGetLastError() != cudaSuccess; } -} // tensorrt -} // inference -} // paddle +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h index 7281e40c33..b5b6e69992 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h @@ -14,61 +14,58 @@ #pragma once +#include #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" namespace paddle { namespace inference { namespace tensorrt { +namespace plugin { class SplitPlugin : public PluginTensorRT { - int axis_; - std::vector output_length_; - int nx_, ny_, nz_; - std::vector segment_offsets_; + public: + SplitPlugin(int axis, std::vector const &output_lengths) + : axis_(axis), output_length_(output_lengths) {} + + SplitPlugin(void const *serial_data, size_t serial_length) { + deserializeBase(serial_data, serial_length); + DeserializeValue(&serial_data, &serial_length, &axis_); + DeserializeValue(&serial_data, &serial_length, &output_length_); + } + + SplitPlugin *clone() const override { + return new SplitPlugin(axis_, output_length_); + } + + const char *getPluginType() const override { return "split"; } + int getNbOutputs() const override { return output_length_.size(); } + nvinfer1::Dims getOutputDimensions(int index, + const nvinfer1::Dims *input_dims, + int num_inputs) override; + + int initialize() override; + int enqueue(int batchSize, const void *const *inputs, void **outputs, + void *workspace, cudaStream_t stream) override; protected: - virtual size_t getSerializationSize() override { + size_t getSerializationSize() override { return SerializedSize(axis_) + SerializedSize(output_length_) + getBaseSerializationSize(); } - // TRT will call this func when we need to serialize the configuration of - // tensorrt. - // It should not be called by users. - virtual void serialize(void *buffer) override { + void serialize(void *buffer) override { serializeBase(buffer); SerializeValue(&buffer, axis_); SerializeValue(&buffer, output_length_); } - public: - SplitPlugin(int axis, std::vector const &output_lengths) - : axis_(axis), output_length_(output_lengths) { - assert(axis <= nvinfer1::Dims::MAX_DIMS); - } - - // It was used for tensorrt deserialization. - // It should not be called by users. - SplitPlugin(void const *serialData, size_t serialLength) { - deserializeBase(serialData, serialLength); - DeserializeValue(&serialData, &serialLength, &axis_); - DeserializeValue(&serialData, &serialLength, &output_length_); - } - - SplitPlugin *clone() const override { - return new SplitPlugin(axis_, output_length_); - } - - virtual const char *getPluginType() const override { return "split"; } - virtual int getNbOutputs() const override { return output_length_.size(); } - virtual nvinfer1::Dims getOutputDimensions(int index, - const nvinfer1::Dims *inputs, - int nbInputDims) override; - virtual int initialize() override; - virtual int enqueue(int batchSize, const void *const *inputs, void **outputs, - void *workspace, cudaStream_t stream) override; + int axis_; + std::vector output_length_; + int nx_, ny_, nz_; + std::vector segment_offsets_; }; -} // tensorrt -} // inference -} // paddle +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc index 08016d84b1..b0f4cff3ac 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc @@ -17,6 +17,7 @@ namespace paddle { namespace inference { namespace tensorrt { +namespace plugin { void PluginTensorRT::serializeBase(void*& buffer) { SerializeValue(&buffer, input_dims_); @@ -25,12 +26,12 @@ void PluginTensorRT::serializeBase(void*& buffer) { SerializeValue(&buffer, data_format_); } -void PluginTensorRT::deserializeBase(void const*& serialData, - size_t& serialLength) { - DeserializeValue(&serialData, &serialLength, &input_dims_); - DeserializeValue(&serialData, &serialLength, &max_batch_size_); - DeserializeValue(&serialData, &serialLength, &data_type_); - DeserializeValue(&serialData, &serialLength, &data_format_); +void PluginTensorRT::deserializeBase(void const*& serial_data, + size_t& serial_length) { + DeserializeValue(&serial_data, &serial_length, &input_dims_); + DeserializeValue(&serial_data, &serial_length, &max_batch_size_); + DeserializeValue(&serial_data, &serial_length, &data_type_); + DeserializeValue(&serial_data, &serial_length, &data_format_); } size_t PluginTensorRT::getBaseSerializationSize() { @@ -44,18 +45,17 @@ bool PluginTensorRT::supportsFormat(nvinfer1::DataType type, (format == nvinfer1::PluginFormat::kNCHW)); } -void PluginTensorRT::configureWithFormat(const nvinfer1::Dims* inputDims, - int nbInputs, - const nvinfer1::Dims* outputDims, - int nbOutputs, nvinfer1::DataType type, - nvinfer1::PluginFormat format, - int maxBatchSize) { +void PluginTensorRT::configureWithFormat( + const nvinfer1::Dims* input_dims, int num_inputs, + const nvinfer1::Dims* output_dims, int num_outputs, nvinfer1::DataType type, + nvinfer1::PluginFormat format, int max_batch_size) { data_type_ = type; data_format_ = format; - input_dims_.assign(inputDims, inputDims + nbInputs); - max_batch_size_ = maxBatchSize; + input_dims_.assign(input_dims, input_dims + num_inputs); + max_batch_size_ = max_batch_size; } +} // namespace plugin } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h index 4d85e955a4..86084829e1 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h @@ -14,23 +14,30 @@ #pragma once -#include +#include #include -#include #include #include -#include "NvInfer.h" #include "paddle/fluid/inference/tensorrt/plugin/serialize.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/profiler.h" + +DECLARE_bool(profile); namespace paddle { namespace inference { namespace tensorrt { +namespace plugin { class PluginTensorRT : public nvinfer1::IPluginExt { public: PluginTensorRT() {} + // It was used for TensorRT deserialization. + // It should not be called by users. PluginTensorRT(const void* serialized_data, size_t length) {} + virtual ~PluginTensorRT() {} + nvinfer1::Dims const& getInputDims(int index) const { return input_dims_.at(index); } @@ -38,43 +45,66 @@ class PluginTensorRT : public nvinfer1::IPluginExt { nvinfer1::DataType getDataType() const { return data_type_; } nvinfer1::PluginFormat getDataFormat() const { return data_format_; } virtual const char* getPluginVersion() const { return "1"; } + + void AddInput(nvinfer1::ITensor* input) { inputs_.push_back(input); } + std::vector& GetInputs() { return inputs_; } + + virtual nvinfer1::IPluginExt* clone() const = 0; + virtual const char* getPluginType() const = 0; + + // Following functions are inherit from nvinfer1::IPluginExt + // Get the number of outputs from the layer + int getNbOutputs() const { return 1; } + // Get the dimension of an output tensor + virtual nvinfer1::Dims getOutputDimensions(int index, + const nvinfer1::Dims* input_dims, + int num_inputs) = 0; + // Find the workspace size required by the layer size_t getWorkspaceSize(int) const override { return 0; } + + // Initialize the layer for execution. + // This is called when the engine is created. + int initialize() override { return 0; } + // Shutdown the layer. This is called when the engine is destroyed void terminate() override {} - virtual ~PluginTensorRT() {} + // Execute the layer + virtual int enqueue(int batch_size, const void* const* inputs, void** outputs, + void* workspace, cudaStream_t stream) = 0; + + // Find the size of the serialization buffer required + virtual size_t getSerializationSize() = 0; + // Serialize the layer config to buffer. + // TensorRT will call this func to serialize the configuration of TensorRT + // engine. It should not be called by users. + virtual void serialize(void* buffer) = 0; + // Check format support. The default is FLOAT32 and NCHW. bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) const override; - void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs, - const nvinfer1::Dims* outputDims, int nbOutputs, + // Configure the layer + void configureWithFormat(const nvinfer1::Dims* input_dims, int num_inputs, + const nvinfer1::Dims* output_dims, int num_outputs, nvinfer1::DataType type, nvinfer1::PluginFormat format, - int maxBatchSize) override; - - // *NOTE* The following functions need to be overrided in the subclass. - virtual nvinfer1::IPluginExt* clone() const = 0; - virtual const char* getPluginType() const = 0; - // Initialize the layer for execution. This is called when the engine is - // created. - int initialize() override { return 0; } - // Serialize the layer config to buffer. - virtual void serialize(void* buffer) = 0; - virtual size_t getSerializationSize() = 0; - virtual int enqueue(int batchSize, const void* const* inputs, void** outputs, - void* workspace, cudaStream_t stream) = 0; + int max_batch_size) override; protected: // Deserialize input_dims, max_batch_size, data_type, data_format - void deserializeBase(void const*& serialData, size_t& serialLength); + void deserializeBase(void const*& serial_data, // NOLINT + size_t& serial_length); // NOLINT size_t getBaseSerializationSize(); // Serialize input_dims, max_batch_size, data_type, data_format - void serializeBase(void*& buffer); + void serializeBase(void*& buffer); // NOLINT std::vector input_dims_; size_t max_batch_size_; nvinfer1::DataType data_type_; nvinfer1::PluginFormat data_format_; + + std::vector inputs_; }; +} // namespace plugin } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index a404691413..e66ae28057 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -51,7 +51,7 @@ void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) { LOG(INFO) << *reinterpret_cast(config); return; } - LOG(INFO) << *config; + LOG(INFO) << *reinterpret_cast(config); } void CompareResult(const std::vector &outputs,