From 03ccb9a461db7650fd1dc749f2f61a4df253bf31 Mon Sep 17 00:00:00 2001
From: Yihua Xu <yihuax.xu@intel.com>
Date: Thu, 15 Nov 2018 16:07:16 +0800
Subject: [PATCH 01/16] Optimize the stack operator

---
 paddle/fluid/operators/stack_op.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h
index d236c5b943..f1692ae956 100644
--- a/paddle/fluid/operators/stack_op.h
+++ b/paddle/fluid/operators/stack_op.h
@@ -147,16 +147,23 @@ class StackKernel : public framework::OpKernel<T> {
     auto &dim = x[0]->dims();
     for (auto i = 0; i < axis; ++i) pre *= dim[i];
     for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
-    int total_num = pre * n * post;
 
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
 #ifdef __NVCC__
     thrust::device_vector<const T *> device_x_vec(x_datas);
     auto x_data_arr = device_x_vec.data().get();
 #else
     auto x_data_arr = x_datas.data();
 #endif
-    StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
+    size_t x_offset = 0;
+    size_t y_offset = 0;
+    for (int i = 0; i < pre; i++) {
+      for (int j = 0; j < n; j++) {
+        std::memcpy(y_data + y_offset, x_data_arr[j] + x_offset,
+                    post * sizeof(T));
+        y_offset += post;
+      }
+      x_offset += post;
+    }
 #ifdef __NVCC__
     // Wait() must be called because device_x_vec may be destructed before
     // kernel ends

From 513bb6c1513dde0e3b9e2b9da5acccd9649cda0d Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Thu, 8 Nov 2018 17:16:16 +0100
Subject: [PATCH 02/16] Squashing MKL based softmax for inference

test=develop

- Added profiling to softmax functors

- MKL based softmax inference op

- Fix to softmax compuation via MKL

- cleaning

- Cosmetic fixes to softmax MKL

- Fix to ON_INFER lack of propagation
---
 CMakeLists.txt                             | 15 +++---
 paddle/fluid/operators/math/softmax_impl.h | 59 ++++++++++++----------
 paddle/fluid/operators/softmax_op.h        |  2 +-
 3 files changed, 42 insertions(+), 34 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9cfec8e70b..c62cc9bfd7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -302,6 +302,14 @@ set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 
+if (ON_INFER)
+    message(STATUS "On inference mode, will take place some specific optimization.")
+    add_definitions(-DPADDLE_ON_INFERENCE)
+else()
+    #TODO(luotao), combine this warning with `make inference_lib_dist` command.
+    message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
+endif()
+
 add_subdirectory(paddle)
 if(WITH_PYTHON)
     add_subdirectory(python)
@@ -312,10 +320,3 @@ if(WITH_DOC)
     find_python_module(recommonmark REQUIRED)
     add_subdirectory(doc)
 endif()
-
-if (ON_INFER)
-    message(STATUS "On inference mode, will take place some specific optimization.")
-else()
-    #TODO(luotao), combine this warning with `make inference_lib_dist` command.
-    message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
-endif()
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index 7cf98f2725..e09a243347 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
 
+#include "paddle/fluid/operators/math/blas.h"
 namespace paddle {
 namespace operators {
 namespace math {
@@ -65,36 +66,42 @@ void SoftmaxFunctor<DeviceContext, T, is_test>::operator()(
                                                  .broadcast(one_by_class));
 }
 
-template <typename DeviceContext, typename T>
-class SoftmaxFunctor<DeviceContext, T, true> {
+template <typename DeviceContext>
+class SoftmaxFunctor<DeviceContext, float, true> {
   void operator()(const DeviceContext& context, const framework::Tensor* X,
                   framework::Tensor* Y) {
-    auto logits = EigenMatrix<T>::From(*X);
-    auto softmax = EigenMatrix<T>::From(*Y);
-
+    auto in_dims = X->dims();
+    auto out_dims = Y->dims();
+    const float* in_data = X->data<float>();
+    float* out_data = Y->data<float>();
     const int kBatchDim = 0;
     const int kClassDim = 1;
-
-    const int batch_size = logits.dimension(kBatchDim);
-    const int num_classes = logits.dimension(kClassDim);
-
-    Eigen::DSizes<int, 1> along_class(kClassDim);
-    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
-    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
-
-    auto shifted_logits = (logits -
-                           logits.maximum(along_class)
-                               .eval()
-                               .reshape(batch_by_one)
-                               .broadcast(one_by_class));
-
-    softmax.device(*context.eigen_device()) = shifted_logits.exp();
-    softmax.device(*context.eigen_device()) = (softmax *
-                                               softmax.sum(along_class)
-                                                   .inverse()
-                                                   .eval()
-                                                   .reshape(batch_by_one)
-                                                   .broadcast(one_by_class));
+    // 2D data. Batch x C
+    const int batch_size = in_dims[kBatchDim];
+    const int num_classes = in_dims[kClassDim];
+    std::vector<float> entities(batch_size);
+    auto blas = math::GetBlas<DeviceContext, float>(context);
+    for (int n = 0; n < batch_size; ++n) {
+      entities[n] = in_data[n * num_classes];
+      for (int c = 1; c < num_classes; ++c) {
+        entities[n] = in_data[n * num_classes + c] > entities[n]
+                          ? in_data[n * num_classes + c]
+                          : entities[n];
+      }
+      for (int c = 0; c < num_classes; ++c) {
+        out_data[n * num_classes + c] =
+            in_data[n * num_classes + c] - entities[n];
+      }
+    }
+
+    blas.VEXP(num_classes * batch_size, out_data, out_data);
+    for (int n = 0; n < batch_size; ++n) {
+      entities[n] = out_data[n * num_classes];
+      for (int c = 1; c < num_classes; ++c) {
+        entities[n] += out_data[n * num_classes + c];
+      }
+      blas.SCAL(num_classes, 1.0f / entities[n], &out_data[n * num_classes]);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h
index 2fea8a65bc..91829d5761 100644
--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
@@ -35,7 +35,7 @@ class SoftmaxKernel : public framework::OpKernel<T> {
     Tensor X_2d = framework::ReshapeToMatrix(*X, rank - 1);
     Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
 
-#ifdef ON_INFER
+#ifdef PADDLE_ON_INFERENCE
     math::SoftmaxFunctor<DeviceContext, T, true>()(
         context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
 #else

From ba3eaed7a7426a10f4a394071852c6f5d6ab8e1e Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 16 Nov 2018 09:13:34 +0000
Subject: [PATCH 03/16] exp support all size

---
 paddle/fluid/operators/math/jit_code.cc       | 114 ++++++++++++++++--
 paddle/fluid/operators/math/jit_code.h        |   8 +-
 .../fluid/operators/math/jit_kernel_test.cc   |   5 +-
 3 files changed, 113 insertions(+), 14 deletions(-)

diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
index e3b600d442..9efd4e8174 100644
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -81,10 +81,10 @@ void VXXJitCode::generate() {
   }
   if (rest >= 2) {
     if (scalar_index_ != 1) {
-      vmovups(xmm_src1, ptr[param1 + offset]);
+      vmovq(xmm_src1, ptr[param1 + offset]);
     }
     if (scalar_index_ != 2) {
-      vmovups(xmm_src2, ptr[param2 + offset]);
+      vmovq(xmm_src2, ptr[param2 + offset]);
     }
     if (type_ == operand_type::mul) {
       vmulps(xmm_dst, xmm_src1, xmm_src2);
@@ -100,10 +100,10 @@ void VXXJitCode::generate() {
   }
   if (rest > 0) {
     if (scalar_index_ != 1) {
-      vmovups(xmm_src1, ptr[param1 + offset]);
+      vmovss(xmm_src1, ptr[param1 + offset]);
     }
     if (scalar_index_ != 2) {
-      vmovups(xmm_src2, ptr[param2 + offset]);
+      vmovss(xmm_src2, ptr[param2 + offset]);
     }
     if (type_ == operand_type::mul) {
       vmulss(xmm_dst, xmm_src1, xmm_src2);
@@ -179,7 +179,7 @@ bool VActJitCode::init(int d, operand_type type) {
     return ok;
   } else if (type == operand_type::exp) {
     // exp is slower than mkl when d >= 256
-    return ok && d % 8 == 0 && d < 256;
+    return ok;  //&& d % 4 == 0 && d < 256;
   } else {
     // TODO(TJ): support more
     return ok && d % 8 == 0;
@@ -190,6 +190,10 @@ void VActJitCode::relu_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, ymm_t& ymm_zero) {
   vmaxps(ymm_dst, ymm_zero, ymm_src);
 }
 
+void VActJitCode::relu_xmm(xmm_t& xmm_dst, xmm_t& xmm_src, xmm_t& xmm_zero) {
+  vmaxps(xmm_dst, xmm_zero, xmm_src);
+}
+
 void VActJitCode::exp_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
                           int fy_idx, int mask_idx, int tmp_idx) {
   assert(ymm_src.getIdx() != ymm_dst.getIdx());  // TODO(TJ): use enfore
@@ -271,6 +275,65 @@ void VActJitCode::exp_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
   pop(reg_ptr_global);
 }
 
+void VActJitCode::exp_xmm(xmm_t& ymm_dst, xmm_t& ymm_src, int fx_idx,
+                          int fy_idx, int mask_idx, int tmp_idx) {
+  assert(ymm_src.getIdx() != ymm_dst.getIdx());  // TODO(TJ): use enfore
+  // check all idx can not equal
+  xmm_t ymm_fx = xmm_t(fx_idx);
+  xmm_t ymm_fy = xmm_t(fy_idx);
+  xmm_t ymm_mask = xmm_t(mask_idx);
+  xmm_t ymm_tmp = xmm_t(tmp_idx);
+  reg64_t reg_ptr_global = rax;
+  push(reg_ptr_global);
+  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]);
+  vminps(ymm_src, ymm_src, ymm_tmp);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]);
+  vmaxps(ymm_src, ymm_src, ymm_tmp);
+  // express exp(x) as exp(g + n*log(2))
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]);
+  vmulps(ymm_fx, ymm_src, ymm_tmp);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]);
+  vaddps(ymm_fx, ymm_fx, ymm_tmp);
+  vroundps(ymm_fy, ymm_fx, 0x01);
+  // if greater, substract 1
+  vcmpgtps(ymm_mask, ymm_fy, ymm_fx);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global]);
+  vandps(ymm_mask, ymm_mask, ymm_tmp);
+  vsubps(ymm_fx, ymm_fy, ymm_mask);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]);
+  vmulps(ymm_fy, ymm_fx, ymm_tmp);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]);
+  xmm_t ymm_z = xmm_t(ymm_mask.getIdx());
+  vmulps(ymm_z, ymm_fx, ymm_tmp);
+  vsubps(ymm_src, ymm_src, ymm_fy);
+  vsubps(ymm_src, ymm_src, ymm_z);
+  vmulps(ymm_z, ymm_src, ymm_src);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]);
+  vmulps(ymm_dst, ymm_src, ymm_tmp);
+  for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5;
+       i += (YMM_FLOAT_BLOCK * sizeof(float))) {
+    vmovaps(ymm_tmp, ptr[reg_ptr_global + i]);  // P1~P4
+    vaddps(ymm_dst, ymm_dst, ymm_tmp);
+    vmulps(ymm_dst, ymm_dst, ymm_src);
+  }
+  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]);
+  vaddps(ymm_dst, ymm_dst, ymm_tmp);
+  vmulps(ymm_dst, ymm_dst, ymm_z);
+  vaddps(ymm_dst, ymm_dst, ymm_src);
+  vmovaps(ymm_tmp, ptr[reg_ptr_global]);
+  vaddps(ymm_dst, ymm_dst, ymm_tmp);
+  // build 2^n
+  xmm_t ymm_int = ymm_fx;
+  vcvttps2dq(ymm_int, ymm_fx);
+  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_int_0x7f));
+  vmovdqa(ymm_tmp, ptr[reg_ptr_global]);
+  vpaddd(ymm_int, ymm_int, ymm_tmp);
+  vpslld(ymm_int, ymm_int, 23);
+  vmulps(ymm_dst, ymm_dst, ymm_int);
+  pop(reg_ptr_global);
+}
+
 void VActJitCode::sigmoid_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
                               int fy_idx, int mask_idx, int tmp_idx) {
   // y = 1 / (1 + e^-x)
@@ -343,7 +406,7 @@ void VActJitCode::generate() {
     vmovups(ptr[param2 + offset], ymm_dst);
     offset += sizeof(float) * YMM_FLOAT_BLOCK;
   }
-  if (type_ != operand_type::relu) {
+  if (type_ != operand_type::relu && type_ != operand_type::exp) {
     // TODO(TJ): remove me
     ret();
     return;
@@ -351,21 +414,50 @@ void VActJitCode::generate() {
   int rest = num_ % YMM_FLOAT_BLOCK;
   if (rest >= 4) {
     vmovups(xmm_src, ptr[param1 + offset]);
-    vmaxps(xmm_dst, xmm_zero, xmm_src);
+    switch (type_) {
+      case operand_type::relu:
+        relu_xmm(xmm_dst, xmm_src, xmm_zero);
+        break;
+      case operand_type::exp:
+        exp_xmm(xmm_dst, xmm_src, 2, 3, 4, 5);
+        break;
+      default:
+        break;
+    }
     vmovups(ptr[param2 + offset], xmm_dst);
     offset += sizeof(float) * 4;
     rest -= 4;
   }
   if (rest >= 2) {
-    vmovups(xmm_src, ptr[param1 + offset]);
-    vmaxps(xmm_dst, xmm_zero, xmm_src);
+    vmovq(xmm_src, ptr[param1 + offset]);
+    switch (type_) {
+      case operand_type::relu:
+        relu_xmm(xmm_dst, xmm_src, xmm_zero);
+        break;
+      case operand_type::exp:
+        exp_xmm(xmm_dst, xmm_src, 2, 3, 4, 5);
+        break;
+      default:
+        break;
+    }
     vmovq(ptr[param2 + offset], xmm_dst);
     offset += sizeof(float) * 2;
     rest -= 2;
   }
   if (rest > 0) {
-    vmovups(xmm_src, ptr[param1 + offset]);
-    vmaxps(xmm_dst, xmm_zero, xmm_src);
+    // vmovups();
+    vmovss(xmm_src, ptr[param1 + offset]);
+
+    switch (type_) {
+      case operand_type::relu:
+        relu_xmm(xmm_dst, xmm_src, xmm_zero);
+        break;
+      case operand_type::exp:
+        exp_xmm(xmm_dst, xmm_src, 2, 3, 4, 5);
+        break;
+      default:
+        break;
+    }
     vmovss(ptr[param2 + offset], xmm_dst);
   }
   ret();
diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h
index 71205b211b..1467978f26 100644
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -127,13 +127,17 @@ class VActJitCode : public JitCode {
   void generate() override;
 
  protected:
-  // compute relu with ymm
+  // compute relu with ymm, xmm
   void relu_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src,
                 const Xbyak::Ymm& zero);
+  void relu_xmm(const Xbyak::Xmm& dst, const Xbyak::Xmm& src,
+                const Xbyak::Xmm& zero);
 
-  // compute exp with ymm
+  // compute exp with ymm, xmm
   void exp_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2,
                int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5);
+  void exp_xmm(const Xbyak::Xmm& dst, const Xbyak::Xmm& src, int fx_idx = 2,
+               int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5);
 
   // compute sigmoid with ymm
   void sigmoid_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2,
diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc
index 5a6f87fe1f..178298bf56 100644
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -33,6 +33,9 @@ limitations under the License. */
 
 constexpr int repeat = 20000;
 
+// TODO(TJ): benchmark and test should be seperated,
+// benchmark should verify more sizes
+
 inline double GetCurrentUS() {
   struct timeval time;
   gettimeofday(&time, NULL);
@@ -156,7 +159,7 @@ void vexp_mkl(const int n, const float* x, float* y) {
 
 TEST(JitKernel, vexp) {
   namespace jit = paddle::operators::math::jitkernel;
-  for (int d : {7, 8, 15, 16, 30, 128, 256}) {
+  for (int d : {7, 8, 12, 15, 16, 20, 30, 128, 256}) {
     std::vector<float> x(d);
     std::vector<float> zref(d), ztgt(d);
     RandomVec<float>(d, x.data(), -2.f, 2.f);

From 4e67fe6a122636bc84b2f8df6d5f94feb5ed1a78 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 16 Nov 2018 10:09:40 +0000
Subject: [PATCH 04/16] refine act and vxx with all size

---
 paddle/fluid/operators/math/jit_code.cc | 147 ++++++++++--------------
 1 file changed, 60 insertions(+), 87 deletions(-)

diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
index 9efd4e8174..a5eef019c8 100644
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -60,60 +60,53 @@ void VXXJitCode::generate() {
     offset += sizeof(float) * YMM_FLOAT_BLOCK;
   }
   int rest = num_ % YMM_FLOAT_BLOCK;
-  if (rest >= 4) {
-    if (scalar_index_ != 1) {
-      vmovups(xmm_src1, ptr[param1 + offset]);
-    }
-    if (scalar_index_ != 2) {
-      vmovups(xmm_src2, ptr[param2 + offset]);
-    }
-    if (type_ == operand_type::mul) {
-      vmulps(xmm_dst, xmm_src1, xmm_src2);
-    } else if (type_ == operand_type::add) {
-      vaddps(xmm_dst, xmm_src1, xmm_src2);
-    }
-    if (with_relu_) {
-      vmaxps(xmm_dst, xmm_zero, xmm_dst);
-    }
-    vmovups(ptr[param3 + offset], xmm_dst);
-    offset += sizeof(float) * 4;
-    rest -= 4;
-  }
-  if (rest >= 2) {
-    if (scalar_index_ != 1) {
-      vmovq(xmm_src1, ptr[param1 + offset]);
-    }
-    if (scalar_index_ != 2) {
-      vmovq(xmm_src2, ptr[param2 + offset]);
+  int block = XMM_FLOAT_BLOCK;
+  while (rest > 0) {
+    if (rest >= 4) {
+      if (scalar_index_ != 1) {
+        vmovups(xmm_src1, ptr[param1 + offset]);
+      }
+      if (scalar_index_ != 2) {
+        vmovups(xmm_src2, ptr[param2 + offset]);
+      }
+    } else if (rest >= 2) {
+      if (scalar_index_ != 1) {
+        vmovq(xmm_src1, ptr[param1 + offset]);
+      }
+      if (scalar_index_ != 2) {
+        vmovq(xmm_src2, ptr[param2 + offset]);
+      }
+    } else {
+      if (scalar_index_ != 1) {
+        vmovss(xmm_src1, ptr[param1 + offset]);
+      }
+      if (scalar_index_ != 2) {
+        vmovss(xmm_src2, ptr[param2 + offset]);
+      }
     }
-    if (type_ == operand_type::mul) {
-      vmulps(xmm_dst, xmm_src1, xmm_src2);
-    } else if (type_ == operand_type::add) {
-      vaddps(xmm_dst, xmm_src1, xmm_src2);
+    switch (type_) {
+      case operand_type::mul:
+        vmulps(xmm_dst, xmm_src1, xmm_src2);
+        break;
+      case operand_type::add:
+        vaddps(xmm_dst, xmm_src1, xmm_src2);
+        break;
+      default:
+        break;
     }
     if (with_relu_) {
       vmaxps(xmm_dst, xmm_zero, xmm_dst);
     }
-    vmovq(ptr[param3 + offset], xmm_dst);
-    offset += sizeof(float) * 2;
-    rest -= 2;
-  }
-  if (rest > 0) {
-    if (scalar_index_ != 1) {
-      vmovss(xmm_src1, ptr[param1 + offset]);
-    }
-    if (scalar_index_ != 2) {
-      vmovss(xmm_src2, ptr[param2 + offset]);
-    }
-    if (type_ == operand_type::mul) {
-      vmulss(xmm_dst, xmm_src1, xmm_src2);
-    } else if (type_ == operand_type::add) {
-      vaddss(xmm_dst, xmm_src1, xmm_src2);
+    if (rest >= 4) {
+      vmovups(ptr[param3 + offset], xmm_dst);
+    } else if (rest >= 2) {
+      vmovq(ptr[param3 + offset], xmm_dst);
+    } else {
+      vmovss(ptr[param3 + offset], xmm_dst);
     }
-    if (with_relu_) {
-      vmaxps(xmm_dst, xmm_zero, xmm_dst);
-    }
-    vmovss(ptr[param3 + offset], xmm_dst);
+    offset += sizeof(float) * block;
+    rest -= block;
+    block /= 2;
   }
   ret();
 }
@@ -175,11 +168,9 @@ static int g_tmp_mem[16] ALIGN32 = {0};
 
 bool VActJitCode::init(int d, operand_type type) {
   bool ok = MayIUse(avx);
-  if (type == operand_type::relu) {
+  if (type == operand_type::relu || type == operand_type::exp) {
+    // TODO(TJ): implement avx512, avx_exp is slower than mkl when d >= 256
     return ok;
-  } else if (type == operand_type::exp) {
-    // exp is slower than mkl when d >= 256
-    return ok;  //&& d % 4 == 0 && d < 256;
   } else {
     // TODO(TJ): support more
     return ok && d % 8 == 0;
@@ -412,24 +403,15 @@ void VActJitCode::generate() {
     return;
   }
   int rest = num_ % YMM_FLOAT_BLOCK;
-  if (rest >= 4) {
-    vmovups(xmm_src, ptr[param1 + offset]);
-    switch (type_) {
-      case operand_type::relu:
-        relu_xmm(xmm_dst, xmm_src, xmm_zero);
-        break;
-      case operand_type::exp:
-        exp_xmm(xmm_dst, xmm_src, 2, 3, 4, 5);
-        break;
-      default:
-        break;
+  int block = XMM_FLOAT_BLOCK;
+  while (rest > 0) {
+    if (rest >= 4) {
+      vmovups(xmm_src, ptr[param1 + offset]);
+    } else if (rest >= 2) {
+      vmovq(xmm_src, ptr[param1 + offset]);
+    } else {
+      vmovss(xmm_src, ptr[param1 + offset]);
     }
-    vmovups(ptr[param2 + offset], xmm_dst);
-    offset += sizeof(float) * 4;
-    rest -= 4;
-  }
-  if (rest >= 2) {
-    vmovq(xmm_src, ptr[param1 + offset]);
     switch (type_) {
       case operand_type::relu:
         relu_xmm(xmm_dst, xmm_src, xmm_zero);
@@ -440,25 +422,16 @@ void VActJitCode::generate() {
       default:
         break;
     }
-    vmovq(ptr[param2 + offset], xmm_dst);
-    offset += sizeof(float) * 2;
-    rest -= 2;
-  }
-  if (rest > 0) {
-    // vmovups();
-    vmovss(xmm_src, ptr[param1 + offset]);
-
-    switch (type_) {
-      case operand_type::relu:
-        relu_xmm(xmm_dst, xmm_src, xmm_zero);
-        break;
-      case operand_type::exp:
-        exp_xmm(xmm_dst, xmm_src, 2, 3, 4, 5);
-        break;
-      default:
-        break;
+    if (rest >= 4) {
+      vmovups(ptr[param2 + offset], xmm_dst);
+    } else if (rest >= 2) {
+      vmovq(ptr[param2 + offset], xmm_dst);
+    } else {
+      vmovss(ptr[param2 + offset], xmm_dst);
     }
-    vmovss(ptr[param2 + offset], xmm_dst);
+    offset += sizeof(float) * block;
+    rest -= block;
+    block /= 2;
   }
   ret();
 }

From d3eae8f61b26c4fa053a74ce35aeb241db2c3b3b Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 16 Nov 2018 14:58:43 +0000
Subject: [PATCH 05/16] refine relu and fix addrelu test

---
 paddle/fluid/operators/math/jit_code.cc        | 12 ++----------
 paddle/fluid/operators/math/jit_code.h         |  8 ++++----
 paddle/fluid/operators/math/jit_kernel_test.cc |  2 +-
 3 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
index a5eef019c8..2a10cd7821 100644
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -177,14 +177,6 @@ bool VActJitCode::init(int d, operand_type type) {
   }
 }
 
-void VActJitCode::relu_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, ymm_t& ymm_zero) {
-  vmaxps(ymm_dst, ymm_zero, ymm_src);
-}
-
-void VActJitCode::relu_xmm(xmm_t& xmm_dst, xmm_t& xmm_src, xmm_t& xmm_zero) {
-  vmaxps(xmm_dst, xmm_zero, xmm_src);
-}
-
 void VActJitCode::exp_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
                           int fy_idx, int mask_idx, int tmp_idx) {
   assert(ymm_src.getIdx() != ymm_dst.getIdx());  // TODO(TJ): use enfore
@@ -378,7 +370,7 @@ void VActJitCode::generate() {
     vmovups(ymm_src, ptr[param1 + offset]);
     switch (type_) {
       case operand_type::relu:
-        relu_ymm(ymm_dst, ymm_src, ymm_zero);
+        relu_jmm<ymm_t>(ymm_dst, ymm_src, ymm_zero);
         break;
       case operand_type::exp:
         exp_ymm(ymm_dst, ymm_src, 2, 3, 4, 5);
@@ -414,7 +406,7 @@ void VActJitCode::generate() {
     }
     switch (type_) {
       case operand_type::relu:
-        relu_xmm(xmm_dst, xmm_src, xmm_zero);
+        relu_jmm<xmm_t>(xmm_dst, xmm_src, xmm_zero);
         break;
       case operand_type::exp:
         exp_xmm(xmm_dst, xmm_src, 2, 3, 4, 5);
diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h
index 1467978f26..6adeebca7c 100644
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -128,10 +128,10 @@ class VActJitCode : public JitCode {
 
  protected:
   // compute relu with ymm, xmm
-  void relu_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src,
-                const Xbyak::Ymm& zero);
-  void relu_xmm(const Xbyak::Xmm& dst, const Xbyak::Xmm& src,
-                const Xbyak::Xmm& zero);
+  template <typename JMM>
+  void relu_jmm(JMM& dst, JMM& src, JMM& zero) {  // NOLINT
+    vmaxps(dst, src, zero);
+  }
 
   // compute exp with ymm, xmm
   void exp_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2,
diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc
index 178298bf56..932fa4c000 100644
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -762,7 +762,7 @@ TEST(JitKernel, vaddrelu) {
     float* zref_data = zref.data();
     auto trefs = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      vadd_ref(d, x_data, y_data, zref_data);
+      vaddrelu_ref(d, x_data, y_data, zref_data);
     }
     auto trefe = GetCurrentUS();
     auto tmkls = GetCurrentUS();

From ccb8963705205eef1f7447be7964dce008c7b997 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 16 Nov 2018 16:54:48 +0000
Subject: [PATCH 06/16] refine exp jitcode with all size

test=develop
---
 paddle/fluid/operators/math/jit_code.cc  | 223 +++--------------------
 paddle/fluid/operators/math/jit_code.h   | 132 +++++++++++++-
 paddle/fluid/operators/math/jit_kernel.h |   1 +
 3 files changed, 153 insertions(+), 203 deletions(-)

diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
index 2a10cd7821..fd18256b0c 100644
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/jit_code.h"
-#include "paddle/fluid/operators/math/jit_kernel.h"
-#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/operators/math/jit_kernel.h"  // TODO(TJ): remove me
 
 namespace paddle {
 namespace operators {
@@ -111,60 +110,26 @@ void VXXJitCode::generate() {
   ret();
 }
 
-#define ALIGN32 __attribute__((aligned(32)))
-#define EXP_HIG 88.3762626647949f
-#define EXP_LOW -88.3762626647949f
-#define CEPHES_LOG2EF 1.44269504088896341
-#define CEPHES_EXP_C1 0.693359375
-#define CEPHES_EXP_C2 -2.12194440e-4
-#define CEPHES_EXP_P0 1.9875691500E-4
-#define CEPHES_EXP_P1 1.3981999507E-3
-#define CEPHES_EXP_P2 8.3334519073E-3
-#define CEPHES_EXP_P3 4.1665795894E-2
-#define CEPHES_EXP_P4 1.6666665459E-1
-#define CEPHES_EXP_P5 5.0000001201E-1
+const float exp_float_consts[] ALIGN32 = {REPEAT_8TIMES(1.f),
+                                          REPEAT_8TIMES(2.f),
+                                          REPEAT_8TIMES(0.5f),
+                                          REPEAT_8TIMES(EXP_HIG),
+                                          REPEAT_8TIMES(EXP_LOW),
+                                          REPEAT_8TIMES(CEPHES_LOG2EF),
+                                          REPEAT_8TIMES(CEPHES_EXP_C1),
+                                          REPEAT_8TIMES(CEPHES_EXP_C2),
+                                          REPEAT_8TIMES(CEPHES_EXP_P0),
+                                          REPEAT_8TIMES(CEPHES_EXP_P1),
+                                          REPEAT_8TIMES(CEPHES_EXP_P2),
+                                          REPEAT_8TIMES(CEPHES_EXP_P3),
+                                          REPEAT_8TIMES(CEPHES_EXP_P4),
+                                          REPEAT_8TIMES(CEPHES_EXP_P5),
+                                          REPEAT_8TIMES(EXP_MAX_INPUT),
+                                          REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
+                                          REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
 
-#define REPEAT_8TIMES(val) val, val, val, val, val, val, val, val
-
-#define OFFSET_EXP_ONE 0 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_TWO 1 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_0P5 2 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_HIG 3 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_LOW 4 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_LOG2EF 5 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_C1 6 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_C2 7 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P0 8 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P1 9 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P2 10 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P3 11 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P4 12 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P5 13 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_MAX_INPUT 14 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float)
-
-static const float exp_float_consts[] ALIGN32 = {
-    REPEAT_8TIMES(1.f),
-    REPEAT_8TIMES(2.f),
-    REPEAT_8TIMES(0.5f),
-    REPEAT_8TIMES(EXP_HIG),
-    REPEAT_8TIMES(EXP_LOW),
-    REPEAT_8TIMES(CEPHES_LOG2EF),
-    REPEAT_8TIMES(CEPHES_EXP_C1),
-    REPEAT_8TIMES(CEPHES_EXP_C2),
-    REPEAT_8TIMES(CEPHES_EXP_P0),
-    REPEAT_8TIMES(CEPHES_EXP_P1),
-    REPEAT_8TIMES(CEPHES_EXP_P2),
-    REPEAT_8TIMES(CEPHES_EXP_P3),
-    REPEAT_8TIMES(CEPHES_EXP_P4),
-    REPEAT_8TIMES(CEPHES_EXP_P5),
-    REPEAT_8TIMES(EXP_MAX_INPUT),
-    REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
-    REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
-
-static const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)};
-static int g_tmp_mem[16] ALIGN32 = {0};
+const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)};
+int g_tmp_mem[16] ALIGN32 = {0};
 
 bool VActJitCode::init(int d, operand_type type) {
   bool ok = MayIUse(avx);
@@ -177,146 +142,6 @@ bool VActJitCode::init(int d, operand_type type) {
   }
 }
 
-void VActJitCode::exp_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
-                          int fy_idx, int mask_idx, int tmp_idx) {
-  assert(ymm_src.getIdx() != ymm_dst.getIdx());  // TODO(TJ): use enfore
-  // check all idx can not equal
-  ymm_t ymm_fx = ymm_t(fx_idx);
-  ymm_t ymm_fy = ymm_t(fy_idx);
-  ymm_t ymm_mask = ymm_t(mask_idx);
-  ymm_t ymm_tmp = ymm_t(tmp_idx);
-  reg64_t reg_ptr_global = rax;
-  push(reg_ptr_global);
-  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]);
-  vminps(ymm_src, ymm_src, ymm_tmp);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]);
-  vmaxps(ymm_src, ymm_src, ymm_tmp);
-  // express exp(x) as exp(g + n*log(2))
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]);
-  vmulps(ymm_fx, ymm_src, ymm_tmp);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]);
-  vaddps(ymm_fx, ymm_fx, ymm_tmp);
-  vroundps(ymm_fy, ymm_fx, 0x01);
-  // if greater, substract 1
-  vcmpgtps(ymm_mask, ymm_fy, ymm_fx);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global]);
-  vandps(ymm_mask, ymm_mask, ymm_tmp);
-  vsubps(ymm_fx, ymm_fy, ymm_mask);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]);
-  vmulps(ymm_fy, ymm_fx, ymm_tmp);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]);
-  ymm_t ymm_z = ymm_t(ymm_mask.getIdx());
-  vmulps(ymm_z, ymm_fx, ymm_tmp);
-  vsubps(ymm_src, ymm_src, ymm_fy);
-  vsubps(ymm_src, ymm_src, ymm_z);
-  vmulps(ymm_z, ymm_src, ymm_src);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]);
-  vmulps(ymm_dst, ymm_src, ymm_tmp);
-  for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5;
-       i += (YMM_FLOAT_BLOCK * sizeof(float))) {
-    vmovaps(ymm_tmp, ptr[reg_ptr_global + i]);  // P1~P4
-    vaddps(ymm_dst, ymm_dst, ymm_tmp);
-    vmulps(ymm_dst, ymm_dst, ymm_src);
-  }
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]);
-  vaddps(ymm_dst, ymm_dst, ymm_tmp);
-  vmulps(ymm_dst, ymm_dst, ymm_z);
-  vaddps(ymm_dst, ymm_dst, ymm_src);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global]);
-  vaddps(ymm_dst, ymm_dst, ymm_tmp);
-  // build 2^n
-  ymm_t ymm_int = ymm_fx;
-  vcvttps2dq(ymm_int, ymm_fx);
-  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_int_0x7f));
-  vmovdqa(ymm_tmp, ptr[reg_ptr_global]);
-  if (MayIUse(avx2)) {
-    vpaddd(ymm_int, ymm_int, ymm_tmp);
-    vpslld(ymm_int, ymm_int, 23);
-  } else if (MayIUse(avx)) {
-    xmm_t xtmp1 = xmm_t(ymm_int.getIdx());
-    xmm_t xtmp2 = xmm_t(ymm_tmp.getIdx());
-    reg64_t reg_ptr_tmp = reg_ptr_global;
-    mov(reg_ptr_tmp, reinterpret_cast<size_t>(g_tmp_mem));
-    vmovdqa(ptr[reg_ptr_tmp], ymm_int);
-    vmovdqa(ptr[reg_ptr_tmp + YMM_FLOAT_BLOCK * sizeof(float)], ymm_tmp);
-    vpaddd(xtmp1, xtmp1, xtmp2);
-    vpslld(xtmp1, xtmp1, 23);
-    vmovdqa(ptr[reg_ptr_tmp], xtmp1);
-    // next 128bits
-    vmovdqa(xtmp1, ptr[reg_ptr_tmp + 4 /*xmm float block*/ * sizeof(float)]);
-    vmovdqa(xtmp2,
-            ptr[reg_ptr_tmp +
-                (YMM_FLOAT_BLOCK + 4 /*xmm float block*/) * sizeof(float)]);
-    vpaddd(xtmp1, xtmp1, xtmp2);
-    vpslld(xtmp1, xtmp1, 23);
-    vmovdqa(ptr[reg_ptr_tmp + 4 /*xmm float block*/ * sizeof(float)], xtmp1);
-    // load out
-    vmovdqa(ymm_int, ptr[reg_ptr_tmp]);
-  }
-  vmulps(ymm_dst, ymm_dst, ymm_int);
-  pop(reg_ptr_global);
-}
-
-void VActJitCode::exp_xmm(xmm_t& ymm_dst, xmm_t& ymm_src, int fx_idx,
-                          int fy_idx, int mask_idx, int tmp_idx) {
-  assert(ymm_src.getIdx() != ymm_dst.getIdx());  // TODO(TJ): use enfore
-  // check all idx can not equal
-  xmm_t ymm_fx = xmm_t(fx_idx);
-  xmm_t ymm_fy = xmm_t(fy_idx);
-  xmm_t ymm_mask = xmm_t(mask_idx);
-  xmm_t ymm_tmp = xmm_t(tmp_idx);
-  reg64_t reg_ptr_global = rax;
-  push(reg_ptr_global);
-  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]);
-  vminps(ymm_src, ymm_src, ymm_tmp);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]);
-  vmaxps(ymm_src, ymm_src, ymm_tmp);
-  // express exp(x) as exp(g + n*log(2))
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]);
-  vmulps(ymm_fx, ymm_src, ymm_tmp);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]);
-  vaddps(ymm_fx, ymm_fx, ymm_tmp);
-  vroundps(ymm_fy, ymm_fx, 0x01);
-  // if greater, substract 1
-  vcmpgtps(ymm_mask, ymm_fy, ymm_fx);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global]);
-  vandps(ymm_mask, ymm_mask, ymm_tmp);
-  vsubps(ymm_fx, ymm_fy, ymm_mask);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]);
-  vmulps(ymm_fy, ymm_fx, ymm_tmp);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]);
-  xmm_t ymm_z = xmm_t(ymm_mask.getIdx());
-  vmulps(ymm_z, ymm_fx, ymm_tmp);
-  vsubps(ymm_src, ymm_src, ymm_fy);
-  vsubps(ymm_src, ymm_src, ymm_z);
-  vmulps(ymm_z, ymm_src, ymm_src);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]);
-  vmulps(ymm_dst, ymm_src, ymm_tmp);
-  for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5;
-       i += (YMM_FLOAT_BLOCK * sizeof(float))) {
-    vmovaps(ymm_tmp, ptr[reg_ptr_global + i]);  // P1~P4
-    vaddps(ymm_dst, ymm_dst, ymm_tmp);
-    vmulps(ymm_dst, ymm_dst, ymm_src);
-  }
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]);
-  vaddps(ymm_dst, ymm_dst, ymm_tmp);
-  vmulps(ymm_dst, ymm_dst, ymm_z);
-  vaddps(ymm_dst, ymm_dst, ymm_src);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global]);
-  vaddps(ymm_dst, ymm_dst, ymm_tmp);
-  // build 2^n
-  xmm_t ymm_int = ymm_fx;
-  vcvttps2dq(ymm_int, ymm_fx);
-  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_int_0x7f));
-  vmovdqa(ymm_tmp, ptr[reg_ptr_global]);
-  vpaddd(ymm_int, ymm_int, ymm_tmp);
-  vpslld(ymm_int, ymm_int, 23);
-  vmulps(ymm_dst, ymm_dst, ymm_int);
-  pop(reg_ptr_global);
-}
-
 void VActJitCode::sigmoid_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
                               int fy_idx, int mask_idx, int tmp_idx) {
   // y = 1 / (1 + e^-x)
@@ -330,7 +155,7 @@ void VActJitCode::sigmoid_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
   vmaxps(ymm_src, ymm_src, ymm_tmp);
   vxorps(ymm_tmp, ymm_tmp, ymm_tmp);
   vsubps(ymm_src, ymm_tmp, ymm_src);
-  exp_ymm(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx);
+  exp_jmm<ymm_t>(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx);
   vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
   vaddps(ymm_dst, ymm_dst, ymm_tmp);
   vdivps(ymm_dst, ymm_tmp, ymm_dst);
@@ -349,7 +174,7 @@ void VActJitCode::tanh_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
   vxorps(ymm_zero, ymm_zero, ymm_zero);
   vsubps(ymm_tmp, ymm_zero, ymm_tmp);
   vmulps(ymm_src, ymm_src, ymm_tmp);
-  exp_ymm(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx);
+  exp_jmm<ymm_t>(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx);
   vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
   vaddps(ymm_dst, ymm_dst, ymm_tmp);
   vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
@@ -373,7 +198,7 @@ void VActJitCode::generate() {
         relu_jmm<ymm_t>(ymm_dst, ymm_src, ymm_zero);
         break;
       case operand_type::exp:
-        exp_ymm(ymm_dst, ymm_src, 2, 3, 4, 5);
+        exp_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
         break;
       case operand_type::sigmoid:
         sigmoid_ymm(ymm_dst, ymm_src, 2, 3, 4, 5);
@@ -409,7 +234,7 @@ void VActJitCode::generate() {
         relu_jmm<xmm_t>(xmm_dst, xmm_src, xmm_zero);
         break;
       case operand_type::exp:
-        exp_xmm(xmm_dst, xmm_src, 2, 3, 4, 5);
+        exp_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
         break;
       default:
         break;
diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h
index 6adeebca7c..534398f4a4 100644
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -16,6 +16,8 @@ limitations under the License. */
 
 #include <string>
 #include "paddle/fluid/operators/math/jit_gen.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
 namespace paddle {
 namespace operators {
 namespace math {
@@ -40,6 +42,51 @@ typedef enum {
   identity
 } operand_type;
 
+extern const float exp_float_consts[];
+extern const int exp_int_0x7f[];
+extern int g_tmp_mem[];
+
+// TODO(TJ): move these to some proper place
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+#define EXP_MAX_INPUT 40.0
+#define XMM_FLOAT_BLOCK 4
+#define YMM_FLOAT_BLOCK 8
+#define ZMM_FLOAT_BLOCK 16
+
+#define ALIGN32 __attribute__((aligned(32)))
+#define EXP_HIG 88.3762626647949f
+#define EXP_LOW -88.3762626647949f
+#define CEPHES_LOG2EF 1.44269504088896341
+#define CEPHES_EXP_C1 0.693359375
+#define CEPHES_EXP_C2 -2.12194440e-4
+#define CEPHES_EXP_P0 1.9875691500E-4
+#define CEPHES_EXP_P1 1.3981999507E-3
+#define CEPHES_EXP_P2 8.3334519073E-3
+#define CEPHES_EXP_P3 4.1665795894E-2
+#define CEPHES_EXP_P4 1.6666665459E-1
+#define CEPHES_EXP_P5 5.0000001201E-1
+
+#define REPEAT_8TIMES(val) val, val, val, val, val, val, val, val
+
+#define OFFSET_EXP_ONE 0 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_TWO 1 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_0P5 2 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_HIG 3 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_LOW 4 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_LOG2EF 5 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_C1 6 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_C2 7 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P0 8 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P1 9 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P2 10 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P3 11 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P4 12 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P5 13 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_MAX_INPUT 14 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float)
+
 // function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu)
 class VXXJitCode : public JitCode {
  public:
@@ -134,10 +181,87 @@ class VActJitCode : public JitCode {
   }
 
   // compute exp with ymm, xmm
-  void exp_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2,
-               int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5);
-  void exp_xmm(const Xbyak::Xmm& dst, const Xbyak::Xmm& src, int fx_idx = 2,
-               int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5);
+  template <typename JMM>
+  void exp_jmm(JMM& dst, JMM& src, int fx_idx = 2, int fy_idx = 3,  // NOLINT
+               int mask_idx = 4, int tmp_idx = 5) {
+    using namespace platform::jit;         // NOLINT
+    assert(src.getIdx() != dst.getIdx());  // TODO(TJ): use enfore
+    // check all idx can not equal
+    JMM jmm_fx = JMM(fx_idx);
+    JMM jmm_fy = JMM(fy_idx);
+    JMM jmm_mask = JMM(mask_idx);
+    JMM jmm_tmp = JMM(tmp_idx);
+    reg64_t reg_ptr_global = rax;
+    push(reg_ptr_global);
+    mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]);
+    vminps(src, src, jmm_tmp);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]);
+    vmaxps(src, src, jmm_tmp);
+    // express exp(x) as exp(g + n*log(2))
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]);
+    vmulps(jmm_fx, src, jmm_tmp);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]);
+    vaddps(jmm_fx, jmm_fx, jmm_tmp);
+    vroundps(jmm_fy, jmm_fx, 0x01);
+    // if greater, substract 1
+    vcmpgtps(jmm_mask, jmm_fy, jmm_fx);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global]);
+    vandps(jmm_mask, jmm_mask, jmm_tmp);
+    vsubps(jmm_fx, jmm_fy, jmm_mask);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]);
+    vmulps(jmm_fy, jmm_fx, jmm_tmp);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]);
+    JMM ymm_z = JMM(jmm_mask.getIdx());
+    vmulps(ymm_z, jmm_fx, jmm_tmp);
+    vsubps(src, src, jmm_fy);
+    vsubps(src, src, ymm_z);
+    vmulps(ymm_z, src, src);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]);
+    vmulps(dst, src, jmm_tmp);
+    for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5;
+         i += (YMM_FLOAT_BLOCK * sizeof(float))) {
+      vmovaps(jmm_tmp, ptr[reg_ptr_global + i]);  // P1~P4
+      vaddps(dst, dst, jmm_tmp);
+      vmulps(dst, dst, src);
+    }
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]);
+    vaddps(dst, dst, jmm_tmp);
+    vmulps(dst, dst, ymm_z);
+    vaddps(dst, dst, src);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global]);
+    vaddps(dst, dst, jmm_tmp);
+    // build 2^n
+    JMM ymm_int = jmm_fx;
+    vcvttps2dq(ymm_int, jmm_fx);
+    mov(reg_ptr_global, reinterpret_cast<size_t>(exp_int_0x7f));
+    vmovdqa(jmm_tmp, ptr[reg_ptr_global]);
+    if (MayIUse(avx2) || std::is_same<JMM, xmm_t>::value) {
+      vpaddd(ymm_int, ymm_int, jmm_tmp);
+      vpslld(ymm_int, ymm_int, 23);
+    } else if (MayIUse(avx)) {
+      xmm_t xtmp1 = xmm_t(ymm_int.getIdx());
+      xmm_t xtmp2 = xmm_t(jmm_tmp.getIdx());
+      reg64_t reg_ptr_tmp = reg_ptr_global;
+      mov(reg_ptr_tmp, reinterpret_cast<size_t>(g_tmp_mem));
+      vmovdqa(ptr[reg_ptr_tmp], ymm_int);
+      vmovdqa(ptr[reg_ptr_tmp + YMM_FLOAT_BLOCK * sizeof(float)], jmm_tmp);
+      vpaddd(xtmp1, xtmp1, xtmp2);
+      vpslld(xtmp1, xtmp1, 23);
+      vmovdqa(ptr[reg_ptr_tmp], xtmp1);
+      // next 128bits
+      vmovdqa(xtmp1, ptr[reg_ptr_tmp + XMM_FLOAT_BLOCK * sizeof(float)]);
+      vmovdqa(xtmp2, ptr[reg_ptr_tmp +
+                         (YMM_FLOAT_BLOCK + XMM_FLOAT_BLOCK) * sizeof(float)]);
+      vpaddd(xtmp1, xtmp1, xtmp2);
+      vpslld(xtmp1, xtmp1, 23);
+      vmovdqa(ptr[reg_ptr_tmp + XMM_FLOAT_BLOCK * sizeof(float)], xtmp1);
+      // load out
+      vmovdqa(ymm_int, ptr[reg_ptr_tmp]);
+    }
+    vmulps(dst, dst, ymm_int);
+    pop(reg_ptr_global);
+  }
 
   // compute sigmoid with ymm
   void sigmoid_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2,
diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h
index 4d8d3cd79a..117baaee2b 100644
--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
@@ -26,6 +26,7 @@ namespace operators {
 namespace math {
 namespace jitkernel {
 
+// TODO(TJ): move these to some proper place
 #define SIGMOID_THRESHOLD_MIN -40.0
 #define SIGMOID_THRESHOLD_MAX 13.0
 #define EXP_MAX_INPUT 40.0

From 4dbdfa60ef6d13568880fb2de5ee31a469080ab7 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 16 Nov 2018 17:29:36 +0000
Subject: [PATCH 07/16] sigmoid and tanh support all size

test=develop
---
 paddle/fluid/operators/math/jit_code.cc | 67 ++++---------------------
 paddle/fluid/operators/math/jit_code.h  | 50 +++++++++++++++---
 2 files changed, 54 insertions(+), 63 deletions(-)

diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
index fd18256b0c..a080079a2d 100644
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -132,56 +132,8 @@ const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)};
 int g_tmp_mem[16] ALIGN32 = {0};
 
 bool VActJitCode::init(int d, operand_type type) {
-  bool ok = MayIUse(avx);
-  if (type == operand_type::relu || type == operand_type::exp) {
-    // TODO(TJ): implement avx512, avx_exp is slower than mkl when d >= 256
-    return ok;
-  } else {
-    // TODO(TJ): support more
-    return ok && d % 8 == 0;
-  }
-}
-
-void VActJitCode::sigmoid_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
-                              int fy_idx, int mask_idx, int tmp_idx) {
-  // y = 1 / (1 + e^-x)
-  ymm_t ymm_tmp = ymm_t(tmp_idx);
-  reg64_t reg_ptr_global = rax;
-  push(reg_ptr_global);
-  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MAX]);
-  vminps(ymm_src, ymm_src, ymm_tmp);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MIN]);
-  vmaxps(ymm_src, ymm_src, ymm_tmp);
-  vxorps(ymm_tmp, ymm_tmp, ymm_tmp);
-  vsubps(ymm_src, ymm_tmp, ymm_src);
-  exp_jmm<ymm_t>(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
-  vaddps(ymm_dst, ymm_dst, ymm_tmp);
-  vdivps(ymm_dst, ymm_tmp, ymm_dst);
-  pop(reg_ptr_global);
-}
-
-void VActJitCode::tanh_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
-                           int fy_idx, int mask_idx, int tmp_idx) {
-  // y = 2 / (1 + e^(-2x)) - 1
-  ymm_t ymm_tmp = ymm_t(tmp_idx);
-  ymm_t ymm_zero = ymm_t(mask_idx);
-  reg64_t reg_ptr_global = rax;
-  push(reg_ptr_global);
-  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
-  vxorps(ymm_zero, ymm_zero, ymm_zero);
-  vsubps(ymm_tmp, ymm_zero, ymm_tmp);
-  vmulps(ymm_src, ymm_src, ymm_tmp);
-  exp_jmm<ymm_t>(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
-  vaddps(ymm_dst, ymm_dst, ymm_tmp);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
-  vdivps(ymm_dst, ymm_tmp, ymm_dst);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
-  vsubps(ymm_dst, ymm_dst, ymm_tmp);
-  pop(reg_ptr_global);
+  // TODO(TJ): implement avx512, avx_exp is slower than mkl when d >= 256
+  return MayIUse(avx);
 }
 
 void VActJitCode::generate() {
@@ -201,10 +153,10 @@ void VActJitCode::generate() {
         exp_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
         break;
       case operand_type::sigmoid:
-        sigmoid_ymm(ymm_dst, ymm_src, 2, 3, 4, 5);
+        sigmoid_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
         break;
       case operand_type::tanh:
-        tanh_ymm(ymm_dst, ymm_src, 2, 3, 4, 5);
+        tanh_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
         break;
       case operand_type::identity:
         break;
@@ -214,11 +166,6 @@ void VActJitCode::generate() {
     vmovups(ptr[param2 + offset], ymm_dst);
     offset += sizeof(float) * YMM_FLOAT_BLOCK;
   }
-  if (type_ != operand_type::relu && type_ != operand_type::exp) {
-    // TODO(TJ): remove me
-    ret();
-    return;
-  }
   int rest = num_ % YMM_FLOAT_BLOCK;
   int block = XMM_FLOAT_BLOCK;
   while (rest > 0) {
@@ -236,6 +183,12 @@ void VActJitCode::generate() {
       case operand_type::exp:
         exp_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
         break;
+      case operand_type::sigmoid:
+        sigmoid_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
+        break;
+      case operand_type::tanh:
+        tanh_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
+        break;
       default:
         break;
     }
diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h
index 534398f4a4..65f83ff484 100644
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -263,13 +263,51 @@ class VActJitCode : public JitCode {
     pop(reg_ptr_global);
   }
 
-  // compute sigmoid with ymm
-  void sigmoid_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2,
-                   int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5);
+  // compute sigmoid with ymm, xmm
+  template <typename JMM>
+  void sigmoid_jmm(JMM& dst, JMM& src, int fx_idx = 2,  // NOLINT
+                   int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5) {
+    // y = 1 / (1 + e^-x)
+    JMM jmm_tmp = JMM(tmp_idx);
+    reg64_t reg_ptr_global = rax;
+    push(reg_ptr_global);
+    mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MAX]);
+    vminps(src, src, jmm_tmp);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MIN]);
+    vmaxps(src, src, jmm_tmp);
+    vxorps(jmm_tmp, jmm_tmp, jmm_tmp);
+    vsubps(src, jmm_tmp, src);
+    exp_jmm<JMM>(dst, src, fx_idx, fy_idx, mask_idx, tmp_idx);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
+    vaddps(dst, dst, jmm_tmp);
+    vdivps(dst, jmm_tmp, dst);
+    pop(reg_ptr_global);
+  }
 
-  // compute tanh with ymm
-  void tanh_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2,
-                int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5);
+  // compute tanh with ymm, xmm
+  template <typename JMM>
+  void tanh_jmm(JMM& dst, JMM& src, int fx_idx = 2, int fy_idx = 3,  // NOLINT
+                int mask_idx = 4, int tmp_idx = 5) {
+    // y = 2 / (1 + e^(-2x)) - 1
+    JMM jmm_tmp = JMM(tmp_idx);
+    JMM jmm_zero = JMM(mask_idx);
+    reg64_t reg_ptr_global = rax;
+    push(reg_ptr_global);
+    mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
+    vxorps(jmm_zero, jmm_zero, jmm_zero);
+    vsubps(jmm_tmp, jmm_zero, jmm_tmp);
+    vmulps(src, src, jmm_tmp);
+    exp_jmm<JMM>(dst, src, fx_idx, fy_idx, mask_idx, tmp_idx);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
+    vaddps(dst, dst, jmm_tmp);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
+    vdivps(dst, jmm_tmp, dst);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
+    vsubps(dst, dst, jmm_tmp);
+    pop(reg_ptr_global);
+  }
 
  protected:
   int num_;

From be80bb4f28f4a50cfbc96edd790227f59273d20e Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Fri, 16 Nov 2018 20:01:56 +0100
Subject: [PATCH 08/16] - Fix to GPU

test=develop
---
 paddle/fluid/operators/softmax_op.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h
index 91829d5761..8eb5c7691e 100644
--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
@@ -36,7 +36,9 @@ class SoftmaxKernel : public framework::OpKernel<T> {
     Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
 
 #ifdef PADDLE_ON_INFERENCE
-    math::SoftmaxFunctor<DeviceContext, T, true>()(
+    math::SoftmaxFunctor<
+        DeviceContext, T,
+        std::is_same<DeviceContext, platform::CPUDeviceContext>::value>()(
         context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
 #else
     math::SoftmaxFunctor<DeviceContext, T, false>()(

From a19b3225a1da8c31fc996bace3ac09e6f5f177ef Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Sat, 17 Nov 2018 14:56:43 +0000
Subject: [PATCH 09/16] fix jitcode small size

test=develop
---
 paddle/fluid/operators/math/jit_code.cc        | 12 ++++++++----
 paddle/fluid/operators/math/jit_kernel_test.cc | 10 +++++-----
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
index a080079a2d..e484e9a3c7 100644
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -59,9 +59,10 @@ void VXXJitCode::generate() {
     offset += sizeof(float) * YMM_FLOAT_BLOCK;
   }
   int rest = num_ % YMM_FLOAT_BLOCK;
-  int block = XMM_FLOAT_BLOCK;
   while (rest > 0) {
+    int block = XMM_FLOAT_BLOCK;
     if (rest >= 4) {
+      block = 4;
       if (scalar_index_ != 1) {
         vmovups(xmm_src1, ptr[param1 + offset]);
       }
@@ -69,6 +70,7 @@ void VXXJitCode::generate() {
         vmovups(xmm_src2, ptr[param2 + offset]);
       }
     } else if (rest >= 2) {
+      block = 2;
       if (scalar_index_ != 1) {
         vmovq(xmm_src1, ptr[param1 + offset]);
       }
@@ -76,6 +78,7 @@ void VXXJitCode::generate() {
         vmovq(xmm_src2, ptr[param2 + offset]);
       }
     } else {
+      block = 1;
       if (scalar_index_ != 1) {
         vmovss(xmm_src1, ptr[param1 + offset]);
       }
@@ -105,7 +108,6 @@ void VXXJitCode::generate() {
     }
     offset += sizeof(float) * block;
     rest -= block;
-    block /= 2;
   }
   ret();
 }
@@ -167,13 +169,16 @@ void VActJitCode::generate() {
     offset += sizeof(float) * YMM_FLOAT_BLOCK;
   }
   int rest = num_ % YMM_FLOAT_BLOCK;
-  int block = XMM_FLOAT_BLOCK;
   while (rest > 0) {
+    int block = XMM_FLOAT_BLOCK;
     if (rest >= 4) {
+      block = 4;
       vmovups(xmm_src, ptr[param1 + offset]);
     } else if (rest >= 2) {
+      block = 2;
       vmovq(xmm_src, ptr[param1 + offset]);
     } else {
+      block = 1;
       vmovss(xmm_src, ptr[param1 + offset]);
     }
     switch (type_) {
@@ -201,7 +206,6 @@ void VActJitCode::generate() {
     }
     offset += sizeof(float) * block;
     rest -= block;
-    block /= 2;
   }
   ret();
 }
diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc
index 932fa4c000..b6c62a2634 100644
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -69,7 +69,7 @@ void vrelu_intri8(const int n, const float* x, float* y) {
 
 TEST(JitKernel, vrelu) {
   namespace jit = paddle::operators::math::jitkernel;
-  for (int d : {7, 8, 15, 16, 30, 256, 512}) {
+  for (int d : {3, 7, 8, 15, 16, 30, 256, 512}) {
     std::vector<float> x(d);
     std::vector<float> zref(d), ztgt(d);
     RandomVec<float>(d, x.data(), -10.f, 1.f);
@@ -159,7 +159,7 @@ void vexp_mkl(const int n, const float* x, float* y) {
 
 TEST(JitKernel, vexp) {
   namespace jit = paddle::operators::math::jitkernel;
-  for (int d : {7, 8, 12, 15, 16, 20, 30, 128, 256}) {
+  for (int d : {1, 3, 4, 6, 7, 8, 12, 15, 16, 20, 30, 128, 256}) {
     std::vector<float> x(d);
     std::vector<float> zref(d), ztgt(d);
     RandomVec<float>(d, x.data(), -2.f, 2.f);
@@ -234,7 +234,7 @@ void vsigmoid_better(
 
 TEST(JitKernel, vsigmoid) {
   namespace jit = paddle::operators::math::jitkernel;
-  for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
+  for (int d : {1, 3, 4, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
     std::vector<float> x(d);
     std::vector<float> zref(d), ztgt(d);
     RandomVec<float>(d, x.data(), -2.f, 2.f);
@@ -298,7 +298,7 @@ void vtanh_better(
 
 TEST(JitKernel, vtanh) {
   namespace jit = paddle::operators::math::jitkernel;
-  for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
+  for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
     std::vector<float> x(d);
     std::vector<float> zref(d), ztgt(d);
     RandomVec<float>(d, x.data(), -2.f, 2.f);
@@ -389,7 +389,7 @@ void lstm_ctht_better(
 
 TEST(JitKernel, lstm) {
   namespace jit = paddle::operators::math::jitkernel;
-  for (int d : {7, 8, 15, 16, 30, 32, 64, 100}) {
+  for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100}) {
     int d4 = d * 4;
     int d3 = d * 3;
     std::vector<float> x(d4), xref(d4);

From 9b0eae3023e3faf6a40a69f5ff79bcc2303c674b Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Sun, 18 Nov 2018 13:27:17 +0100
Subject: [PATCH 10/16] - Removing partial specialization of sotmax for
 inference for GPU

test=develop
---
 paddle/fluid/operators/math/softmax.h      |  3 ++-
 paddle/fluid/operators/math/softmax_impl.h | 10 +++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/math/softmax.h b/paddle/fluid/operators/math/softmax.h
index bf698dc2f7..089458e957 100644
--- a/paddle/fluid/operators/math/softmax.h
+++ b/paddle/fluid/operators/math/softmax.h
@@ -19,7 +19,8 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename DeviceContext, typename T, bool is_test>
+template <typename DeviceContext, typename T, bool is_test,
+          typename Enable = void>
 class SoftmaxFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor* X,
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index e09a243347..0f3e5b2008 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -33,8 +33,8 @@ struct ValueClip {
   }
 };
 
-template <typename DeviceContext, typename T, bool is_test>
-void SoftmaxFunctor<DeviceContext, T, is_test>::operator()(
+template <typename DeviceContext, typename T, bool is_test, typename Enable>
+void SoftmaxFunctor<DeviceContext, T, is_test, Enable>::operator()(
     const DeviceContext& context, const framework::Tensor* X,
     framework::Tensor* Y) {
   auto logits = EigenMatrix<T>::From(*X);
@@ -66,8 +66,12 @@ void SoftmaxFunctor<DeviceContext, T, is_test>::operator()(
                                                  .broadcast(one_by_class));
 }
 
+template <class DeviceContext>
+using enable_if_CPU = typename std::enable_if<
+    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type;
+
 template <typename DeviceContext>
-class SoftmaxFunctor<DeviceContext, float, true> {
+class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
   void operator()(const DeviceContext& context, const framework::Tensor* X,
                   framework::Tensor* Y) {
     auto in_dims = X->dims();

From be50670348a23b35172e2420baeb058321ab3e13 Mon Sep 17 00:00:00 2001
From: Yihua Xu <yihuax.xu@intel.com>
Date: Tue, 20 Nov 2018 08:24:00 +0800
Subject: [PATCH 11/16] Remove the remnant code (test=develop)

---
 paddle/fluid/operators/stack_op.h | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h
index f1692ae956..56a12852a9 100644
--- a/paddle/fluid/operators/stack_op.h
+++ b/paddle/fluid/operators/stack_op.h
@@ -72,25 +72,6 @@ class StackOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-template <typename VecXType, typename T>
-struct StackFunctor {
-  HOSTDEVICE StackFunctor(const VecXType &x, T *y, int n, int post)
-      : x_(x), y_(y), n_(n), post_(post) {}
-
-  HOSTDEVICE void operator()(int idx) {
-    int i = idx / (n_ * post_);
-    int which_x = idx / post_ - i * n_;
-    int x_index = i * post_ + idx % post_;
-    y_[idx] = x_[which_x][x_index];
-  }
-
- private:
-  VecXType x_;
-  T *y_;
-  int n_;
-  int post_;
-};
-
 template <typename VecDxType, typename T>
 struct StackGradFunctor {
   HOSTDEVICE StackGradFunctor(const VecDxType &dx, const T *dy, int n, int post)
@@ -110,14 +91,6 @@ struct StackGradFunctor {
   int post_;
 };
 
-template <typename DeviceContext, typename VecXType, typename T>
-static inline void StackFunctorForRange(const DeviceContext &ctx,
-                                        const VecXType &x, T *y, int total_num,
-                                        int n, int post) {
-  platform::ForRange<DeviceContext> for_range(ctx, total_num);
-  for_range(StackFunctor<VecXType, T>(x, y, n, post));
-}
-
 template <typename DeviceContext, typename VecDxType, typename T>
 static inline void StackGradFunctorForRange(const DeviceContext &ctx,
                                             const VecDxType &dx, const T *dy,

From d91740acb1e49e4baaad02aeda379f27f6ec0f69 Mon Sep 17 00:00:00 2001
From: Yihua Xu <yihuax.xu@intel.com>
Date: Tue, 20 Nov 2018 08:25:48 +0800
Subject: [PATCH 12/16] Revert "Remove the remnant code (test=develop)"

This reverts commit be50670348a23b35172e2420baeb058321ab3e13.
---
 paddle/fluid/operators/stack_op.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h
index 56a12852a9..f1692ae956 100644
--- a/paddle/fluid/operators/stack_op.h
+++ b/paddle/fluid/operators/stack_op.h
@@ -72,6 +72,25 @@ class StackOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
+template <typename VecXType, typename T>
+struct StackFunctor {
+  HOSTDEVICE StackFunctor(const VecXType &x, T *y, int n, int post)
+      : x_(x), y_(y), n_(n), post_(post) {}
+
+  HOSTDEVICE void operator()(int idx) {
+    int i = idx / (n_ * post_);
+    int which_x = idx / post_ - i * n_;
+    int x_index = i * post_ + idx % post_;
+    y_[idx] = x_[which_x][x_index];
+  }
+
+ private:
+  VecXType x_;
+  T *y_;
+  int n_;
+  int post_;
+};
+
 template <typename VecDxType, typename T>
 struct StackGradFunctor {
   HOSTDEVICE StackGradFunctor(const VecDxType &dx, const T *dy, int n, int post)
@@ -91,6 +110,14 @@ struct StackGradFunctor {
   int post_;
 };
 
+template <typename DeviceContext, typename VecXType, typename T>
+static inline void StackFunctorForRange(const DeviceContext &ctx,
+                                        const VecXType &x, T *y, int total_num,
+                                        int n, int post) {
+  platform::ForRange<DeviceContext> for_range(ctx, total_num);
+  for_range(StackFunctor<VecXType, T>(x, y, n, post));
+}
+
 template <typename DeviceContext, typename VecDxType, typename T>
 static inline void StackGradFunctorForRange(const DeviceContext &ctx,
                                             const VecDxType &dx, const T *dy,

From a906a361be831b9b425a9f197036fef506020857 Mon Sep 17 00:00:00 2001
From: Yihua Xu <yihuax.xu@intel.com>
Date: Tue, 20 Nov 2018 08:30:27 +0800
Subject: [PATCH 13/16] Add the macro for NVCC (test=develop)

---
 paddle/fluid/operators/stack_op.h | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h
index f1692ae956..3d132e4397 100644
--- a/paddle/fluid/operators/stack_op.h
+++ b/paddle/fluid/operators/stack_op.h
@@ -149,11 +149,20 @@ class StackKernel : public framework::OpKernel<T> {
     for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
 
 #ifdef __NVCC__
+    int total_num = pre * n * post;
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+
     thrust::device_vector<const T *> device_x_vec(x_datas);
     auto x_data_arr = device_x_vec.data().get();
+
+    StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
+
+    // Wait() must be called because device_x_vec may be destructed before
+    // kernel ends
+    dev_ctx.Wait();
 #else
     auto x_data_arr = x_datas.data();
-#endif
+
     size_t x_offset = 0;
     size_t y_offset = 0;
     for (int i = 0; i < pre; i++) {
@@ -164,10 +173,6 @@ class StackKernel : public framework::OpKernel<T> {
       }
       x_offset += post;
     }
-#ifdef __NVCC__
-    // Wait() must be called because device_x_vec may be destructed before
-    // kernel ends
-    dev_ctx.Wait();
 #endif
   }
 };

From a94a7355f0014337006ea8bb04bb2c30c955f7ea Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Tue, 20 Nov 2018 10:01:51 +0800
Subject: [PATCH 14/16] Refine the GraphNum check  (#14144)

* refine GraphCheck
test=develop

* fix ci fail
test=develop
---
 paddle/fluid/framework/ir/graph_helper.cc   | 28 +++++++++++++++------
 paddle/fluid/framework/parallel_executor.cc | 13 ++++++++--
 python/paddle/fluid/__init__.py             |  3 ++-
 3 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index 98112c1ed3..963179192f 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -15,8 +15,15 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include <algorithm>
 #include <deque>
+#include <fstream>
+#include <iosfwd>
+#include <ostream>
 #include <unordered_set>
 
+DEFINE_string(print_sub_graph_dir, "",
+              "FLAGS_print_sub_graph_dir is used "
+              "to print the nodes of sub_graphs.");
+
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -164,12 +171,15 @@ size_t GraphNum(const Graph &graph) {
     graph_nodes.emplace_back(g_nodes);
   }
 
-  if (VLOG_IS_ON(100)) {
-    VLOG(100) << "graph_num: " << graph_nodes.size();
-    for (auto &g_n : graph_nodes) {
-      VLOG(100) << "graph_nodes: " << g_n.size();
-      if (g_n.size() < 10) {
-        std::stringstream out;
+  if (FLAGS_print_sub_graph_dir.size()) {
+    if (graph_nodes.size() > 1) {
+      std::stringstream out;
+      for (auto &g_n : graph_nodes) {
+        out << "graph_nodes: " << g_n.size() << "\n";
+      }
+      out << "\n\n";
+      for (auto &g_n : graph_nodes) {
+        out << "graph_nodes: " << g_n.size();
         for (auto &node : g_n) {
           out << "\nNode: " << node->Name() << " in [";
           for (auto &n : node->inputs) {
@@ -181,8 +191,12 @@ size_t GraphNum(const Graph &graph) {
           }
           out << "]";
         }
-        VLOG(100) << out.str();
+        out << "\n\n\n";
       }
+      std::unique_ptr<std::ostream> fout(
+          new std::ofstream(FLAGS_print_sub_graph_dir));
+      PADDLE_ENFORCE(fout->good());
+      *fout << out.str();
     }
   }
 
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 39b47415ff..2c6e337568 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -171,8 +171,17 @@ ParallelExecutor::ParallelExecutor(
   }
   // If the loss_var_name is given, the number of graph should be only one.
   if (loss_var_name.size()) {
-    PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1,
-                      "The number of graph should be only one");
+    size_t graph_num = ir::GraphNum(*graph);
+    if (graph_num > 1) {
+      LOG(WARNING)
+          << "The number of graph should be only one, "
+             "but the current graph has "
+          << ir::GraphNum(*graph)
+          << " sub_graphs. If you want to see the nodes of the "
+             "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' "
+             "to specify the output dir. NOTES: if you not do training, "
+             "please don't pass loss_var_name.";
+    }
   }
 
   if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index b991974928..f2f49f813a 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -116,7 +116,8 @@ def __bootstrap__():
         'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb',
         'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
         "dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb',
-        'allocator_strategy', 'reader_queue_speed_test_mode'
+        'allocator_strategy', 'reader_queue_speed_test_mode',
+        'print_sub_graph_dir'
     ]
     if os.name != 'nt':
         read_env_flags.append('warpctc_dir')

From bb2b35c85ebe726fa6baa94f466f65a71b21394e Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Mon, 19 Nov 2018 21:11:12 +0800
Subject: [PATCH 15/16] Add python example for resize_nearest. test=develop

---
 python/paddle/fluid/layers/nn.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index af96f5de4f..91599b156d 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -5788,7 +5788,7 @@ def image_resize(input,
     Examples:
         .. code-block:: python
 
-            out = fluid.layers.image_resize(input, out_shape=[12, 12])
+            out = fluid.layers.image_resize(input, out_shape=[12, 12], resample="NEAREST")
     """
     resample_methods = {
         'BILINEAR': 'bilinear',
@@ -5891,6 +5891,11 @@ def resize_bilinear(input,
 
     Returns:
         ${out_comment}.
+
+    Examples:
+        .. code-block:: python
+
+            out = fluid.layers.resize_bilinear(input, out_shape=[12, 12])
     """
 
     return image_resize(input, out_shape, scale, name, 'BILINEAR', actual_shape)
@@ -5937,6 +5942,11 @@ def resize_nearest(input,
 
     Returns:
         ${out_comment}.
+
+    Examples:
+        .. code-block:: python
+
+            out = fluid.layers.resize_nearest(input, out_shape=[12, 12])
     """
 
     return image_resize(input, out_shape, scale, name, 'NEAREST', actual_shape)

From 8bc1c5d2abb260ab4c20e009ceacb8508b8ae59d Mon Sep 17 00:00:00 2001
From: Yiqun Liu <liuyiqun01@baidu.com>
Date: Tue, 20 Nov 2018 11:10:38 +0800
Subject: [PATCH 16/16] Implement the Tensorrt plugin for elementwise op
 (#14487)

* Initialize the elementwise plugin.

* Implement the basic CUDA kernel of elementwise plugin.
test=develop
---
 .../ir_passes/tensorrt_subgraph_pass.cc       |   2 +-
 .../passes/ir_analysis_compose_pass.cc        |   3 +-
 .../inference/tensorrt/convert/CMakeLists.txt |  13 +-
 .../tensorrt/convert/elementwise_op.cc        |  70 ++++++---
 .../inference/tensorrt/convert/op_converter.h |   2 +-
 .../inference/tensorrt/convert/prelu_op.cc    |   2 +-
 .../inference/tensorrt/convert/split_op.cc    |   2 +-
 .../tensorrt/convert/test_elementwise_op.cc   |  78 +++++++---
 .../inference/tensorrt/convert/test_mul_op.cc |  18 +--
 .../inference/tensorrt/convert/ut_helper.h    |   2 +-
 paddle/fluid/inference/tensorrt/engine.cc     |   5 +-
 paddle/fluid/inference/tensorrt/engine.h      |   4 +-
 .../inference/tensorrt/plugin/CMakeLists.txt  |   4 +-
 .../tensorrt/plugin/elementwise_op_plugin.cu  | 138 ++++++++++++++++++
 .../tensorrt/plugin/elementwise_op_plugin.h   |  87 +++++++++++
 .../tensorrt/plugin/prelu_op_plugin.cu        |   2 +
 .../tensorrt/plugin/prelu_op_plugin.h         |   2 +
 .../inference/tensorrt/plugin/serialize.h     |  32 +++-
 .../tensorrt/plugin/split_op_plugin.cu        |  25 ++--
 .../tensorrt/plugin/split_op_plugin.h         |  73 +++++----
 .../inference/tensorrt/plugin/trt_plugin.cc   |  28 ++--
 .../inference/tensorrt/plugin/trt_plugin.h    |  72 ++++++---
 .../fluid/inference/tests/api/tester_helper.h |   2 +-
 23 files changed, 500 insertions(+), 166 deletions(-)
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h

diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 21fd8d2df4..c6b7c05f78 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -114,7 +114,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
   // it is either an OP's input or an OP's output.
 
   auto &subgraph_nodes = *Agent(node).subgraph();
-  for (size_t index = 0; index < block_desc.OpSize(); index++) {
+  for (size_t index = 0; index < block_desc.OpSize(); ++index) {
     framework::proto::OpDesc *op = block_desc.Op(index)->Proto();
     auto correspond_node = subgraph_nodes[index];
     PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
index 38e9b1c5e7..267737e95c 100644
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
@@ -45,7 +45,8 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
       std::unordered_set<std::string> teller_set(
           {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
            "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
-           "elementwise_add", "dropout", "split", "prelu", "conv2d_transpose"});
+           "elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
+           "conv2d_transpose"});
       if (!node->IsOp()) return false;
 
       if (teller_set.count(node->Op()->Type())) {
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 85ad5ffe78..8dd6e8453f 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,9 +1,9 @@
 # Add TRT tests
 nv_library(tensorrt_converter
-  SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
-batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
-pad_op.cc split_op.cc prelu_op.cc
-  DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
+           SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
+                batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
+                pad_op.cc split_op.cc prelu_op.cc
+           DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
   ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_converter)
@@ -20,7 +20,8 @@ nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
 nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
         DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pool_op SERIAL)
 nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
-        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine elementwise_add_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
+             elementwise_add_op elementwise_mul_op SERIAL)
 nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
         DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine softmax_op SERIAL)
 nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
@@ -33,7 +34,7 @@ nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
         DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pad_op SERIAL)
 nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc
         DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
-        split_op concat_op SERIAL)
+             split_op concat_op SERIAL)
 nv_test(test_trt_prelu_op SRCS test_prelu_op.cc prelu_op.cc
         DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
         prelu_op SERIAL)
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index 1af091fabd..6975086193 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,11 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 
+static bool CheckDims(const nvinfer1::Dims& dims_x,
+                      const nvinfer1::Dims& dims_y) {
+  if (dims_x.nbDims != dims_y.nbDims) {
+    return false;
+  }
+  for (int i = 0; i < dims_x.nbDims; i++) {
+    if (dims_x.d[i] != dims_y.d[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
 class ElementwiseWeightOpConverter : public OpConverter {
  public:
   ElementwiseWeightOpConverter() {}
@@ -26,7 +40,7 @@ class ElementwiseWeightOpConverter : public OpConverter {
     // Here the two nullptr looks strange, that's because the
     // framework::OpDesc's constructor is strange.
     framework::OpDesc op_desc(op, nullptr);
-    VLOG(3) << "convert a fluid elementwise op to tensorrt IScaleLayer";
+    VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer";
 
     PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
     PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
@@ -106,10 +120,12 @@ class ElementwiseTensorOpConverter : public OpConverter {
   ElementwiseTensorOpConverter() {}
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
+    auto op_pair = ops.find(op_type_);
+    PADDLE_ENFORCE(op_pair != ops.end(), "Wrong elementwise op type!");
+
     // Here the two nullptr looks strange, that's because the
     // framework::OpDesc's constructor is strange.
     framework::OpDesc op_desc(op, nullptr);
-    VLOG(3) << "convert a fluid elementwise op to tensorrt IScaleLayer";
 
     PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
     PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
@@ -120,29 +136,35 @@ class ElementwiseTensorOpConverter : public OpConverter {
     nvinfer1::Dims dims_x = X->getDimensions();
     nvinfer1::Dims dims_y = Y->getDimensions();
 
-    // The two input tensor should have the same dims
-    PADDLE_ENFORCE(dims_x.nbDims >= 3);
-    if (dims_x.nbDims == dims_y.nbDims) {
-      for (int i = 0; i < dims_x.nbDims; i++) {
-        if (dims_x.d[i] != dims_y.d[i])
-          PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
-      }
-    } else {
-      PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
-    }
+    int axis = boost::get<int>(op_desc.GetAttr("axis"));
+    auto output_name = op_desc.Output("Out")[0];
+    if (CheckDims(dims_x, dims_y)) {
+      // The two input tensor should have the same dims
+      VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer";
 
-    auto op_pair = ops.find(op_type_);
-    if (op_pair == ops.end()) {
-      PADDLE_THROW("Wrong elementwise op type!");
-    }
-    nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
-        engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
-        *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
+      nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
+          engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
+          *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
 
-    auto output_name = op_desc.Output("Out")[0];
-    layer->setName(("elementwise (Output: " + output_name + ")").c_str());
-    layer->getOutput(0)->setName(output_name.c_str());
-    engine_->SetITensor(output_name, layer->getOutput(0));
+      layer->setName(("elementwise (Output: " + output_name + ")").c_str());
+      layer->getOutput(0)->setName(output_name.c_str());
+      engine_->SetITensor(output_name, layer->getOutput(0));
+    } else {
+      VLOG(3) << "Convert a fluid elementwise op to TensorRT "
+                 "ElementWisePluginLayer";
+
+      plugin::ElementWisePlugin* plugin =
+          new plugin::ElementWisePlugin(op_pair->second, dims_x, dims_y, axis);
+      plugin->AddInput(X);
+      plugin->AddInput(Y);
+      nvinfer1::IPluginLayer* layer = engine_->AddPlugin(
+          const_cast<nvinfer1::ITensor* const*>(plugin->GetInputs().data()), 2,
+          reinterpret_cast<plugin::PluginTensorRT*>(plugin));
+
+      layer->setName(("elementwise (Output: " + output_name + ")").c_str());
+      layer->getOutput(0)->setName(output_name.c_str());
+      engine_->SetITensor(output_name, layer->getOutput(0));
+    }
     if (test_mode) {  // the test framework can not determine which is the
                       // output, so place the declaration inside.
       engine_->DeclareOutput(output_name);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index d309d94c56..d61d635ed7 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -61,7 +61,7 @@ class OpConverter {
       // TODO(xingzhaolong): all mul, sub, div
       // static std::unordered_set<std::string> add_weight_op_set {"add", "mul",
       // "sub", "div"};
-      static std::unordered_set<std::string> add_weight_op_set{"add"};
+      static std::unordered_set<std::string> add_weight_op_set{"add", "mul"};
       PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
       int op_type_len = op_desc.Type().size();
       std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len);
diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
index 337885e6ba..dbdff85dde 100644
--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
@@ -54,7 +54,7 @@ class PReluOpConverter : public OpConverter {
     TensorRTEngine::Weight alpha_rt(nvinfer1::DataType::kFLOAT,
                                     static_cast<void*>(alpha_data),
                                     alpha_tensor_device->numel());
-    PReluPlugin* plugin = new PReluPlugin(alpha_rt, mode);
+    plugin::PReluPlugin* plugin = new plugin::PReluPlugin(alpha_rt, mode);
     nvinfer1::IPluginLayer* layer =
         engine_->AddPlugin(&input, input_num, plugin);
     // keep alpha tensor to avoid release it's memory
diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc
index 159854ab59..6620c76318 100644
--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
@@ -50,7 +50,7 @@ class SplitOpConverter : public OpConverter {
     PADDLE_ENFORCE(output_lengths.size() == output_num);
 
     //
-    SplitPlugin* plugin = new SplitPlugin(axis, output_lengths);
+    plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths);
     nvinfer1::IPluginLayer* layer =
         engine_->AddPlugin(&input, input_num, plugin);
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
index 7537d02a35..cc967464a5 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
@@ -20,13 +20,12 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-TEST(elementwise_op, add_weight_test) {
+TEST(elementwise_op, add_weight) {
   std::unordered_set<std::string> parameters({"elementwise_add-Y"});
   framework::Scope scope;
   TRTConvertValidation validator(10, parameters, scope, 1 << 15);
   validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3));
   validator.DeclParamVar("elementwise_add-Y", nvinfer1::Dims3(10, 1, 1));
-  // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
   validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3));
 
   // Prepare Op description
@@ -44,30 +43,65 @@ TEST(elementwise_op, add_weight_test) {
   validator.Execute(8);
 }
 
-TEST(elementwise_op, add_tensor_test) {
-  std::unordered_set<std::string> parameters;
-  framework::Scope scope;
-  TRTConvertValidation validator(8, parameters, scope, 1 << 15);
-  validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3));
-  validator.DeclInputVar("elementwise_add-Y", nvinfer1::Dims3(10, 3, 3));
-  // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
-  validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3));
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("elementwise_add");
-  desc.SetInput("X", {"elementwise_add-X"});
-  desc.SetInput("Y", {"elementwise_add-Y"});
-  desc.SetOutput("Out", {"elementwise_add-Out"});
-
-  // the defalut axis of elementwise op is -1
-
-  validator.SetOp(*desc.Proto());
+TEST(elementwise_op, native) {
+  for (std::string type : {"add", "mul"}) {
+    int batch_size = 8;
+    std::unordered_set<std::string> parameters;
+    framework::Scope scope;
+    TRTConvertValidation validator(batch_size, parameters, scope, 1 << 15);
+    validator.DeclInputVar("elementwise_" + type + "-X",
+                           nvinfer1::DimsCHW(10, 3, 3));
+    validator.DeclInputVar("elementwise_" + type + "-Y",
+                           nvinfer1::Dims3(10, 3, 3));
+    validator.DeclOutputVar("elementwise_" + type + "-Out",
+                            nvinfer1::DimsCHW(10, 3, 3));
+
+    // Prepare Op description
+    framework::OpDesc desc;
+    desc.SetType("elementwise_" + type);
+    desc.SetInput("X", {"elementwise_" + type + "-X"});
+    desc.SetInput("Y", {"elementwise_" + type + "-Y"});
+    desc.SetOutput("Out", {"elementwise_" + type + "-Out"});
+
+    int axis = -1;
+    desc.SetAttr("axis", axis);
+
+    validator.SetOp(*desc.Proto());
+    validator.Execute(batch_size);
+  }
+}
 
-  validator.Execute(8);
+TEST(elementwise_op, plugin) {
+  for (std::string type : {"add", "mul"}) {
+    int batch_size = 8;
+    std::unordered_set<std::string> parameters;
+    framework::Scope scope;
+    TRTConvertValidation validator(batch_size, parameters, scope, 1 << 15);
+    validator.DeclInputVar("elementwise_" + type + "-X",
+                           nvinfer1::DimsCHW(10, 3, 3));
+    validator.DeclInputVar("elementwise_" + type + "-Y",
+                           nvinfer1::Dims3(10, 1, 1));
+    validator.DeclOutputVar("elementwise_" + type + "-Out",
+                            nvinfer1::DimsCHW(10, 3, 3));
+
+    // Prepare Op description
+    framework::OpDesc desc;
+    desc.SetType("elementwise_" + type);
+    desc.SetInput("X", {"elementwise_" + type + "-X"});
+    desc.SetInput("Y", {"elementwise_" + type + "-Y"});
+    desc.SetOutput("Out", {"elementwise_" + type + "-Out"});
+
+    int axis = -1;
+    desc.SetAttr("axis", axis);
+
+    validator.SetOp(*desc.Proto());
+    validator.Execute(batch_size);
+  }
 }
 
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
+
 USE_OP(elementwise_add);
+USE_OP(elementwise_mul);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
index 3d34cd7d5d..282f53559a 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index 0a6f171fc4..f313beb73b 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 208bd12b83..f739752cbc 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -257,9 +257,10 @@ void TensorRTEngine::freshDeviceId() {
 }
 
 nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
-    nvinfer1::ITensor *const *inputs, int nbInputs, PluginTensorRT *plugin) {
+    nvinfer1::ITensor *const *inputs, int num_inputs,
+    plugin::PluginTensorRT *plugin) {
   owned_plugin_.emplace_back(plugin);
-  return infer_network_.get()->addPluginExt(inputs, nbInputs, *plugin);
+  return infer_network_.get()->addPluginExt(inputs, num_inputs, *plugin);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 99420f19ba..f5b2c28ba9 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -128,7 +128,7 @@ class TensorRTEngine : public EngineBase {
   int GetRuntimeBatch();
   int GetDevice() { return device_; }
   nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs,
-                                    int nbInputs, PluginTensorRT*);
+                                    int num_inputs, plugin::PluginTensorRT*);
 
   // A pointer to CPU memory is needed of the TRT weight.
   // Before TRT runs, fluid loads weight into GPU storage.
@@ -171,7 +171,7 @@ class TensorRTEngine : public EngineBase {
 
   // The specific GPU id that the TensorRTEngine bounded to.
   int device_;
-  std::vector<std::unique_ptr<PluginTensorRT>> owned_plugin_;
+  std::vector<std::unique_ptr<plugin::PluginTensorRT>> owned_plugin_;
 
   // TensorRT related internal members
   template <typename T>
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index b6811f9183..4090269499 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -1 +1,3 @@
-nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce device_context)
+nv_library(tensorrt_plugin
+           SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu
+           DEPS enforce device_context)
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
new file mode 100644
index 0000000000..9cd9026b73
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
@@ -0,0 +1,138 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+namespace details {
+
+template <typename T>
+struct Add {
+  __device__ T operator()(const T& a, const T& b) const { return a + b; }
+};
+
+template <typename T>
+struct Mul {
+  __device__ T operator()(const T& a, const T& b) const { return a * b; }
+};
+
+template <typename T, typename Operator>
+__global__ void ColumnWiseKernel(Operator op, const T* x, const T* y, T* out,
+                                 int batch_size, int num_rows, int num_cols) {
+  for (int batch_id = 0; batch_id < batch_size; ++batch_id) {
+    int row = blockIdx.x;
+    for (; row < num_rows; row += gridDim.x) {
+      T value_y = y[batch_id * num_rows + row];
+      int col = threadIdx.x;
+      int offset = (batch_id * num_rows + row) * num_cols;
+      for (; col < num_cols; col += blockDim.x) {
+        T value_x = x[offset + col];
+        out[offset + col] = op(value_x, value_y);
+      }
+    }
+  }
+}
+
+template <typename T, typename Operator>
+static void ElementWise(Operator op, const T* x, const T* y, T* out,
+                        int batch_size, int prev, int midd, int post,
+                        cudaStream_t stream) {
+  const int kThreadsPerBlock = 1024;
+  const int kMaximumBlocks = 65535;
+  if (prev == 1) {
+    int num_threads = (post > kThreadsPerBlock) ? kThreadsPerBlock
+                                                : (((post + 31) >> 5) << 5);
+    int num_blocks = (midd < kMaximumBlocks) ? midd : kMaximumBlocks;
+    ColumnWiseKernel<<<num_blocks, num_threads, 0, stream>>>(
+        op, x, y, out, batch_size, midd, post);
+  } else if (post == 1) {
+    PADDLE_THROW("Not implemented.");
+  } else {
+    PADDLE_THROW("Not implemented.");
+  }
+}
+
+}  // namespace details
+
+nvinfer1::Dims ElementWisePlugin::getOutputDimensions(
+    int index, const nvinfer1::Dims* input_dims, int num_inputs) {
+  PADDLE_ENFORCE_EQ(index, 0);
+  PADDLE_ENFORCE_EQ(num_inputs, 2);
+  PADDLE_ENFORCE_NOT_NULL(input_dims);
+  return input_dims[0];
+}
+
+int ElementWisePlugin::initialize() {
+  PADDLE_ENFORCE_GT(dims_y_.nbDims, 0);
+
+  axis_ = (axis_ == -1) ? dims_x_.nbDims - dims_y_.nbDims : axis_;
+  int trimed_nb_dims = dims_y_.nbDims;
+  for (; trimed_nb_dims > 0; --trimed_nb_dims) {
+    if (dims_y_.d[trimed_nb_dims - 1] != 1) {
+      break;
+    }
+  }
+  dims_y_.nbDims = trimed_nb_dims;
+
+  PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_);
+  PADDLE_ENFORCE_LT(axis_, dims_x_.nbDims);
+
+  prev_size_ = 1;
+  midd_size_ = 1;
+  post_size_ = 1;
+  for (int i = 0; i < axis_; ++i) {
+    prev_size_ *= dims_x_.d[i];
+  }
+
+  for (int i = 0; i < dims_y_.nbDims; ++i) {
+    PADDLE_ENFORCE_EQ(dims_x_.d[i + axis_], dims_y_.d[i],
+                      "Broadcast dimension mismatch.");
+    midd_size_ *= dims_y_.d[i];
+  }
+
+  for (int i = axis_ + dims_y_.nbDims; i < dims_x_.nbDims; ++i) {
+    post_size_ *= dims_x_.d[i];
+  }
+  return 0;
+}
+
+int ElementWisePlugin::enqueue(int batch_size, const void* const* inputs,
+                               void** outputs, void* workspace,
+                               cudaStream_t stream) {
+  const float* x = reinterpret_cast<const float*>(inputs[0]);
+  const float* y = reinterpret_cast<const float*>(inputs[1]);
+  float* out = reinterpret_cast<float*>(outputs[0]);
+
+  if (type_ == nvinfer1::ElementWiseOperation::kSUM) {
+    details::ElementWise(details::Add<float>(), x, y, out, batch_size,
+                         prev_size_, midd_size_, post_size_, stream);
+  } else if (type_ == nvinfer1::ElementWiseOperation::kPROD) {
+    details::ElementWise(details::Mul<float>(), x, y, out, batch_size,
+                         prev_size_, midd_size_, post_size_, stream);
+  } else {
+    PADDLE_THROW("Not implemented.");
+  }
+
+  return cudaGetLastError() != cudaSuccess;
+}
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
new file mode 100644
index 0000000000..9c461f7a5c
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
@@ -0,0 +1,87 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+class ElementWisePlugin : public PluginTensorRT {
+ public:
+  ElementWisePlugin(nvinfer1::ElementWiseOperation type,
+                    nvinfer1::Dims const &dims_x, nvinfer1::Dims const &dims_y,
+                    int axis)
+      : type_(type),
+        dims_x_(dims_x),
+        dims_y_(dims_y),
+        axis_(axis),
+        prev_size_(1),
+        midd_size_(1),
+        post_size_(1) {}
+
+  ElementWisePlugin(void const *serial_data, size_t serial_length) {
+    deserializeBase(serial_data, serial_length);
+    DeserializeValue(&serial_data, &serial_length, &axis_);
+    DeserializeValue(&serial_data, &serial_length, &dims_x_);
+    DeserializeValue(&serial_data, &serial_length, &dims_y_);
+  }
+
+  ElementWisePlugin *clone() const override {
+    // return new ElementWisePlugin(dims_x_, dims_y_, axis_);
+    return nullptr;
+  }
+
+  const char *getPluginType() const override { return "elementwise"; }
+
+  nvinfer1::Dims getOutputDimensions(int index,
+                                     const nvinfer1::Dims *input_dims,
+                                     int num_inputs) override;
+
+  int initialize() override;
+
+  // execute the layer
+  int enqueue(int batch_size, const void *const *inputs, void **outputs,
+              void *workspace, cudaStream_t stream);
+
+ protected:
+  size_t getSerializationSize() override {
+    return SerializedSize(axis_) + SerializedSize(dims_x_) +
+           SerializedSize(dims_y_) + getBaseSerializationSize();
+  }
+
+  void serialize(void *buffer) override {
+    serializeBase(buffer);
+    SerializeValue(&buffer, axis_);
+    SerializeValue(&buffer, dims_x_);
+    SerializeValue(&buffer, dims_y_);
+  }
+
+  nvinfer1::ElementWiseOperation type_;
+  nvinfer1::Dims dims_x_;
+  nvinfer1::Dims dims_y_;
+  int axis_;
+  int prev_size_;
+  int midd_size_;
+  int post_size_;
+};
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
index 0f1ca11295..e8f4254402 100644
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
@@ -20,6 +20,7 @@
 namespace paddle {
 namespace inference {
 namespace tensorrt {
+namespace plugin {
 
 static const int CUDA_NUM_THREADS = 1024;
 static const int CUDA_MAX_NUM_BLOCKS = 65535;
@@ -126,6 +127,7 @@ int PReluPlugin::enqueue(int batchSize, const void *const *inputs,
   return cudaGetLastError() != cudaSuccess;
 }
 
+}  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
index aa0f865c89..0db56a310b 100644
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
@@ -21,6 +21,7 @@
 namespace paddle {
 namespace inference {
 namespace tensorrt {
+namespace plugin {
 
 class PReluPlugin : public PluginTensorRT {
   TensorRTEngine::Weight alpha_;
@@ -63,6 +64,7 @@ class PReluPlugin : public PluginTensorRT {
               void *workspace, cudaStream_t stream) override;
 };
 
+}  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/serialize.h b/paddle/fluid/inference/tensorrt/plugin/serialize.h
index 50c0b17d78..ce859f16fc 100644
--- a/paddle/fluid/inference/tensorrt/plugin/serialize.h
+++ b/paddle/fluid/inference/tensorrt/plugin/serialize.h
@@ -14,10 +14,15 @@
 
 #pragma once
 
-#include <cassert>
 #include <cstring>
 #include <type_traits>
 #include <vector>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
 
 template <typename T>
 inline void SerializeValue(void** buffer, T const& value);
@@ -26,7 +31,7 @@ template <typename T>
 inline void DeserializeValue(void const** buffer, size_t* buffer_size,
                              T* value);
 
-namespace {
+namespace details {
 
 template <typename T, class Enable = void>
 struct Serializer {};
@@ -36,10 +41,12 @@ struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
                                              std::is_enum<T>::value ||
                                              std::is_pod<T>::value>::type> {
   static size_t SerializedSize(T const& value) { return sizeof(T); }
+
   static void Serialize(void** buffer, T const& value) {
     std::memcpy(*buffer, &value, sizeof(T));
     reinterpret_cast<char*&>(*buffer) += sizeof(T);
   }
+
   static void Deserialize(void const** buffer, size_t* buffer_size, T* value) {
     assert(*buffer_size >= sizeof(T));
     std::memcpy(value, *buffer, sizeof(T));
@@ -51,10 +58,12 @@ struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
 template <>
 struct Serializer<const char*> {
   static size_t SerializedSize(const char* value) { return strlen(value) + 1; }
+
   static void Serialize(void** buffer, const char* value) {
-    std::strcpy(static_cast<char*>(*buffer), value);
+    std::strcpy(static_cast<char*>(*buffer), value);  // NOLINT
     reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
   }
+
   static void Deserialize(void const** buffer, size_t* buffer_size,
                           const char** value) {
     *value = static_cast<char const*>(*buffer);
@@ -73,39 +82,46 @@ struct Serializer<std::vector<T>,
   static size_t SerializedSize(std::vector<T> const& value) {
     return sizeof(value.size()) + value.size() * sizeof(T);
   }
+
   static void Serialize(void** buffer, std::vector<T> const& value) {
     SerializeValue(buffer, value.size());
     size_t nbyte = value.size() * sizeof(T);
     std::memcpy(*buffer, value.data(), nbyte);
     reinterpret_cast<char*&>(*buffer) += nbyte;
   }
+
   static void Deserialize(void const** buffer, size_t* buffer_size,
                           std::vector<T>* value) {
     size_t size;
     DeserializeValue(buffer, buffer_size, &size);
     value->resize(size);
     size_t nbyte = value->size() * sizeof(T);
-    assert(*buffer_size >= nbyte);
+    PADDLE_ENFORCE_GE(*buffer_size, nbyte);
     std::memcpy(value->data(), *buffer, nbyte);
     reinterpret_cast<char const*&>(*buffer) += nbyte;
     *buffer_size -= nbyte;
   }
 };
 
-}  // namespace
+}  // namespace details
 
 template <typename T>
 inline size_t SerializedSize(T const& value) {
-  return Serializer<T>::SerializedSize(value);
+  return details::Serializer<T>::SerializedSize(value);
 }
 
 template <typename T>
 inline void SerializeValue(void** buffer, T const& value) {
-  return Serializer<T>::Serialize(buffer, value);
+  return details::Serializer<T>::Serialize(buffer, value);
 }
 
 template <typename T>
 inline void DeserializeValue(void const** buffer, size_t* buffer_size,
                              T* value) {
-  return Serializer<T>::Deserialize(buffer, buffer_size, value);
+  return details::Serializer<T>::Deserialize(buffer, buffer_size, value);
 }
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
index bd6a44dcc1..4adea2db1e 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -12,26 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <stdio.h>
-#include <cassert>
 #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
+namespace plugin {
 
-nvinfer1::Dims SplitPlugin::getOutputDimensions(int index,
-                                                const nvinfer1::Dims* inputDims,
-                                                int nbInputs) {
-  assert(nbInputs == 1);
-  assert(index < this->getNbOutputs());
-  nvinfer1::Dims const& input_dims = inputDims[0];
-  nvinfer1::Dims output_dims = input_dims;
+nvinfer1::Dims SplitPlugin::getOutputDimensions(
+    int index, const nvinfer1::Dims* input_dims, int num_inputs) {
+  PADDLE_ENFORCE_EQ(num_inputs, 1);
+  PADDLE_ENFORCE_LT(index, this->getNbOutputs());
+
+  nvinfer1::Dims output_dims = input_dims[0];
   output_dims.d[axis_] = output_length_.at(index);
   return output_dims;
 }
 
 int SplitPlugin::initialize() {
+  PADDLE_ENFORCE_LE(axis_, nvinfer1::Dims::MAX_DIMS);
+
   std::vector<int> segment_offsets(1, 0);
   for (int i = 0; i < this->getNbOutputs(); ++i) {
     segment_offsets.push_back(segment_offsets.back() + output_length_[i]);
@@ -76,6 +76,7 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
   return cudaGetLastError() != cudaSuccess;
 }
 
-}  // tensorrt
-}  // inference
-}  // paddle
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
index 7281e40c33..b5b6e69992 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -14,61 +14,58 @@
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
+namespace plugin {
 
 class SplitPlugin : public PluginTensorRT {
-  int axis_;
-  std::vector<int> output_length_;
-  int nx_, ny_, nz_;
-  std::vector<int> segment_offsets_;
+ public:
+  SplitPlugin(int axis, std::vector<int> const &output_lengths)
+      : axis_(axis), output_length_(output_lengths) {}
+
+  SplitPlugin(void const *serial_data, size_t serial_length) {
+    deserializeBase(serial_data, serial_length);
+    DeserializeValue(&serial_data, &serial_length, &axis_);
+    DeserializeValue(&serial_data, &serial_length, &output_length_);
+  }
+
+  SplitPlugin *clone() const override {
+    return new SplitPlugin(axis_, output_length_);
+  }
+
+  const char *getPluginType() const override { return "split"; }
+  int getNbOutputs() const override { return output_length_.size(); }
+  nvinfer1::Dims getOutputDimensions(int index,
+                                     const nvinfer1::Dims *input_dims,
+                                     int num_inputs) override;
+
+  int initialize() override;
+  int enqueue(int batchSize, const void *const *inputs, void **outputs,
+              void *workspace, cudaStream_t stream) override;
 
  protected:
-  virtual size_t getSerializationSize() override {
+  size_t getSerializationSize() override {
     return SerializedSize(axis_) + SerializedSize(output_length_) +
            getBaseSerializationSize();
   }
 
-  // TRT will call this func when we need to serialize the configuration of
-  // tensorrt.
-  // It should not be called by users.
-  virtual void serialize(void *buffer) override {
+  void serialize(void *buffer) override {
     serializeBase(buffer);
     SerializeValue(&buffer, axis_);
     SerializeValue(&buffer, output_length_);
   }
 
- public:
-  SplitPlugin(int axis, std::vector<int> const &output_lengths)
-      : axis_(axis), output_length_(output_lengths) {
-    assert(axis <= nvinfer1::Dims::MAX_DIMS);
-  }
-
-  // It was used for tensorrt deserialization.
-  // It should not be called by users.
-  SplitPlugin(void const *serialData, size_t serialLength) {
-    deserializeBase(serialData, serialLength);
-    DeserializeValue(&serialData, &serialLength, &axis_);
-    DeserializeValue(&serialData, &serialLength, &output_length_);
-  }
-
-  SplitPlugin *clone() const override {
-    return new SplitPlugin(axis_, output_length_);
-  }
-
-  virtual const char *getPluginType() const override { return "split"; }
-  virtual int getNbOutputs() const override { return output_length_.size(); }
-  virtual nvinfer1::Dims getOutputDimensions(int index,
-                                             const nvinfer1::Dims *inputs,
-                                             int nbInputDims) override;
-  virtual int initialize() override;
-  virtual int enqueue(int batchSize, const void *const *inputs, void **outputs,
-                      void *workspace, cudaStream_t stream) override;
+  int axis_;
+  std::vector<int> output_length_;
+  int nx_, ny_, nz_;
+  std::vector<int> segment_offsets_;
 };
 
-}  // tensorrt
-}  // inference
-}  // paddle
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
index 08016d84b1..b0f4cff3ac 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
@@ -17,6 +17,7 @@
 namespace paddle {
 namespace inference {
 namespace tensorrt {
+namespace plugin {
 
 void PluginTensorRT::serializeBase(void*& buffer) {
   SerializeValue(&buffer, input_dims_);
@@ -25,12 +26,12 @@ void PluginTensorRT::serializeBase(void*& buffer) {
   SerializeValue(&buffer, data_format_);
 }
 
-void PluginTensorRT::deserializeBase(void const*& serialData,
-                                     size_t& serialLength) {
-  DeserializeValue(&serialData, &serialLength, &input_dims_);
-  DeserializeValue(&serialData, &serialLength, &max_batch_size_);
-  DeserializeValue(&serialData, &serialLength, &data_type_);
-  DeserializeValue(&serialData, &serialLength, &data_format_);
+void PluginTensorRT::deserializeBase(void const*& serial_data,
+                                     size_t& serial_length) {
+  DeserializeValue(&serial_data, &serial_length, &input_dims_);
+  DeserializeValue(&serial_data, &serial_length, &max_batch_size_);
+  DeserializeValue(&serial_data, &serial_length, &data_type_);
+  DeserializeValue(&serial_data, &serial_length, &data_format_);
 }
 
 size_t PluginTensorRT::getBaseSerializationSize() {
@@ -44,18 +45,17 @@ bool PluginTensorRT::supportsFormat(nvinfer1::DataType type,
           (format == nvinfer1::PluginFormat::kNCHW));
 }
 
-void PluginTensorRT::configureWithFormat(const nvinfer1::Dims* inputDims,
-                                         int nbInputs,
-                                         const nvinfer1::Dims* outputDims,
-                                         int nbOutputs, nvinfer1::DataType type,
-                                         nvinfer1::PluginFormat format,
-                                         int maxBatchSize) {
+void PluginTensorRT::configureWithFormat(
+    const nvinfer1::Dims* input_dims, int num_inputs,
+    const nvinfer1::Dims* output_dims, int num_outputs, nvinfer1::DataType type,
+    nvinfer1::PluginFormat format, int max_batch_size) {
   data_type_ = type;
   data_format_ = format;
-  input_dims_.assign(inputDims, inputDims + nbInputs);
-  max_batch_size_ = maxBatchSize;
+  input_dims_.assign(input_dims, input_dims + num_inputs);
+  max_batch_size_ = max_batch_size;
 }
 
+}  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
index 4d85e955a4..86084829e1 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@@ -14,23 +14,30 @@
 
 #pragma once
 
-#include <cassert>
+#include <NvInfer.h>
 #include <cstring>
-#include <iostream>
 #include <unordered_map>
 #include <vector>
-#include "NvInfer.h"
 
 #include "paddle/fluid/inference/tensorrt/plugin/serialize.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DECLARE_bool(profile);
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
+namespace plugin {
 
 class PluginTensorRT : public nvinfer1::IPluginExt {
  public:
   PluginTensorRT() {}
+  // It was used for TensorRT deserialization.
+  // It should not be called by users.
   PluginTensorRT(const void* serialized_data, size_t length) {}
+  virtual ~PluginTensorRT() {}
+
   nvinfer1::Dims const& getInputDims(int index) const {
     return input_dims_.at(index);
   }
@@ -38,43 +45,66 @@ class PluginTensorRT : public nvinfer1::IPluginExt {
   nvinfer1::DataType getDataType() const { return data_type_; }
   nvinfer1::PluginFormat getDataFormat() const { return data_format_; }
   virtual const char* getPluginVersion() const { return "1"; }
+
+  void AddInput(nvinfer1::ITensor* input) { inputs_.push_back(input); }
+  std::vector<nvinfer1::ITensor*>& GetInputs() { return inputs_; }
+
+  virtual nvinfer1::IPluginExt* clone() const = 0;
+  virtual const char* getPluginType() const = 0;
+
+  // Following functions are inherit from nvinfer1::IPluginExt
+  // Get the number of outputs from the layer
+  int getNbOutputs() const { return 1; }
+  // Get the dimension of an output tensor
+  virtual nvinfer1::Dims getOutputDimensions(int index,
+                                             const nvinfer1::Dims* input_dims,
+                                             int num_inputs) = 0;
+  // Find the workspace size required by the layer
   size_t getWorkspaceSize(int) const override { return 0; }
+
+  // Initialize the layer for execution.
+  // This is called when the engine is created.
+  int initialize() override { return 0; }
+  // Shutdown the layer. This is called when the engine is destroyed
   void terminate() override {}
-  virtual ~PluginTensorRT() {}
+  // Execute the layer
+  virtual int enqueue(int batch_size, const void* const* inputs, void** outputs,
+                      void* workspace, cudaStream_t stream) = 0;
+
+  // Find the size of the serialization buffer required
+  virtual size_t getSerializationSize() = 0;
+  // Serialize the layer config to buffer.
+  // TensorRT will call this func to serialize the configuration of TensorRT
+  // engine. It should not be called by users.
+  virtual void serialize(void* buffer) = 0;
+
   // Check format support. The default is FLOAT32 and NCHW.
   bool supportsFormat(nvinfer1::DataType type,
                       nvinfer1::PluginFormat format) const override;
-  void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs,
-                           const nvinfer1::Dims* outputDims, int nbOutputs,
+  // Configure the layer
+  void configureWithFormat(const nvinfer1::Dims* input_dims, int num_inputs,
+                           const nvinfer1::Dims* output_dims, int num_outputs,
                            nvinfer1::DataType type,
                            nvinfer1::PluginFormat format,
-                           int maxBatchSize) override;
-
-  // *NOTE* The following functions need to be overrided in the subclass.
-  virtual nvinfer1::IPluginExt* clone() const = 0;
-  virtual const char* getPluginType() const = 0;
-  // Initialize the layer for execution. This is called when the engine is
-  // created.
-  int initialize() override { return 0; }
-  // Serialize the layer config to buffer.
-  virtual void serialize(void* buffer) = 0;
-  virtual size_t getSerializationSize() = 0;
-  virtual int enqueue(int batchSize, const void* const* inputs, void** outputs,
-                      void* workspace, cudaStream_t stream) = 0;
+                           int max_batch_size) override;
 
  protected:
   // Deserialize input_dims, max_batch_size, data_type, data_format
-  void deserializeBase(void const*& serialData, size_t& serialLength);
+  void deserializeBase(void const*& serial_data,  // NOLINT
+                       size_t& serial_length);    // NOLINT
   size_t getBaseSerializationSize();
   // Serialize input_dims, max_batch_size, data_type, data_format
-  void serializeBase(void*& buffer);
+  void serializeBase(void*& buffer);  // NOLINT
 
   std::vector<nvinfer1::Dims> input_dims_;
   size_t max_batch_size_;
   nvinfer1::DataType data_type_;
   nvinfer1::PluginFormat data_format_;
+
+  std::vector<nvinfer1::ITensor*> inputs_;
 };
 
+}  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index a404691413..e66ae28057 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -51,7 +51,7 @@ void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
     LOG(INFO) << *reinterpret_cast<const contrib::AnalysisConfig *>(config);
     return;
   }
-  LOG(INFO) << *config;
+  LOG(INFO) << *reinterpret_cast<const NativeConfig *>(config);
 }
 
 void CompareResult(const std::vector<PaddleTensor> &outputs,