Merge pull request #16104 from tensor-tang/refine/jit

refine jitkernels and test
7 years ago · ccc7c358b3
parent c49b7855fa 14d871121b
commit ccc7c358b3
45 changed files with 1616 additions and 1273 deletions
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
@ -82,8 +82,9 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
    Tensor track;
    int* track_value =
        track.mutable_data<int>(emission_dims, platform::CPUPlace());
-    auto ker = jit::Get<jit::kCRFDecoding, jit::CRFDecodingTuples<T>,
-                        platform::CPUPlace>(tag_num);
+    auto ker =
+        jit::KernelFuncs<jit::CRFDecodingTuple<T>, platform::CPUPlace>::Cache()
+            .At(tag_num);
    ker(static_cast<int>(seq_len), x, w, alpha_value, track_value, tag_num);
    T max_score = -std::numeric_limits<T>::max();
    int max_i = 0;
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
@ -110,8 +110,9 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
        constexpr int simd_width = 16;
        int C = c / simd_width;

-        auto multiply = jit::Get<jit::kNCHW16CMulNC, jit::NCHW16CMulNCTuples<T>,
-                                 platform::CPUPlace>(0);
+        auto multiply = jit::KernelFuncs<jit::NCHW16CMulNCTuple<T>,
+                                         platform::CPUPlace>::Cache()
+                            .At(0);
 #pragma omp parallel for collapse(2)
        for (int ni = 0; ni < n; ni++) {
          for (int ci = 0; ci < C; ci++) {
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
@ -52,8 +52,9 @@ struct EmbeddingVSumFunctor {
                                  out_width, jit::SeqPoolType::kSum);
    for (size_t i = 0; i != ids_lod.size() - 1; ++i) {
      attr.index_height = ids_lod[i + 1] - ids_lod[i];
-      auto emb_seqpool = jit::Get<jit::kEmbSeqPool, jit::EmbSeqPoolTuples<T>,
-                                  platform::CPUPlace>(attr);
+      auto emb_seqpool =
+          jit::KernelFuncs<jit::EmbSeqPoolTuple<T>, platform::CPUPlace>::Cache()
+              .At(attr);
      emb_seqpool(table, ids + ids_lod[i] * idx_width, output + i * out_width,
                  &attr);
    }
@ -135,8 +136,9 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
      T *d_table_data = d_table_value->mutable_data<T>(context.GetPlace());
      const T *d_output_data = d_output->data<T>();

-      auto vbroadcast = jit::Get<jit::kVBroadcast, jit::VBroadcastTuples<T>,
-                                 platform::CPUPlace>(out_width);
+      auto vbroadcast =
+          jit::KernelFuncs<jit::VBroadcastTuple<T>, platform::CPUPlace>::Cache()
+              .At(out_width);
      for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
        int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
        const T *src = d_output_data + i * out_width;
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@ -182,29 +182,32 @@ class FusionGRUKernel : public framework::OpKernel<T> {
  const int total_T = x_dims[0];           \
  const int D3 = wh_dims[1]

-#define INIT_OTHER_DEFINES                                                     \
-  auto* h0 = ctx.Input<Tensor>("H0");                                          \
-  auto* wx = ctx.Input<Tensor>("WeightX");                                     \
-  auto* bias = ctx.Input<Tensor>("Bias");                                      \
-  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");                          \
-  bool is_reverse = ctx.Attr<bool>("is_reverse");                              \
-  const int M = x_dims[1];                                                     \
-  const int D = wh_dims[0];                                                    \
-  const int D2 = D * 2;                                                        \
-  const jit::gru_attr_t attr(                                                  \
-      D, jit::to_kerneltype(ctx.Attr<std::string>("gate_activation")),         \
-      jit::to_kerneltype(ctx.Attr<std::string>("activation")));                \
-  jit::gru_t one_step;                                                         \
-  auto ComputeH1 =                                                             \
-      jit::Get<jit::kGRUH1, jit::GRUTuples<T>, platform::CPUPlace>(attr);      \
-  auto ComputeHtPart1 =                                                        \
-      jit::Get<jit::kGRUHtPart1, jit::GRUTuples<T>, platform::CPUPlace>(attr); \
-  auto ComputeHtPart2 =                                                        \
-      jit::Get<jit::kGRUHtPart2, jit::GRUTuples<T>, platform::CPUPlace>(attr); \
-  const T* x_data = x->data<T>();                                              \
-  const T* wx_data = wx->data<T>();                                            \
-  const T* wh_data = wh->data<T>();                                            \
-  auto place = ctx.GetPlace();                                                 \
+#define INIT_OTHER_DEFINES                                                   \
+  auto* h0 = ctx.Input<Tensor>("H0");                                        \
+  auto* wx = ctx.Input<Tensor>("WeightX");                                   \
+  auto* bias = ctx.Input<Tensor>("Bias");                                    \
+  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");                        \
+  bool is_reverse = ctx.Attr<bool>("is_reverse");                            \
+  const int M = x_dims[1];                                                   \
+  const int D = wh_dims[0];                                                  \
+  const int D2 = D * 2;                                                      \
+  const jit::gru_attr_t attr(                                                \
+      D, jit::to_kerneltype(ctx.Attr<std::string>("gate_activation")),       \
+      jit::to_kerneltype(ctx.Attr<std::string>("activation")));              \
+  jit::gru_t one_step;                                                       \
+  auto ComputeH1 =                                                           \
+      jit::KernelFuncs<jit::GRUH1Tuple<T>, platform::CPUPlace>::Cache().At(  \
+          attr);                                                             \
+  auto ComputeHtPart1 =                                                      \
+      jit::KernelFuncs<jit::GRUHtPart1Tuple<T>, platform::CPUPlace>::Cache() \
+          .At(attr);                                                         \
+  auto ComputeHtPart2 =                                                      \
+      jit::KernelFuncs<jit::GRUHtPart2Tuple<T>, platform::CPUPlace>::Cache() \
+          .At(attr);                                                         \
+  const T* x_data = x->data<T>();                                            \
+  const T* wx_data = wx->data<T>();                                          \
+  const T* wh_data = wh->data<T>();                                          \
+  auto place = ctx.GetPlace();                                               \
  T* xx_data = xx->mutable_data<T>(place)

  void SeqCompute(const framework::ExecutionContext& ctx) const {
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@ -235,32 +235,34 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
  const int D = wh_dims[0];                                 \
  const int D4 = wh_dims[1]

-#define INIT_OTHER_DEFINES                                                    \
-  const T* x_data = x->data<T>();                                             \
-  const T* wx_data = wx->data<T>();                                           \
-  const T* wh_data = wh->data<T>();                                           \
-  /* diagonal weight*/                                                        \
-  const T* wp_data = bias->data<T>() + D4;                                    \
-  /* for peephole only*/                                                      \
-  T* checked_cell_data = nullptr;                                             \
-  auto place = ctx.GetPlace();                                                \
-  if (use_peepholes) {                                                        \
-    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                          \
-    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");                   \
-    checked_cell_data = checked_cell->mutable_data<T>(place);                 \
-  }                                                                           \
-  const jit::lstm_attr_t attr(                                                \
-      D, jit::to_kerneltype(ctx.Attr<std::string>("gate_activation")),        \
-      jit::to_kerneltype(ctx.Attr<std::string>("candidate_activation")),      \
-      jit::to_kerneltype(ctx.Attr<std::string>("cell_activation")),           \
-      use_peepholes);                                                         \
-  jit::lstm_t one_step;                                                       \
-  one_step.wp = wp_data;                                                      \
-  one_step.checked = checked_cell_data;                                       \
-  auto ComputeC1H1 =                                                          \
-      jit::Get<jit::kLSTMC1H1, jit::LSTMTuples<T>, platform::CPUPlace>(attr); \
-  auto ComputeCtHt =                                                          \
-      jit::Get<jit::kLSTMCtHt, jit::LSTMTuples<T>, platform::CPUPlace>(attr)
+#define INIT_OTHER_DEFINES                                                     \
+  const T* x_data = x->data<T>();                                              \
+  const T* wx_data = wx->data<T>();                                            \
+  const T* wh_data = wh->data<T>();                                            \
+  /* diagonal weight*/                                                         \
+  const T* wp_data = bias->data<T>() + D4;                                     \
+  /* for peephole only*/                                                       \
+  T* checked_cell_data = nullptr;                                              \
+  auto place = ctx.GetPlace();                                                 \
+  if (use_peepholes) {                                                         \
+    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                           \
+    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");                    \
+    checked_cell_data = checked_cell->mutable_data<T>(place);                  \
+  }                                                                            \
+  const jit::lstm_attr_t attr(                                                 \
+      D, jit::to_kerneltype(ctx.Attr<std::string>("gate_activation")),         \
+      jit::to_kerneltype(ctx.Attr<std::string>("candidate_activation")),       \
+      jit::to_kerneltype(ctx.Attr<std::string>("cell_activation")),            \
+      use_peepholes);                                                          \
+  jit::lstm_t one_step;                                                        \
+  one_step.wp = wp_data;                                                       \
+  one_step.checked = checked_cell_data;                                        \
+  auto ComputeC1H1 =                                                           \
+      jit::KernelFuncs<jit::LSTMC1H1Tuple<T>, platform::CPUPlace>::Cache().At( \
+          attr);                                                               \
+  auto ComputeCtHt =                                                           \
+      jit::KernelFuncs<jit::LSTMCtHtTuple<T>, platform::CPUPlace>::Cache().At( \
+          attr)

 // Wh GEMM
 #define GEMM_WH_ADDON(bs, prev, out)                                           \
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
@ -82,9 +82,11 @@ template <typename T>
 static void fc_relu(const T* x, const T* w, const T* b, T* y,
                    const jit::matmul_attr_t& attr) {
  auto matmul =
-      jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(attr);
+      jit::KernelFuncs<jit::MatMulTuple<T>, platform::CPUPlace>::Cache().At(
+          attr);
  auto addbias_relu =
-      jit::Get<jit::kVAddRelu, jit::XYZNTuples<T>, platform::CPUPlace>(attr.n);
+      jit::KernelFuncs<jit::VAddReluTuple<T>, platform::CPUPlace>::Cache().At(
+          attr.n);
  matmul(x, w, y, &attr);
  T* dst = y;
  for (int i = 0; i < attr.m; ++i) {
--- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
@ -98,7 +98,7 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel<T> {
      attr.type = jit::SeqPoolType::kSqrt;
    }
    auto seqpool =
-        jit::Get<jit::kSeqPool, jit::SeqPoolTuples<T>, platform::CPUPlace>(
+        jit::KernelFuncs<jit::SeqPoolTuple<T>, platform::CPUPlace>::Cache().At(
            attr);
    size_t n = ins.size();
    size_t dst_step_size = n * w;
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
@ -94,19 +94,23 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
    int o_numel = attr.m * attr.n;

    auto vsquare_x =
-        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(attr.m *
-                                                                       attr.k);
+        jit::KernelFuncs<jit::VSquareTuple<T>, platform::CPUPlace>::Cache().At(
+            attr.m * attr.k);
    auto vsquare_y =
-        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(attr.k *
-                                                                       attr.n);
+        jit::KernelFuncs<jit::VSquareTuple<T>, platform::CPUPlace>::Cache().At(
+            attr.k * attr.n);
    auto vsquare_xy =
-        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(o_numel);
+        jit::KernelFuncs<jit::VSquareTuple<T>, platform::CPUPlace>::Cache().At(
+            o_numel);
    auto vsub =
-        jit::Get<jit::kVSub, jit::XYZNTuples<T>, platform::CPUPlace>(o_numel);
+        jit::KernelFuncs<jit::VSubTuple<T>, platform::CPUPlace>::Cache().At(
+            o_numel);
    auto vscal =
-        jit::Get<jit::kVScal, jit::AXYNTuples<T>, platform::CPUPlace>(o_numel);
+        jit::KernelFuncs<jit::VScalTuple<T>, platform::CPUPlace>::Cache().At(
+            o_numel);
    auto matmul =
-        jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(attr);
+        jit::KernelFuncs<jit::MatMulTuple<T>, platform::CPUPlace>::Cache().At(
+            attr);

    const T* x_data = x->data<T>();
    const T* y_data = y->data<T>();
--- a/paddle/fluid/operators/jit/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/CMakeLists.txt
@ -5,7 +5,7 @@ file(APPEND ${jit_file} "\#pragma once\n")
 file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/helper.h\"\n")
 file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/registry.h\"\n\n")

-set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place)
+set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place xxhash)

 file(GLOB jit_kernel_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
 list(REMOVE_ITEM jit_kernel_cc_srcs test.cc benchmark.cc)
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
--- a/paddle/fluid/operators/jit/gen/act.cc
+++ b/paddle/fluid/operators/jit/gen/act.cc
@ -13,6 +13,7 @@
 * limitations under the License. */

 #include "paddle/fluid/operators/jit/gen/act.h"
+#include <memory>
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"

@ -81,7 +82,7 @@ void VActJitCode::genCode() {
 #define DECLARE_ACT_CREATOR(name)                                            \
  class name##Creator : public JitCodeCreator<int> {                         \
   public:                                                                   \
-    bool UseMe(const int& attr) const override;                              \
+    bool CanBeUsed(const int& attr) const override;                          \
    size_t CodeSize(const int& d) const override;                            \
    std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \
      return make_unique<name##JitCode>(attr, CodeSize(attr));               \
@ -96,27 +97,27 @@ DECLARE_ACT_CREATOR(VSigmoid);
 DECLARE_ACT_CREATOR(VTanh);

 // TODO(TJ): tuning use me
-bool VReluCreator::UseMe(const int& d) const {
+bool VReluCreator::CanBeUsed(const int& d) const {
  return platform::MayIUse(platform::avx);
 }

-bool VSquareCreator::UseMe(const int& d) const {
+bool VSquareCreator::CanBeUsed(const int& d) const {
  return platform::MayIUse(platform::avx);
 }

-bool VIdentityCreator::UseMe(const int& d) const {
+bool VIdentityCreator::CanBeUsed(const int& d) const {
  return platform::MayIUse(platform::avx);
 }

-bool VExpCreator::UseMe(const int& d) const {
+bool VExpCreator::CanBeUsed(const int& d) const {
  return platform::MayIUse(platform::avx) && d < 32;
 }

-bool VSigmoidCreator::UseMe(const int& d) const {
+bool VSigmoidCreator::CanBeUsed(const int& d) const {
  return platform::MayIUse(platform::avx);
 }

-bool VTanhCreator::UseMe(const int& d) const {
+bool VTanhCreator::CanBeUsed(const int& d) const {
  return platform::MayIUse(platform::avx);
 }

--- a/paddle/fluid/operators/jit/gen/blas.cc
+++ b/paddle/fluid/operators/jit/gen/blas.cc
@ -13,6 +13,7 @@
 * limitations under the License. */

 #include "paddle/fluid/operators/jit/gen/blas.h"
+#include <memory>
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"

@ -142,7 +143,7 @@ void NCHW16CMulNCJitCode::genCode() {

 class NCHW16CMulNCCreator : public JitCodeCreator<int> {
 public:
-  bool UseMe(const int& attr) const override {
+  bool CanBeUsed(const int& attr) const override {
    return platform::MayIUse(platform::avx512f);
  }
  size_t CodeSize(const int& d) const override { return 256 * 1024; }
@ -154,7 +155,7 @@ class NCHW16CMulNCCreator : public JitCodeCreator<int> {
 #define DECLARE_BLAS_CREATOR(name)                                           \
  class name##Creator : public JitCodeCreator<int> {                         \
   public:                                                                   \
-    bool UseMe(const int& attr) const override {                             \
+    bool CanBeUsed(const int& attr) const override {                         \
      return platform::MayIUse(platform::avx) && attr <= 1024;               \
    }                                                                        \
    size_t CodeSize(const int& d) const override {                           \
--- a/paddle/fluid/operators/jit/gen/embseqpool.cc
+++ b/paddle/fluid/operators/jit/gen/embseqpool.cc
@ -14,6 +14,7 @@

 #include "paddle/fluid/operators/jit/gen/embseqpool.h"
 #include <stddef.h>  // offsetof
+#include <memory>
 #include <vector>
 #include "paddle/fluid/operators/jit/gen/act.h"  // for exp_float_consts ones
 #include "paddle/fluid/operators/jit/registry.h"
@ -121,7 +122,7 @@ void EmbSeqPoolJitCode::genCode() {

 class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
 public:
-  bool UseMe(const emb_seq_pool_attr_t& attr) const override {
+  bool CanBeUsed(const emb_seq_pool_attr_t& attr) const override {
    return platform::MayIUse(platform::avx) &&
           attr.table_width % YMM_FLOAT_BLOCK == 0;
  }
--- a/paddle/fluid/operators/jit/gen/gru.cc
+++ b/paddle/fluid/operators/jit/gen/gru.cc
@ -14,6 +14,7 @@

 #include "paddle/fluid/operators/jit/gen/gru.h"
 #include <stddef.h>  // offsetof
+#include <memory>
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"

@ -86,7 +87,7 @@ void GRUJitCode::genCode() {
  class name##Creator : public JitCodeCreator<gru_attr_t> {       \
   public:                                                        \
    /* TODO(TJ): enable more */                                   \
-    bool UseMe(const gru_attr_t& attr) const override {           \
+    bool CanBeUsed(const gru_attr_t& attr) const override {       \
      return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
    }                                                             \
    size_t CodeSize(const gru_attr_t& attr) const override {      \
--- a/paddle/fluid/operators/jit/gen/hopv.cc
+++ b/paddle/fluid/operators/jit/gen/hopv.cc
@ -13,6 +13,7 @@
 * limitations under the License. */

 #include "paddle/fluid/operators/jit/gen/hopv.h"
+#include <memory>
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"

@ -76,7 +77,7 @@ void HOPVJitCode::genCode() {
 #define DECLARE_HOP_CREATOR(name)                                            \
  class name##Creator : public JitCodeCreator<int> {                         \
   public:                                                                   \
-    bool UseMe(const int& attr) const override {                             \
+    bool CanBeUsed(const int& attr) const override {                         \
      return platform::MayIUse(platform::avx);                               \
    }                                                                        \
    size_t CodeSize(const int& d) const override {                           \
--- a/paddle/fluid/operators/jit/gen/jitcode.h
+++ b/paddle/fluid/operators/jit/gen/jitcode.h
@ -73,7 +73,7 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator {
  virtual void genCode() = 0;

  size_t getSize() const override { return CodeGenerator::getSize(); }
-  const unsigned char* getCodeInternal() override {
+  const unsigned char* getCodeInternal() const override {
    const Xbyak::uint8* code = CodeGenerator::getCode();
    return code;
  }
--- a/paddle/fluid/operators/jit/gen/lstm.cc
+++ b/paddle/fluid/operators/jit/gen/lstm.cc
@ -14,6 +14,7 @@

 #include "paddle/fluid/operators/jit/gen/lstm.h"
 #include <stddef.h>  // offsetof
+#include <memory>
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"

@ -114,7 +115,7 @@ void LSTMJitCode::genCode() {
  class name##Creator : public JitCodeCreator<lstm_attr_t> {      \
   public:                                                        \
    /* TODO(TJ): enable more */                                   \
-    bool UseMe(const lstm_attr_t& attr) const override {          \
+    bool CanBeUsed(const lstm_attr_t& attr) const override {      \
      return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
    }                                                             \
    size_t CodeSize(const lstm_attr_t& attr) const override {     \
--- a/paddle/fluid/operators/jit/gen/matmul.cc
+++ b/paddle/fluid/operators/jit/gen/matmul.cc
@ -14,8 +14,8 @@

 #include "paddle/fluid/operators/jit/gen/matmul.h"
 #include <stddef.h>  // offsetof
+#include <memory>
 #include <vector>
-
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"

@ -98,7 +98,7 @@ void MatMulJitCode::genCode() {

 class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
 public:
-  bool UseMe(const matmul_attr_t& attr) const override {
+  bool CanBeUsed(const matmul_attr_t& attr) const override {
    return attr.m == 1 && platform::MayIUse(platform::avx512f) &&
           attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512;
  }
--- a/paddle/fluid/operators/jit/gen/seqpool.cc
+++ b/paddle/fluid/operators/jit/gen/seqpool.cc
@ -13,6 +13,7 @@
 * limitations under the License. */

 #include "paddle/fluid/operators/jit/gen/seqpool.h"
+#include <memory>
 #include "paddle/fluid/operators/jit/gen/act.h"  // for exp_float_consts ones
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
@ -57,7 +58,7 @@ void SeqPoolJitCode::genCode() {

 class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
 public:
-  bool UseMe(const seq_pool_attr_t& attr) const override {
+  bool CanBeUsed(const seq_pool_attr_t& attr) const override {
    return platform::MayIUse(platform::avx);
  }
  size_t CodeSize(const seq_pool_attr_t& attr) const override {
--- a/paddle/fluid/operators/jit/gen/sgd.cc
+++ b/paddle/fluid/operators/jit/gen/sgd.cc
@ -14,6 +14,7 @@

 #include "paddle/fluid/operators/jit/gen/sgd.h"
 #include <stddef.h>  // offsetof
+#include <memory>
 #include <vector>
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
@ -104,7 +105,7 @@ void SgdJitCode::genCode() {

 class SgdCreator : public JitCodeCreator<sgd_attr_t> {
 public:
-  bool UseMe(const sgd_attr_t& attr) const override {
+  bool CanBeUsed(const sgd_attr_t& attr) const override {
    return platform::MayIUse(platform::avx) &&
           attr.grad_width % YMM_FLOAT_BLOCK == 0;
  }
--- a/paddle/fluid/operators/jit/gen/vbroadcast.cc
+++ b/paddle/fluid/operators/jit/gen/vbroadcast.cc
@ -69,7 +69,7 @@ void VBroadcastJitCode::genCode() {

 class VBroadcastCreator : public JitCodeCreator<int64_t> {
 public:
-  bool UseMe(const int64_t& w) const override {
+  bool CanBeUsed(const int64_t& w) const override {
    return platform::MayIUse(platform::avx) && w % YMM_FLOAT_BLOCK == 0;
  }
  size_t CodeSize(const int64_t& w) const override {
--- a/paddle/fluid/operators/jit/gen_base.cc
+++ b/paddle/fluid/operators/jit/gen_base.cc
@ -31,7 +31,7 @@ namespace paddle {
 namespace operators {
 namespace jit {

-// refer do not need useme, it would be the last one.
+// refer do not need CanBeUsed, it would be the last one.
 void GenBase::dumpCode(const unsigned char* code) const {
  if (code) {
    static int counter = 0;
--- a/paddle/fluid/operators/jit/gen_base.h
+++ b/paddle/fluid/operators/jit/gen_base.h
@ -31,9 +31,10 @@ class GenBase : public Kernel {
  virtual ~GenBase() = default;
  virtual std::string name() const = 0;
  virtual size_t getSize() const = 0;
-  virtual const unsigned char* getCodeInternal() = 0;
+  virtual const unsigned char* getCodeInternal() const = 0;
+  const char* ImplType() const override { return "JitCode"; }
  template <typename Func>
-  Func getCode() {
+  Func getCode() const {
    const unsigned char* code = this->getCodeInternal();
    if (FLAGS_dump_jitcode) {
      this->dumpCode(code);
@ -65,7 +66,7 @@ class JitCodeCreator : public GenCreator {
  virtual ~JitCodeCreator() = default;

  // condition when this jit code can be used.
-  virtual bool UseMe(const Attr& attr) const = 0;
+  virtual bool CanBeUsed(const Attr& attr) const = 0;

  // estimate this code size
  virtual size_t CodeSize(const Attr& attr) const = 0;
--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
@ -16,6 +16,8 @@

 #include <iostream>
 #include <string>
+#include <unordered_map>
+#include <utility>  // for std::move
 #include <vector>
 #include "paddle/fluid/operators/jit/gen_base.h"
 #include "paddle/fluid/operators/jit/kernel_base.h"
@ -27,35 +29,34 @@ namespace paddle {
 namespace operators {
 namespace jit {

-template <KernelType KT, typename KernelTuples, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
 inline typename std::enable_if<
-    std::is_same<typename KernelTuples::data_type, float>::value &&
+    std::is_same<typename KernelTuple::data_type, float>::value &&
        std::is_same<PlaceType, platform::CPUPlace>::value,
-    typename KernelTuples::func_type>::type
-GetJitCode(const typename KernelTuples::attr_type& attr) {
-  using Func = typename KernelTuples::func_type;
-  using Attr = typename KernelTuples::attr_type;
-  size_t key = JitCodeKey<Attr>(attr);
-  auto& codes = JitCodePool<KT>().Instance();
+    const Kernel*>::type
+GetJitCode(const typename KernelTuple::attr_type& attr) {
+  using Attr = typename KernelTuple::attr_type;
+  int64_t key = JitCodeKey<Attr>(attr);
+  auto& codes = JitCodePool<KernelTuple::kernel_type>::Instance();
  if (codes.Has(key)) {
-    return codes.AllKernels().at(key)->template getCode<Func>();
+    return codes.AllKernels().at(key).get();
  }

  // creator is not related with attr, so can use KernelKey as key
-  KernelKey kkey(KT, PlaceType());
+  KernelKey kkey(KernelTuple::kernel_type, PlaceType());
  // pool: (KernelKey(type, place), vector<GenCreatorPtr>)
-  auto& creator_map = JitCodeCreatorPool().Instance().AllCreators();
+  auto& creator_map = JitCodeCreatorPool::Instance().AllCreators();
  auto iter = creator_map.find(kkey);
  if (iter != creator_map.end()) {
    auto& creators = iter->second;
    for (auto& cur : creators) {
      auto i = dynamic_cast<const JitCodeCreator<Attr>*>(cur.get());
-      if (i && i->UseMe(attr)) {
+      if (i && i->CanBeUsed(attr)) {
        auto p = i->CreateJitCode(attr);
        if (p) {
-          auto f = p->template getCode<Func>();
+          auto res = p.get();
          codes.Insert(key, std::move(p));
-          return f;
+          return res;
        }
      }
    }
@ -63,87 +64,153 @@ GetJitCode(const typename KernelTuples::attr_type& attr) {
  return nullptr;
 }

-template <KernelType KT, typename KernelTuples, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
 inline typename std::enable_if<
-    !std::is_same<typename KernelTuples::data_type, float>::value ||
+    !std::is_same<typename KernelTuple::data_type, float>::value ||
        !std::is_same<PlaceType, platform::CPUPlace>::value,
-    typename KernelTuples::func_type>::type
-GetJitCode(const typename KernelTuples::attr_type& attr) {
+    const Kernel*>::type
+GetJitCode(const typename KernelTuple::attr_type& attr) {
  return nullptr;
 }

 // Refer code do not related with attr, which is just for cast
 // Refer is always on CPUPlace
-template <KernelType KT, typename KernelTuples>
-inline typename KernelTuples::func_type GetRefer() {
-  auto& ref_pool = ReferKernelPool().Instance().AllKernels();
-  KernelKey kkey(KT, platform::CPUPlace());
+template <typename KernelTuple>
+inline const Kernel* GetReferKernel() {
+  auto& ref_pool = ReferKernelPool::Instance().AllKernels();
+  KernelKey kkey(KernelTuple::kernel_type, platform::CPUPlace());
  auto ref_iter = ref_pool.find(kkey);
  PADDLE_ENFORCE(ref_iter != ref_pool.end(),
                 "Every Kernel should have reference function.");
  auto& ref_impls = ref_iter->second;
  for (auto& impl : ref_impls) {
-    auto i = dynamic_cast<const ReferKernel<KernelTuples>*>(impl.get());
+    auto i = dynamic_cast<const ReferKernel<KernelTuple>*>(impl.get());
    if (i) {
-      return i->GetFunc();
+      return i;
    }
  }
  return nullptr;
 }

-template <KernelType KT, typename KernelTuples,
-          typename PlaceType = platform::CPUPlace>
-typename KernelTuples::func_type Get(
-    const typename KernelTuples::attr_type& attr) {
-  auto jitfunc = GetJitCode<KT, KernelTuples, PlaceType>(attr);
-  if (jitfunc) {
-    return jitfunc;
+template <typename KernelTuple>
+inline typename KernelTuple::func_type GetReferFunc() {
+  auto ker = GetReferKernel<KernelTuple>();
+  auto p = dynamic_cast<const ReferKernel<KernelTuple>*>(ker);
+  PADDLE_ENFORCE(p, "The Refer kernel should exsit");
+  return p->GetFunc();
+}
+
+// Return all Kernels that can be used
+template <typename KernelTuple, typename PlaceType>
+std::vector<const Kernel*> GetAllCandidateKernels(
+    const typename KernelTuple::attr_type& attr) {
+  // the search order shoudl be jitcode > more > refer
+  std::vector<const Kernel*> res;
+  auto jitker = GetJitCode<KernelTuple, PlaceType>(attr);
+  if (jitker) {
+    res.emplace_back(jitker);
  }

-  // pool: (KernelKey(type, place), vector<KernelPtr>)
-  KernelKey kkey(KT, PlaceType());
-  auto& pool = KernelPool().Instance().AllKernels();
+  // more kernelpool: (KernelKey(type, place), vector<KernelPtr>)
+  KernelKey kkey(KernelTuple::kernel_type, PlaceType());
+  auto& pool = KernelPool::Instance().AllKernels();
  auto iter = pool.find(kkey);
  if (iter != pool.end()) {
    auto& impls = iter->second;
    for (auto& impl : impls) {
-      auto i = dynamic_cast<const KernelMore<KernelTuples>*>(impl.get());
-      if (i && i->UseMe(attr)) {
-        return i->GetFunc();
+      auto i = dynamic_cast<const KernelMore<KernelTuple>*>(impl.get());
+      if (i && i->CanBeUsed(attr)) {
+        res.emplace_back(i);
      }
    }
  }

  // The last implementation should be reference function on CPUPlace.
-  return GetRefer<KT, KernelTuples>();
+  auto ref = GetReferKernel<KernelTuple>();
+  PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty.");
+  res.emplace_back(ref);
+  return res;
+}
+
+template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
+std::vector<std::pair<std::string, typename KernelTuple::func_type>>
+GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) {
+  using Func = typename KernelTuple::func_type;
+  auto kers = GetAllCandidateKernels<KernelTuple, PlaceType>(attr);
+  std::vector<std::pair<std::string, Func>> res;
+  for (auto k : kers) {
+    std::string name = k->ImplType();
+    if (name == "JitCode") {
+      auto i = dynamic_cast<const GenBase*>(k);
+      PADDLE_ENFORCE(i, "jitcode kernel cast can not fail.");
+      res.emplace_back(std::make_pair(name, i->template getCode<Func>()));
+    } else {
+      auto i = dynamic_cast<const KernelMore<KernelTuple>*>(k);
+      PADDLE_ENFORCE(i, "kernel cast can not fail.");
+      res.emplace_back(std::make_pair(name, i->GetFunc()));
+    }
+  }
+  return res;
+}
+
+template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
+std::vector<typename KernelTuple::func_type> GetAllCandidateFuncs(
+    const typename KernelTuple::attr_type& attr) {
+  auto funcs = GetAllCandidateFuncsWithTypes<KernelTuple, PlaceType>(attr);
+  std::vector<typename KernelTuple::func_type> res;
+  for (auto& i : funcs) {
+    res.emplace_back(i.second);
+  }
+  return res;
+}
+
+template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
+typename KernelTuple::func_type GetDefaultBestFunc(
+    const typename KernelTuple::attr_type& attr) {
+  auto funcs = GetAllCandidateFuncs<KernelTuple, PlaceType>(attr);
+  PADDLE_ENFORCE_GE(funcs.size(), 1UL);
+  // Here could do some runtime benchmark of this attr and return the best one.
+  // But yet just get the first one as the default best one,
+  // which is searched in order and tuned by offline.
+  return funcs[0];
 }

-template <KernelType KT, typename KernelTuples, typename PlaceType>
+template <typename KernelTuple, typename PlaceType>
 class KernelFuncs {
 public:
  KernelFuncs() = default;
  static KernelFuncs& Cache() {
-    static thread_local KernelFuncs<KT, KernelTuples, PlaceType> g_func_cache;
+    static thread_local KernelFuncs<KernelTuple, PlaceType> g_func_cache;
    return g_func_cache;
  }

-  bool Has(int key) const { return funcs_.find(key) != funcs_.end(); }
-
-  void Insert(int key, typename KernelTuples::func_type func) {
-    funcs_.emplace(key, func);
-  }
-
-  typename KernelTuples::func_type At(int key) {
+  // the exposed interface to use
+  typename KernelTuple::func_type At(
+      const typename KernelTuple::attr_type& attr) {
+    // Maybe here is not good enough, not all kernels should have jitcode
+    int64_t key = JitCodeKey<typename KernelTuple::attr_type>(attr);
    if (Has(key)) {
      return funcs_.at(key);
    }
-    auto func = Get<KT, KernelTuples, PlaceType>(key);
+    // If do not have this attr in cache then get the default best
+    auto func = GetDefaultBestFunc<KernelTuple, PlaceType>(attr);
    Insert(key, func);
    return func;
  }

+  typename KernelTuple::func_type operator[](
+      const typename KernelTuple::attr_type& attr) {
+    return At(attr);
+  }
+
+ protected:
+  bool Has(int64_t key) const { return funcs_.find(key) != funcs_.end(); }
+  void Insert(int64_t key, typename KernelTuple::func_type func) {
+    funcs_.emplace(key, func);
+  }
+
 private:
-  std::unordered_map<int, typename KernelTuples::func_type> funcs_;
+  std::unordered_map<int64_t, typename KernelTuple::func_type> funcs_;
  DISABLE_COPY_AND_ASSIGN(KernelFuncs);
 };

--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
@ -62,26 +62,55 @@ typedef enum {
  kSqrt,
 } SeqPoolType;

+// x, y, z, n
 template <typename T>
-struct XYZNTuples {
+struct XYZNTuple {
  typedef T data_type;
  typedef int attr_type;
  typedef void (*func_type)(const T*, const T*, T*, int);
 };

+// a, x, y, n
 template <typename T>
-struct AXYNTuples : public XYZNTuples<T> {};
+struct AXYNTuple : public XYZNTuple<T> {};

+// x, y, n
 template <typename T>
-struct XYNTuples {
+struct XYNTuple {
  typedef T data_type;
  typedef int attr_type;
  typedef void (*func_type)(const T*, T*, int);
 };

-// x, return and int
+// x, returned value, n
 template <typename T>
-struct XRNTuples : public XYNTuples<T> {};
+struct XRNTuple : public XYNTuple<T> {};
+
+#define DECLARE_KERNELTUPLE(kernel_tuple, type)        \
+  template <typename T>                                \
+  struct type##Tuple : public kernel_tuple<T> {        \
+    static constexpr KernelType kernel_type = k##type; \
+  }
+
+// Tuple should be corresponding to the KernelType
+DECLARE_KERNELTUPLE(XYZNTuple, VMul);
+DECLARE_KERNELTUPLE(XYZNTuple, VAdd);
+DECLARE_KERNELTUPLE(XYZNTuple, VAddRelu);
+DECLARE_KERNELTUPLE(XYZNTuple, VSub);
+
+DECLARE_KERNELTUPLE(AXYNTuple, VScal);
+DECLARE_KERNELTUPLE(AXYNTuple, VAddBias);
+
+DECLARE_KERNELTUPLE(XYNTuple, VRelu);
+DECLARE_KERNELTUPLE(XYNTuple, VIdentity);
+DECLARE_KERNELTUPLE(XYNTuple, VSquare);
+DECLARE_KERNELTUPLE(XYNTuple, VExp);
+DECLARE_KERNELTUPLE(XYNTuple, VSigmoid);
+DECLARE_KERNELTUPLE(XYNTuple, VTanh);
+DECLARE_KERNELTUPLE(XYNTuple, VCopy);
+
+DECLARE_KERNELTUPLE(XRNTuple, HMax);
+DECLARE_KERNELTUPLE(XRNTuple, HSum);

 typedef struct {
  void* gates;  // gates: x_ch, x_ih, x_fh, x_oh
@ -122,21 +151,31 @@ typedef struct rnn_attr_s gru_attr_t;
 typedef struct lstm_attr_s lstm_attr_t;

 template <typename T>
-struct LSTMTuples {
+struct LSTMTuple {
  typedef T data_type;
  typedef lstm_attr_t attr_type;
  typedef void (*func_type)(lstm_t*, const lstm_attr_t*);
 };

 template <typename T>
-struct GRUTuples {
+struct GRUTuple {
  typedef T data_type;
  typedef gru_attr_t attr_type;
  typedef void (*func_type)(gru_t*, const gru_attr_t*);
 };

+DECLARE_KERNELTUPLE(LSTMTuple, LSTMCtHt);
+DECLARE_KERNELTUPLE(LSTMTuple, LSTMC1H1);
+
+DECLARE_KERNELTUPLE(GRUTuple, GRUH1);
+DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart1);
+DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart2);
+
+#undef DECLARE_KERNELTUPLE
+
 template <typename T>
-struct VBroadcastTuples {
+struct VBroadcastTuple {
+  static constexpr KernelType kernel_type = kVBroadcast;
  typedef T data_type;
  typedef int64_t attr_type;
  typedef void (*func_type)(const T*, T*, int64_t, int64_t);
@ -151,7 +190,8 @@ typedef struct seq_pool_attr_s {
 } seq_pool_attr_t;

 template <typename T>
-struct SeqPoolTuples {
+struct SeqPoolTuple {
+  static constexpr KernelType kernel_type = kSeqPool;
  typedef T data_type;
  typedef seq_pool_attr_t attr_type;
  typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*);
@ -176,7 +216,8 @@ typedef struct emb_seq_pool_attr_s {
 } emb_seq_pool_attr_t;

 template <typename T>
-struct EmbSeqPoolTuples {
+struct EmbSeqPoolTuple {
+  static constexpr KernelType kernel_type = kEmbSeqPool;
  typedef T data_type;
  typedef emb_seq_pool_attr_t attr_type;
  typedef void (*func_type)(const T*, const int64_t*, T*,
@ -198,7 +239,8 @@ typedef struct sgd_attr_s {
 } sgd_attr_t;

 template <typename T>
-struct SgdTuples {
+struct SgdTuple {
+  static constexpr KernelType kernel_type = kSgd;
  typedef T data_type;
  typedef sgd_attr_t attr_type;
  typedef void (*func_type)(const T*, const T*, const T*, const int64_t*, T*,
@ -214,21 +256,24 @@ typedef struct matmul_attr_s {
 } matmul_attr_t;

 template <typename T>
-struct MatMulTuples {
+struct MatMulTuple {
+  static constexpr KernelType kernel_type = kMatMul;
  typedef T data_type;
  typedef matmul_attr_t attr_type;
  typedef void (*func_type)(const T*, const T*, T*, const matmul_attr_t*);
 };

 template <typename T>
-struct CRFDecodingTuples {
+struct CRFDecodingTuple {
+  static constexpr KernelType kernel_type = kCRFDecoding;
  typedef T data_type;
  typedef int attr_type;
  typedef void (*func_type)(const int, const T*, const T*, T*, int*, int);
 };

 template <typename T>
-struct LayerNormTuples {
+struct LayerNormTuple {
+  static constexpr KernelType kernel_type = kLayerNorm;
  typedef T data_type;
  typedef int attr_type;
  typedef void (*func_type)(T*, T*, T*, T*, const T*, const T*, int,
@ -236,7 +281,8 @@ struct LayerNormTuples {
 };

 template <typename T>
-struct SoftmaxTuples {
+struct SoftmaxTuple {
+  static constexpr KernelType kernel_type = kSoftmax;
  typedef T data_type;
  typedef int attr_type;
  typedef void (*func_type)(const T*, T*, int, int);
@ -244,7 +290,8 @@ struct SoftmaxTuples {

 // nChw16c = nChw16c .* NC
 template <typename T>
-struct NCHW16CMulNCTuples {
+struct NCHW16CMulNCTuple {
+  static constexpr KernelType kernel_type = kNCHW16CMulNC;
  typedef T data_type;
  typedef int attr_type;
  typedef void (*func_type)(const T*, const T*, T*, int, int);
@ -255,28 +302,29 @@ class Kernel {
 public:
  Kernel() = default;
  virtual ~Kernel() = default;
+  virtual const char* ImplType() const = 0;
  DISABLE_COPY_AND_ASSIGN(Kernel);
 };

-template <typename KernelTuples>
+template <typename KernelTuple>
 class KernelMore : public Kernel {
 public:
-  using T = typename KernelTuples::data_type;
-  using Func = typename KernelTuples::func_type;
-  using Attr = typename KernelTuples::attr_type;
+  using T = typename KernelTuple::data_type;
+  using Func = typename KernelTuple::func_type;
+  using Attr = typename KernelTuple::attr_type;
  virtual Func GetFunc() const { return func; }
-  virtual bool UseMe(const Attr& attr) const = 0;
-  virtual const char* ImplType() const = 0;
+  // specify this kernel can be used, means it should not fail if use it.
+  virtual bool CanBeUsed(const Attr& attr) const = 0;

 protected:
  Func func{nullptr};
 };

-template <typename KernelTuples>
-class ReferKernel : public KernelMore<KernelTuples> {
+template <typename KernelTuple>
+class ReferKernel : public KernelMore<KernelTuple> {
 public:
  // Refer code can always be used
-  bool UseMe(const typename KernelTuples::attr_type& attr) const override {
+  bool CanBeUsed(const typename KernelTuple::attr_type& attr) const override {
    return true;
  }
  const char* ImplType() const override { return "Refer"; }
--- a/Show More
+++ b/Show More