clean code and remove unused files

test=develop
7 years ago · d53c4756ad
parent 95fb31285c
commit d53c4756ad
28 changed files with 73 additions and 3608 deletions
--- a/paddle/fluid/operators/jit/README.md
+++ b/paddle/fluid/operators/jit/README.md
@ -1,7 +1,8 @@
 # JIT Kernel

 结合函数模板和JIT生成需要的kernel函数。
-这里的kernel是比Operator中kernel更小级别的算子单元，更侧重的是在不同硬件上的性能。
+这里的kernel是比Operator中kernel更小级别的算子单元，更侧重的是在不同硬件上的性能。可以有多重第三方库的实现，每种实现有自己的`UseMe`函数负责什么条件下可以被调用。
+这里实现的函数可以非常细粒度的函数方法，比如Vector mul， 也可以是一个复杂的逻辑比如LSTM等。复杂的逻辑也可以由自己的底层函数拼接而成。
 目前仅支持CPU上的高性能计算。

 ## 目录结构
@ -21,6 +22,8 @@ PaddlePaddle/Paddle/paddle/fluid/
    │   │   └── ...
    │   ├── mkldnn/
    │   │   └── ...
+    │   ├── mix/
+    │   │   └── ...
    │   ├── intrinsic/
    │   │   └── ...
    │   └── openblas/
@ -29,28 +32,35 @@ PaddlePaddle/Paddle/paddle/fluid/
        └── ...
 ```

-基础class都的根目录下，根目录下包括jitcode,more和refer。每个目录下都是一种实现，每种kernel算子都需要有reference的实现，其他的都是可选的。
- jitcode： 代表使用jit生成的code，需要依赖xbyak。他关心的是性能。
- refer：代表reference的实现，每种kernel算子都需要有在CPU上的reference的实现，他主要关心的算法逻辑。
- more： 下面可以放入跟多实现，包括mkl，mkldnn，openblas等，也可以是自身已有的kernel组合。
+基本类的定义都放在根目录下，根目录下包括gen,more和refer三个目录。每个目录下都是一种或者多种实现，每种kernel算子都需要有reference的实现，用作单元测试的基准，其他的实现都是可选的。
+- gen: 代表使用jit生成的code，需要依赖xbyak库。该实现最关心的就是性能。
+- refer: 代表reference的实现，每种kernel算子都需要有在CPU上的reference的实现，他主要关心的算法逻辑的正确性。
+- more: 下面可以放入跟多实现，可以包括mkl，mkldnn，intrinsic，openblas等，也可以是自身已有的kernel组合。

 ## 动态获取

-提供一个get方法，根据kernel类别获取，每种实现都有自己的使用范围，根据范围动态和当前条件选择需要的kernel函数。
+提供一个`jit::Get`方法，根据kernel类别获取，每种实现都有自己的使用范围，根据范围动态和当前条件选择需要的kernel函数。

 ## 测试

 - 逻辑测试
    所有实现都要与refer的code对比，需要满足精度要求， 包括float和double的数据类型
 - 性能测试
-    所有实现的性能对比，并且与最终的`jit::Get`方法对比，该方法拿到的性能需要是最好的。
+    所有实现的性能对比，并且与最终的`jit::Get`方法对比，该方法拿到的性能需要在各种条件下都是最好的。

 # 如何添加新的算子

 - 在`KernelType` 中添加 `your_key` .
- 实现Reference 的逻辑，每个jitkernel的Reference 实现是必须的。不要依赖任何第三方库。并在`refer/CmakeLists.txt`中`USE_JITKERNEL_REFER(your_key)`.
- (optional) 实现更多的算法在`more`目录下，可以依赖mkl，openblas，或者mkldnn等第三方库。
- (optional) 实现基于Xbyak的生成code，在`gen`目下。 jitcode需要实现自己的`JitCodeCreator`，并注册在KernelType上。
+- 实现Reference 的逻辑，这个是必须是在CPU上的实现，并且不能依赖任何第三方库。实现后在`refer/CmakeLists.txt`中添加`USE_JITKERNEL_REFER(your_key)`来使用该kernel.
+- (optional) 实现更多的算法在`more`目录下，可以依赖mkl，intrinsic或者mkldnn等第三方库。
+- (optional) 实现基于Xbyak的生成code，在`gen`目下。 jitcode需要实现自己的`JitCodeCreator`，并注册在与refer相同的`KernelType`上。
 - 必要时可以添加新的`KernelTuples`，可以参考`XYZNTuples`，新加的Attr类型需要特例化`JitCodeKey`方法。
- 添加unit test，需要测试float和double
- 添加benchmark确保get得到的速度是最快。
+- 在`test.cc`中添加unit test，至少需要测试`float`和`double`两种数据类型，如有必要需要支持额外的数据类型，比如`int8`的相关函数。
+- 在`benchmark.cc`中添加相应的性能对比，同一种kernel需要对比所有实现，并且确保`jit::Get`得到的实现一直是速度最快的。
+
+# 优点
+- 统一的Get方法，接口简单。
+- 同一套逻辑可以有多套实现，可以依赖多套第三方库，互不影响。
+- 目录结构清晰，不会在某个文件中有多个宏定义，导致的可读性差问题。
+- 优化方便，可以直接针对某种属性针对性优化，并不影响其他属性下的性能。
+- 可以支持多种平台，包括Linux，Mac 和 Windows，至少可以保证每种平台都可以正常work。后期也可以针对不同平台有针对的优化。框架层面可以使用统一接口，不必关心底层实现。
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@ -93,10 +93,11 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
  if (iter != pool.end()) {
    auto& impls = iter->second;
    for (auto& impl : impls) {
-      auto i = dynamic_cast<const jit::KernelImpl<KernelTuples>*>(impl.get());
+      auto i = dynamic_cast<const jit::KernelMore<KernelTuples>*>(impl.get());
      if (i && i->UseMe(attr)) {
        auto more = i->GetFunc();
-        infos.push_back(std::make_pair("More", benchmark(more, args...)));
+        infos.push_back(
+            std::make_pair(i->ImplType(), benchmark(more, args...)));
      }
    }
  }
--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
@ -107,7 +107,7 @@ typename KernelTuples::func_type Get(
  if (iter != pool.end()) {
    auto& impls = iter->second;
    for (auto& impl : impls) {
-      auto i = dynamic_cast<const KernelImpl<KernelTuples>*>(impl.get());
+      auto i = dynamic_cast<const KernelMore<KernelTuples>*>(impl.get());
      if (i && i->UseMe(attr)) {
        return i->GetFunc();
      }
--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
@ -144,28 +144,27 @@ class Kernel {
 };

 template <typename KernelTuples>
-class KernelImpl : public Kernel {
-  // TODO(TJ): rename KernelImpl to KernelMore which seems only used in more
-  // and add name interface for more implements easy for debug
+class KernelMore : public Kernel {
 public:
  using T = typename KernelTuples::data_type;
  using Func = typename KernelTuples::func_type;
  using Attr = typename KernelTuples::attr_type;
  virtual Func GetFunc() const { return func; }
-  // TODO(TJ): const &attr
-  virtual bool UseMe(Attr attr) const = 0;
+  virtual bool UseMe(const Attr& attr) const = 0;
+  virtual const char* ImplType() const = 0;

 protected:
  Func func{nullptr};
 };

 template <typename KernelTuples>
-class ReferKernel : public KernelImpl<KernelTuples> {
+class ReferKernel : public KernelMore<KernelTuples> {
 public:
  // Refer code can always be used
-  bool UseMe(typename KernelTuples::attr_type attr) const override {
+  bool UseMe(const typename KernelTuples::attr_type& attr) const override {
    return true;
  }
+  const char* ImplType() const override { return "Refer"; }
 };

 }  // namespace jit
--- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
+++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
@ -156,7 +156,7 @@ void CRFDecoding(const int seq_len, const float* x, const float* w,
  }
 }

-bool CRFDecodingKernel::UseMe(int d) const {
+bool CRFDecodingKernel::UseMe(const int& d) const {
  return platform::MayIUse(platform::avx);
 }

--- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
+++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
@ -26,10 +26,12 @@ namespace intrinsic {
 void CRFDecoding(const int seq_len, const float* x, const float* w,
                 float* alpha, int* track, int tag_num);

-class CRFDecodingKernel : public KernelImpl<CRFDecodingTuples<float>> {
+class CRFDecodingKernel : public KernelMore<CRFDecodingTuples<float>> {
 public:
  CRFDecodingKernel() { this->func = CRFDecoding; }
-  bool UseMe(typename CRFDecodingTuples<float>::attr_type) const override;
+  bool UseMe(
+      const typename CRFDecodingTuples<float>::attr_type&) const override;
+  const char* ImplType() const override { return "Intrinsic"; }
 };

 }  // namespace intrinsic
--- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
+++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
@ -153,7 +153,7 @@ void LayerNorm(float* x, float* out, float* mean, float* var,
  }
 }

-bool LayerNormKernel::UseMe(int d) const {
+bool LayerNormKernel::UseMe(const int& d) const {
  return platform::MayIUse(platform::avx);
 }

--- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
+++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
@ -27,10 +27,11 @@ void LayerNorm(float* x, float* out, float* mean, float* var,
               const float* scale, const float* bias, int height,
               const float epsilon, int right);

-class LayerNormKernel : public KernelImpl<LayerNormTuples<float>> {
+class LayerNormKernel : public KernelMore<LayerNormTuples<float>> {
 public:
  LayerNormKernel() { this->func = LayerNorm; }
-  bool UseMe(typename LayerNormTuples<float>::attr_type) const override;
+  bool UseMe(const typename LayerNormTuples<float>::attr_type&) const override;
+  const char* ImplType() const override { return "Intrinsic"; }
 };

 }  // namespace intrinsic
--- a/paddle/fluid/operators/jit/more/mix/mix.cc
+++ b/paddle/fluid/operators/jit/more/mix/mix.cc
@ -180,19 +180,19 @@ void GRUHtPart2(gru_t* step, const gru_attr_t* attr) {
 }

 // TODO(TJ): tuning me
-bool VSigmoidKernel::UseMe(int d) const { return true; }
+bool VSigmoidKernel::UseMe(const int& d) const { return true; }

-bool VTanhKernel::UseMe(int d) const { return true; }
+bool VTanhKernel::UseMe(const int& d) const { return true; }

-bool LSTMCtHtKernel::UseMe(lstm_attr_t attr) const { return true; }
+bool LSTMCtHtKernel::UseMe(const lstm_attr_t& attr) const { return true; }

-bool LSTMC1H1Kernel::UseMe(lstm_attr_t attr) const { return true; }
+bool LSTMC1H1Kernel::UseMe(const lstm_attr_t& attr) const { return true; }

-bool GRUH1Kernel::UseMe(gru_attr_t attr) const { return true; }
+bool GRUH1Kernel::UseMe(const gru_attr_t& attr) const { return true; }

-bool GRUHtPart1Kernel::UseMe(gru_attr_t attr) const { return true; }
+bool GRUHtPart1Kernel::UseMe(const gru_attr_t& attr) const { return true; }

-bool GRUHtPart2Kernel::UseMe(gru_attr_t attr) const { return true; }
+bool GRUHtPart2Kernel::UseMe(const gru_attr_t& attr) const { return true; }

 }  // namespace mix
 }  // namespace more
--- a/paddle/fluid/operators/jit/more/mix/mix.h
+++ b/paddle/fluid/operators/jit/more/mix/mix.h
@ -33,11 +33,12 @@ void GRUH1(gru_t* step, const gru_attr_t* attr);
 void GRUHtPart1(gru_t* step, const gru_attr_t* attr);
 void GRUHtPart2(gru_t* step, const gru_attr_t* attr);

-#define DECLARE_MORE_KERNEL(name, tuples)                     \
-  class name##Kernel : public KernelImpl<tuples<T>> {         \
-   public:                                                    \
-    name##Kernel() { this->func = name; }                     \
-    bool UseMe(typename tuples<T>::attr_type) const override; \
+#define DECLARE_MORE_KERNEL(name, tuples)                            \
+  class name##Kernel : public KernelMore<tuples<T>> {                \
+   public:                                                           \
+    name##Kernel() { this->func = name; }                            \
+    bool UseMe(const typename tuples<T>::attr_type&) const override; \
+    const char* ImplType() const override { return "Mixed"; }        \
  }

 // XYN
--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
@ -74,39 +74,39 @@ void VExp<double>(const double* x, double* y, int n) {

 // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
 template <>
-bool VMulKernel<float>::UseMe(int d) const {
+bool VMulKernel<float>::UseMe(const int& d) const {
  return platform::MayIUse(platform::avx512f) && d > 512;
 }

 template <>
-bool VAddKernel<float>::UseMe(int d) const {
+bool VAddKernel<float>::UseMe(const int& d) const {
  return platform::MayIUse(platform::avx512f) && d > 512;
 }

 template <>
-bool VScalKernel<float>::UseMe(int d) const {
+bool VScalKernel<float>::UseMe(const int& d) const {
  return platform::MayIUse(platform::avx512f) && d > 512;
 }

 template <>
-bool VExpKernel<float>::UseMe(int d) const {
+bool VExpKernel<float>::UseMe(const int& d) const {
  return d > 7;
 }

 template <>
-bool VSigmoidKernel<float>::UseMe(int d) const {
+bool VSigmoidKernel<float>::UseMe(const int& d) const {
  return d > 7;
 }

 template <>
-bool VTanhKernel<float>::UseMe(int d) const {
+bool VTanhKernel<float>::UseMe(const int& d) const {
  return d > 7;
 }

-#define AWALYS_USE_ME_WITH_DOUBLE(func)           \
-  template <>                                     \
-  bool func##Kernel<double>::UseMe(int d) const { \
-    return true;                                  \
+#define AWALYS_USE_ME_WITH_DOUBLE(func)                  \
+  template <>                                            \
+  bool func##Kernel<double>::UseMe(const int& d) const { \
+    return true;                                         \
  }

 AWALYS_USE_ME_WITH_DOUBLE(VMul);
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@ -60,12 +60,13 @@ void VTanh(const T* x, T* y, int n) {
  }
 }

-#define DECLARE_MKL_KERNEL(name, tuples)                      \
-  template <typename T>                                       \
-  class name##Kernel : public KernelImpl<tuples<T>> {         \
-   public:                                                    \
-    name##Kernel() { this->func = name<T>; }                  \
-    bool UseMe(typename tuples<T>::attr_type) const override; \
+#define DECLARE_MKL_KERNEL(name, tuples)                             \
+  template <typename T>                                              \
+  class name##Kernel : public KernelMore<tuples<T>> {                \
+   public:                                                           \
+    name##Kernel() { this->func = name<T>; }                         \
+    bool UseMe(const typename tuples<T>::attr_type&) const override; \
+    const char* ImplType() const override { return "MKL"; }          \
  }

 // XYZN
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@ -228,10 +228,10 @@ void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
  if (iter != pool.end()) {
    auto& impls = iter->second;
    for (auto& impl : impls) {
-      auto i = dynamic_cast<const jit::KernelImpl<KernelTuples>*>(impl.get());
+      auto i = dynamic_cast<const jit::KernelMore<KernelTuples>*>(impl.get());
      if (i && i->UseMe(attr)) {
        auto more = i->GetFunc();
-        VLOG(10) << "Test More Kernel ";
+        VLOG(10) << "Test More Kernel : " << i->ImplType();
        test(more, args...);
      }
    }
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
--- a/paddle/fluid/operators/math/jit_gen.cc
+++ b/paddle/fluid/operators/math/jit_gen.cc
@ -1,90 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/jit_gen.h"
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include "paddle/fluid/platform/cpu_info.h"
-
-DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace jitkernel {
-namespace gen {
-
-constexpr Xbyak::Operand::Code g_abi_regs[] = {
-    Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::R12,
-    Xbyak::Operand::R13, Xbyak::Operand::R14, Xbyak::Operand::R15};
-
-constexpr int num_g_abi_regs = sizeof(g_abi_regs) / sizeof(g_abi_regs[0]);
-
-void JitCode::preCode() {
-  for (int i = 0; i < num_g_abi_regs; ++i) {
-    push(Xbyak::Reg64(g_abi_regs[i]));
-  }
-  if (platform::MayIUse(platform::avx512f)) {
-    mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt);
-  }
-}
-
-void JitCode::postCode() {
-  for (int i = 0; i < num_g_abi_regs; ++i) {
-    pop(Xbyak::Reg64(g_abi_regs[num_g_abi_regs - 1 - i]));
-  }
-  ret();
-}
-
-void JitCode::dumpCode(const Xbyak::uint8 *code) const {
-  if (code) {
-    static int counter = 0;
-    std::ostringstream filename;
-    filename << "paddle_jitcode_" << name() << "." << counter << ".bin";
-    counter++;
-    std::ofstream fout(filename.str(), std::ios::out);
-    if (fout.is_open()) {
-      fout.write(reinterpret_cast<const char *>(code), getSize());
-      fout.close();
-    }
-  }
-}
-
-Xbyak::Address JitCode::EVEX_compress_addr(Xbyak::Reg64 base, int offt,
-                                           bool bcast) {
-  int scale = 0;
-  if (EVEX_max_8b_offt <= offt && offt < 3 * EVEX_max_8b_offt) {
-    offt = offt - 2 * EVEX_max_8b_offt;
-    scale = 1;
-  } else if (3 * EVEX_max_8b_offt <= offt && offt < 5 * EVEX_max_8b_offt) {
-    offt = offt - 4 * EVEX_max_8b_offt;
-    scale = 2;
-  }
-  auto re = Xbyak::RegExp() + base + offt;
-  if (scale) {
-    re = re + reg_EVEX_max_8b_offt * scale;
-  }
-  if (bcast) {
-    return zword_b[re];
-  } else {
-    return zword[re];
-  }
-}
-
-}  // namespace gen
-}  // namespace jitkernel
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/math/jit_gen.h
+++ b/paddle/fluid/operators/math/jit_gen.h
@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <gflags/gflags.h>
-#include <type_traits>
-#include "paddle/fluid/platform/macros.h"
-
-#define XBYAK_USE_MMAP_ALLOCATOR
-#include "xbyak/xbyak.h"
-#include "xbyak/xbyak_util.h"
-
-DECLARE_bool(dump_jitcode);
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace jitkernel {
-namespace gen {
-
-#define DECLARE_JIT_CODE(codename) \
-  const char *name() const override { return #codename; }
-
-// Application Binary Interface
-constexpr Xbyak::Operand::Code abi_param1(Xbyak::Operand::RDI),
-    abi_param2(Xbyak::Operand::RSI), abi_param3(Xbyak::Operand::RDX),
-    abi_param4(Xbyak::Operand::RCX), abi_not_param1(Xbyak::Operand::RCX);
-
-class JitCode : public Xbyak::CodeGenerator {
- public:
-  explicit JitCode(size_t code_size = 256 * 1024, void *code_ptr = nullptr)
-      : Xbyak::CodeGenerator(code_size, code_ptr) {}
-
-  virtual ~JitCode() {}
-  virtual const char *name() const = 0;
-  virtual void generate() = 0;
-
-  template <typename FUNC>
-  const FUNC getCode() {
-    this->generate();
-    const Xbyak::uint8 *code = CodeGenerator::getCode();
-    if (FLAGS_dump_jitcode) {
-      this->dumpCode(code);
-    }
-    return reinterpret_cast<const FUNC>(code);
-  }
-  DISABLE_COPY_AND_ASSIGN(JitCode);
-
- protected:
-  Xbyak::Reg64 param1{abi_param1};
-  const int EVEX_max_8b_offt = 0x200;
-  const Xbyak::Reg64 reg_EVEX_max_8b_offt = rbp;
-
-  void preCode();
-  void postCode();
-  void dumpCode(const Xbyak::uint8 *code) const;
-  void L(const char *label) { Xbyak::CodeGenerator::L(label); }
-  void L(const Xbyak::Label &label) { Xbyak::CodeGenerator::L(label); }
-  // Enhanced vector extension
-  Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base, int offt,
-                                    bool bcast = false);
-};
-
-}  // namespace gen
-}  // namespace jitkernel
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/math/jit_kernel.cc
+++ b/paddle/fluid/operators/math/jit_kernel.cc
@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/jit_kernel.h"
-#include <iostream>
-#include <string>
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace jitkernel {
-
-KernelPool& KernelPool::Instance() {
-  static thread_local KernelPool g_jit_kernels;
-  return g_jit_kernels;
-}
-
-std::shared_ptr<const Kernel> KernelPool::Get(const std::string& key) const {
-  if (kers_.find(key) == kers_.end()) {
-    return nullptr;
-  }
-  return kers_.at(key);
-}
-
-}  // namespace jitkernel
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
@ -1,157 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <functional>
-#include <memory>  // for shared_ptr
-#include <string>
-#include <unordered_map>
-#include "paddle/fluid/operators/math/jit_kernel_impl.h"
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/macros.h"
-
-// Note: Only support on CPU yet.
-namespace paddle {
-namespace operators {
-namespace math {
-namespace jitkernel {
-
-// TODO(TJ): remove me
-typedef enum { kLT8, kEQ8, kGT8LT16, kEQ16, kGT16 } jit_block;
-
-class Kernel {
- public:
-  Kernel() = default;
-  virtual ~Kernel() = default;
-  // TODO(TJ): below members should be deprecated.
-  int num_{0};
-  int end_{0};
-  int rest_{0};
-  DISABLE_COPY_AND_ASSIGN(Kernel);
-};
-
-class KernelPool {
- public:
-  static KernelPool &Instance();
-
-  template <typename Ker, typename... ARGS>
-  std::shared_ptr<const Ker> Get(ARGS... args);
-
-  std::shared_ptr<const Kernel> Get(const std::string &key) const;
-
- private:
-  KernelPool() = default;
-  std::unordered_map<std::string, std::shared_ptr<const Kernel>> kers_;
-
-  DISABLE_COPY_AND_ASSIGN(KernelPool);
-};
-
-template <typename T>
-class VMulKernel : public Kernel {
- public:
-  void (*Compute)(const T *, const T *, T *, int);
-};
-
-template <typename T>
-class VAddKernel : public Kernel {
- public:
-  void (*Compute)(const T *, const T *, T *, int);
-};
-
-template <typename T>
-class VAddReluKernel : public Kernel {
- public:
-  void (*Compute)(const T *, const T *, T *, int);
-};
-
-template <typename T>
-class VScalKernel : public Kernel {
- public:
-  // y = a.*x
-  void (*Compute)(const T *, const T *, T *, int);
-};
-
-template <typename T>
-class VAddBiasKernel : public Kernel {
- public:
-  // y = a.+x
-  void (*Compute)(const T *, const T *, T *, int);
-};
-
-#ifdef PADDLE_WITH_MKLDNN
-template <typename T>
-class EltwiseMulnChw16cNCKernel : public Kernel {
- public:
-  // nChw16c = nChw16c .* NC
-  void (*Compute)(const float *, const float *, float *, int, int);
-};
-#endif
-
-template <typename T>
-class VActKernel : public Kernel {
- public:
-  void (*Compute)(const T *, T *, int);
-};
-
-template <typename T>
-class VReluKernel : public VActKernel<T> {};
-
-template <typename T>
-class VIdentityKernel : public VActKernel<T> {};
-
-template <typename T>
-class VExpKernel : public VActKernel<T> {};
-
-template <typename T>
-class VSigmoidKernel : public VActKernel<T> {};
-
-template <typename T>
-class VTanhKernel : public VActKernel<T> {};
-
-template <typename T>
-class LSTMKernel : public Kernel {
- public:
-  // compute c1 and h1 without c0 or h0
-  void (*ComputeC1H1)(lstm_t *, const lstm_attr_t *);
-  void (*ComputeCtHt)(lstm_t *, const lstm_attr_t *);
-};
-
-template <typename T>
-class GRUKernel : public Kernel {
- public:
-  // compute h1 without h0
-  void (*ComputeH1)(gru_t *, const gru_attr_t *);
-  void (*ComputeHtPart1)(gru_t *, const gru_attr_t *);
-  void (*ComputeHtPart2)(gru_t *, const gru_attr_t *);
-};
-
-template <typename T>
-class CRFDecodeKernel : public Kernel {
- public:
-  virtual void Compute(const int seq_len, const T *x, const T *w, T *alpha,
-                       int *track) const = 0;
-};
-
-template <typename T>
-class LayerNormKernel : public Kernel {
- public:
-  virtual void Compute(T *x, T *out, T *mean, T *var, const T *scale,
-                       const T *bias, int height,
-                       const float epsilon) const = 0;
-};
-
-}  // namespace jitkernel
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
--- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
+++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
@ -1,195 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/jit_kernel.h"
-#include <string>
-#include "paddle/fluid/operators/math/jit_kernel_macro.h"
-#include "paddle/fluid/operators/math/jit_kernel_refer.h"
-
-#ifdef PADDLE_WITH_XBYAK
-#include "paddle/fluid/operators/math/jit_code.h"
-#endif
-
-#ifdef PADDLE_WITH_MKLML
-#include "paddle/fluid/platform/dynload/mklml.h"
-#endif
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace jitkernel {
-
-/* VExp JitKernel */
-template <typename T>
-class VExpKernelImpl : public VExpKernel<T> {
- public:
-  JITKERNEL_DECLARE_STATIC_FUNC;
-  explicit VExpKernelImpl(int d) : VExpKernel<T>() {
-#ifdef PADDLE_WITH_XBYAK
-    if (useJIT(d)) {
-      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 70 * 8;
-      jitcode_.reset(new gen::VActJitCode(d, gen::operand_type::exp,
-                                          sz > 4096 ? sz : 4096));
-      this->Compute = jitcode_->getCode<void (*)(const T*, T*, int)>();
-      return;
-    }
-#endif
-#ifdef PADDLE_WITH_MKLML
-    if (useMKL(d)) {
-      this->Compute = VExpMKL<T>;
-      return;
-    }
-#endif
-    this->Compute = refer::VExp<T>;
-  }
-
-#ifdef PADDLE_WITH_XBYAK
-
- private:
-  std::unique_ptr<gen::VActJitCode> jitcode_{nullptr};
-#endif
-};
-
-#ifdef PADDLE_WITH_XBYAK
-template <>
-bool VExpKernelImpl<float>::useJIT(int d) {
-  return gen::VActJitCode::init(d, gen::operand_type::exp);
-}
-#endif
-
-#ifdef PADDLE_WITH_MKLML
-template <>
-bool VExpKernelImpl<float>::useMKL(int d) {
-  return d > 512;
-}
-
-template <>
-bool VExpKernelImpl<double>::useMKL(int d) {
-  return true;
-}
-
-#endif
-
-/* VSigmoid JitKernel */
-template <typename T>
-class VSigmoidKernelImpl : public VSigmoidKernel<T> {
- public:
-  JITKERNEL_DECLARE_STATIC_FUNC;
-  explicit VSigmoidKernelImpl(int d) : VSigmoidKernel<T>() {
-#ifdef PADDLE_WITH_XBYAK
-    if (useJIT(d)) {
-      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 82 * 8;
-      jitcode_.reset(new gen::VActJitCode(d, gen::operand_type::sigmoid,
-                                          sz > 4096 ? sz : 4096));
-      this->Compute = jitcode_->getCode<void (*)(const T*, T*, int)>();
-      return;
-    }
-#endif
-
-#ifdef PADDLE_WITH_MKLML
-    // strictly it's a better impl with MKL, then is refer
-    if (useMKL(d)) {
-      this->Compute = VSigmoidMKL<T>;
-      return;
-    }
-#endif
-    this->Compute = refer::VSigmoid<T>;
-  }
-
-#ifdef PADDLE_WITH_XBYAK
-
- private:
-  std::unique_ptr<gen::VActJitCode> jitcode_{nullptr};
-#endif
-};
-
-#ifdef PADDLE_WITH_XBYAK
-template <>
-bool VSigmoidKernelImpl<float>::useJIT(int d) {
-  return gen::VActJitCode::init(d, gen::operand_type::sigmoid);
-}
-#endif
-
-#ifdef PADDLE_WITH_MKLML
-template <>
-bool VSigmoidKernelImpl<float>::useMKL(int d) {
-  return d > 512;
-}
-
-template <>
-bool VSigmoidKernelImpl<double>::useMKL(int d) {
-  return true;
-}
-#endif
-
-/* VTanh JitKernel */
-template <typename T>
-class VTanhKernelImpl : public VTanhKernel<T> {
- public:
-  JITKERNEL_DECLARE_STATIC_FUNC;
-  explicit VTanhKernelImpl(int d) : VTanhKernel<T>() {
-#ifdef PADDLE_WITH_XBYAK
-    if (useJIT(d)) {
-      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 84 * 8;
-      jitcode_.reset(new gen::VActJitCode(d, gen::operand_type::tanh,
-                                          sz > 4096 ? sz : 4096));
-      this->Compute = jitcode_->getCode<void (*)(const T*, T*, int)>();
-      return;
-    }
-#endif
-
-#ifdef PADDLE_WITH_MKLML
-    // strictly it's a better impl with MKL, then is refer
-    if (useMKL(d)) {
-      this->Compute = VTanhMKL<T>;
-      return;
-    }
-#endif
-    this->Compute = refer::VTanh<T>;
-  }
-
-#ifdef PADDLE_WITH_XBYAK
-
- private:
-  std::unique_ptr<gen::VActJitCode> jitcode_{nullptr};
-#endif
-};
-
-#ifdef PADDLE_WITH_XBYAK
-template <>
-bool VTanhKernelImpl<float>::useJIT(int d) {
-  return gen::VActJitCode::init(d, gen::operand_type::tanh);
-}
-#endif
-
-#ifdef PADDLE_WITH_MKLML
-template <>
-bool VTanhKernelImpl<float>::useMKL(int d) {
-  return d > 512;
-}
-
-template <>
-bool VTanhKernelImpl<double>::useMKL(int d) {
-  return true;
-}
-#endif
-
-REGISTER_JITKERNEL(vexp, VExpKernel);
-REGISTER_JITKERNEL(vsigmoid, VSigmoidKernel);
-REGISTER_JITKERNEL(vtanh, VTanhKernel);
-
-}  // namespace jitkernel
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/math/jit_kernel_impl.h
+++ b/paddle/fluid/operators/math/jit_kernel_impl.h
@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <type_traits>
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace jitkernel {
-
-#define SIGMOID_THRESHOLD_MIN -40.0
-#define SIGMOID_THRESHOLD_MAX 13.0
-#define EXP_MAX_INPUT 40.0
-#define XMM_FLOAT_BLOCK 4
-#define YMM_FLOAT_BLOCK 8
-#define ZMM_FLOAT_BLOCK 16
-
-}  // namespace jitkernel
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
+++ b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
@ -1,239 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/math/jit_kernel.h"
-#include <math.h>
-#include <limits>
-#include <string>
-#include "paddle/fluid/operators/math/jit_kernel_macro.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace jitkernel {
-
-/* Layer Norm JitKernel */
-template <typename T, platform::cpu_isa_t isa, jit_block>
-class LayerNormKernelImpl : public LayerNormKernel<T> {
- public:
-  explicit LayerNormKernelImpl(int right) : LayerNormKernel<T>() {
-    this->num_ = right;
-  }
-
-  void Compute(T* x, T* out, T* mean, T* var, const T* scale, const T* bias,
-               int height, const float epsilon) const override {
-    // get mean
-    for (int i = 0; i < height; i++) {
-      T sum = 0.0;
-      int offset = i * this->num_;
-      for (int j = 0; j < this->num_; j++) {
-        sum += x[offset + j];
-      }
-      mean[i] = sum / this->num_;
-    }
-
-    // get variance
-    for (int i = 0; i < height; i++) {
-      T sum = 0.0;
-      int offset = i * this->num_;
-      for (int j = 0; j < this->num_; j++) {
-        sum += (x[offset + j] - mean[i]) * (x[offset + j] - mean[i]);
-      }
-      var[i] = sum / this->num_;
-    }
-
-    for (int i = 0; i < height; i++) {
-      int offset = i * this->num_;
-      T sqrt_var = sqrt(var[i] + (T)epsilon);
-      for (int j = 0; j < this->num_; j++) {
-        out[offset + j] = (x[offset + j] - mean[i]) / sqrt_var;
-      }
-    }
-    if (scale) {
-      for (int i = 0; i < height; i++) {
-        int offset = i * this->num_;
-        for (int j = 0; j < this->num_; j++) {
-          out[offset + j] *= scale[j];
-        }
-      }
-    }
-
-    if (bias) {
-      for (int i = 0; i < height; i++) {
-        int offset = i * this->num_;
-        for (int j = 0; j < this->num_; j++) {
-          out[offset + j] += bias[j];
-        }
-      }
-    }
-  }
-};
-
-#define INTRIAVX_FLOAT(isa, jit_block)                                         \
-  template <>                                                                  \
-  LayerNormKernelImpl<float, isa, jit_block>::LayerNormKernelImpl(int right)   \
-      : LayerNormKernel<float>() {                                             \
-    this->num_ = right;                                                        \
-    this->rest_ = this->num_ % YMM_FLOAT_BLOCK;                                \
-    this->end_ = this->num_ - this->rest_;                                     \
-  }                                                                            \
-  template <>                                                                  \
-  void LayerNormKernelImpl<float, isa, jit_block>::Compute(                    \
-      float* x, float* out, float* mean, float* var, const float* scale,       \
-      const float* bias, int height, const float epsilon) const {              \
-    __m256 sum;                                                                \
-    __m256 mean_vec, var_vec;                                                  \
-    __m128 hi, lo;                                                             \
-    __m256 tmp;                                                                \
-    size_t offset;                                                             \
-    size_t j;                                                                  \
-    size_t block = YMM_FLOAT_BLOCK;                                            \
-    __m256 reverse_num_vec =                                                   \
-        _mm256_div_ps(_mm256_set1_ps(1.0), _mm256_set1_ps(this->num_));        \
-    __m256 epsilon_vec = _mm256_set1_ps(epsilon);                              \
-    int rest_mask =                                                            \
-        ((-1) & (~((~0U) >> (sizeof(int) * 8 - (YMM_FLOAT_BLOCK - rest_))))) & \
-        0x0ff;                                                                 \
-    __m256i mask_vec = _mm256_set_epi32(                                       \
-        rest_mask & 0x80 ? 0xffffffff : 0, rest_mask & 0x40 ? 0xffffffff : 0,  \
-        rest_mask & 0x20 ? 0xffffffff : 0, rest_mask & 0x10 ? 0xffffffff : 0,  \
-        rest_mask & 0x8 ? 0xffffffff : 0, rest_mask & 0x4 ? 0xffffffff : 0,    \
-        rest_mask & 0x2 ? 0xffffffff : 0, rest_mask & 0x1 ? 0xffffffff : 0);   \
-                                                                               \
-    for (int i = 0; i < height; ++i) {                                         \
-      offset = i * this->num_;                                                 \
-                                                                               \
-      /* get mean */                                                           \
-      sum = _mm256_setzero_ps();                                               \
-      for (j = offset; j < end_ + offset; j += block) {                        \
-        sum = _mm256_add_ps(sum, _mm256_loadu_ps((const float*)x + j));        \
-      }                                                                        \
-      if (rest_ != 0) {                                                        \
-        j = offset + this->num_ - block;                                       \
-        tmp = _mm256_loadu_ps((const float*)x + j);                            \
-        tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, *(__m256*)&mask_vec); \
-        sum = _mm256_add_ps(sum, tmp);                                         \
-      }                                                                        \
-      hi = _mm256_extractf128_ps(sum, 1);                                      \
-      lo = _mm256_extractf128_ps(sum, 0);                                      \
-      sum = _mm256_add_ps(                                                     \
-          sum, _mm256_insertf128_ps(                                           \
-                   _mm256_insertf128_ps(_mm256_setzero_ps(), hi, 0), lo, 1));  \
-      sum = _mm256_hadd_ps(sum, sum);                                          \
-      sum = _mm256_hadd_ps(sum, sum);                                          \
-      mean_vec = _mm256_mul_ps(sum, reverse_num_vec);                          \
-      mean[i] = *reinterpret_cast<float*>(&mean_vec);                          \
-                                                                               \
-      /* get variance */                                                       \
-      sum = _mm256_setzero_ps();                                               \
-      for (j = offset; j < end_ + offset; j += block) {                        \
-        tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec);   \
-        tmp = _mm256_mul_ps(tmp, tmp);                                         \
-        sum = _mm256_add_ps(sum, tmp);                                         \
-      }                                                                        \
-      if (rest_ != 0) {                                                        \
-        j = offset + this->num_ - block;                                       \
-        tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec);   \
-        tmp = _mm256_mul_ps(tmp, tmp);                                         \
-        tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, *(__m256*)&mask_vec); \
-        sum = _mm256_add_ps(sum, tmp);                                         \
-      }                                                                        \
-      hi = _mm256_extractf128_ps(sum, 1);                                      \
-      lo = _mm256_extractf128_ps(sum, 0);                                      \
-      sum = _mm256_add_ps(                                                     \
-          sum, _mm256_insertf128_ps(                                           \
-                   _mm256_insertf128_ps(_mm256_setzero_ps(), hi, 0), lo, 1));  \
-      sum = _mm256_hadd_ps(sum, sum);                                          \
-      sum = _mm256_hadd_ps(sum, sum);                                          \
-      var_vec = _mm256_mul_ps(sum, reverse_num_vec);                           \
-      var[i] = *reinterpret_cast<float*>(&var_vec);                            \
-                                                                               \
-      /* get x_norm and calculate output*/                                     \
-      for (j = offset; j < end_ + offset; j += block) {                        \
-        tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec);   \
-        tmp = _mm256_div_ps(                                                   \
-            tmp, _mm256_sqrt_ps(_mm256_add_ps(var_vec, epsilon_vec)));         \
-        _mm256_storeu_ps(reinterpret_cast<float*>(out) + j, tmp);              \
-      }                                                                        \
-      if (rest_ != 0) {                                                        \
-        j = offset + num_ - block;                                             \
-        tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec);   \
-        tmp = _mm256_div_ps(                                                   \
-            tmp, _mm256_sqrt_ps(_mm256_add_ps(var_vec, epsilon_vec)));         \
-        _mm256_storeu_ps(reinterpret_cast<float*>(out) + j, tmp);              \
-      }                                                                        \
-                                                                               \
-      if (scale) {                                                             \
-        if (rest_ != 0) {                                                      \
-          j = offset + this->num_ - block;                                     \
-          tmp = _mm256_loadu_ps((const float*)out + j);                        \
-        }                                                                      \
-        for (j = offset; j < end_ + offset; j += block) {                      \
-          _mm256_storeu_ps(                                                    \
-              reinterpret_cast<float*>(out) + j,                               \
-              _mm256_mul_ps(                                                   \
-                  _mm256_loadu_ps((const float*)out + j),                      \
-                  _mm256_loadu_ps((const float*)scale + j - offset)));         \
-        }                                                                      \
-        if (rest_ != 0) {                                                      \
-          j = offset + this->num_ - block;                                     \
-          _mm256_storeu_ps(                                                    \
-              reinterpret_cast<float*>(out) + j,                               \
-              _mm256_mul_ps(                                                   \
-                  tmp, _mm256_loadu_ps((const float*)scale + j - offset)));    \
-        }                                                                      \
-      }                                                                        \
-                                                                               \
-      if (bias) {                                                              \
-        if (rest_ != 0) {                                                      \
-          j = offset + this->num_ - block;                                     \
-          tmp = _mm256_loadu_ps((const float*)out + j);                        \
-        }                                                                      \
-        for (j = offset; j < end_ + offset; j += block) {                      \
-          _mm256_storeu_ps(                                                    \
-              reinterpret_cast<float*>(out) + j,                               \
-              _mm256_add_ps(                                                   \
-                  _mm256_loadu_ps((const float*)out + j),                      \
-                  _mm256_loadu_ps((const float*)bias + j - offset)));          \
-        }                                                                      \
-        if (rest_ != 0) {                                                      \
-          j = offset + this->num_ - block;                                     \
-          _mm256_storeu_ps(                                                    \
-              reinterpret_cast<float*>(out) + j,                               \
-              _mm256_add_ps(                                                   \
-                  tmp, _mm256_loadu_ps((const float*)bias + j - offset)));     \
-        }                                                                      \
-      }                                                                        \
-    }                                                                          \
-  }
-
-#ifdef __AVX__
-INTRIAVX_FLOAT(platform::avx, kEQ8);
-INTRIAVX_FLOAT(platform::avx, kGT8LT16);
-INTRIAVX_FLOAT(platform::avx, kEQ16);
-INTRIAVX_FLOAT(platform::avx, kGT16);
-INTRIAVX_FLOAT(platform::avx2, kEQ8);
-INTRIAVX_FLOAT(platform::avx2, kGT8LT16);
-INTRIAVX_FLOAT(platform::avx2, kEQ16);
-INTRIAVX_FLOAT(platform::avx2, kGT16);
-INTRIAVX_FLOAT(platform::avx512f, kEQ8);
-INTRIAVX_FLOAT(platform::avx512f, kGT8LT16);
-INTRIAVX_FLOAT(platform::avx512f, kEQ16);
-INTRIAVX_FLOAT(platform::avx512f, kGT16);
-#endif
-
-#undef INTRIAVX_FLOAT
-
-REGISTER_JITKERNEL_DEPRECATED(layer_norm, LayerNormKernel);
-
-}  // namespace jitkernel
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/math/jit_kernel_macro.h
+++ b/paddle/fluid/operators/math/jit_kernel_macro.h
@ -1,179 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace jitkernel {
-
-#define JITKERNEL_DECLARE_STATIC_FUNC                       \
-  static inline std::string name(int d) {                   \
-    PADDLE_THROW("DType should be either float or double"); \
-  }                                                         \
-  static inline bool useJIT(int d) { return false; }        \
-  static inline bool useMKL(int d) { return false; }
-
-#define JITKERNEL_DEFINE_NAME(ker_key, ker_class)    \
-  template <>                                        \
-  std::string ker_class##Impl<float>::name(int d) {  \
-    std::string key(#ker_key "f");                   \
-    if (useJIT(d)) {                                 \
-      /* only jit code need record d*/               \
-      return key + "jit" + std::to_string(d);        \
-    } else if (useMKL(d)) {                          \
-      return key + "mkl";                            \
-    } else {                                         \
-      return key + "any";                            \
-    }                                                \
-  }                                                  \
-  template <>                                        \
-  std::string ker_class##Impl<double>::name(int d) { \
-    std::string key(#ker_key "d");                   \
-    /* jit code do not support double yet*/          \
-    if (useMKL(d)) {                                 \
-      return key + "mkl";                            \
-    } else {                                         \
-      return key + "any";                            \
-    }                                                \
-  }
-
-#define JITKERNEL_DECLARE(ker_class, ker_dtype) \
-  template <>                                   \
-  std::shared_ptr<const ker_class<ker_dtype>>   \
-  KernelPool::Get<ker_class<ker_dtype>, int>(int d)
-
-#define JITKERNEL_FIND_KEY(ker_class, ker_dtype) \
-  std::string key = ker_class##Impl<ker_dtype>::name(d)
-
-#define JITKERNEL_IMPL(ker_class, ker_dtype)           \
-  p = std::dynamic_pointer_cast<ker_class<ker_dtype>>( \
-      std::make_shared<ker_class##Impl<ker_dtype>>(d))
-
-#define REGISTER_JITKERNEL_WITH_DTYPE(ker_class, ker_dtype, marco_declare, \
-                                      macro_find_key, macro_impl)          \
-  marco_declare(ker_class, ker_dtype) {                                    \
-    macro_find_key(ker_class, ker_dtype);                                  \
-    if (kers_.find(key) == kers_.end()) {                                  \
-      std::shared_ptr<ker_class<ker_dtype>> p;                             \
-      macro_impl(ker_class, ker_dtype);                                    \
-      kers_.insert({key, std::dynamic_pointer_cast<Kernel>(p)});           \
-      return p;                                                            \
-    }                                                                      \
-    return std::dynamic_pointer_cast<const ker_class<ker_dtype>>(          \
-        kers_.at(key));                                                    \
-  }
-
-#define REGISTER_JITKERNEL_ARGS(ker_key, ker_class, marco_define_name,     \
-                                marco_declare, macro_find_key, macro_impl) \
-  marco_define_name(ker_key, ker_class);                                   \
-  REGISTER_JITKERNEL_WITH_DTYPE(ker_class, float, marco_declare,           \
-                                macro_find_key, macro_impl);               \
-  REGISTER_JITKERNEL_WITH_DTYPE(ker_class, double, marco_declare,          \
-                                macro_find_key, macro_impl)
-
-#define REGISTER_JITKERNEL(ker_key, ker_class)                       \
-  REGISTER_JITKERNEL_ARGS(ker_key, ker_class, JITKERNEL_DEFINE_NAME, \
-                          JITKERNEL_DECLARE, JITKERNEL_FIND_KEY,     \
-                          JITKERNEL_IMPL)
-
-// TODO(TJ): below defines are deprecated, would be remove recently
-#define SEARCH_BLOCK(macro_, ker, dtype, isa)              \
-  if (d < YMM_FLOAT_BLOCK) {                               \
-    macro_(ker, dtype, isa, kLT8);                         \
-  } else if (d == YMM_FLOAT_BLOCK) {                       \
-    macro_(ker, dtype, isa, kEQ8);                         \
-  } else if (d > YMM_FLOAT_BLOCK && d < ZMM_FLOAT_BLOCK) { \
-    macro_(ker, dtype, isa, kGT8LT16);                     \
-  } else if (d == ZMM_FLOAT_BLOCK) {                       \
-    macro_(ker, dtype, isa, kEQ16);                        \
-  } else {                                                 \
-    macro_(ker, dtype, isa, kGT16);                        \
-  }
-
-#define SEARCH_ISA_BLOCK(macro_, ker, dtype)             \
-  if (platform::MayIUse(platform::avx512f)) {            \
-    SEARCH_BLOCK(macro_, ker, dtype, platform::avx512f); \
-  } else if (platform::MayIUse(platform::avx2)) {        \
-    SEARCH_BLOCK(macro_, ker, dtype, platform::avx2);    \
-  } else if (platform::MayIUse(platform::avx)) {         \
-    SEARCH_BLOCK(macro_, ker, dtype, platform::avx);     \
-  } else {                                               \
-    SEARCH_BLOCK(macro_, ker, dtype, platform::isa_any); \
-  }
-
-#define JITKERNEL_KEY(ker_key, dtype_key) \
-  #ker_key #dtype_key + std::to_string(d)
-
-#define JITKERNEL_NEW_IMPL_DEPRECATED(ker, dtype, isa, k) \
-  p = std::dynamic_pointer_cast<ker<dtype>>(              \
-      std::make_shared<ker##Impl<dtype, isa, k>>(d))
-
-#define JITKERNEL_WITH_DTYPE_DEPRECATED(ker_key, ker_class, ker_dtype,       \
-                                        dtype_key, marco_declare, macro_key, \
-                                        macro_impl)                          \
-  marco_declare(ker_class, ker_dtype) {                                      \
-    std::string key = macro_key(ker_key, dtype_key);                         \
-    if (kers_.find(key) == kers_.end()) {                                    \
-      std::shared_ptr<ker_class<ker_dtype>> p;                               \
-      SEARCH_ISA_BLOCK(macro_impl, ker_class, ker_dtype);                    \
-      kers_.insert({key, std::dynamic_pointer_cast<Kernel>(p)});             \
-      return p;                                                              \
-    }                                                                        \
-    return std::dynamic_pointer_cast<const ker_class<ker_dtype>>(            \
-        kers_.at(key));                                                      \
-  }
-
-#define REGISTER_JITKERNEL_DEPRECATED(ker_key, ker_class)           \
-  JITKERNEL_WITH_DTYPE_DEPRECATED(ker_key, ker_class, float, f,     \
-                                  JITKERNEL_DECLARE, JITKERNEL_KEY, \
-                                  JITKERNEL_NEW_IMPL_DEPRECATED);   \
-  JITKERNEL_WITH_DTYPE_DEPRECATED(ker_key, ker_class, double, d,    \
-                                  JITKERNEL_DECLARE, JITKERNEL_KEY, \
-                                  JITKERNEL_NEW_IMPL_DEPRECATED)
-
-#define REGISTER_JITKERNEL_ARGS_DEPRECATED(ker_key, ker_class, marco_declare,  \
-                                           macro_key, macro_impl)              \
-  JITKERNEL_WITH_DTYPE_DEPRECATED(ker_key, ker_class, float, f, marco_declare, \
-                                  macro_key, macro_impl);                      \
-  JITKERNEL_WITH_DTYPE_DEPRECATED(ker_key, ker_class, double, d,               \
-                                  marco_declare, macro_key, macro_impl)
-
-#define FOR_EACH_ISA(macro_, block) \
-  macro_(platform::avx512f, block); \
-  macro_(platform::avx2, block);    \
-  macro_(platform::avx, block);     \
-  macro_(platform::isa_any, block)
-
-#define FOR_EACH_BLOCK(macro_, isa) \
-  macro_(isa, kLT8);                \
-  macro_(isa, kEQ8);                \
-  macro_(isa, kGT8LT16);            \
-  macro_(isa, kEQ16);               \
-  macro_(isa, kGT16)
-
-#define FOR_EACH_ISA_BLOCK(macro_)           \
-  FOR_EACH_BLOCK(macro_, platform::avx512f); \
-  FOR_EACH_BLOCK(macro_, platform::avx2);    \
-  FOR_EACH_BLOCK(macro_, platform::avx);     \
-  FOR_EACH_BLOCK(macro_, platform::isa_any)
-
-}  // namespace jitkernel
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
--- a/Show More
+++ b/Show More