Merge pull request #15304 from tensor-tang/fuse/second_order_mul_sub

Fuse/second order mul sub and fuse repeated fc relu
6 years ago · a7fc3d42a0
parent a152a5c731 1a95cd227d
commit a7fc3d42a0
31 changed files with 1584 additions and 19 deletions
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@ -43,6 +43,8 @@ pass_library(multi_batch_merge_pass base)
 pass_library(conv_bn_fuse_pass inference)
 pass_library(seqconv_eltadd_relu_fuse_pass inference)
 pass_library(seqpool_concat_fuse_pass inference)
 pass_library(repeated_fc_relu_fuse_pass inference)
 pass_library(squared_mat_sub_fuse_pass inference)
 pass_library(is_test_pass base)
 pass_library(conv_elementwise_add_act_fuse_pass inference)
 pass_library(conv_elementwise_add2_act_fuse_pass inference)
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
@ -0,0 +1,41 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License. */
 #pragma once
 #include <string>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 namespace paddle {
 namespace framework {
 namespace ir {
 /**
 * Fuse Repeated FC Relu
 */
 class RepeatedFCReluFusePass : public FusePassBase {
 public:
  virtual ~RepeatedFCReluFusePass() {}
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
  const std::string name_scope_{"repeated_fc_relu_fuse"};
 };
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
@ -129,7 +129,8 @@ PDNode* BuildSeqPoolConcatPattern(PDPattern* pattern,
  return concat_out_var;
 }
-int BuildFusion(Graph* graph, const std::string& name_scope, int num_inputs) {
+static int BuildFusion(Graph* graph, const std::string& name_scope,
                       int num_inputs) {
  GraphPatternDetector gpd;
  auto* pattern = gpd.mutable_pattern();
  BuildSeqPoolConcatPattern(pattern, name_scope, num_inputs);
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
@ -0,0 +1,41 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License. */
 #pragma once
 #include <string>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 namespace paddle {
 namespace framework {
 namespace ir {
 /**
 * Fuse ( (A.^2 * B.^2) - (A * B).^2 ) .* scalar
 */
 class SquaredMatSubFusePass : public FusePassBase {
 public:
  virtual ~SquaredMatSubFusePass() {}
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
  const std::string name_scope_{"squared_mat_sub_fuse"};
 };
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@ -98,6 +98,8 @@ class CpuPassStrategy : public PassStrategy {
        "mul_gru_fuse_pass",             //
        "seq_concat_fc_fuse_pass",       //
        "fc_fuse_pass",                  //
        "repeated_fc_relu_fuse_pass",    //
        "squared_mat_sub_fuse_pass",     //
        "conv_bn_fuse_pass",             //
        "conv_eltwiseadd_bn_fuse_pass",  //
        "is_test_pass",                  //
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@ -37,15 +37,21 @@ function(inference_analysis_api_test_with_refer_result target install_dir filena
             --refer_result=${install_dir}/result.txt)
 endfunction()
 # RNN1
 if(NOT APPLE AND WITH_MKLML)
    # RNN1
    set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
    download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz")
    inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc SERIAL)
    # seq_pool1
    set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
    download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz")
    inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc SERIAL)
 else()
    # TODO: fix this test on MACOS and OPENBLAS, the reason is that
    # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
    message(WARNING "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_rnn1")
    message(WARNING "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_seq_pool1")
 endif()
 # RNN2
@ -90,11 +96,6 @@ set(SEQ_CONV1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_conv1")
 download_model_and_data(${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} analyzer_seq_conv1_tester.cc)
 # seq_pool1
 set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
 download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc)
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
 if (NOT EXISTS ${OCR_INSTALL_DIR})
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@ -21,6 +21,12 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 // diff: similarity_norm.tmp_0, for speed: fc_4.tmp_1
 static const char out_var_name[] = "reduce_sum_0.tmp_0";
 // for diff: 154, for speed 111
 constexpr int num_slots = 154;
 struct OneSlotInBatch {
  std::string name;
  std::vector<std::vector<float>> data;
@ -41,7 +47,6 @@ struct DataRecord {
  void Load(const std::string &path) {
    std::ifstream file(path);
    constexpr int num_slots = 154;
    std::string line;
    int num_lines = 0;
    while (std::getline(file, line)) {
@ -187,11 +192,15 @@ void analysis_fuse_statis(bool use_zerocopy) {
  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
  auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
  ASSERT_EQ(fuse_statis.at("fc_fuse"), 10);
  ASSERT_TRUE(fuse_statis.count("seqpool_concat_fuse"));
  ASSERT_TRUE(fuse_statis.count("squared_mat_sub_fuse"));
  ASSERT_TRUE(fuse_statis.count("repeated_fc_relu_fuse"));
  ASSERT_EQ(fuse_statis.at("fc_fuse"), 10);
  EXPECT_EQ(fuse_statis.at("seqpool_concat_fuse"), 2);
  EXPECT_EQ(fuse_statis.at("squared_mat_sub_fuse"), 2);
  EXPECT_EQ(fuse_statis.at("repeated_fc_relu_fuse"), 2);
  LOG(INFO) << "num_ops: " << num_ops;
-  EXPECT_EQ(num_ops, 195);
+  EXPECT_EQ(num_ops, 171);
 }
 // Check the fuse status
@ -214,9 +223,6 @@ void PrepareZeroCopyInputs(
  }
 }
 // diff: similarity_norm.tmp_0, // speed: fc_4.tmp_1
 static const char out_var_name[] = "reduce_sum_0.tmp_0";
 // return the output values
 std::vector<float> zerocopy_profile(int repeat_times) {
  AnalysisConfig config;
@ -322,7 +328,9 @@ TEST(Analyzer_seq_pool1, zerocopy_compare_native) {
            native_outputs.front().data.length());
  auto *native_data = static_cast<float *>(native_outputs.front().data.data());
  for (size_t i = 0; i < zerocopy_output.size(); ++i) {
-    EXPECT_NEAR(zerocopy_output[i], native_data[i], 1e-3);
+    EXPECT_LT(
        std::fabs((zerocopy_output[i] - native_data[i]) / zerocopy_output[i]),
        1e-3);
  }
 }
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
@ -0,0 +1,149 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License. */
 #include "paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.h"
 #include <string>
 #include <vector>
 #include "paddle/fluid/operators/jit/kernels.h"
 namespace paddle {
 namespace operators {
 void FusionRepeatedFCReluOp::InferShape(
    framework::InferShapeContext* ctx) const {
  PADDLE_ENFORCE(ctx->HasInput("X"),
                 "Input(X) of FusionRepeatedFCReluOp should not be null.");
  auto sz = ctx->Inputs("W").size();
  PADDLE_ENFORCE_GT(
      sz, 1UL, "Inputs(W) of FusionRepeatedFCReluOp should larger than 1.");
  PADDLE_ENFORCE_EQ(ctx->Inputs("Bias").size(), sz,
                    "Size of inputs(Bias) of FusionRepeatedFCReluOp should be "
                    "equal to inputs size.");
  PADDLE_ENFORCE_EQ(ctx->Outputs("ReluOut").size(), sz - 1,
                    "Size of output(ReluOut) of FusionRepeatedFCReluOp should "
                    "be equal to inputs size -1.");
  PADDLE_ENFORCE(ctx->HasOutput("Out"),
                 "Output(Out) of FusionRepeatedFCReluOp should not be null.");
  auto i_dims = ctx->GetInputDim("X");
  PADDLE_ENFORCE_EQ(i_dims.size(), 2UL, "Input shape size should be 2");
  auto w_dims = ctx->GetInputsDim("W");
  auto b_dims = ctx->GetInputsDim("Bias");
  PADDLE_ENFORCE_EQ(w_dims.size(), b_dims.size(),
                    "Shape size of weight and bias should be equal");
  PADDLE_ENFORCE_EQ(w_dims.size(), sz,
                    "Shape size of weight and bias should be equal");
  PADDLE_ENFORCE_EQ(i_dims[1], w_dims[0][0],
                    "inpute width should be equal with weight height");
  for (size_t i = 1; i < sz; ++i) {
    PADDLE_ENFORCE_EQ(w_dims[i].size(), 2UL,
                      "Every weight shape size should be 2.");
    PADDLE_ENFORCE_EQ(framework::product(b_dims[i]), w_dims[i][1],
                      "The length of Bias must be equal with w_dims[1].");
  }
  ctx->SetOutputDim("Out", {i_dims[0], w_dims[sz - 1][1]});
  ctx->ShareLoD("X", /*->*/ "Out");
 }
 framework::OpKernelType FusionRepeatedFCReluOp::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
  return framework::OpKernelType(framework::GetDataTypeOfVar(ctx.InputVar("X")),
                                 ctx.GetPlace());
 }
 void FusionRepeatedFCReluOpMaker::Make() {
  AddInput("X", "(LoDTensor) Input tensors of this operator.");
  AddInput("W", "(Tensor) The weight tensors of this operator.").AsDuplicable();
  AddInput("Bias", "(Tensor) The bias tensors of this operator.")
      .AsDuplicable();
  AddOutput("ReluOut", "(Tensor) The output tensor of each relu operator.")
      .AsDuplicable()
      .AsIntermediate();
  AddOutput("Out", "(LoDTensor) Output tensor of this operator.");
  AddComment(R"DOC(
  Fusion Repeated FC with Relu Operator.
 )DOC");
 }
 template <typename T>
 static void fc_relu(const T* x, const T* w, const T* b, T* y, int m, int n,
                    int k) {
  auto matmul =
      jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(k);
  auto addbias_relu =
      jit::Get<jit::kVAddRelu, jit::XYZNTuples<T>, platform::CPUPlace>(n);
  matmul(x, w, y, m, n, k);
  T* dst = y;
  for (int i = 0; i < m; ++i) {
    addbias_relu(b, dst, dst, n);
    dst += n;
  }
 }
 template <typename T>
 class FusionRepeatedFCReluKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto in = ctx.Input<Tensor>("X");
    auto weights = ctx.MultiInput<Tensor>("W");
    auto biases = ctx.MultiInput<Tensor>("Bias");
    auto relus = ctx.MultiOutput<Tensor>("ReluOut");
    auto* out = ctx.Output<Tensor>("Out");
    auto place = ctx.GetPlace();
    int weight_sz = static_cast<int>(weights.size());
    auto i_dims = in->dims();
    auto w_dims = weights[0]->dims();
    int m = i_dims[0];
    int n = w_dims[1];
    int k = w_dims[0];
    relus[0]->Resize({m, n});
    fc_relu(in->data<T>(), weights[0]->data<T>(), biases[0]->data<T>(),
            relus[0]->mutable_data<T>(place), m, n, k);
    for (int i = 1; i < weight_sz - 1; ++i) {
      auto i_dims = relus[i - 1]->dims();
      auto w_dims = weights[i]->dims();
      int m = i_dims[0];
      int n = w_dims[1];
      int k = w_dims[0];
      relus[i]->Resize({m, n});
      fc_relu(relus[i - 1]->data<T>(), weights[i]->data<T>(),
              biases[i]->data<T>(), relus[i]->mutable_data<T>(place), m, n, k);
    }
    auto i_dims_last = relus[weight_sz - 2]->dims();
    auto w_dims_last = weights[weight_sz - 1]->dims();
    m = i_dims_last[0];
    n = w_dims_last[1];
    k = w_dims_last[0];
    fc_relu(relus[weight_sz - 2]->data<T>(), weights[weight_sz - 1]->data<T>(),
            biases[weight_sz - 1]->data<T>(), out->mutable_data<T>(place), m, n,
            k);
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(fusion_repeated_fc_relu, ops::FusionRepeatedFCReluOp,
                  ops::FusionRepeatedFCReluOpMaker,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OP_CPU_KERNEL(fusion_repeated_fc_relu,
                       ops::FusionRepeatedFCReluKernel<float>,
                       ops::FusionRepeatedFCReluKernel<double>);
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.h
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.h
@ -0,0 +1,41 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
 namespace operators {
 using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
 class FusionRepeatedFCReluOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override;
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override;
 };
 class FusionRepeatedFCReluOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override;
 };
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
@ -0,0 +1,137 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License. */
 #include "paddle/fluid/operators/fused/fusion_squared_mat_sub_op.h"
 #include <string>
 #include <vector>
 #include "paddle/fluid/operators/jit/kernels.h"
 namespace paddle {
 namespace operators {
 void FusionSquaredMatSubOp::InferShape(
    framework::InferShapeContext* ctx) const {
  PADDLE_ENFORCE(ctx->HasInput("X"),
                 "Input(X) of FusionSquaredMatSubOp should not be null.");
  PADDLE_ENFORCE(ctx->HasInput("Y"),
                 "Input(Y) of FusionSquaredMatSubOp should not be null.");
  PADDLE_ENFORCE(
      ctx->HasOutput("SquaredX"),
      "Output(SquaredX) of FusionSquaredMatSubOp should not be null.");
  PADDLE_ENFORCE(
      ctx->HasOutput("SquaredY"),
      "Output(SquaredY) of FusionSquaredMatSubOp should not be null.");
  PADDLE_ENFORCE(
      ctx->HasOutput("SquaredXY"),
      "Output(SquaredXY) of FusionSquaredMatSubOp should not be null.");
  PADDLE_ENFORCE(ctx->HasOutput("Out"),
                 "Output(Out) of FusionSquaredMatSubOp should not be null.");
  auto x_dims = ctx->GetInputDim("X");
  auto y_dims = ctx->GetInputDim("Y");
  PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(),
                    "Input tensors dims size should be equal.");
  PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input tensors should be a Matrix.");
  PADDLE_ENFORCE_EQ(x_dims[1], y_dims[0], "Inputs Matrix should be multiply.");
  ctx->SetOutputDim("SquaredX", x_dims);
  ctx->SetOutputDim("SquaredY", y_dims);
  ctx->SetOutputDim("SquaredXY", {x_dims[0], y_dims[1]});
  ctx->SetOutputDim("Out", {x_dims[0], y_dims[1]});
 }
 framework::OpKernelType FusionSquaredMatSubOp::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
  return framework::OpKernelType(framework::GetDataTypeOfVar(ctx.InputVar("X")),
                                 ctx.GetPlace());
 }
 void FusionSquaredMatSubOpMaker::Make() {
  AddInput("X", "(Tensor) Input Mat A of this operator.");
  AddInput("Y", "(Tensor) Input Mat B of this operator.");
  AddOutput("SquaredX", "(Tensor) Squared X.").AsIntermediate();
  AddOutput("SquaredY", "(Tensor) Squared Y.").AsIntermediate();
  AddOutput("SquaredXY", "(Tensor) Squared X*Y.").AsIntermediate();
  AddOutput("Out", "(Tensor) Output tensor of concat operator.");
  AddAttr<float>("scalar", "The scalar on output matrix.").SetDefault(1.f);
  AddComment(R"DOC(
    Fusion Squared Matrix and substrct operator.
    ( (X * Y).^2 - (X.^2 * Y.^2) ) .* scalar
 )DOC");
 }
 template <typename T>
 class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto x = ctx.Input<Tensor>("X");
    auto y = ctx.Input<Tensor>("Y");
    auto* squared_x = ctx.Output<Tensor>("SquaredX");
    auto* squared_y = ctx.Output<Tensor>("SquaredY");
    auto* squared_xy = ctx.Output<Tensor>("SquaredXY");
    auto* out = ctx.Output<Tensor>("Out");
    auto place = ctx.GetPlace();
    T scalar = static_cast<T>(ctx.Attr<float>("scalar"));
    auto x_dims = x->dims();
    auto y_dims = y->dims();
    int m = x_dims[0];
    int k = x_dims[1];
    int n = y_dims[1];
    int o_numel = m * n;
    auto vsquare_x =
        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(m * k);
    auto vsquare_y =
        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(k * n);
    auto vsquare_xy =
        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(o_numel);
    auto vsub =
        jit::Get<jit::kVSub, jit::XYZNTuples<T>, platform::CPUPlace>(o_numel);
    auto vscal =
        jit::Get<jit::kVScal, jit::AXYNTuples<T>, platform::CPUPlace>(o_numel);
    auto matmul =
        jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(k);
    const T* x_data = x->data<T>();
    const T* y_data = y->data<T>();
    T* squared_x_data = squared_x->mutable_data<T>(place);
    T* squared_y_data = squared_y->mutable_data<T>(place);
    T* squared_xy_data = squared_xy->mutable_data<T>(place);
    T* o_data = out->mutable_data<T>(place);
    matmul(x_data, y_data, squared_xy_data, m, n, k);
    vsquare_xy(squared_xy_data, squared_xy_data, o_numel);
    vsquare_x(x_data, squared_x_data, m * k);
    vsquare_y(y_data, squared_y_data, k * n);
    matmul(squared_x_data, squared_y_data, o_data, m, n, k);
    vsub(squared_xy_data, o_data, o_data, o_numel);
    vscal(&scalar, o_data, o_data, o_numel);
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(fusion_squared_mat_sub, ops::FusionSquaredMatSubOp,
                  ops::FusionSquaredMatSubOpMaker,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OP_CPU_KERNEL(fusion_squared_mat_sub,
                       ops::FusionSquaredMatSubKernel<float>,
                       ops::FusionSquaredMatSubKernel<double>);
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.h
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.h
@ -0,0 +1,42 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
 namespace operators {
 using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
 // ( (A.^2 * B.^2) - (A * B).^2 ) .* scalar
 class FusionSquaredMatSubOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override;
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override;
 };
 class FusionSquaredMatSubOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override;
 };
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@ -210,6 +210,24 @@ void BenchSeqPoolKernel() {
  }
 }
 template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
 void BenchMatMulKernel() {
  for (int m : {1, 2, 3, 4}) {
    for (int n : TestSizes()) {
      for (int k : TestSizes()) {
        std::vector<T> a(m * k), b(k * n), c(m * n);
        RandomVec<T>(m * k, a.data(), -2.f, 2.f);
        RandomVec<T>(k * n, b.data(), -2.f, 2.f);
        const T* a_data = a.data();
        const T* b_data = b.data();
        T* c_data = c.data();
        BenchAllImpls<KT, jit::MatMulTuples<T>, PlaceType>(k, a_data, b_data,
                                                           c_data, m, n, k);
      }
    }
  }
 }
 // Benchmark all jit kernels including jitcode, mkl and refer.
 // To use this tool, run command: ./benchmark [options...]
 // Options:
@ -236,6 +254,7 @@ int main(int argc, char* argv[]) {
  // xyn
  BenchXYNKernel<jit::kVRelu, T, PlaceType>();
  BenchXYNKernel<jit::kVIdentity, T, PlaceType>();
  BenchXYNKernel<jit::kVSquare, T, PlaceType>();
  BenchXYNKernel<jit::kVExp, T, PlaceType>();
  BenchXYNKernel<jit::kVSigmoid, T, PlaceType>();
  BenchXYNKernel<jit::kVTanh, T, PlaceType>();
@ -251,4 +270,7 @@ int main(int argc, char* argv[]) {
  // seq pool function
  BenchSeqPoolKernel<jit::kSeqPool, T, PlaceType>();
  // matmul
  BenchMatMulKernel<jit::kMatMul, T, PlaceType>();
 }
--- a/paddle/fluid/operators/jit/gen/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt
@ -11,11 +11,12 @@ endfunction()
 # use gen jitcode kernel by name
 USE_JITKERNEL_GEN(kVMul)
 USE_JITKERNEL_GEN(kVAdd)
-#USE_JITKERNEL_GEN(kVSub) # TODO(TJ): enable me
+USE_JITKERNEL_GEN(kVSub)
 USE_JITKERNEL_GEN(kVAddRelu)
 USE_JITKERNEL_GEN(kVScal)
 USE_JITKERNEL_GEN(kVAddBias)
 USE_JITKERNEL_GEN(kVRelu)
 USE_JITKERNEL_GEN(kVSquare)
 USE_JITKERNEL_GEN(kVIdentity)
 USE_JITKERNEL_GEN(kVExp)
 USE_JITKERNEL_GEN(kVSigmoid)
--- a/paddle/fluid/operators/jit/gen/act.cc
+++ b/paddle/fluid/operators/jit/gen/act.cc
@ -91,6 +91,7 @@ void VActJitCode::genCode() {
  }
 DECLARE_ACT_CREATOR(VRelu);
 DECLARE_ACT_CREATOR(VSquare);
 DECLARE_ACT_CREATOR(VIdentity);
 DECLARE_ACT_CREATOR(VExp);
 DECLARE_ACT_CREATOR(VSigmoid);
@ -103,6 +104,10 @@ size_t VReluCreator::CodeSize(const int& d) const {
             8 /* average bytes for each instruction */;
 }
 size_t VSquareCreator::CodeSize(const int& d) const {
  return 96 + (d / YMM_FLOAT_BLOCK + 3) * 4 * 8;
 }
 size_t VIdentityCreator::CodeSize(const int& d) const {
  return 96 + (d / YMM_FLOAT_BLOCK + 3) * 4 * 8;
 }
@ -129,6 +134,7 @@ size_t VTanhCreator::CodeSize(const int& d) const {
 namespace gen = paddle::operators::jit::gen;
 REGISTER_JITKERNEL_GEN(kVRelu, gen::VReluCreator);
 REGISTER_JITKERNEL_GEN(kVSquare, gen::VSquareCreator);
 REGISTER_JITKERNEL_GEN(kVIdentity, gen::VIdentityCreator);
 REGISTER_JITKERNEL_GEN(kVExp, gen::VExpCreator);
 REGISTER_JITKERNEL_GEN(kVSigmoid, gen::VSigmoidCreator);
--- a/paddle/fluid/operators/jit/gen/act.h
+++ b/paddle/fluid/operators/jit/gen/act.h
@ -75,6 +75,12 @@ class VActFunc : public JitCode {
    vmaxps(dst, src, zero);
  }
  // compute SQUARE with ymm, xmm
  template <typename JMM>
  void square_jmm(JMM& dst, JMM& src) {  // NOLINT
    vmulps(dst, src, src);
  }
  // compute EXP with ymm, xmm
  template <typename JMM>
  void exp_jmm(JMM& dst, JMM& src, int src_idx = 11, int fx_idx = 12,  // NOLINT
@ -228,6 +234,9 @@ class VActFunc : public JitCode {
      case operand_type::RELU:
        relu_jmm<JMM>(dst, src, 15);
        break;
      case operand_type::SQUARE:
        square_jmm<JMM>(dst, src);
        break;
      case operand_type::EXP:
        exp_jmm<JMM>(dst, src, 11, 12, 13, 14, 15);
        break;
@ -254,7 +263,7 @@ class VActJitCode : public VActFunc {
      : VActFunc(code_size, code_ptr), num_(d), type_(type) {
    if (!(type_ == operand_type::RELU || type_ == operand_type::EXP ||
          type_ == operand_type::SIGMOID || type_ == operand_type::TANH ||
-          type_ == operand_type::IDENTITY)) {
+          type_ == operand_type::IDENTITY || type_ == operand_type::SQUARE)) {
      LOG(FATAL) << "Do not support this operand type: " << type_;
    }
    this->genCode();
@ -266,6 +275,9 @@ class VActJitCode : public VActFunc {
      case operand_type::RELU:
        base += "_Relu";
        break;
      case operand_type::SQUARE:
        base += "_Square";
        break;
      case operand_type::EXP:
        base += "_Exp";
        break;
@ -306,6 +318,7 @@ class VActJitCode : public VActFunc {
  };
 DECLARE_ACT_JITCODE(VRelu, operand_type::RELU);
 DECLARE_ACT_JITCODE(VSquare, operand_type::SQUARE);
 DECLARE_ACT_JITCODE(VIdentity, operand_type::IDENTITY);
 DECLARE_ACT_JITCODE(VExp, operand_type::EXP);
 DECLARE_ACT_JITCODE(VSigmoid, operand_type::SIGMOID);
--- a/paddle/fluid/operators/jit/gen/blas.cc
+++ b/paddle/fluid/operators/jit/gen/blas.cc
@ -43,6 +43,8 @@ void VXXJitCode::genCode() {
      vmulps(ymm_dst, ymm_src1, ymm_src2);
    } else if (type_ == operand_type::ADD) {
      vaddps(ymm_dst, ymm_src1, ymm_src2);
    } else if (type_ == operand_type::SUB) {
      vsubps(ymm_dst, ymm_src1, ymm_src2);
    }
    if (with_relu_) {
      vmaxps(ymm_dst, ymm_zero, ymm_dst);
@ -85,6 +87,9 @@ void VXXJitCode::genCode() {
      case operand_type::ADD:
        vaddps(xmm_dst, xmm_src1, xmm_src2);
        break;
      case operand_type::SUB:
        vsubps(xmm_dst, xmm_src1, xmm_src2);
        break;
      default:
        break;
    }
@ -178,8 +183,7 @@ namespace gen = paddle::operators::jit::gen;
 REGISTER_JITKERNEL_GEN(kVMul, gen::VMulCreator);
 REGISTER_JITKERNEL_GEN(kVAdd, gen::VAddCreator);
-// TODO(TJ): enable sub
+REGISTER_JITKERNEL_GEN(kVSub, gen::VSubCreator);
 // REGISTER_JITKERNEL_GEN(kVSub, gen::VSubCreator);
 REGISTER_JITKERNEL_GEN(kVAddRelu, gen::VAddReluCreator);
 REGISTER_JITKERNEL_GEN(kVScal, gen::VScalCreator);
 REGISTER_JITKERNEL_GEN(kVAddBias, gen::VAddBiasCreator);
--- a/paddle/fluid/operators/jit/gen/blas.h
+++ b/paddle/fluid/operators/jit/gen/blas.h
@ -34,7 +34,8 @@ class VXXJitCode : public JitCode {
        type_(type),
        scalar_index_(scalar_index),
        with_relu_(with_relu) {
-    if (!(type_ == operand_type::MUL || type_ == operand_type::ADD)) {
+    if (!(type_ == operand_type::MUL || type_ == operand_type::ADD ||
          type_ == operand_type::SUB)) {
      LOG(FATAL) << "Do not support this operand type: " << type_;
    }
    this->genCode();
@ -51,6 +52,8 @@ class VXXJitCode : public JitCode {
      base += "_Mul";
    } else if (type_ == operand_type::ADD) {
      base += "_Add";
    } else if (type_ == operand_type::SUB) {
      base += "_SUB";
    }
    if (scalar_index_ == 2) {
      base += "_Scalar";
--- a/paddle/fluid/operators/jit/gen/jitcode.h
+++ b/paddle/fluid/operators/jit/gen/jitcode.h
@ -51,6 +51,7 @@ typedef enum {
  SUB,
  RELU,
  EXP,
  SQUARE,
  SIGMOID,
  TANH,
  IDENTITY
--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
@ -36,6 +36,7 @@ const char* to_string(KernelType kt) {
    ONE_CASE(kVRelu);
    ONE_CASE(kVIdentity);
    ONE_CASE(kVExp);
    ONE_CASE(kVSquare);
    ONE_CASE(kVSigmoid);
    ONE_CASE(kVTanh);
    ONE_CASE(kLSTMCtHt);
@ -47,6 +48,7 @@ const char* to_string(KernelType kt) {
    ONE_CASE(kLayerNorm);
    ONE_CASE(kNCHW16CMulNC);
    ONE_CASE(kSeqPool);
    ONE_CASE(kMatMul);
    default:
      PADDLE_THROW("Not support type: %d, or forget to add it.", kt);
      return "NOT JITKernel";
--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
@ -30,6 +30,7 @@ typedef enum {
  kVAddBias,
  kVRelu,
  kVIdentity,
  kVSquare,
  kVExp,
  kVSigmoid,
  kVTanh,
@ -42,6 +43,7 @@ typedef enum {
  kLayerNorm,
  kNCHW16CMulNC,
  kSeqPool,
  kMatMul,
 } KernelType;
 typedef enum {
@ -135,6 +137,13 @@ struct SeqPoolTuples {
  typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*);
 };
 template <typename T>
 struct MatMulTuples {
  typedef T data_type;
  typedef int attr_type;
  typedef void (*func_type)(const T*, const T*, T*, int, int, int);
 };
 template <typename T>
 struct CRFDecodingTuples {
  typedef T data_type;
--- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
@ -3,10 +3,12 @@ cc_library(jit_kernel_mkl SRCS mkl.cc DEPS jit_kernel_base dynload_mklml)
 set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl PARENT_SCOPE)
 # use mkl kernels by name and type
 USE_JITKERNEL_MORE(kMatMul, mkl)
 USE_JITKERNEL_MORE(kVMul, mkl)
 USE_JITKERNEL_MORE(kVAdd, mkl)
 USE_JITKERNEL_MORE(kVScal, mkl)
 USE_JITKERNEL_MORE(kVExp, mkl)
 USE_JITKERNEL_MORE(kVSquare, mkl)
 USE_JITKERNEL_MORE(kVSigmoid, mkl)
 USE_JITKERNEL_MORE(kVTanh, mkl)
 USE_JITKERNEL_MORE(kSeqPool, mkl)
--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
@ -24,6 +24,20 @@ namespace jit {
 namespace more {
 namespace mkl {
 template <>
 void MatMul<float>(const float* a, const float* b, float* c, int m, int n,
                   int k) {
  platform::dynload::cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m,
                                 n, k, 1.f, a, k, b, n, 0.f, c, n);
 }
 template <>
 void MatMul<double>(const double* a, const double* b, double* c, int m, int n,
                    int k) {
  platform::dynload::cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m,
                                 n, k, 1.0, a, k, b, n, 0.0, c, n);
 }
 template <>
 void VMul<float>(const float* x, const float* y, float* z, int n) {
  platform::dynload::vsMul(n, x, y, z);
@ -72,6 +86,16 @@ void VExp<double>(const double* x, double* y, int n) {
  platform::dynload::vdExp(n, x, y);
 }
 template <>
 void VSquare<float>(const float* x, float* y, int n) {
  platform::dynload::vsSqr(n, x, y);
 }
 template <>
 void VSquare<double>(const double* x, double* y, int n) {
  platform::dynload::vdSqr(n, x, y);
 }
 template <>
 void VCopy<float>(const float* x, float* y, int n) {
  platform::dynload::cblas_scopy(n, x, 1, y, 1);
@ -93,6 +117,11 @@ void VAXPY<double>(double a, const double* x, double* y, int n) {
 }
 // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
 template <>
 bool MatMulKernel<float>::UseMe(const int& d) const {
  return platform::MayIUse(platform::avx);
 }
 template <>
 bool VMulKernel<float>::UseMe(const int& d) const {
  return platform::MayIUse(platform::avx512f) && d > 512;
@ -113,6 +142,11 @@ bool VExpKernel<float>::UseMe(const int& d) const {
  return d > 7;
 }
 template <>
 bool VSquareKernel<float>::UseMe(const int& d) const {
  return d > 7;
 }
 template <>
 bool VSigmoidKernel<float>::UseMe(const int& d) const {
  return d > 7;
@ -139,12 +173,14 @@ bool SeqPoolKernel<double>::UseMe(const seq_pool_attr_t& attr) const {
    return true;                                         \
  }
 AWALYS_USE_ME_WITH_DOUBLE(MatMul);
 AWALYS_USE_ME_WITH_DOUBLE(VMul);
 AWALYS_USE_ME_WITH_DOUBLE(VAdd);
 AWALYS_USE_ME_WITH_DOUBLE(VScal);
 AWALYS_USE_ME_WITH_DOUBLE(VExp);
 AWALYS_USE_ME_WITH_DOUBLE(VSigmoid);
 AWALYS_USE_ME_WITH_DOUBLE(VTanh);
 AWALYS_USE_ME_WITH_DOUBLE(VSquare);
 #undef AWALYS_USE_ME_WITH_DOUBLE
 }  // namespace mkl
@ -159,10 +195,12 @@ namespace mkl = paddle::operators::jit::more::mkl;
  REGISTER_JITKERNEL_MORE(key, mkl, mkl::func##Kernel<float>, \
                          mkl::func##Kernel<double>)
 REGISTER_MKL_KERNEL(kMatMul, MatMul);
 REGISTER_MKL_KERNEL(kVMul, VMul);
 REGISTER_MKL_KERNEL(kVAdd, VAdd);
 REGISTER_MKL_KERNEL(kVScal, VScal);
 REGISTER_MKL_KERNEL(kVExp, VExp);
 REGISTER_MKL_KERNEL(kVSquare, VSquare);
 REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid);
 REGISTER_MKL_KERNEL(kVTanh, VTanh);
 REGISTER_MKL_KERNEL(kSeqPool, SeqPool);
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@ -24,6 +24,9 @@ namespace jit {
 namespace more {
 namespace mkl {
 template <typename T>
 void MatMul(const T* a, const T* b, T* c, int m, int n, int k);
 template <typename T>
 void VMul(const T* x, const T* y, T* z, int n);
@ -36,6 +39,9 @@ void VScal(const T* a, const T* x, T* y, int n);
 template <typename T>
 void VExp(const T* x, T* y, int n);
 template <typename T>
 void VSquare(const T* x, T* y, int n);
 template <typename T>
 void VCopy(const T* x, T* y, int n);
@ -93,6 +99,9 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
    const char* ImplType() const override { return "MKL"; }          \
  }
 // ABCMNK
 DECLARE_MKL_KERNEL(MatMul, MatMulTuples);
 // XYZN
 DECLARE_MKL_KERNEL(VMul, XYZNTuples);
 DECLARE_MKL_KERNEL(VAdd, XYZNTuples);
@ -104,6 +113,7 @@ DECLARE_MKL_KERNEL(VScal, AXYNTuples);
 DECLARE_MKL_KERNEL(VExp, XYNTuples);
 DECLARE_MKL_KERNEL(VSigmoid, XYNTuples);
 DECLARE_MKL_KERNEL(VTanh, XYNTuples);
 DECLARE_MKL_KERNEL(VSquare, XYNTuples);
 DECLARE_MKL_KERNEL(SeqPool, SeqPoolTuples);
--- a/Show More
+++ b/Show More