cherry-pick from feature/anakin-engine: Add subgraph fuse support and anakin engine #16018

6 years ago · b21770a2aa
parent 084310f536
commit b21770a2aa
36 changed files with 1129 additions and 80 deletions
--- a/paddle/fluid/inference/anakin/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/anakin/convert/CMakeLists.txt
@ -1,8 +1,12 @@
-cc_library(anakin_op_converter SRCS fc.cc conv2d.cc activation.cc pool2d.cc concat.cc split.cc DEPS anakin_engine framework_proto scope op_registry)
-cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS  ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS}  anakin_op_converter mul_op)
-cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL)
-cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} activation_op anakin_op_converter
- SERIAL)
-cc_test(test_anakin_pool2d SRCS test_pool2d_op.cc DEPS  ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} anakin_op_converter pool_op pooling)
-cc_test(test_anakin_concat SRCS test_concat_op.cc DEPS  ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} anakin_op_converter concat_op concat_and_split)
-cc_test(test_anakin_split SRCS test_split_op.cc DEPS  ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} anakin_op_converter split_op concat_and_split)
+cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc
+elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc DEPS anakin_engine framework_proto scope op_registry)
+cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op)
+cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL)
+cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS activation_op anakin_op_converter SERIAL)
+cc_test(test_anakin_pool2d SRCS test_pool2d_op.cc DEPS anakin_op_converter pool_op pooling)
+cc_test(test_anakin_concat SRCS test_concat_op.cc DEPS anakin_op_converter concat_op concat_and_split)
+cc_test(test_anakin_split SRCS test_split_op.cc DEPS anakin_op_converter split_op concat_and_split)
+cc_test(test_anakin_elementwise SRCS test_elementwise_op.cc DEPS
+anakin_op_converter elementwise_add_op)
+
+cc_test(test_anakin_relu SRCS test_relu_op.cc DEPS activation_op anakin_op_converter SERIAL)
--- a/paddle/fluid/inference/anakin/convert/activation.cc
+++ b/paddle/fluid/inference/anakin/convert/activation.cc
@ -45,15 +45,11 @@ void ActivationOpConverter::operator()(const framework::proto::OpDesc &op,
  auto output_name = op_desc.Output("Out").front();
  engine_->AddOp(op_name, "Activation", {input_name}, {output_name});
  engine_->AddOpAttr(op_name, "type", anakin_op_type_);
-  if (op_type_ == "relu") {
-    engine_->AddOpAttr(op_name, "alpha", 0);
-  }
 }

 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle

-REGISTER_ANAKIN_OP_CONVERTER(relu, ReluOpConverter);
 REGISTER_ANAKIN_OP_CONVERTER(sigmoid, SigmoidOpConverter);
 REGISTER_ANAKIN_OP_CONVERTER(tanh, TanhOpConverter);
--- a/paddle/fluid/inference/anakin/convert/activation.h
+++ b/paddle/fluid/inference/anakin/convert/activation.h
@ -34,13 +34,8 @@ class ActivationOpConverter : public AnakinOpConverter {
 private:
  std::string op_type_;
  std::string anakin_op_type_;
-  std::map<std::string, std::string> anakin_ops_type_{
-      {"relu", "Relu"}, {"tanh", "TanH"}, {"sigmoid", "Sigmoid"}};
-};
-
-class ReluOpConverter : public ActivationOpConverter {
- public:
-  ReluOpConverter() : ActivationOpConverter("relu") {}
+  std::map<std::string, std::string> anakin_ops_type_{{"tanh", "TanH"},
+                                                      {"sigmoid", "Sigmoid"}};
 };

 class TanhOpConverter : public ActivationOpConverter {
@ -50,7 +45,7 @@ class TanhOpConverter : public ActivationOpConverter {

 class SigmoidOpConverter : public ActivationOpConverter {
 public:
-  SigmoidOpConverter() : ActivationOpConverter("tanh") {}
+  SigmoidOpConverter() : ActivationOpConverter("sigmoid") {}
 };
 }  // namespace anakin
 }  // namespace inference
--- a/paddle/fluid/inference/anakin/convert/concat.cc
+++ b/paddle/fluid/inference/anakin/convert/concat.cc
@ -32,8 +32,8 @@ void ConcatOpConverter::operator()(const framework::proto::OpDesc &op,
                                   const framework::Scope &scope,
                                   bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);
-  auto input_names = op_desc.Input("X");
  int axis = boost::get<int>(op_desc.GetAttr("axis"));
+  auto input_names = op_desc.Input("X");
  PADDLE_ENFORCE(axis > 0,
                 "The axis attr of Concat op should be large than 0 for trt");

--- a/paddle/fluid/inference/anakin/convert/conv2d.cc
+++ b/paddle/fluid/inference/anakin/convert/conv2d.cc
@ -51,10 +51,11 @@ void Conv2dOpConverter::operator()(const framework::proto::OpDesc &op,
  PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);

  // const int n_output = weight_tensor->dims()[0];
-  const int n_input = weight_tensor->dims()[1];
+  // const int n_input = weight_tensor->dims()[1];
  const int filter_h = weight_tensor->dims()[2];
  const int filter_w = weight_tensor->dims()[3];
-  auto filter_num = n_input * filter_h * filter_w;
+  // auto filter_num = n_input * filter_h * filter_w ;
+  auto filter_num = weight_tensor->dims()[0];
  engine_->AddOpAttr<int>(op_name, "filter_num", filter_num);
  engine_->AddOpAttr<PTuple<int>>(op_name, "kernel_size", {filter_h, filter_w});
  auto strides = boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
--- a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
+++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
@ -0,0 +1,113 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/anakin/convert/conv2d_fusion.h"
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+using anakin::PTuple;
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+void Conv2dFusionOpConverter::operator()(const framework::proto::OpDesc &op,
+                                         const framework::Scope &scope,
+                                         bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1UL);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1UL);
+
+  auto input_name = op_desc.Input("Input").front();
+  auto output_name = op_desc.Output("Output").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Output").front();
+  engine_->AddOp(op_name, "Convolution", {input_name}, {output_name});
+
+  auto *filter_v = scope.FindVar(op_desc.Input("Filter").front());
+  PADDLE_ENFORCE_NOT_NULL(filter_v);
+  auto *filter_t = filter_v->GetMutable<framework::LoDTensor>();
+
+  auto *b_v = scope.FindVar(op_desc.Input("Bias").front());
+  PADDLE_ENFORCE_NOT_NULL(b_v);
+  auto *b_t = b_v->GetMutable<framework::LoDTensor>();
+
+  std::unique_ptr<framework::LoDTensor> weight_tensor(
+      new framework::LoDTensor());
+  weight_tensor->Resize(filter_t->dims());
+  TensorCopySync((*filter_t), platform::CPUPlace(), weight_tensor.get());
+
+  PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
+
+  // const int n_output = weight_tensor->dims()[0];
+  // const int n_input = weight_tensor->dims()[1];
+  const int filter_h = weight_tensor->dims()[2];
+  const int filter_w = weight_tensor->dims()[3];
+  // auto filter_num = n_input * filter_h * filter_w ;
+  auto filter_num = weight_tensor->dims()[0];
+  engine_->AddOpAttr<int>(op_name, "filter_num", filter_num);
+  engine_->AddOpAttr<PTuple<int>>(op_name, "kernel_size", {filter_h, filter_w});
+  auto strides = boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+  engine_->AddOpAttr<PTuple<int>>(op_name, "strides", strides);
+  auto paddings = boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+  engine_->AddOpAttr<PTuple<int>>(op_name, "padding", paddings);
+  auto dilations = boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
+  engine_->AddOpAttr<PTuple<int>>(op_name, "dilation_rate", dilations);
+  const int groups = boost::get<int>(op_desc.GetAttr("groups"));
+  engine_->AddOpAttr(op_name, "group", groups);
+  engine_->AddOpAttr(op_name, "axis", 1);
+  engine_->AddOpAttr(op_name, "bias_term", true);
+
+  auto weight_shape = framework::vectorize2int(filter_t->dims());
+  Shape anakin_shape(weight_shape);
+  auto *weight1 =
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(anakin_shape);
+  float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data());
+  std::copy_n(weight_tensor->data<float>(), weight_tensor->numel(), cpu_data);
+  weight1->d_tensor().set_shape(anakin_shape);
+  weight1->d_tensor().copy_from(weight1->h_tensor());
+  engine_->AddOpAttr(op_name, "weight_1", *weight1);
+
+  auto bias_shape = framework::vectorize2int(b_t->dims());
+  framework::LoDTensor bias_tensor;
+  bias_tensor.Resize(b_t->dims());
+  TensorCopySync((*b_t), platform::CPUPlace(), &bias_tensor);
+  auto *bias_data = bias_tensor.data<float>();
+  bias_shape.insert(bias_shape.begin(), 1);
+  bias_shape.insert(bias_shape.begin(), 1);
+  bias_shape.insert(bias_shape.begin(), 1);
+  // bias_shape.push_back(1);
+  // bias_shape.push_back(1);
+  Shape anakin_bias_shape(bias_shape);
+
+  auto *weight2 = GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(
+      anakin_bias_shape);
+  float *cpu_data2 = static_cast<float *>(weight2->h_tensor().mutable_data());
+  std::copy_n(bias_data, bias_tensor.numel(), cpu_data2);
+  weight2->d_tensor().set_shape(anakin_bias_shape);
+  weight2->d_tensor().copy_from(weight2->h_tensor());
+  engine_->AddOpAttr(op_name, "weight_2", *weight2);
+}
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_ANAKIN_OP_CONVERTER(conv2d_fusion, Conv2dFusionOpConverter);
--- a/paddle/fluid/inference/anakin/convert/conv2d_fusion.h
+++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.h
@ -0,0 +1,35 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+class Conv2dFusionOpConverter : public AnakinOpConverter {
+ public:
+  Conv2dFusionOpConverter() = default;
+
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~Conv2dFusionOpConverter() {}
+};
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/elementwise.cc
+++ b/paddle/fluid/inference/anakin/convert/elementwise.cc
@ -0,0 +1,57 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/anakin/convert/elementwise.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::Precision;
+using anakin::saber::NV;
+using anakin::saber::X86;
+using anakin::saber::Shape;
+using anakin::PBlock;
+using anakin::PTuple;
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+void ElementwiseAddOpConverter::operator()(const framework::proto::OpDesc &op,
+                                           const framework::Scope &scope,
+                                           bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+  auto x_name = op_desc.Input("X").front();
+  auto y_name = op_desc.Input("Y").front();
+  auto out_name = op_desc.Output("Out").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+
+  engine_->AddOp(op_name, "Eltwise", {x_name, y_name}, {out_name});
+  std::string elementwise_type = "Add";
+  engine_->AddOpAttr<std::string>(op_name, "type", elementwise_type);
+  std::vector<float> coeff = {1.0, 1.0};
+  engine_->AddOpAttr<PTuple<float>>(op_name, "coeff", coeff);
+}
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_ANAKIN_OP_CONVERTER(elementwise_add, ElementwiseAddOpConverter);
--- a/paddle/fluid/inference/anakin/convert/elementwise.h
+++ b/paddle/fluid/inference/anakin/convert/elementwise.h
@ -0,0 +1,37 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+class ElementwiseAddOpConverter : public AnakinOpConverter {
+ public:
+  ElementwiseAddOpConverter() = default;
+
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~ElementwiseAddOpConverter() {}
+
+ private:
+};
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/fc.cc
+++ b/paddle/fluid/inference/anakin/convert/fc.cc
@ -14,6 +14,8 @@

 #include "paddle/fluid/inference/anakin/convert/fc.h"
 #include <algorithm>
+#include <string>
+#include <vector>

 using anakin::graph::GraphGlobalMem;
 using anakin::AK_FLOAT;
@ -24,28 +26,39 @@ namespace paddle {
 namespace inference {
 namespace anakin {

-void FcOpConverter::operator()(const framework::proto::OpDesc &op,
-                               const framework::Scope &scope, bool test_mode) {
+void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op,
+                                   const framework::Scope &scope,
+                                   bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-  PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);
-  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+  auto input_names = op_desc.InputNames();
+  bool with_bias = input_names.size() == 3;
+
+  std::string w_name = "Y";
+  std::string i_name = "X";
+  if (with_bias) {
+    w_name = "W";
+    i_name = "Input";
+  }

-  auto x_name = op_desc.Input("X").front();
  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
-  auto *y_v = scope.FindVar(op_desc.Input("Y").front());
+
+  // get weights
+  auto *y_v = scope.FindVar(op_desc.Input(w_name).front());
  PADDLE_ENFORCE_NOT_NULL(y_v);
  auto *y_t = y_v->GetMutable<framework::LoDTensor>();

-  auto input_name = op_desc.Input("X").front();
+  auto input_name = op_desc.Input(i_name).front();
  auto output_name = op_desc.Output("Out").front();

-  auto weight_shape = framework::vectorize2int(y_t->dims());
  engine_->AddOp(op_name, "Dense", {input_name}, {output_name});
-  engine_->AddOpAttr(op_name, "bias_term", false);
+  engine_->AddOpAttr(op_name, "bias_term", with_bias);
  engine_->AddOpAttr(op_name, "axis", 1);
+
+  auto weight_shape = framework::vectorize2int(y_t->dims());
  int out_dim = weight_shape[1];
  engine_->AddOpAttr(op_name, "out_dim", out_dim);
+  const int w_m = weight_shape[0];
+  const int w_k = weight_shape[1];

  weight_shape.push_back(1);
  weight_shape.push_back(1);
@ -54,18 +67,54 @@ void FcOpConverter::operator()(const framework::proto::OpDesc &op,
  framework::LoDTensor weight_tensor;
  weight_tensor.Resize(y_t->dims());
  TensorCopySync((*y_t), platform::CPUPlace(), &weight_tensor);
+  auto *weight_data = weight_tensor.data<float>();
+  PADDLE_ENFORCE(w_m * w_k == weight_tensor.numel());

+  std::vector<float> trans_weight_data(weight_tensor.numel());
+  for (int i = 0; i < w_m; i++) {
+    for (int j = 0; j < w_k; j++) {
+      trans_weight_data[i + j * w_m] = weight_data[i * w_k + j];
+    }
+  }
  auto *weight1 =
      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(anakin_shape);
  float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data());
-  std::copy_n(weight_tensor.data<float>(), weight_tensor.numel(), cpu_data);
+  std::copy_n(trans_weight_data.data(), weight_tensor.numel(), cpu_data);
  weight1->d_tensor().set_shape(anakin_shape);
  weight1->d_tensor().copy_from(weight1->h_tensor());
  engine_->AddOpAttr(op_name, "weight_1", *weight1);
+
+  // get bias
+  if (with_bias) {
+    auto *b_v = scope.FindVar(op_desc.Input("Bias").front());
+    PADDLE_ENFORCE_NOT_NULL(b_v);
+    auto *b_t = b_v->GetMutable<framework::LoDTensor>();
+
+    auto bias_shape = framework::vectorize2int(b_t->dims());
+    framework::LoDTensor bias_tensor;
+    bias_tensor.Resize(b_t->dims());
+    TensorCopySync((*b_t), platform::CPUPlace(), &bias_tensor);
+    auto *bias_data = bias_tensor.data<float>();
+    bias_shape.insert(bias_shape.begin(), 1);
+    bias_shape.insert(bias_shape.begin(), 1);
+    bias_shape.insert(bias_shape.begin(), 1);
+    // bias_shape.push_back(1);
+    // bias_shape.push_back(1);
+    Shape anakin_bias_shape(bias_shape);
+
+    auto *weight2 = GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(
+        anakin_bias_shape);
+    float *cpu_data2 = static_cast<float *>(weight2->h_tensor().mutable_data());
+    std::copy_n(bias_data, bias_tensor.numel(), cpu_data2);
+    weight2->d_tensor().set_shape(anakin_bias_shape);
+    weight2->d_tensor().copy_from(weight2->h_tensor());
+    engine_->AddOpAttr(op_name, "weight_2", *weight2);
+  }
 }

 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle

+REGISTER_ANAKIN_OP_CONVERTER(mul, MulOpConverter);
 REGISTER_ANAKIN_OP_CONVERTER(fc, FcOpConverter);
--- a/paddle/fluid/inference/anakin/convert/fc.h
+++ b/paddle/fluid/inference/anakin/convert/fc.h
@ -20,14 +20,26 @@ namespace paddle {
 namespace inference {
 namespace anakin {

-class FcOpConverter : public AnakinOpConverter {
+class FcBaseOpConverter : public AnakinOpConverter {
 public:
-  FcOpConverter() = default;
+  FcBaseOpConverter() = default;

  virtual void operator()(const framework::proto::OpDesc &op,
                          const framework::Scope &scope,
                          bool test_mode) override;
-  virtual ~FcOpConverter() {}
+  virtual ~FcBaseOpConverter() {}
+};
+
+// with bias
+class FcOpConverter : public FcBaseOpConverter {
+ public:
+  FcOpConverter() = default;
+};
+
+// without bias
+class MulOpConverter : public FcBaseOpConverter {
+ public:
+  MulOpConverter() = default;
 };

 }  // namespace anakin
--- a/paddle/fluid/inference/anakin/convert/op_converter.h
+++ b/paddle/fluid/inference/anakin/convert/op_converter.h
@ -47,14 +47,6 @@ class AnakinOpConverter {
    std::string op_type = op_desc.Type();
    AnakinOpConverter *it = nullptr;

-    if (op_type == "mul") {
-      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
-      std::string Y = op_desc.Input("Y")[0];
-      if (parameters.count(Y)) {
-        it = Registry<AnakinOpConverter>::Global().Lookup("fc");
-      }
-    }
-
    if (!it) {
      it = Registry<AnakinOpConverter>::Global().Lookup(op_type);
    }
--- a/paddle/fluid/inference/anakin/convert/relu.cc
+++ b/paddle/fluid/inference/anakin/convert/relu.cc
@ -0,0 +1,47 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/anakin/convert/relu.h"
+#include <algorithm>
+#include <map>
+
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+void ReluOpConverter::operator()(const framework::proto::OpDesc &op,
+                                 const framework::Scope &scope,
+                                 bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  auto input_name = op_desc.Input("X").front();
+  auto output_name = op_desc.Output("Out").front();
+
+  engine_->AddOp(op_name, "ReLU", {input_name}, {output_name});
+  engine_->AddOpAttr(op_name, "alpha", 0);
+}
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_ANAKIN_OP_CONVERTER(relu, ReluOpConverter);
--- a/paddle/fluid/inference/anakin/convert/relu.h
+++ b/paddle/fluid/inference/anakin/convert/relu.h
@ -0,0 +1,37 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+class ReluOpConverter : public AnakinOpConverter {
+ public:
+  ReluOpConverter() = default;
+
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~ReluOpConverter() {}
+};
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_activation_op.cc
@ -41,16 +41,13 @@ static void test_activation_op(const std::string &op_type) {
  validator.Execute(5);
 }

-TEST(relu_op, test) { test_activation_op("relu"); }
 TEST(sigm_op, test) { test_activation_op("sigmoid"); }
 TEST(tanh_op, test) { test_activation_op("tanh"); }
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle

-USE_OP(relu);
 USE_OP(sigmoid);
 USE_OP(tanh);
-USE_ANAKIN_CONVERTER(relu);
 USE_ANAKIN_CONVERTER(sigmoid);
 USE_ANAKIN_CONVERTER(tanh);
--- a/paddle/fluid/inference/anakin/convert/test_concat_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_concat_op.cc
@ -25,10 +25,10 @@ TEST(concat_op, test) {
  std::unordered_set<std::string> parameters({""});
  framework::Scope scope;
  AnakinConvertValidation validator(parameters, scope);
-  validator.DeclInputVar("concat_x1", {1, 10, 3, 1});
-  validator.DeclInputVar("concat_x2", {1, 3, 3, 1});
-  validator.DeclInputVar("concat_x3", {1, 7, 3, 1});
-  validator.DeclOutputVar("concat_out", {1, 20, 3, 1});
+  validator.DeclInputVar("concat_x1", {1, 2, 1, 1});
+  validator.DeclInputVar("concat_x2", {1, 3, 1, 1});
+  validator.DeclInputVar("concat_x3", {1, 1, 1, 1});
+  validator.DeclOutputVar("concat_out", {1, 6, 1, 1});

  // Prepare Op description
  framework::OpDesc desc;
--- a/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
@ -28,9 +28,9 @@ TEST(conv2d_op, test) {
  std::unordered_set<std::string> parameters({"conv2d-Y"});
  framework::Scope scope;
  AnakinConvertValidation validator(parameters, scope);
-  validator.DeclInputVar("conv2d-X", {1, 2, 5, 5});
-  validator.DeclParamVar("conv2d-Y", {3, 2, 3, 3});
-  validator.DeclOutputVar("conv2d-Out", {1, 3, 5, 5});
+  validator.DeclInputVar("conv2d-X", {1, 3, 3, 3});
+  validator.DeclParamVar("conv2d-Y", {4, 3, 1, 1});
+  validator.DeclOutputVar("conv2d-Out", {1, 4, 3, 3});

  // Prepare Op description
  framework::OpDesc desc;
@ -40,7 +40,7 @@ TEST(conv2d_op, test) {
  desc.SetOutput("Output", {"conv2d-Out"});

  const std::vector<int> strides({1, 1});
-  const std::vector<int> paddings({1, 1});
+  const std::vector<int> paddings({0, 0});
  const std::vector<int> dilations({1, 1});
  const int groups = 1;

--- a/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc
@ -0,0 +1,50 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+TEST(elementwise_op, native) {
+  std::unordered_set<std::string> parameters;
+  framework::Scope scope;
+  AnakinConvertValidation validator(parameters, scope);
+  validator.DeclInputVar("elementwise_add_x", {1, 1, 2, 2});
+  validator.DeclInputVar("elementwise_y", {1, 1, 2, 2});
+  validator.DeclOutputVar("elementwise_out", {1, 1, 2, 2});
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("elementwise_add");
+  desc.SetInput("X", {"elementwise_add_x"});
+  desc.SetInput("Y", {"elementwise_y"});
+  desc.SetOutput("Out", {"elementwise_out"});
+
+  int axis = -1;
+  desc.SetAttr("axis", axis);
+
+  validator.SetOp(*desc.Proto());
+  validator.Execute(1);
+}
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(elementwise_add);
+USE_ANAKIN_CONVERTER(elementwise_add);
--- a/paddle/fluid/inference/anakin/convert/test_fc_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_fc_op.cc
@ -27,9 +27,9 @@ TEST(fc_op, test) {
  std::unordered_set<std::string> parameters({"mul_y"});
  framework::Scope scope;
  AnakinConvertValidation validator(parameters, scope);
-  validator.DeclInputVar("mul_x", {1, 1, 1, 1});
-  validator.DeclParamVar("mul_y", {1, 2});
-  validator.DeclOutputVar("mul_out", {1, 1, 1, 2});
+  validator.DeclInputVar("mul_x", {1, 1, 2, 2});
+  validator.DeclParamVar("mul_y", {4, 2});
+  validator.DeclOutputVar("mul_out", {1, 2});

  // Prepare Op description
  framework::OpDesc desc;
@ -37,8 +37,8 @@ TEST(fc_op, test) {
  desc.SetInput("X", {"mul_x"});
  desc.SetInput("Y", {"mul_y"});
  desc.SetOutput("Out", {"mul_out"});
-  int num_flatten_dims = 3;
-  desc.SetAttr("x_num_col_dims", num_flatten_dims);
+  // int num_flatten_dims = 3;
+  // desc.SetAttr("x_num_col_dims", num_flatten_dims);
  validator.SetOp(*desc.Proto());

  validator.Execute(10);
--- a/paddle/fluid/inference/anakin/convert/test_relu_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_relu_op.cc
@ -0,0 +1,50 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+#include "paddle/fluid/inference/anakin/convert/relu.h"
+#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+static void test_activation_op(const std::string &op_type) {
+  auto *converter = Registry<AnakinOpConverter>::Global().Lookup(op_type);
+  PADDLE_ENFORCE(converter != nullptr);
+  std::unordered_set<std::string> parameters;
+  framework::Scope scope;
+  AnakinConvertValidation validator(parameters, scope);
+  validator.DeclInputVar("act-X", {10, 6, 1, 1});
+  validator.DeclOutputVar("act-Out", {10, 6, 1, 1});
+  framework::OpDesc desc;
+  desc.SetType(op_type);
+  desc.SetInput("X", {"act-X"});
+  desc.SetOutput("Out", {"act-Out"});
+
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+
+  validator.Execute(5);
+}
+
+TEST(sigm_op, test) { test_activation_op("relu"); }
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(relu);
+USE_ANAKIN_CONVERTER(relu);
--- a/paddle/fluid/inference/anakin/convert/ut_helper.h
+++ b/paddle/fluid/inference/anakin/convert/ut_helper.h
@ -161,10 +161,6 @@ class AnakinConvertValidation {
      framework::TensorToVector(*tensor, ctx, &fluid_out);
      fluid_outputs.push_back(fluid_out);

-      // size_t fluid_out_size = fluid_out.size();
-      /*for (size_t i = 0; i < fluid_out_size; i++) {
-        std::cout << fluid_out[i] << std::endl;
-      }*/
      outputs.insert({output, tensor});
    }

@ -180,8 +176,7 @@ class AnakinConvertValidation {
      size_t anakin_out_size = anakin_out.size();
      auto fluid_out = fluid_outputs[i_output++];
      for (size_t i = 0; i < anakin_out_size; i++) {
-        LOG(INFO) << "Output[" << i << "]: anakin[" << anakin_out[i] << "], "
-                  << "fluid[" << fluid_out[i] << "]";
+        EXPECT_LT(std::abs(fluid_out[i] - anakin_out[i]), 1e-3);
      }
    }
  }
--- a/paddle/fluid/inference/anakin/engine.cc
+++ b/paddle/fluid/inference/anakin/engine.cc
@ -68,29 +68,34 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
    auto *tensor = input.second;
    auto *data = tensor->data<float>();
    auto shape = framework::vectorize2int(tensor->dims());
-    ::anakin::saber::Shape anakin_shape(shape);
    auto *anakin_input = net_->get_in(input.first);
+    auto anakin_input_shape = anakin_input->valid_shape();
+    PADDLE_ENFORCE(tensor->numel(), anakin_input_shape.count(),
+                   "the fluid input size should be equal to anakin");
    ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0,
-                                                       anakin_shape);
-    anakin_input->share_from(tmp_anakin_tensor);
+                                                       anakin_input_shape);
+    anakin_input->copy_from(tmp_anakin_tensor);
  }

  for (const auto &output : outputs) {
    auto *tensor = output.second;
    auto *data = tensor->data<float>();
    auto shape = framework::vectorize2int(tensor->dims());
-    ::anakin::saber::Shape anakin_shape(shape);
    auto *anakin_output = net_->get_out(output.first);
+    auto anakin_output_shape = anakin_output->valid_shape();
+    PADDLE_ENFORCE(tensor->numel(), anakin_output_shape.count(),
+                   "the fluid output size should be equal to anakin");
    ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0,
-                                                       anakin_shape);
+                                                       anakin_output_shape);
    anakin_output->share_from(tmp_anakin_tensor);
  }
  net_->prediction();
+  cudaDeviceSynchronize();
 }

 template <typename TargetT, Precision PrecisionType, OpRunType RunType>
 void AnakinEngine<TargetT, PrecisionType, RunType>::Freeze() {
-  PADDLE_ENFORCE(graph_->Freeze(), "Freeze anakin subgraph.");
+  PADDLE_ENFORCE(graph_->Freeze_v3(), "Freeze anakin subgraph.");
 }

 template <typename TargetT, Precision PrecisionType, OpRunType RunType>
--- a/paddle/fluid/inference/anakin/engine.h
+++ b/paddle/fluid/inference/anakin/engine.h
@ -46,6 +46,9 @@ namespace anakin {
 template <typename TargetT, ::anakin::Precision PrecisionType,
          ::anakin::OpRunType RunType = ::anakin::OpRunType::ASYNC>
 class AnakinEngine {
+  using NetT = ::anakin::Net<TargetT, PrecisionType, RunType>;
+  using GraphT = ::anakin::graph::Graph<TargetT, PrecisionType>;
+
 public:
  explicit AnakinEngine(bool need_summary = false);
  ~AnakinEngine();
@ -61,16 +64,15 @@ class AnakinEngine {
    PADDLE_ENFORCE(graph_->AddOpAttr(op_name, attr_name, attr_value),
                   "Add operation's attribution.");
  }
-
+  NetT *Net() { return net_.get(); }
  std::unique_ptr<AnakinEngine> Clone();
  void Freeze();
  void Optimize();
+  void Save(std::string path) { graph_->save(path); }
  void Execute(const std::map<std::string, framework::LoDTensor *> &inputs,
               const std::map<std::string, framework::LoDTensor *> &outputs);

 private:
-  using NetT = ::anakin::Net<TargetT, PrecisionType, RunType>;
-  using GraphT = ::anakin::graph::Graph<TargetT, PrecisionType>;
  std::unique_ptr<GraphT> graph_;
  std::unique_ptr<NetT> net_;
 };
--- a/paddle/fluid/inference/anakin/op_teller.cc
+++ b/paddle/fluid/inference/anakin/op_teller.cc
@ -20,7 +20,18 @@ namespace anakin {

 // Just tell by the op_types.
 struct SimpleOpTypeSetTeller : public Teller {
-  SimpleOpTypeSetTeller() {}
+  SimpleOpTypeSetTeller() {
+    // teller_set.insert("mul");
+    teller_set.insert("fc");
+    teller_set.insert("conv2d_fusion");
+    teller_set.insert("split");
+    teller_set.insert("relu");
+    teller_set.insert("pool2d");
+    teller_set.insert("elementwise_add");
+    teller_set.insert("concat");
+    teller_set.insert("tanh");
+    // teller_set.insert("conv2d");
+  }

  bool operator()(const std::string& op_type,
                  const framework::OpDesc& desc) override {
@ -28,7 +39,7 @@ struct SimpleOpTypeSetTeller : public Teller {
  }

 private:
-  std::unordered_set<std::string> teller_set{{"mul"}};
+  std::unordered_set<std::string> teller_set;
 };

 bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) {
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@ -13,8 +13,11 @@
 // limitations under the License.

 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
+#include <memory>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
@ -63,7 +66,14 @@ void IRPassManager::CreatePasses(Argument *argument,
    } else if (pass_name == "cpu_quantize_pass") {
      pass->Set("quant_var_scales",
                new VarQuantScale(argument->quant_var_scales()));
-    } else if (pass_name == "tensorrt_subgraph_pass") {
+    }
+
+    if (pass_name == "anakin_subgraph_pass") {
+      pass->Set("program",
+                new framework::ProgramDesc *(&argument->main_program()));
+    }
+
+    if (pass_name == "tensorrt_subgraph_pass") {
      pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
      pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
      pass->Set("min_subgraph_size",
--- a/Show More
+++ b/Show More