From d83187dba87bf106abb71ed559f645cc79a7933a Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Tue, 4 Sep 2018 11:07:43 +0800
Subject: [PATCH 01/13] enable lac analysis test

---
 .../fluid/inference/analysis/CMakeLists.txt   | 14 +++-
 .../inference/analysis/analyzer_lac_tester.cc | 70 ++++++++++++++++---
 paddle/fluid/inference/api/CMakeLists.txt     |  2 +-
 3 files changed, 75 insertions(+), 11 deletions(-)
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index cc0dd0d492..eb4908da24 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -105,6 +105,18 @@ if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING)
 endif()
 
 inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
-    EXTRA_DEPS paddle_inference_api paddle_fluid_api
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis
+    analysis_predictor
+    # ir
+    fc_fuse_pass
+    fc_lstm_fuse_pass
+    seq_concat_fc_fuse_pass
+    graph_viz_pass
+    infer_clean_graph_pass
+    graph_pattern_detector
+    infer_clean_graph_pass
+    attention_lstm_fuse_pass
+    paddle_inference_api
+    pass
     ARGS --infer_model=${LAC_INSTALL_DIR}/model
         --infer_data=${LAC_INSTALL_DIR}/data.txt)
diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
index e2f7253ac0..2aef25603f 100644
--- a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "paddle/fluid/inference/analysis/analyzer.h"
-#include <google/protobuf/text_format.h>
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
@@ -102,6 +102,7 @@ struct DataRecord {
     return data;
   }
 };
+
 void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
                  int batch_size) {
   auto one_batch = data->NextBatch();
@@ -114,12 +115,14 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   PADDLE_ENFORCE_EQ(batch_size, static_cast<int>(one_batch.lod.size() - 1));
   input_slots->assign({input_tensor});
 }
+
 static void PrintTime(const double latency, const int bs, const int repeat) {
   LOG(INFO) << "===========profile result===========";
   LOG(INFO) << "batch_size: " << bs << ", repeat: " << repeat
             << ", avg latency: " << latency / repeat << "ms";
   LOG(INFO) << "=====================================";
 }
+
 void BenchAllData(const std::string &model_path, const std::string &data_file,
                   const int batch_size, const int repeat) {
   NativeConfig config;
@@ -147,36 +150,64 @@ void BenchAllData(const std::string &model_path, const std::string &data_file,
   }
   PrintTime(sum, batch_size, repeat);
 }
+
 const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25,
                                 25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43,
                                 44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39,
                                 14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
+
 void TestLACPrediction(const std::string &model_path,
                        const std::string &data_file, const int batch_size,
-                       const int repeat, bool test_all_data) {
-  if (test_all_data) {
-    BenchAllData(model_path, data_file, batch_size, repeat);
-    return;
-  }
+                       const int repeat, bool test_all_data,
+                       bool use_analysis = false) {
   NativeConfig config;
   config.model_dir = model_path;
   config.use_gpu = false;
   config.device = 0;
   config.specify_input_name = true;
-  std::vector<PaddleTensor> input_slots, outputs_slots;
+  std::vector<PaddleTensor> input_slots, outputs_slots, ref_outputs_slots;
   DataRecord data(data_file, batch_size);
   GetOneBatch(&input_slots, &data, batch_size);
-  auto predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  std::unique_ptr<PaddlePredictor> predictor;
+  if (use_analysis) {
+    predictor =
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kAnalysis>(
+            config);
+  } else {
+    predictor =
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  }
   for (int i = 0; i < FLAGS_burning; i++) {
     predictor->Run(input_slots, &outputs_slots);
   }
   Timer timer;
+  if (test_all_data) {
+    double sum = 0;
+    for (int i = 0; i < repeat; i++) {
+      for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
+        GetOneBatch(&input_slots, &data, batch_size);
+        timer.tic();
+        predictor->Run(input_slots, &outputs_slots);
+        sum += timer.toc();
+      }
+    }
+    PrintTime(sum, batch_size, repeat);
+    return;
+  }
   timer.tic();
   for (int i = 0; i < repeat; i++) {
     predictor->Run(input_slots, &outputs_slots);
   }
   PrintTime(timer.toc(), batch_size, repeat);
+
+  // check result
+  if (use_analysis) {
+    // run once for comparion as reference
+    auto ref_predictor =
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+    ref_predictor->Run(input_slots, &ref_outputs_slots);
+  }
+
   EXPECT_EQ(outputs_slots.size(), 1UL);
   auto &out = outputs_slots[0];
   size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
@@ -188,12 +219,33 @@ void TestLACPrediction(const std::string &model_path,
   for (size_t i = 0; i < batch1_size; ++i) {
     EXPECT_EQ(pdata[i], lac_ref_data[i]);
   }
+
+  if (use_analysis) {
+    EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size());
+    auto &ref_out = ref_outputs_slots[0];
+    size_t ref_size =
+        std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
+                        [](int a, int b) { return a * b; });
+    EXPECT_EQ(size, ref_size);
+    int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
+    for (size_t i = 0; i < size; ++i) {
+      EXPECT_EQ(pdata_ref[i], pdata[i]);
+    }
+  }
 }
+
 TEST(Analyzer_LAC, native) {
   LOG(INFO) << "LAC with native";
   TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
                     FLAGS_repeat, FLAGS_test_all_data);
 }
+
+TEST(Analyzer_LAC, analysis) {
+  LOG(INFO) << "LAC with analysis";
+  TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
+                    FLAGS_repeat, FLAGS_test_all_data, true);
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index adfe439244..a94c79a698 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -47,7 +47,7 @@ function(inference_api_test TARGET_NAME)
 endfunction(inference_api_test)
 
 cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor)
-cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api)
+cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis)
 
 cc_test(test_paddle_inference_api
         SRCS api_tester.cc

From 09016df8df61cff85a58c0dfd5a29e4feb575a97 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 5 Sep 2018 21:03:53 +0800
Subject: [PATCH 02/13] make analyzer run

---
 paddle/fluid/inference/analysis/CMakeLists.txt     | 14 +-------------
 .../inference/analysis/analyzer_lac_tester.cc      | 10 ++++++++--
 2 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 43201fb0bb..dce74ee3f9 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -93,19 +93,7 @@ if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
 endif()
 
 inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
-    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis
-    analysis_predictor
-    # ir
-    fc_fuse_pass
-    fc_lstm_fuse_pass
-    seq_concat_fc_fuse_pass
-    graph_viz_pass
-    infer_clean_graph_pass
-    graph_pattern_detector
-    infer_clean_graph_pass
-    attention_lstm_fuse_pass
-    paddle_inference_api
-    pass
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
     ARGS --infer_model=${LAC_INSTALL_DIR}/model
         --infer_data=${LAC_INSTALL_DIR}/data.txt)
 
diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
index 2aef25603f..5efee95030 100644
--- a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
@@ -16,6 +16,7 @@
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -170,9 +171,14 @@ void TestLACPrediction(const std::string &model_path,
   GetOneBatch(&input_slots, &data, batch_size);
   std::unique_ptr<PaddlePredictor> predictor;
   if (use_analysis) {
+    AnalysisConfig cfg;
+    cfg.model_dir = model_path;
+    cfg.use_gpu = false;
+    cfg.device = 0;
+    cfg.specify_input_name = true;
+    cfg.enable_ir_optim = true;
     predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kAnalysis>(
-            config);
+        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
   } else {
     predictor =
         CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);

From f057077c3a7b3c36ea0728d849ec91e5af7814bf Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 5 Sep 2018 23:38:58 +0800
Subject: [PATCH 03/13] add fuse fc gru pass

---
 paddle/fluid/framework/ir/fc_gru_fuse_pass.cc | 193 ++++++++++++++++++
 paddle/fluid/framework/ir/fc_gru_fuse_pass.h  |  50 +++++
 .../framework/ir/graph_pattern_detector.cc    |  27 +++
 .../framework/ir/graph_pattern_detector.h     |   2 +
 4 files changed, 272 insertions(+)
 create mode 100644 paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
 create mode 100644 paddle/fluid/framework/ir/fc_gru_fuse_pass.h

diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
new file mode 100644
index 0000000000..1e7b49620c
--- /dev/null
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
@@ -0,0 +1,193 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h"
+#include <string>
+#include "paddle/fluid/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::string GenNodeName(const std::string& prefix, const std::string& name) {
+  return prefix + "/" + name;
+}
+
+void BuildPattern(PDPattern* pattern, const std::string& name_scope,
+                  bool with_fc_bias) {
+  PDNode* x = pattern->NewNode(name_scope, "x")
+                  ->assert_is_op_input("mul")
+                  ->assert_var_not_persistable();
+  auto* fc_out = patterns::FC(pattern, name_scope, x, with_fc_bias);
+  fc_out->AsIntermediate();  // fc_out is a tmp var, will be removed after fuse.
+  patterns::GRU(pattern, name_scope, fc_out);
+  VLOG(3) << "\n" << pattern->DotString();
+}
+
+int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
+                bool with_fc_bias) {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+
+  BuildPattern(pattern, name_scope, with_fc_bias);
+
+  // Create New OpDesc
+  auto gru_creater = [&](int gru, int x, int weight_x, int weight_h, int bias,
+                         int hidden, int fc_bias) {
+#define GET_NODE(x) auto* x##_n = graph->RetriveNode(x);
+    GET_NODE(x);
+    GET_NODE(weight_x);
+    GET_NODE(weight_h);
+    GET_NODE(bias);
+    GET_NODE(hidden);
+    GET_NODE(gru);
+
+    OpDesc op_desc;
+    op_desc.SetType("fusion_gru");
+#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__##_n->Name()});
+    SET_IN(X, x);
+    SET_IN(WeightX, weight_x);
+    SET_IN(WeightH, weight_h);
+    SET_IN(Bias, bias);
+#undef SET_IN
+    if (with_fc_bias) {
+      // Add FC-bias with LSTM-bias and create a new weight
+      PADDLE_ENFORCE(scope);
+      const std::string& new_bias_var = name_scope + "_bias.new";
+      auto* bias_var = scope->Var(new_bias_var);
+      PADDLE_ENFORCE(bias_var);
+      auto* bias_tensor = bias_var->GetMutable<framework::LoDTensor>();
+      auto* gru_bias_var = scope->FindVar(bias_n->Name());
+      PADDLE_ENFORCE(gru_bias_var);
+      const auto& gru_bias_tenosr = gru_bias_var->Get<framework::LoDTensor>();
+      bias_tensor->Resize(gru_bias_tenosr.dims());
+
+      GET_NODE(fc_bias);
+      auto* fc_bias_var = scope->FindVar(fc_bias_n->Name());
+      const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
+      // new bias = fc bias + gru bias
+      auto* data = bias_tensor->mutable_data<float>(platform::CPUPlace());
+      for (int i = 0; i < bias_tensor->numel(); i++) {
+        data[i] =
+            fc_bias_tensor.data<float>()[i] + gru_bias_tenosr.data<float>()[i];
+      }
+      op_desc.SetInput("Bias", {new_bias_var});
+    }
+#undef GET_NODE
+
+    op_desc.SetInput("H0", {});
+    op_desc.SetOutput("Hidden", {hidden_n->Name()});
+    op_desc.SetAttr("is_reverse", gru_n->Op()->GetAttr("is_reverse"));
+    // TODO(TJ): This should be a option for infer
+    op_desc.SetAttr("use_seq", true);
+
+    // Create temp variables.
+    // TODO(TJ): clean code
+    scope->Var(name_scope + "/ReorderedH0.new")
+        ->GetMutable<framework::LoDTensor>();
+    scope->Var(name_scope + "/XX.new")->GetMutable<framework::LoDTensor>();
+    scope->Var(name_scope + "/BatchedInput.new")
+        ->GetMutable<framework::LoDTensor>();
+    scope->Var(name_scope + "/BatchedOut.new")
+        ->GetMutable<framework::LoDTensor>();
+    op_desc.SetOutput("ReorderedH0", {name_scope + "/ReorderedH0.new"});
+    op_desc.SetOutput("XX", {name_scope + "/XX.new"});
+    op_desc.SetOutput("BatchedInput", {name_scope + "/BatchedInput.new"});
+    op_desc.SetOutput("BatchedOut", {name_scope + "/BatchedOut.new"});
+
+    auto* op = graph->CreateOpNode(&op_desc);
+    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
+    auto* scope = graph->Get<Scope*>(kParamScopeAttr);
+
+    IR_NODE_LINK_TO(x_n, op);
+    IR_NODE_LINK_TO(weight_x_n, op);
+    IR_NODE_LINK_TO(weight_h_n, op);
+    IR_NODE_LINK_TO(bias_n, op);
+    IR_NODE_LINK_TO(op, hidden_n);
+    // h0?
+    return op;
+  };
+
+  int fusion_count{0};
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+#define GET_NODE(name__)                                \
+  std::string name__##key = name_scope + "/" + #name__; \
+  auto* name__##n = pattern->RetrieveNode(name__##key); \
+  PADDLE_ENFORCE(name__##n);                            \
+  PADDLE_ENFORCE(subgraph.count(name__##n));            \
+  Node* name__##_n = subgraph.at(name__##n);            \
+  int name__ __attribute__((unused)) = name__##_n->id();
+
+    GET_NODE(x);
+    GET_NODE(w);
+    GET_NODE(mul);
+    GET_NODE(fc_out);
+    GET_NODE(Weight);
+    GET_NODE(gru);
+    GET_NODE(Bias);
+    GET_NODE(Hidden);
+
+    if (with_fc_bias) {
+      GET_NODE(fc_bias);
+      GET_NODE(elementwise_add);
+      gru_creater(gru, x, w, Weight, Bias, Hidden, fc_bias);
+      // Remove unneeded nodes.
+      std::unordered_set<const Node*> marked_nodes(
+          {mul_n, gru_n, elementwise_add_n});
+      GraphSafeRemoveNodes(graph, marked_nodes);
+    } else {
+      gru_creater(gru, x, w, Weight, Bias, Hidden, -1);
+      // Remove unneeded nodes.
+      std::unordered_set<const Node*> marked_nodes({mul_n, gru_n});
+      GraphSafeRemoveNodes(graph, marked_nodes);
+    }
+#undef GET_NODE
+
+    ++fusion_count;
+  };
+
+  gpd(graph, handler);
+
+  return fusion_count;
+}
+
+std::unique_ptr<ir::Graph> MulGRUFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+
+  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+                                 false /*with_fc_bias*/);
+
+  AddStatis(fusion_count);
+  return graph;
+}
+
+std::unique_ptr<ir::Graph> FCGRUFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+
+  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+                                 true /*with_fc_bias*/);
+
+  AddStatis(fusion_count);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(mul_lstm_fuse_pass, paddle::framework::ir::MulGRUFusePass);
+REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCGRUFusePass);
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
new file mode 100644
index 0000000000..63e1c72bfb
--- /dev/null
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+// The MulGRUFusePass and MulGRUFusePass will fuse to the same FusionGRU op.
+
+class FCGRUFusePass : public FusePassBase {
+ public:
+  virtual ~FCGRUFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+
+  const std::string name_scope_{"fc_gru_fuse"};
+};
+
+// Just FC without bias
+class MulGRUFusePass : public FusePassBase {
+ public:
+  virtual ~MulGRUFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"fc_nobias_gru_fuse"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 434bee4cce..8dfe36f781 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -565,6 +565,7 @@ PDNode* patterns::FC(PDPattern* pattern, const std::string& name_scope,
 
   return fc_out;
 }
+
 PDNode* patterns::LSTM(PDPattern* pattern, const std::string& name_scope,
                        PDNode* x) {
   x->assert_is_op_input("lstm", "Input");
@@ -589,6 +590,32 @@ PDNode* patterns::LSTM(PDPattern* pattern, const std::string& name_scope,
   lstm_op->LinksTo({Hidden, Cell, BatchGate, BatchCellPreAct});
   return Hidden;
 }
+
+PDNode* patterns::GRU(PDPattern* pattern, const std::string& name_scope,
+                      PDNode* x) {
+  x->assert_is_op_input("gru", "Input");
+  auto* gru_op = pattern->NewNode(name_scope, "gru")->assert_is_op("gru");
+#define NEW_NODE(arg__, io__)                        \
+  auto* arg__ = pattern->NewNode(name_scope, #arg__) \
+                    ->assert_is_op_##io__("gru", #arg__);
+
+  NEW_NODE(Weight, input);
+  // TODO(Superjomn): upgrade the fuse framework to support optional.
+  // H0 and bias are optional
+  NEW_NODE(Bias, input);  // also optional
+  // NEW_NODE(H0, input);
+
+  NEW_NODE(Hidden, output);
+  // below are intermediate
+  NEW_NODE(BatchGate, output);
+  NEW_NODE(BatchResetHiddenPrev, output);
+  NEW_NODE(BatchHidden, output);
+
+  gru_op->LinksFrom({x, Weight, Bias});
+  gru_op->LinksTo({Hidden, BatchGate, BatchResetHiddenPrev, BatchHidden});
+  return Hidden;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index eacea1750f..71e4c36d9b 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -298,6 +298,8 @@ PDNode* FC(PDPattern* pattern, const std::string& name_scope, PDNode* x,
 
 PDNode* LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* x);
 
+PDNode* GRU(PDPattern* pattern, const std::string& name_scope, PDNode* x);
+
 }  // namespace patterns
 
 #define IR_NODE_LINK_TO(a, b) \

From 74f95b8da05a6a7f7487222b8f004f40a3156c05 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 6 Sep 2018 11:09:23 +0800
Subject: [PATCH 04/13] fix redefine macro

---
 .../framework/ir/graph_pattern_detector.cc    | 35 +++++++++----------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 8dfe36f781..8b1e653ec8 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -566,25 +566,26 @@ PDNode* patterns::FC(PDPattern* pattern, const std::string& name_scope,
   return fc_out;
 }
 
+#define NEW_NODE(op__, arg__, io__)                  \
+  auto* arg__ = pattern->NewNode(name_scope, #arg__) \
+                    ->assert_is_op_##io__(#op__, #arg__);
+
 PDNode* patterns::LSTM(PDPattern* pattern, const std::string& name_scope,
                        PDNode* x) {
   x->assert_is_op_input("lstm", "Input");
   auto* lstm_op = pattern->NewNode(name_scope, "lstm")->assert_is_op("lstm");
-#define NEW_NODE(arg__, io__)                        \
-  auto* arg__ = pattern->NewNode(name_scope, #arg__) \
-                    ->assert_is_op_##io__("lstm", #arg__);
 
   // Currently, the H0 and C0 are optional
   // TODO(Superjomn) upgrade the fuse framework to support optional.
   // NEW_NODE(H0, input);
   // NEW_NODE(C0, input);
-  NEW_NODE(Weight, input);
-  NEW_NODE(Bias, input);
+  NEW_NODE(lstm, Weight, input);
+  NEW_NODE(lstm, Bias, input);
 
-  NEW_NODE(Hidden, output);
-  NEW_NODE(Cell, output);
-  NEW_NODE(BatchGate, output);
-  NEW_NODE(BatchCellPreAct, output);
+  NEW_NODE(lstm, Hidden, output);
+  NEW_NODE(lstm, Cell, output);
+  NEW_NODE(lstm, BatchGate, output);
+  NEW_NODE(lstm, BatchCellPreAct, output);
 
   lstm_op->LinksFrom({x, Weight, Bias});
   lstm_op->LinksTo({Hidden, Cell, BatchGate, BatchCellPreAct});
@@ -595,26 +596,24 @@ PDNode* patterns::GRU(PDPattern* pattern, const std::string& name_scope,
                       PDNode* x) {
   x->assert_is_op_input("gru", "Input");
   auto* gru_op = pattern->NewNode(name_scope, "gru")->assert_is_op("gru");
-#define NEW_NODE(arg__, io__)                        \
-  auto* arg__ = pattern->NewNode(name_scope, #arg__) \
-                    ->assert_is_op_##io__("gru", #arg__);
 
-  NEW_NODE(Weight, input);
+  NEW_NODE(gru, Weight, input);
   // TODO(Superjomn): upgrade the fuse framework to support optional.
   // H0 and bias are optional
-  NEW_NODE(Bias, input);  // also optional
+  NEW_NODE(gru, Bias, input);  // also optional
   // NEW_NODE(H0, input);
 
-  NEW_NODE(Hidden, output);
+  NEW_NODE(gru, Hidden, output);
   // below are intermediate
-  NEW_NODE(BatchGate, output);
-  NEW_NODE(BatchResetHiddenPrev, output);
-  NEW_NODE(BatchHidden, output);
+  NEW_NODE(gru, BatchGate, output);
+  NEW_NODE(gru, BatchResetHiddenPrev, output);
+  NEW_NODE(gru, BatchHidden, output);
 
   gru_op->LinksFrom({x, Weight, Bias});
   gru_op->LinksTo({Hidden, BatchGate, BatchResetHiddenPrev, BatchHidden});
   return Hidden;
 }
+#undef NEW_NODE
 
 }  // namespace ir
 }  // namespace framework

From 4d774953c6cb584f084129746b4d2aea0e59237a Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 6 Sep 2018 11:53:25 +0800
Subject: [PATCH 05/13] enable fc gru fuse pass

---
 paddle/fluid/framework/ir/CMakeLists.txt      |  1 +
 paddle/fluid/framework/ir/fc_gru_fuse_pass.cc | 18 ++++++-------
 .../fluid/framework/ir/fc_lstm_fuse_pass.cc   | 11 ++++----
 paddle/fluid/inference/analysis/analyzer.h    |  4 +++
 .../inference/analysis/analyzer_lac_tester.cc | 25 +++++++++++++++++++
 paddle/fluid/inference/api/CMakeLists.txt     |  1 +
 6 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index f5235f70ad..6c7f972589 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -24,6 +24,7 @@ pass_library(fc_fuse_pass)
 pass_library(attention_lstm_fuse_pass)
 pass_library(infer_clean_graph_pass)
 pass_library(fc_lstm_fuse_pass)
+pass_library(fc_gru_fuse_pass)
 pass_library(seq_concat_fc_fuse_pass)
 set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
 
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
index 1e7b49620c..4a08beee7d 100644
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
@@ -20,12 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-std::string GenNodeName(const std::string& prefix, const std::string& name) {
-  return prefix + "/" + name;
-}
-
-void BuildPattern(PDPattern* pattern, const std::string& name_scope,
-                  bool with_fc_bias) {
+static void BuildPattern(PDPattern* pattern, const std::string& name_scope,
+                         bool with_fc_bias) {
   PDNode* x = pattern->NewNode(name_scope, "x")
                   ->assert_is_op_input("mul")
                   ->assert_var_not_persistable();
@@ -35,8 +31,8 @@ void BuildPattern(PDPattern* pattern, const std::string& name_scope,
   VLOG(3) << "\n" << pattern->DotString();
 }
 
-int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
-                bool with_fc_bias) {
+static int BuildFusion(Graph* graph, const std::string& name_scope,
+                       Scope* scope, bool with_fc_bias) {
   GraphPatternDetector gpd;
   auto* pattern = gpd.mutable_pattern();
 
@@ -108,7 +104,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
 
     auto* op = graph->CreateOpNode(&op_desc);
     PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
-    auto* scope = graph->Get<Scope*>(kParamScopeAttr);
+    // auto* scope = graph->Get<Scope*>(kParamScopeAttr);
 
     IR_NODE_LINK_TO(x_n, op);
     IR_NODE_LINK_TO(weight_x_n, op);
@@ -189,5 +185,5 @@ std::unique_ptr<ir::Graph> FCGRUFusePass::ApplyImpl(
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(mul_lstm_fuse_pass, paddle::framework::ir::MulGRUFusePass);
-REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCGRUFusePass);
+REGISTER_PASS(mul_gru_fuse_pass, paddle::framework::ir::MulGRUFusePass);
+REGISTER_PASS(fc_gru_fuse_pass, paddle::framework::ir::FCGRUFusePass);
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
index 0d69dfa79a..5fa3fcb9dc 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -19,12 +19,13 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-std::string GenNodeName(const std::string& prefix, const std::string& name) {
+static std::string GenNodeName(const std::string& prefix,
+                               const std::string& name) {
   return prefix + "/" + name;
 }
 
-void BuildPattern(PDPattern* pattern, const std::string& name_scope,
-                  bool with_fc_bias) {
+static void BuildPattern(PDPattern* pattern, const std::string& name_scope,
+                         bool with_fc_bias) {
   PDNode* x = pattern->NewNode(name_scope, "x")
                   ->assert_is_op_input("mul")
                   ->assert_var_not_persistable();
@@ -34,8 +35,8 @@ void BuildPattern(PDPattern* pattern, const std::string& name_scope,
   // LOG(INFO) << "\n" << pattern->DotString();
 }
 
-int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
-                bool with_fc_bias) {
+static int BuildFusion(Graph* graph, const std::string& name_scope,
+                       Scope* scope, bool with_fc_bias) {
   GraphPatternDetector gpd;
   auto* pattern = gpd.mutable_pattern();
 
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index 3fdd2b9ec7..7800fc90b1 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -36,6 +36,8 @@ limitations under the License. */
  */
 
 #include <gflags/gflags.h>
+#include <string>
+#include <vector>
 #include "paddle/fluid/inference/analysis/flags.h"
 #include "paddle/fluid/inference/analysis/pass.h"
 #include "paddle/fluid/inference/analysis/pass_manager.h"
@@ -66,6 +68,8 @@ class Analyzer : public OrderedRegistry<PassManager> {
       "attention_lstm_fuse_pass",  //
       "fc_lstm_fuse_pass",         //
       "mul_lstm_fuse_pass",        //
+      "fc_gru_fuse_pass",          //
+      "mul_gru_fuse_pass",         //
       "seq_concat_fc_fuse_pass",   //
       "fc_fuse_pass",              //
   }};
diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
index 5efee95030..a6e8351c4f 100644
--- a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include <gtest/gtest.h>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/analysis_predictor.h"
@@ -237,6 +238,30 @@ void TestLACPrediction(const std::string &model_path,
     for (size_t i = 0; i < size; ++i) {
       EXPECT_EQ(pdata_ref[i], pdata[i]);
     }
+
+    AnalysisPredictor *analysis_predictor =
+        dynamic_cast<AnalysisPredictor *>(predictor.get());
+    auto &fuse_statis = analysis_predictor->analysis_argument()
+                            .Get<std::unordered_map<std::string, int>>(
+                                framework::ir::kFuseStatisAttr);
+    for (auto &item : fuse_statis) {
+      LOG(INFO) << "fused " << item.first << " " << item.second;
+    }
+    int num_ops = 0;
+    for (auto &node :
+         analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
+      if (node->IsFunction()) {
+        ++num_ops;
+      }
+    }
+    LOG(INFO) << "has num ops: " << num_ops;
+    ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+    ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
+    LOG(INFO) << "fc fuse num:" << fuse_statis.at("fc_fuse");
+    LOG(INFO) << "fc gru fuse num:" << fuse_statis.at("fc_gru_fuse");
+
+    // ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
+    // LOG(INFO) << fuse_statis.at("fc_gru_fuse");
   }
 }
 
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index e976b9397d..330ea04495 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -50,6 +50,7 @@ cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_a
           pass
           fc_fuse_pass
           fc_lstm_fuse_pass
+          fc_gru_fuse_pass
           seq_concat_fc_fuse_pass
           graph_viz_pass
           infer_clean_graph_pass

From 6b104c90d353409c2aacd34321bc6cf5407eb0e5 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 6 Sep 2018 13:51:34 +0800
Subject: [PATCH 06/13] fix profile

---
 .../inference/analysis/analyzer_lac_tester.cc | 19 +++++++------------
 .../fluid/inference/api/analysis_predictor.cc | 13 +++++++++++++
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
index a6e8351c4f..1df1ade25f 100644
--- a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
@@ -178,6 +178,7 @@ void TestLACPrediction(const std::string &model_path,
     cfg.device = 0;
     cfg.specify_input_name = true;
     cfg.enable_ir_optim = true;
+    cfg.ir_passes.push_back("fc_gru_fuse_pass");
     predictor =
         CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
   } else {
@@ -208,13 +209,6 @@ void TestLACPrediction(const std::string &model_path,
   PrintTime(timer.toc(), batch_size, repeat);
 
   // check result
-  if (use_analysis) {
-    // run once for comparion as reference
-    auto ref_predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
-    ref_predictor->Run(input_slots, &ref_outputs_slots);
-  }
-
   EXPECT_EQ(outputs_slots.size(), 1UL);
   auto &out = outputs_slots[0];
   size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
@@ -228,6 +222,10 @@ void TestLACPrediction(const std::string &model_path,
   }
 
   if (use_analysis) {
+    // run once for comparion as reference
+    auto ref_predictor =
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+    ref_predictor->Run(input_slots, &ref_outputs_slots);
     EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size());
     auto &ref_out = ref_outputs_slots[0];
     size_t ref_size =
@@ -256,12 +254,9 @@ void TestLACPrediction(const std::string &model_path,
     }
     LOG(INFO) << "has num ops: " << num_ops;
     ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-    ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
-    LOG(INFO) << "fc fuse num:" << fuse_statis.at("fc_fuse");
-    LOG(INFO) << "fc gru fuse num:" << fuse_statis.at("fc_gru_fuse");
-
     // ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
-    // LOG(INFO) << fuse_statis.at("fc_gru_fuse");
+    LOG(INFO) << "fc fuse num:" << fuse_statis.at("fc_fuse");
+    // LOG(INFO) << "fc gru fuse num:" << fuse_statis.at("fc_gru_fuse");
   }
 }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index a8fa677202..82d673fd15 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -22,12 +22,25 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DECLARE_bool(profile);
 
 namespace paddle {
 
 bool AnalysisPredictor::Init(
     const std::shared_ptr<framework::Scope>& parent_scope) {
   VLOG(3) << "Predictor::init()";
+#if !defined(_WIN32)
+  if (FLAGS_profile) {
+    LOG(WARNING) << "Profiler is actived, might affect the performance";
+    LOG(INFO) << "You can turn off by set gflags '-profile false'";
+    auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll
+                                           : platform::ProfilerState::kCPU;
+    platform::EnableProfiler(tracking_device);
+  }
+#endif
+
   if (config_.use_gpu) {
     place_ = paddle::platform::CUDAPlace(config_.device);
     LOG(WARNING) << "ir optimize only supports CPU currently";

From ca30127e0a048de5e56e249d54d8836422ac2140 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 7 Sep 2018 00:03:13 +0800
Subject: [PATCH 07/13] fix compile error undef registrar pass

---
 paddle/fluid/inference/analysis/analyzer.h |  1 -
 paddle/fluid/inference/api/CMakeLists.txt  | 14 +++++++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index 6189548a7b..399afbe64a 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -38,7 +38,6 @@ limitations under the License. */
 #include <gflags/gflags.h>
 #include <string>
 #include <vector>
-
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/flags.h"
 #include "paddle/fluid/inference/analysis/pass_manager.h"
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index b69948f40a..f944c9fdec 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -44,7 +44,19 @@ function(inference_api_test TARGET_NAME)
 endfunction(inference_api_test)
 
 cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor)
-cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis)
+cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api
+    analysis
+    ir_pass_manager
+    pass
+    fc_fuse_pass
+    fc_lstm_fuse_pass
+    fc_gru_fuse_pass
+    seq_concat_fc_fuse_pass
+    graph_viz_pass
+    infer_clean_graph_pass
+    graph_pattern_detector
+    infer_clean_graph_pass
+    attention_lstm_fuse_pass)
 
 cc_test(test_paddle_inference_api
         SRCS api_tester.cc

From 7eebb905235c5780350d92902776a4e0c267c87f Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 7 Sep 2018 11:53:19 +0800
Subject: [PATCH 08/13] fix conflicts

---
 paddle/fluid/inference/analysis/analyzer_lac_tester.cc | 2 +-
 paddle/fluid/inference/api/helper.h                    | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
index 5740faa746..7917152428 100644
--- a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
@@ -192,7 +192,7 @@ void TestLACPrediction(const std::string &model_path,
         sum += timer.toc();
       }
     }
-    PrintTime(sum, batch_size, repeat);
+    PrintTime(batch_size, repeat, 1, 0, sum / batch_size);
     return;
   }
   timer.tic();
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index 2c2ac656e8..0ab2542f34 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -124,9 +124,11 @@ std::string DescribeTensor(const PaddleTensor &tensor) {
 
 void PrintTime(int batch_size, int repeat, int num_threads, int tid,
                double latency) {
+  LOG(INFO) << "=====================================";
   LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << repeat
             << ", threads: " << num_threads << ", thread id: " << tid
             << ", latency: " << latency << "ms";
+  LOG(INFO) << "=====================================";
 }
 
 }  // namespace inference

From c9bd2d50f1d9c0db255ebc132b7c74438f3b3bba Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 7 Sep 2018 12:51:36 +0800
Subject: [PATCH 09/13] refine fc and gru pattern

---
 .../framework/ir/graph_pattern_detector.cc    | 45 +++++++++----------
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 37566b7621..69a323a8bd 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -519,50 +519,41 @@ bool VarLinksFromOp(Node* node, const std::string& op_type) {
 
 PDNode* patterns::FC(PDPattern* pattern, const std::string& name_scope,
                      PDNode* x, bool with_bias) {
-  // Create Operators
-  PDNode* elementwise_add_op{nullptr};
+  // mul op
   auto* mul_op = pattern->NewNode(name_scope, "mul")->assert_is_op("mul");
-  if (with_bias) {
-    elementwise_add_op = pattern->NewNode(name_scope, "elementwise_add")
-                             ->assert_is_op("elementwise_add");
-  }
-  // Create variables
-  // w
   auto* mul_weight_var = pattern->NewNode(name_scope, "w")
                              ->AsInput()
                              ->assert_is_persistable_var()
-                             ->assert_is_op_nth_input("mul", "Y", 0);
-  PDNode* mul_out_var{nullptr};
+                             ->assert_is_op_input("mul", "Y");
+
+  PDNode* fc_out{nullptr};
   if (with_bias) {
+    PDNode* elementwise_add_op{nullptr};
+    PDNode *mul_out_var{nullptr}, *bias{nullptr};
+    elementwise_add_op = pattern->NewNode(name_scope, "elementwise_add")
+                             ->assert_is_op("elementwise_add");
     // intermediate variable, will be removed in the IR after fuse.
     mul_out_var = pattern->NewNode(name_scope, "mul_out")
                       ->AsIntermediate()
                       ->assert_is_only_output_of_op("mul")
-                      ->assert_is_op_input("elementwise_add");
-  }
-  PDNode *bias{nullptr}, *fc_out{nullptr};
-  if (with_bias) {
+                      ->assert_is_op_input("elementwise_add", "X");
     // bias
     bias = pattern->NewNode(name_scope, "fc_bias")
-               ->assert_is_op_input("elementwise_add")
-               ->AsInput();
+               ->AsInput()
+               ->assert_is_persistable_var()
+               ->assert_is_op_input("elementwise_add", "Y");
     // output
     fc_out = pattern->NewNode(name_scope, "fc_out")
                  ->AsOutput()
-                 ->assert_is_op_output("elementwise_add");
+                 ->assert_is_op_output("elementwise_add", "Out");
+    mul_op->LinksFrom({x, mul_weight_var}).LinksTo({mul_out_var});
+    elementwise_add_op->LinksFrom({mul_out_var, bias}).LinksTo({fc_out});
   } else {
     fc_out = pattern->NewNode(name_scope, "fc_out")
                  ->AsOutput()
-                 ->assert_is_op_output("mul");
-  }
-
-  if (with_bias) {
-    mul_op->LinksFrom({mul_weight_var, x}).LinksTo({mul_out_var});
-    elementwise_add_op->LinksFrom({mul_out_var, bias}).LinksTo({fc_out});
-  } else {
+                 ->assert_is_op_output("mul", "Out");
     mul_op->LinksFrom({mul_weight_var, x}).LinksTo({fc_out});
   }
-
   return fc_out;
 }
 
@@ -609,6 +600,10 @@ PDNode* patterns::GRU(PDPattern* pattern, const std::string& name_scope,
   NEW_NODE(gru, BatchResetHiddenPrev, output);
   NEW_NODE(gru, BatchHidden, output);
 
+  BatchGate->AsIntermediate();
+  BatchResetHiddenPrev->AsIntermediate();
+  BatchHidden->AsIntermediate();
+
   gru_op->LinksFrom({x, Weight, Bias});
   gru_op->LinksTo({Hidden, BatchGate, BatchResetHiddenPrev, BatchHidden});
   return Hidden;

From df0c695618696378c8320dd85661fdaa276e7407 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 7 Sep 2018 12:53:15 +0800
Subject: [PATCH 10/13] fix fusion gru pass and enable it

---
 paddle/fluid/framework/ir/fc_gru_fuse_pass.cc | 98 +++++++++++--------
 .../inference/analysis/analyzer_lac_tester.cc |  1 -
 2 files changed, 56 insertions(+), 43 deletions(-)

diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
index 4a08beee7d..90d8d5c042 100644
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
@@ -28,7 +28,7 @@ static void BuildPattern(PDPattern* pattern, const std::string& name_scope,
   auto* fc_out = patterns::FC(pattern, name_scope, x, with_fc_bias);
   fc_out->AsIntermediate();  // fc_out is a tmp var, will be removed after fuse.
   patterns::GRU(pattern, name_scope, fc_out);
-  VLOG(3) << "\n" << pattern->DotString();
+  VLOG(3) << "fc_gru pattern \n" << pattern->DotString();
 }
 
 static int BuildFusion(Graph* graph, const std::string& name_scope,
@@ -51,65 +51,72 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
 
     OpDesc op_desc;
     op_desc.SetType("fusion_gru");
+
+#define NEW_NAME(x) name_scope + "/at." #x ".new"
 #define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__##_n->Name()});
     SET_IN(X, x);
     SET_IN(WeightX, weight_x);
     SET_IN(WeightH, weight_h);
-    SET_IN(Bias, bias);
+    if (with_fc_bias) {
+      op_desc.SetInput("Bias", {NEW_NAME(bias) + bias_n->Name()});
+    } else {
+      SET_IN(Bias, bias);
+    }
 #undef SET_IN
+    op_desc.SetInput("H0", {});
+    op_desc.SetOutput("Hidden", {hidden_n->Name()});
+    op_desc.SetAttr("is_reverse", gru_n->Op()->GetAttr("is_reverse"));
+    // TODO(TJ): This should be a option for infer
+    op_desc.SetAttr("use_seq", true);
+
+#define SET_IMTERMEDIATE_OUT(key) op_desc.SetOutput(#key, {NEW_NAME(key)})
+    SET_IMTERMEDIATE_OUT(ReorderedH0);
+    SET_IMTERMEDIATE_OUT(XX);
+    SET_IMTERMEDIATE_OUT(BatchedInput);
+    SET_IMTERMEDIATE_OUT(BatchedOut);
+#undef SET_IMTERMEDIATE_OUT
+
+    auto* op = graph->CreateOpNode(&op_desc);
+    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
+    auto* scope = graph->Get<Scope*>(kParamScopeAttr);
+    PADDLE_ENFORCE(scope);
     if (with_fc_bias) {
-      // Add FC-bias with LSTM-bias and create a new weight
-      PADDLE_ENFORCE(scope);
-      const std::string& new_bias_var = name_scope + "_bias.new";
-      auto* bias_var = scope->Var(new_bias_var);
-      PADDLE_ENFORCE(bias_var);
-      auto* bias_tensor = bias_var->GetMutable<framework::LoDTensor>();
+      // Fusion GRU bias = fcbias + grubias
+      auto* fusion_bias_var = scope->Var(NEW_NAME(bias) + bias_n->Name());
+      auto* out_bias_tensor =
+          fusion_bias_var->GetMutable<framework::LoDTensor>();
+      PADDLE_ENFORCE(fusion_bias_var);
+      GET_NODE(fc_bias);
+      PADDLE_ENFORCE(fc_bias_n);
       auto* gru_bias_var = scope->FindVar(bias_n->Name());
+      auto* fc_bias_var = scope->FindVar(fc_bias_n->Name());
       PADDLE_ENFORCE(gru_bias_var);
+      PADDLE_ENFORCE(fc_bias_var);
       const auto& gru_bias_tenosr = gru_bias_var->Get<framework::LoDTensor>();
-      bias_tensor->Resize(gru_bias_tenosr.dims());
-
-      GET_NODE(fc_bias);
-      auto* fc_bias_var = scope->FindVar(fc_bias_n->Name());
       const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
       // new bias = fc bias + gru bias
-      auto* data = bias_tensor->mutable_data<float>(platform::CPUPlace());
-      for (int i = 0; i < bias_tensor->numel(); i++) {
+      out_bias_tensor->Resize(gru_bias_tenosr.dims());
+      auto* data = out_bias_tensor->mutable_data<float>(platform::CPUPlace());
+      for (int i = 0; i < out_bias_tensor->numel(); i++) {
         data[i] =
             fc_bias_tensor.data<float>()[i] + gru_bias_tenosr.data<float>()[i];
       }
-      op_desc.SetInput("Bias", {new_bias_var});
     }
 #undef GET_NODE
 
-    op_desc.SetInput("H0", {});
-    op_desc.SetOutput("Hidden", {hidden_n->Name()});
-    op_desc.SetAttr("is_reverse", gru_n->Op()->GetAttr("is_reverse"));
-    // TODO(TJ): This should be a option for infer
-    op_desc.SetAttr("use_seq", true);
-
-    // Create temp variables.
-    // TODO(TJ): clean code
-    scope->Var(name_scope + "/ReorderedH0.new")
-        ->GetMutable<framework::LoDTensor>();
-    scope->Var(name_scope + "/XX.new")->GetMutable<framework::LoDTensor>();
-    scope->Var(name_scope + "/BatchedInput.new")
-        ->GetMutable<framework::LoDTensor>();
-    scope->Var(name_scope + "/BatchedOut.new")
-        ->GetMutable<framework::LoDTensor>();
-    op_desc.SetOutput("ReorderedH0", {name_scope + "/ReorderedH0.new"});
-    op_desc.SetOutput("XX", {name_scope + "/XX.new"});
-    op_desc.SetOutput("BatchedInput", {name_scope + "/BatchedInput.new"});
-    op_desc.SetOutput("BatchedOut", {name_scope + "/BatchedOut.new"});
-
-    auto* op = graph->CreateOpNode(&op_desc);
-    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
-    // auto* scope = graph->Get<Scope*>(kParamScopeAttr);
+#define NEW_IMTERMEDIATE_OUT(key) \
+  scope->Var(NEW_NAME(key))->GetMutable<framework::LoDTensor>()
+    NEW_IMTERMEDIATE_OUT(ReorderedH0);
+    NEW_IMTERMEDIATE_OUT(XX);
+    NEW_IMTERMEDIATE_OUT(BatchedInput);
+    NEW_IMTERMEDIATE_OUT(BatchedOut);
+#undef NEW_NAME
+#undef NEW_IMTERMEDIATE_OUT
 
     IR_NODE_LINK_TO(x_n, op);
     IR_NODE_LINK_TO(weight_x_n, op);
     IR_NODE_LINK_TO(weight_h_n, op);
-    IR_NODE_LINK_TO(bias_n, op);
+    IR_NODE_LINK_TO(bias_n, op);  // actually should link to new bias if have
     IR_NODE_LINK_TO(op, hidden_n);
     // h0?
     return op;
@@ -127,26 +134,33 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
   int name__ __attribute__((unused)) = name__##_n->id();
 
     GET_NODE(x);
-    GET_NODE(w);
+    GET_NODE(w);  // fc weight
     GET_NODE(mul);
     GET_NODE(fc_out);
     GET_NODE(Weight);
     GET_NODE(gru);
     GET_NODE(Bias);
     GET_NODE(Hidden);
+    // nodes need be removed
+    GET_NODE(BatchGate);
+    GET_NODE(BatchResetHiddenPrev);
+    GET_NODE(BatchHidden);
 
     if (with_fc_bias) {
+      GET_NODE(mul_out);
       GET_NODE(fc_bias);
       GET_NODE(elementwise_add);
       gru_creater(gru, x, w, Weight, Bias, Hidden, fc_bias);
       // Remove unneeded nodes.
       std::unordered_set<const Node*> marked_nodes(
-          {mul_n, gru_n, elementwise_add_n});
+          {mul_n, gru_n, elementwise_add_n, fc_bias_n, fc_out_n, mul_out_n,
+           BatchGate_n, BatchResetHiddenPrev_n, BatchHidden_n});
       GraphSafeRemoveNodes(graph, marked_nodes);
     } else {
       gru_creater(gru, x, w, Weight, Bias, Hidden, -1);
       // Remove unneeded nodes.
-      std::unordered_set<const Node*> marked_nodes({mul_n, gru_n});
+      std::unordered_set<const Node*> marked_nodes(
+          {mul_n, gru_n, BatchGate_n, BatchResetHiddenPrev_n, BatchHidden_n});
       GraphSafeRemoveNodes(graph, marked_nodes);
     }
 #undef GET_NODE
diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
index 7917152428..56f773bf21 100644
--- a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
@@ -171,7 +171,6 @@ void TestLACPrediction(const std::string &model_path,
     cfg.device = 0;
     cfg.specify_input_name = true;
     cfg.enable_ir_optim = true;
-    cfg.ir_passes.push_back("fc_gru_fuse_pass");
     predictor =
         CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
   } else {

From acfdbf029330e60037e4fff7cee9c00d99f031c5 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 7 Sep 2018 15:56:51 +0800
Subject: [PATCH 11/13] enable ner analysis test and refine lac

---
 .../fluid/inference/analysis/CMakeLists.txt   |  2 +-
 .../inference/analysis/analyzer_lac_tester.cc | 14 ++--
 .../inference/analysis/analyzer_ner_tester.cc | 74 ++++++++++++++++---
 .../inference/analysis/analyzer_tester.cc     |  2 -
 paddle/fluid/inference/api/CMakeLists.txt     | 15 +---
 paddle/fluid/inference/api/helper.h           |  6 +-
 6 files changed, 74 insertions(+), 39 deletions(-)

diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index ef55a0c28a..a115bc8f4a 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -81,7 +81,7 @@ if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
 endif()
 
 inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
-    EXTRA_DEPS paddle_inference_api paddle_fluid_api
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
     ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model
         --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt)
 
diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
index 56f773bf21..4ff7251473 100644
--- a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
@@ -15,11 +15,9 @@
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DEFINE_string(infer_model, "", "model path for LAC");
@@ -160,7 +158,7 @@ void TestLACPrediction(const std::string &model_path,
   config.use_gpu = false;
   config.device = 0;
   config.specify_input_name = true;
-  std::vector<PaddleTensor> input_slots, outputs_slots, ref_outputs_slots;
+  std::vector<PaddleTensor> input_slots, outputs_slots;
   DataRecord data(data_file, batch_size);
   GetOneBatch(&input_slots, &data, batch_size);
   std::unique_ptr<PaddlePredictor> predictor;
@@ -217,6 +215,7 @@ void TestLACPrediction(const std::string &model_path,
     // run once for comparion as reference
     auto ref_predictor =
         CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+    std::vector<PaddleTensor> ref_outputs_slots;
     ref_predictor->Run(input_slots, &ref_outputs_slots);
     EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size());
     auto &ref_out = ref_outputs_slots[0];
@@ -246,9 +245,10 @@ void TestLACPrediction(const std::string &model_path,
     }
     LOG(INFO) << "has num ops: " << num_ops;
     ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-    // ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
-    LOG(INFO) << "fc fuse num:" << fuse_statis.at("fc_fuse");
-    // LOG(INFO) << "fc gru fuse num:" << fuse_statis.at("fc_gru_fuse");
+    ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
+    EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
+    EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 4);
+    EXPECT_EQ(num_ops, 11);
   }
 }
 
diff --git a/paddle/fluid/inference/analysis/analyzer_ner_tester.cc b/paddle/fluid/inference/analysis/analyzer_ner_tester.cc
index eaae09b051..f5c5d73aeb 100644
--- a/paddle/fluid/inference/analysis/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_ner_tester.cc
@@ -13,12 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/analyzer.h"
-#include <google/protobuf/text_format.h>
 #include <gtest/gtest.h>
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DEFINE_string(infer_model, "", "model path");
@@ -112,7 +111,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
 const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
                                        48, 39, 38, 16, 25};
 
-void TestChineseNERPrediction() {
+void TestChineseNERPrediction(bool use_analysis) {
   NativeConfig config;
   config.prog_file = FLAGS_infer_model + "/__model__";
   config.param_file = FLAGS_infer_model + "/param";
@@ -120,11 +119,23 @@ void TestChineseNERPrediction() {
   config.device = 0;
   config.specify_input_name = true;
 
-  auto predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
-  std::vector<PaddleTensor> input_slots;
-  std::vector<PaddleTensor> outputs;
+  std::vector<PaddleTensor> input_slots, outputs;
+  std::unique_ptr<PaddlePredictor> predictor;
   Timer timer;
+  if (use_analysis) {
+    AnalysisConfig cfg;
+    cfg.prog_file = FLAGS_infer_model + "/__model__";
+    cfg.param_file = FLAGS_infer_model + "/param";
+    cfg.use_gpu = false;
+    cfg.device = 0;
+    cfg.specify_input_name = true;
+    cfg.enable_ir_optim = true;
+    predictor =
+        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
+  } else {
+    predictor =
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  }
 
   if (FLAGS_test_all_data) {
     LOG(INFO) << "test all data";
@@ -165,10 +176,51 @@ void TestChineseNERPrediction() {
   for (size_t i = 0; i < std::min(11UL, size); i++) {
     PADDLE_ENFORCE(result[i], chinese_ner_result_data[i]);
   }
+
+  if (use_analysis) {
+    // run once for comparion as reference
+    auto ref_predictor =
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+    std::vector<PaddleTensor> ref_outputs_slots;
+    ref_predictor->Run(input_slots, &ref_outputs_slots);
+    EXPECT_EQ(ref_outputs_slots.size(), outputs.size());
+    auto &ref_out = ref_outputs_slots[0];
+    size_t ref_size =
+        std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
+                        [](int a, int b) { return a * b; });
+    EXPECT_EQ(size, ref_size);
+    int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
+    for (size_t i = 0; i < size; ++i) {
+      EXPECT_EQ(pdata_ref[i], result[i]);
+    }
+
+    AnalysisPredictor *analysis_predictor =
+        dynamic_cast<AnalysisPredictor *>(predictor.get());
+    auto &fuse_statis = analysis_predictor->analysis_argument()
+                            .Get<std::unordered_map<std::string, int>>(
+                                framework::ir::kFuseStatisAttr);
+    for (auto &item : fuse_statis) {
+      LOG(INFO) << "fused " << item.first << " " << item.second;
+    }
+    int num_ops = 0;
+    for (auto &node :
+         analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
+      if (node->IsFunction()) {
+        ++num_ops;
+      }
+    }
+    LOG(INFO) << "has num ops: " << num_ops;
+    ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+    ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
+    EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
+    EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 2);
+    EXPECT_EQ(num_ops, 14);
+  }
 }
 
-// Directly infer with the original model.
-TEST(Analyzer, Chinese_ner) { TestChineseNERPrediction(); }
+TEST(Analyzer_Chinese_ner, native) { TestChineseNERPrediction(false); }
+
+TEST(Analyzer_Chinese_ner, analysis) { TestChineseNERPrediction(true); }
 
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 4cf26d3c70..a496ae41aa 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -283,7 +283,6 @@ void TestDituRNNPrediction(bool use_analysis, bool activate_ir,
 
   base_predictor->Run(input_slots, &base_outputs);
 
-  LOG(INFO) << "===========profile result===========";
   if (num_threads == 1) {
     // Prepare inputs.
     Timer timer;
@@ -324,7 +323,6 @@ void TestDituRNNPrediction(bool use_analysis, bool activate_ir,
       threads[i].join();
     }
   }
-  LOG(INFO) << "=====================================";
 
   if (use_analysis && activate_ir) {
     AnalysisPredictor *analysis_predictor =
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index f944c9fdec..5df486f345 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -44,20 +44,7 @@ function(inference_api_test TARGET_NAME)
 endfunction(inference_api_test)
 
 cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor)
-cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api
-    analysis
-    ir_pass_manager
-    pass
-    fc_fuse_pass
-    fc_lstm_fuse_pass
-    fc_gru_fuse_pass
-    seq_concat_fc_fuse_pass
-    graph_viz_pass
-    infer_clean_graph_pass
-    graph_pattern_detector
-    infer_clean_graph_pass
-    attention_lstm_fuse_pass)
-
+cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis)
 cc_test(test_paddle_inference_api
         SRCS api_tester.cc
         DEPS paddle_inference_api)
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index 0ab2542f34..f6893be428 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -124,11 +124,9 @@ std::string DescribeTensor(const PaddleTensor &tensor) {
 
 void PrintTime(int batch_size, int repeat, int num_threads, int tid,
                double latency) {
-  LOG(INFO) << "=====================================";
-  LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << repeat
+  LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
             << ", threads: " << num_threads << ", thread id: " << tid
-            << ", latency: " << latency << "ms";
-  LOG(INFO) << "=====================================";
+            << ", latency: " << latency << "ms ======";
 }
 
 }  // namespace inference

From 3ea19b759649feabf45860e4e4c808c26845c3c7 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 7 Sep 2018 18:48:45 +0800
Subject: [PATCH 12/13] fix bug and fc pass ut

---
 paddle/fluid/framework/ir/graph_pattern_detector.cc    | 9 ++++-----
 paddle/fluid/inference/analysis/analyzer_lac_tester.cc | 1 +
 paddle/fluid/inference/analysis/analyzer_ner_tester.cc | 1 +
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 69a323a8bd..5ca7509515 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -536,22 +536,21 @@ PDNode* patterns::FC(PDPattern* pattern, const std::string& name_scope,
     mul_out_var = pattern->NewNode(name_scope, "mul_out")
                       ->AsIntermediate()
                       ->assert_is_only_output_of_op("mul")
-                      ->assert_is_op_input("elementwise_add", "X");
+                      ->assert_is_op_input("elementwise_add");
     // bias
     bias = pattern->NewNode(name_scope, "fc_bias")
                ->AsInput()
-               ->assert_is_persistable_var()
-               ->assert_is_op_input("elementwise_add", "Y");
+               ->assert_is_op_input("elementwise_add");
     // output
     fc_out = pattern->NewNode(name_scope, "fc_out")
                  ->AsOutput()
-                 ->assert_is_op_output("elementwise_add", "Out");
+                 ->assert_is_op_output("elementwise_add");
     mul_op->LinksFrom({x, mul_weight_var}).LinksTo({mul_out_var});
     elementwise_add_op->LinksFrom({mul_out_var, bias}).LinksTo({fc_out});
   } else {
     fc_out = pattern->NewNode(name_scope, "fc_out")
                  ->AsOutput()
-                 ->assert_is_op_output("mul", "Out");
+                 ->assert_is_op_output("mul");
     mul_op->LinksFrom({mul_weight_var, x}).LinksTo({fc_out});
   }
   return fc_out;
diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
index 4ff7251473..b906b32cf5 100644
--- a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
diff --git a/paddle/fluid/inference/analysis/analyzer_ner_tester.cc b/paddle/fluid/inference/analysis/analyzer_ner_tester.cc
index f5c5d73aeb..661b047ed7 100644
--- a/paddle/fluid/inference/analysis/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_ner_tester.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"

From 5a2fc5b52f20b0c905a38ebd0fe206f88dadd649 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 7 Sep 2018 22:54:42 +0800
Subject: [PATCH 13/13] fix print error

---
 paddle/fluid/inference/analysis/analyzer_lac_tester.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
index b906b32cf5..522d870db8 100644
--- a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
@@ -182,6 +182,7 @@ void TestLACPrediction(const std::string &model_path,
   Timer timer;
   if (test_all_data) {
     double sum = 0;
+    LOG(INFO) << "Total number of samples: " << data.datasets.size();
     for (int i = 0; i < repeat; i++) {
       for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
         GetOneBatch(&input_slots, &data, batch_size);
@@ -190,7 +191,9 @@ void TestLACPrediction(const std::string &model_path,
         sum += timer.toc();
       }
     }
-    PrintTime(batch_size, repeat, 1, 0, sum / batch_size);
+    PrintTime(batch_size, repeat, 1, 0, sum / repeat);
+    LOG(INFO) << "Average latency of each sample: "
+              << sum / repeat / data.datasets.size() << " ms";
     return;
   }
   timer.tic();