Refine infer api test (#13472)

* refine analyzer_nlp_tester * refine analyzer_rnn/vis_tester
7 years ago · b75887514e
parent d4570f041f
commit b75887514e
7 changed files with 383 additions and 420 deletions
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@ -103,108 +103,74 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  input_slots->assign({input_tensor});
 }

-const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25,
-                                25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43,
-                                44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39,
-                                14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
-
-void TestLACPrediction(const std::string &model_path,
-                       const std::string &data_file, const int batch_size,
-                       const int repeat, bool use_analysis = false) {
-  AnalysisConfig cfg;
-  cfg.model_dir = model_path;
-  cfg.use_gpu = false;
-  cfg.device = 0;
-  cfg.specify_input_name = true;
-  cfg.enable_ir_optim = true;
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->model_dir = FLAGS_infer_model;
+  cfg->use_gpu = false;
+  cfg->device = 0;
+  cfg->specify_input_name = true;
+  cfg->enable_ir_optim = true;
+}

-  std::vector<PaddleTensor> input_slots, outputs_slots;
-  DataRecord data(data_file, batch_size);
-  GetOneBatch(&input_slots, &data, batch_size);
-  std::unique_ptr<PaddlePredictor> predictor;
-  if (use_analysis) {
-    predictor =
-        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
-  } else {
-    predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
-  }
-  for (int i = 0; i < FLAGS_burning; i++) {
-    predictor->Run(input_slots, &outputs_slots);
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
+  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+  std::vector<PaddleTensor> input_slots;
+  int epoch = FLAGS_test_all_data ? data.batched_datas.size() : 1;
+  LOG(INFO) << "number of samples: " << epoch;
+  for (int bid = 0; bid < epoch; ++bid) {
+    GetOneBatch(&input_slots, &data, FLAGS_batch_size);
+    (*inputs).emplace_back(input_slots);
  }
-  Timer timer;
-  if (FLAGS_test_all_data) {
-    LOG(INFO) << "test all data";
-    std::vector<std::vector<PaddleTensor>> input_slots_all;
-    for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
-      GetOneBatch(&input_slots, &data, batch_size);
-      input_slots_all.emplace_back(input_slots);
-    }
-    LOG(INFO) << "total number of samples: " << data.datasets.size();
-    TestPrediction(cfg, input_slots_all, &outputs_slots, FLAGS_num_threads);
-    return;
-  }
-  timer.tic();
-  for (int i = 0; i < repeat; i++) {
-    predictor->Run(input_slots, &outputs_slots);
-  }
-  PrintTime(batch_size, repeat, 1, 0, timer.toc() / repeat);
+}

-  // check result
-  EXPECT_EQ(outputs_slots.size(), 1UL);
-  auto &out = outputs_slots[0];
-  size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
-                                [](int a, int b) { return a * b; });
-  size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
-  PADDLE_ENFORCE_GT(size, 0);
-  EXPECT_GE(size, batch1_size);
-  int64_t *pdata = static_cast<int64_t *>(out.data.data());
-  for (size_t i = 0; i < batch1_size; ++i) {
-    EXPECT_EQ(pdata[i], lac_ref_data[i]);
-  }
+// Easy for profiling independently.
+TEST(Analyzer_LAC, profile) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<PaddleTensor> outputs;

-  if (use_analysis) {
-    // run once for comparion as reference
-    auto ref_predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
-    std::vector<PaddleTensor> ref_outputs_slots;
-    ref_predictor->Run(input_slots, &ref_outputs_slots);
-    CompareResult(ref_outputs_slots, outputs_slots);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);

-    AnalysisPredictor *analysis_predictor =
-        dynamic_cast<AnalysisPredictor *>(predictor.get());
-    auto &fuse_statis = analysis_predictor->analysis_argument()
-                            .Get<std::unordered_map<std::string, int>>(
-                                framework::ir::kFuseStatisAttr);
-    for (auto &item : fuse_statis) {
-      LOG(INFO) << "fused " << item.first << " " << item.second;
-    }
-    int num_ops = 0;
-    for (auto &node :
-         analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
-      if (node->IsFunction()) {
-        ++num_ops;
-      }
+  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
+    // the first inference result
+    const int64_t lac_ref_data[] = {
+        24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, 25, 25, 25, 25,
+        44, 24, 25, 25, 25, 36, 42, 43, 44, 14, 15, 44, 14, 15, 44, 14,
+        15, 44, 38, 39, 14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
+    PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
+    size_t size = GetSize(outputs[0]);
+    size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
+    PADDLE_ENFORCE_GE(size, batch1_size);
+    int64_t *pdata = static_cast<int64_t *>(outputs[0].data.data());
+    for (size_t i = 0; i < batch1_size; ++i) {
+      EXPECT_EQ(pdata[i], lac_ref_data[i]);
    }
-    LOG(INFO) << "has num ops: " << num_ops;
-    ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-    ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
-    EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
-    EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 4);
-    EXPECT_EQ(num_ops, 11);
  }
 }

-TEST(Analyzer_LAC, native) {
-  LOG(INFO) << "LAC with native";
-  TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
-                    FLAGS_repeat);
+// Check the fuse status
+TEST(Analyzer_LAC, fuse_statis) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  int num_ops;
+  auto fuse_statis = GetFuseStatis(cfg, &num_ops);
+  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+  ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
+  EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
+  EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 4);
+  EXPECT_EQ(num_ops, 11);
 }

-TEST(Analyzer_LAC, analysis) {
-  LOG(INFO) << "LAC with analysis";
-  TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
-                    FLAGS_repeat, true);
+// Compare result of NativeConfig and AnalysisConfig
+TEST(Analyzer_LAC, compare) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(cfg, input_slots_all);
 }

 }  // namespace analysis
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@ -95,97 +95,73 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  }
 }

-// the first inference result
-const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
-                                       48, 39, 38, 16, 25};
-
-void TestChineseNERPrediction(bool use_analysis) {
-  AnalysisConfig cfg;
-  cfg.prog_file = FLAGS_infer_model + "/__model__";
-  cfg.param_file = FLAGS_infer_model + "/param";
-  cfg.use_gpu = false;
-  cfg.device = 0;
-  cfg.specify_input_name = true;
-  cfg.enable_ir_optim = true;
-
-  std::vector<PaddleTensor> input_slots, outputs;
-  std::unique_ptr<PaddlePredictor> predictor;
-  Timer timer;
-  if (use_analysis) {
-    predictor =
-        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
-  } else {
-    predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
-  }
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->prog_file = FLAGS_infer_model + "/__model__";
+  cfg->param_file = FLAGS_infer_model + "/param";
+  cfg->use_gpu = false;
+  cfg->device = 0;
+  cfg->specify_input_name = true;
+  cfg->enable_ir_optim = true;
+}

-  if (FLAGS_test_all_data) {
-    LOG(INFO) << "test all data";
-    DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
-    std::vector<std::vector<PaddleTensor>> input_slots_all;
-    for (size_t bid = 0; bid < data.num_samples / FLAGS_batch_size; ++bid) {
-      PrepareInputs(&input_slots, &data, FLAGS_batch_size);
-      input_slots_all.emplace_back(input_slots);
-    }
-    LOG(INFO) << "total number of samples: " << data.num_samples;
-    TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
-    return;
-  }
-  // Prepare inputs.
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
-  PrepareInputs(&input_slots, &data, FLAGS_batch_size);
-
-  timer.tic();
-  for (int i = 0; i < FLAGS_repeat; i++) {
-    predictor->Run(input_slots, &outputs);
+  std::vector<PaddleTensor> input_slots;
+  int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
+  LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
+  for (int bid = 0; bid < epoch; ++bid) {
+    PrepareInputs(&input_slots, &data, FLAGS_batch_size);
+    (*inputs).emplace_back(input_slots);
  }
-  PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, timer.toc() / FLAGS_repeat);
+}

-  PADDLE_ENFORCE(outputs.size(), 1UL);
-  auto &out = outputs[0];
-  size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
-                                [](int a, int b) { return a * b; });
-  PADDLE_ENFORCE_GT(size, 0);
-  int64_t *result = static_cast<int64_t *>(out.data.data());
-  for (size_t i = 0; i < std::min(11UL, size); i++) {
-    PADDLE_ENFORCE(result[i], chinese_ner_result_data[i]);
-  }
+// Easy for profiling independently.
+TEST(Analyzer_Chinese_ner, profile) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<PaddleTensor> outputs;

-  if (use_analysis) {
-    // run once for comparion as reference
-    auto ref_predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
-    std::vector<PaddleTensor> ref_outputs_slots;
-    ref_predictor->Run(input_slots, &ref_outputs_slots);
-    CompareResult(ref_outputs_slots, outputs);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);

-    AnalysisPredictor *analysis_predictor =
-        dynamic_cast<AnalysisPredictor *>(predictor.get());
-    auto &fuse_statis = analysis_predictor->analysis_argument()
-                            .Get<std::unordered_map<std::string, int>>(
-                                framework::ir::kFuseStatisAttr);
-    for (auto &item : fuse_statis) {
-      LOG(INFO) << "fused " << item.first << " " << item.second;
-    }
-    int num_ops = 0;
-    for (auto &node :
-         analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
-      if (node->IsFunction()) {
-        ++num_ops;
-      }
+  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
+    // the first inference result
+    const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
+                                           48, 39, 38, 16, 25};
+    PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
+    size_t size = GetSize(outputs[0]);
+    PADDLE_ENFORCE_GT(size, 0);
+    int64_t *result = static_cast<int64_t *>(outputs[0].data.data());
+    for (size_t i = 0; i < std::min(11UL, size); i++) {
+      EXPECT_EQ(result[i], chinese_ner_result_data[i]);
    }
-    LOG(INFO) << "has num ops: " << num_ops;
-    ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-    ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
-    EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
-    EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 2);
-    EXPECT_EQ(num_ops, 14);
  }
 }

-TEST(Analyzer_Chinese_ner, native) { TestChineseNERPrediction(false); }
+// Check the fuse status
+TEST(Analyzer_Chinese_ner, fuse_statis) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);

-TEST(Analyzer_Chinese_ner, analysis) { TestChineseNERPrediction(true); }
+  int num_ops;
+  auto fuse_statis = GetFuseStatis(cfg, &num_ops);
+  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+  ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
+  EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
+  EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 2);
+  EXPECT_EQ(num_ops, 14);
+}
+
+// Compare result of NativeConfig and AnalysisConfig
+TEST(Analyzer_Chinese_ner, compare) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(cfg, input_slots_all);
+}

 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@ -25,6 +25,7 @@ struct DataRecord {
  std::vector<size_t> lod1, lod2, lod3;
  std::vector<std::vector<float>> rnn_link_data, rnn_week_datas,
      rnn_minute_datas;
+  size_t num_samples;  // total number of samples
  size_t batch_iter{0};
  size_t batch_size{1};
  DataRecord() = default;
@ -97,6 +98,7 @@ struct DataRecord {
      week_data_all.push_back(std::move(week_data));
      minute_data_all.push_back(std::move(minute_data));
    }
+    num_samples = num_lines;
  }
 };
 void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
@ -147,89 +149,72 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  }
 }

-// Test with a really complicate model.
-void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
-  AnalysisConfig config;
-  config.prog_file = FLAGS_infer_model + "/__model__";
-  config.param_file = FLAGS_infer_model + "/param";
-  config.use_gpu = false;
-  config.device = 0;
-  config.specify_input_name = true;
-  config.enable_ir_optim = activate_ir;
-  PADDLE_ENFORCE(config.ir_mode ==
-                 AnalysisConfig::IrPassMode::kExclude);  // default
-  config.ir_passes.clear();  // Do not exclude any pass.
-
-  int batch_size = FLAGS_batch_size;
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->prog_file = FLAGS_infer_model + "/__model__";
+  cfg->param_file = FLAGS_infer_model + "/param";
+  cfg->use_gpu = false;
+  cfg->device = 0;
+  cfg->specify_input_name = true;
+  cfg->enable_ir_optim = true;
+  cfg->ir_passes.clear();  // Do not exclude any pass.
+}

-  auto base_predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
-  auto predictor =
-      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-          config);
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
+  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
  std::vector<PaddleTensor> input_slots;
-  DataRecord data(FLAGS_infer_data, batch_size);
-  // Prepare inputs.
-  PrepareInputs(&input_slots, &data, batch_size);
-  std::vector<PaddleTensor> outputs, base_outputs;
+  int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
+  LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
+  for (int bid = 0; bid < epoch; ++bid) {
+    PrepareInputs(&input_slots, &data, FLAGS_batch_size);
+    (*inputs).emplace_back(input_slots);
+  }
+}

-  base_predictor->Run(input_slots, &base_outputs);
+// Easy for profiling independently.
+TEST(Analyzer_rnn1, profile) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<PaddleTensor> outputs;

  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  input_slots_all.emplace_back(input_slots);
-  if (num_threads == 1) {
-    TestOneThreadPrediction(config, input_slots_all, &outputs);
-    CompareResult(outputs, base_outputs);
-  } else {
-    // only return the output of first thread
-    TestMultiThreadPrediction(config, input_slots_all, &outputs, num_threads);
-  }
+  SetInput(&input_slots_all);
+  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+}

-  if (use_analysis && activate_ir) {
-    AnalysisPredictor *analysis_predictor =
-        dynamic_cast<AnalysisPredictor *>(predictor.get());
-    auto &fuse_statis = analysis_predictor->analysis_argument()
-                            .Get<std::unordered_map<std::string, int>>(
-                                framework::ir::kFuseStatisAttr);
-    for (auto &item : fuse_statis) {
-      LOG(INFO) << "fused " << item.first << " " << item.second;
-    }
+// Check the fuse status
+TEST(Analyzer_rnn1, fuse_statis) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);

-    int num_ops = 0;
-    for (auto &node :
-         analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
-      if (node->IsFunction()) {
-        ++num_ops;
-      }
-    }
-    LOG(INFO) << "has num ops: " << num_ops;
+  int num_ops;
+  auto fuse_statis = GetFuseStatis(cfg, &num_ops);
+  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+  EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
+  EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2);  // bi-directional LSTM
+  EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1);
+  EXPECT_EQ(num_ops,
+            13);  // After graph optimization, only 13 operators exists.
+}

-    ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-    EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
-    EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2);  // bi-directional LSTM
-    EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1);
-    EXPECT_EQ(num_ops,
-              13);  // After graph optimization, only 13 operators exists.
-  }
+// Compare result of NativeConfig and AnalysisConfig
+TEST(Analyzer_rnn1, compare) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(cfg, input_slots_all);
 }

-// Inference with analysis and IR, easy for profiling independently.
-TEST(Analyzer, rnn1) { TestRNN1Prediction(true, true, FLAGS_num_threads); }
+// Test Multi-Thread.
+TEST(Analyzer_rnn1, multi_thread) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<PaddleTensor> outputs;

-// Other unit-tests of RNN1, test different options of use_analysis,
-// activate_ir and multi-threads.
-TEST(Analyzer, RNN_tests) {
-  int num_threads[2] = {1, 4};
-  for (auto i : num_threads) {
-    // Directly infer with the original model.
-    TestRNN1Prediction(false, false, i);
-    // Inference with the original model with the analysis turned on, the
-    // analysis module will transform the program to a data flow graph.
-    TestRNN1Prediction(true, false, i);
-    // Inference with analysis and IR. The IR module will fuse some large
-    // kernels.
-    TestRNN1Prediction(true, true, i);
-  }
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(cfg, input_slots_all, &outputs, 4 /* num_threads */);
 }

 }  // namespace inference
--- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
@ -12,24 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/inference/analysis/analyzer.h"
-
-#include <google/protobuf/text_format.h>
-#include <gtest/gtest.h>
-#include <thread>  // NOLINT
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
-
-DEFINE_string(infer_model, "", "model path");
-DEFINE_string(infer_data, "", "data path");
-DEFINE_int32(batch_size, 1, "batch size.");
-DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
-DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
+#include "paddle/fluid/inference/tests/api/tester_helper.h"

 namespace paddle {
 namespace inference {
@ -41,6 +24,7 @@ struct DataRecord {
  std::vector<size_t> lod;
  std::vector<std::vector<float>> rnn_link_data;
  std::vector<float> result_data;
+  size_t num_samples;  // total number of samples
  size_t batch_iter{0};
  size_t batch_size{1};
  DataRecord() = default;
@ -100,6 +84,7 @@ struct DataRecord {
        result_data.insert(result_data.end(), tmp.begin(), tmp.end());
      }
    }
+    num_samples = num_lines / 2;
  }
 };
 void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
@ -118,64 +103,58 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  input_slots->assign({feed_tensor});
 }

-void CompareResult(const std::vector<PaddleTensor> &outputs,
-                   const std::vector<float> &base_result) {
-  PADDLE_ENFORCE_GT(outputs.size(), 0);
-  for (size_t i = 0; i < outputs.size(); i++) {
-    auto &out = outputs[i];
-    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
-                                  [](int a, int b) { return a * b; });
-    PADDLE_ENFORCE_GT(size, 0);
-    float *data = static_cast<float *>(out.data.data());
-    for (size_t i = 0; i < size; i++) {
-      EXPECT_NEAR(data[i], base_result[i], 1e-3);
-    }
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->prog_file = FLAGS_infer_model + "/__model__";
+  cfg->param_file = FLAGS_infer_model + "/param";
+  cfg->use_gpu = false;
+  cfg->device = 0;
+  cfg->specify_input_name = true;
+  cfg->enable_ir_optim = true;
+}
+
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
+  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+  std::vector<PaddleTensor> input_slots;
+  int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
+  LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
+  for (int bid = 0; bid < epoch; ++bid) {
+    PrepareInputs(&input_slots, &data, FLAGS_batch_size);
+    (*inputs).emplace_back(input_slots);
  }
 }
-// Test with a really complicate model.
-void TestRNN2Prediction() {
-  AnalysisConfig config;
-  config.prog_file = FLAGS_infer_model + "/__model__";
-  config.param_file = FLAGS_infer_model + "/param";
-  config.use_gpu = false;
-  config.device = 0;
-  config.specify_input_name = true;
-  config.enable_ir_optim = true;
-  PADDLE_ENFORCE(config.ir_mode ==
-                 AnalysisConfig::IrPassMode::kExclude);  // default

-  int batch_size = FLAGS_batch_size;
-  int num_times = FLAGS_repeat;
+// Easy for profiling independently.
+TEST(Analyzer_rnn2, profile) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<PaddleTensor> outputs;

-  auto base_predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
-  auto predictor =
-      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-          config);
-  std::vector<PaddleTensor> input_slots;
-  DataRecord data(FLAGS_infer_data, batch_size);
-  PrepareInputs(&input_slots, &data, batch_size);
-  std::vector<PaddleTensor> outputs, base_outputs;
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);

-  Timer timer1;
-  timer1.tic();
-  for (int i = 0; i < num_times; i++) {
-    base_predictor->Run(input_slots, &base_outputs);
+  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
+    // the first inference result
+    DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    size_t size = GetSize(outputs[0]);
+    PADDLE_ENFORCE_GT(size, 0);
+    float *result = static_cast<float *>(outputs[0].data.data());
+    for (size_t i = 0; i < size; i++) {
+      EXPECT_NEAR(result[i], data.result_data[i], 1e-3);
+    }
  }
-  PrintTime(batch_size, num_times, 1, 0, timer1.toc() / num_times);
+}

-  Timer timer2;
-  timer2.tic();
-  for (int i = 0; i < num_times; i++) {
-    predictor->Run(input_slots, &outputs);
-  }
-  PrintTime(batch_size, num_times, 1, 0, timer2.toc() / num_times);
+// Compare result of NativeConfig and AnalysisConfig
+TEST(Analyzer_rnn2, compare) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);

-  CompareResult(base_outputs, data.result_data);
-  CompareResult(outputs, data.result_data);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(cfg, input_slots_all);
 }

-TEST(Analyzer, rnn2) { TestRNN2Prediction(); }
-
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@ -46,54 +46,63 @@ struct DataReader {
  std::unique_ptr<std::ifstream> file;
 };

-void Main(int batch_size) {
-  // shape --
-  // Create Predictor --
-  AnalysisConfig config;
-  config.model_dir = FLAGS_infer_model;
-  config.use_gpu = false;
-  config.enable_ir_optim = true;
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->model_dir = FLAGS_infer_model;
+  cfg->use_gpu = false;
+  cfg->device = 0;
+  cfg->specify_input_name = true;
+  cfg->enable_ir_optim = true;
+}

-  std::vector<PaddleTensor> input_slots, output_slots;
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
+  std::vector<PaddleTensor> input_slots;
  DataReader reader(FLAGS_infer_data);
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-
-  if (FLAGS_test_all_data) {
-    LOG(INFO) << "test all data";
-    int num_batches = 0;
-    while (reader.NextBatch(&input_slots, FLAGS_batch_size)) {
-      input_slots_all.emplace_back(input_slots);
-      ++num_batches;
-    }
-    LOG(INFO) << "total number of samples: " << num_batches * FLAGS_batch_size;
-    TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads);
-    return;
+  int num_batches = 0;
+  while (reader.NextBatch(&input_slots, FLAGS_batch_size)) {
+    (*inputs).emplace_back(input_slots);
+    ++num_batches;
+    if (!FLAGS_test_all_data) return;
  }
+  LOG(INFO) << "total number of samples: " << num_batches * FLAGS_batch_size;
+}

-  // one batch starts
-  // data --
-  reader.NextBatch(&input_slots, FLAGS_batch_size);
-  input_slots_all.emplace_back(input_slots);
-  TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads);
+// Easy for profiling independently.
+TEST(Analyzer_Text_Classification, profile) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<PaddleTensor> outputs;

-  // Get output
-  LOG(INFO) << "get outputs " << output_slots.size();
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);

-  for (auto &output : output_slots) {
-    LOG(INFO) << "output.shape: " << to_string(output.shape);
-    // no lod ?
-    CHECK_EQ(output.lod.size(), 0UL);
-    LOG(INFO) << "output.dtype: " << output.dtype;
-    std::stringstream ss;
-    for (int i = 0; i < 5; i++) {
-      ss << static_cast<float *>(output.data.data())[i] << " ";
+  if (FLAGS_num_threads == 1) {
+    // Get output
+    LOG(INFO) << "get outputs " << outputs.size();
+    for (auto &output : outputs) {
+      LOG(INFO) << "output.shape: " << to_string(output.shape);
+      // no lod ?
+      CHECK_EQ(output.lod.size(), 0UL);
+      LOG(INFO) << "output.dtype: " << output.dtype;
+      std::stringstream ss;
+      for (int i = 0; i < 5; i++) {
+        ss << static_cast<float *>(output.data.data())[i] << " ";
+      }
+      LOG(INFO) << "output.data summary: " << ss.str();
+      // one batch ends
    }
-    LOG(INFO) << "output.data summary: " << ss.str();
-    // one batch ends
  }
 }

-TEST(text_classification, basic) { Main(FLAGS_batch_size); }
+// Compare result of NativeConfig and AnalysisConfig
+TEST(Analyzer_Text_Classification, compare) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(cfg, input_slots_all);
+}

 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@ -49,84 +49,83 @@ Record ProcessALine(const std::string &line) {
  return record;
 }

-/*
- * Use the native and analysis fluid engine to inference the demo.
- * ocr, mobilenet and se_resnext50
- */
-void TestVisualPrediction(bool use_mkldnn) {
-  std::unique_ptr<PaddlePredictor> predictor;
-  AnalysisConfig cfg;
-  cfg.param_file = FLAGS_infer_model + "/__params__";
-  cfg.prog_file = FLAGS_infer_model + "/__model__";
-  cfg.use_gpu = false;
-  cfg._use_mkldnn = use_mkldnn;
-  cfg.device = 0;
-  cfg.enable_ir_optim = true;
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->param_file = FLAGS_infer_model + "/__params__";
+  cfg->prog_file = FLAGS_infer_model + "/__model__";
+  cfg->use_gpu = false;
+  cfg->device = 0;
+  cfg->enable_ir_optim = true;
+  cfg->specify_input_name = true;
  // TODO(TJ): fix fusion gru
-  cfg.ir_passes.push_back("fc_gru_fuse_pass");
+  cfg->ir_passes.push_back("fc_gru_fuse_pass");
 #ifdef PADDLE_WITH_MKLDNN
+  cfg->_use_mkldnn = true;
  // disable mkldnn fuse since it should have some bugs
-  cfg.ir_passes.push_back("conv_relu_mkldnn_fuse_pass");
+  cfg->ir_passes.push_back("conv_relu_mkldnn_fuse_pass");
 #endif
-  predictor =
-      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
+}

-  // Only have single batch of data.
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
+  PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
  std::string line;
  std::ifstream file(FLAGS_infer_data);
  std::getline(file, line);
  auto record = ProcessALine(line);
-  file.close();

-  // Inference.
  PaddleTensor input;
  input.shape = record.shape;
-  input.data =
-      PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
  input.dtype = PaddleDType::FLOAT32;
+  size_t input_size = record.data.size() * sizeof(float);
+  input.data.Resize(input_size);
+  memcpy(input.data.data(), record.data.data(), input_size);
+  std::vector<PaddleTensor> input_slots;
+  input_slots.assign({input});
+  (*inputs).emplace_back(input_slots);
+}

-  std::vector<PaddleTensor> outputs_slots;
-  Timer timer;
-  timer.tic();
-  for (int i = 0; i < FLAGS_repeat; i++) {
-    predictor->Run({input}, &outputs_slots);
-  }
-  PrintTime(/*batch size*/ 1, FLAGS_repeat, /*num threads*/ 1, /*thread id*/ 0,
-            timer.toc() / FLAGS_repeat);
-
-  VLOG(3) << "output.size " << outputs_slots.size();
-
-  // run native as reference
-  auto ref_predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
-  std::vector<PaddleTensor> ref_outputs_slots;
-  ref_predictor->Run({input}, &ref_outputs_slots);
-  CompareResult(outputs_slots, ref_outputs_slots);
-  // print what are fused
-  AnalysisPredictor *analysis_predictor =
-      dynamic_cast<AnalysisPredictor *>(predictor.get());
-  auto &fuse_statis = analysis_predictor->analysis_argument()
-                          .Get<std::unordered_map<std::string, int>>(
-                              framework::ir::kFuseStatisAttr);
-  for (auto &item : fuse_statis) {
-    LOG(INFO) << "fused " << item.first << " " << item.second;
-  }
-  int num_ops = 0;
-  for (auto &node :
-       analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
-    if (node->IsFunction()) {
-      ++num_ops;
+// Easy for profiling independently.
+//  ocr, mobilenet and se_resnext50
+TEST(Analyzer_vis, profile) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<PaddleTensor> outputs;
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+
+  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
+    const float ocr_result_data[] = {
+        5.273636460856323538e-08, 3.296741795111302054e-07,
+        1.873261190610264748e-08, 3.403730275408634043e-08,
+        3.383312474625199684e-08};
+    PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
+    size_t size = GetSize(outputs[0]);
+    PADDLE_ENFORCE_GT(size, 0);
+    float *result = static_cast<float *>(outputs[0].data.data());
+    for (size_t i = 0; i < std::min(5UL, size); i++) {
+      EXPECT_NEAR(result[i], ocr_result_data[i], 1e-3);
    }
  }
-  LOG(INFO) << "has num ops: " << num_ops;
 }

-TEST(Analyzer_vis, analysis) { TestVisualPrediction(/*use_mkldnn*/ false); }
-#ifdef PADDLE_WITH_MKLDNN
-TEST(Analyzer_vis, analysis_mkldnn) {
-  TestVisualPrediction(/*use_mkldnn*/ true);
+// Check the fuse status
+TEST(Analyzer_vis, fuse_statis) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  int num_ops;
+  GetFuseStatis(cfg, &num_ops);
+}
+
+// Compare result of NativeConfig and AnalysisConfig
+TEST(Analyzer_vis, compare) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(cfg, input_slots_all);
 }
-#endif

 }  // namespace analysis
 }  // namespace inference
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@ -15,6 +15,7 @@
 #pragma once

 #include <gtest/gtest.h>
+#include <string>
 #include <thread>  // NOLINT
 #include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
@ -28,17 +29,18 @@
 DEFINE_string(infer_model, "", "model path");
 DEFINE_string(infer_data, "", "data file");
 DEFINE_int32(batch_size, 1, "batch size.");
-DEFINE_int32(burning, 0, "Burning before repeat.");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
 DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
 DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
+DEFINE_bool(use_analysis, true,
+            "Running the inference program in analysis mode.");

 namespace paddle {
 namespace inference {

 void CompareResult(const std::vector<PaddleTensor> &outputs,
                   const std::vector<PaddleTensor> &ref_outputs) {
-  EXPECT_GT(outputs.size(), 0);
+  EXPECT_GT(outputs.size(), 0UL);
  EXPECT_EQ(outputs.size(), ref_outputs.size());
  for (size_t i = 0; i < outputs.size(); i++) {
    auto &out = outputs[i];
@ -72,14 +74,50 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
  }
 }

+std::unique_ptr<PaddlePredictor> GetPrediction(AnalysisConfig config,
+                                               bool use_analysis = true) {
+  if (use_analysis) {
+    return CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+        config);
+  } else {
+    return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
+        config);
+  }
+}
+
+size_t GetSize(const PaddleTensor &out) {
+  return std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                         [](int a, int b) { return a * b; });
+}
+
+std::unordered_map<std::string, int> GetFuseStatis(AnalysisConfig config,
+                                                   int *num_ops) {
+  auto predictor = GetPrediction(config);
+  AnalysisPredictor *analysis_predictor =
+      dynamic_cast<AnalysisPredictor *>(predictor.get());
+  auto &fuse_statis = analysis_predictor->analysis_argument()
+                          .Get<std::unordered_map<std::string, int>>(
+                              framework::ir::kFuseStatisAttr);
+  for (auto &item : fuse_statis) {
+    LOG(INFO) << "fused " << item.first << " " << item.second;
+  }
+  int num = 0;
+  for (auto &node :
+       analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
+    if (node->IsFunction()) {
+      ++num;
+    }
+  }
+  *num_ops = num;
+  return fuse_statis;
+}
+
 void TestOneThreadPrediction(
    AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
-    std::vector<PaddleTensor> *outputs) {
+    std::vector<PaddleTensor> *outputs, bool use_analysis = true) {
  int batch_size = FLAGS_batch_size;
  int num_times = FLAGS_repeat;
-  auto predictor =
-      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-          config);
+  auto predictor = GetPrediction(config, use_analysis);
  Timer timer;
  timer.tic();
  for (int i = 0; i < num_times; i++) {
@ -93,7 +131,8 @@ void TestOneThreadPrediction(

 void TestMultiThreadPrediction(
    AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
-    std::vector<PaddleTensor> *outputs, int num_threads) {
+    std::vector<PaddleTensor> *outputs, int num_threads,
+    bool use_analysis = true) {
  int batch_size = FLAGS_batch_size;
  int num_times = FLAGS_repeat;
  std::vector<std::thread> threads;
@ -101,9 +140,7 @@ void TestMultiThreadPrediction(
  // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
  // because AttentionLSTM's hard code nodeid will be damanged.
  for (int tid = 0; tid < num_threads; ++tid) {
-    predictors.emplace_back(
-        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-            config));
+    predictors.emplace_back(GetPrediction(config, use_analysis));
  }
  for (int tid = 0; tid < num_threads; ++tid) {
    threads.emplace_back([&, tid]() {
@ -129,13 +166,25 @@ void TestMultiThreadPrediction(

 void TestPrediction(AnalysisConfig config,
                    const std::vector<std::vector<PaddleTensor>> inputs,
-                    std::vector<PaddleTensor> *outputs, int num_threads) {
+                    std::vector<PaddleTensor> *outputs, int num_threads,
+                    bool use_analysis = FLAGS_use_analysis) {
+  LOG(INFO) << "use_analysis: " << use_analysis;
  if (num_threads == 1) {
-    TestOneThreadPrediction(config, inputs, outputs);
+    TestOneThreadPrediction(config, inputs, outputs, use_analysis);
  } else {
-    TestMultiThreadPrediction(config, inputs, outputs, num_threads);
+    TestMultiThreadPrediction(config, inputs, outputs, num_threads,
+                              use_analysis);
  }
 }

+void CompareNativeAndAnalysis(
+    AnalysisConfig config,
+    const std::vector<std::vector<PaddleTensor>> inputs) {
+  std::vector<PaddleTensor> native_outputs, analysis_outputs;
+  TestOneThreadPrediction(config, inputs, &native_outputs, false);
+  TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
+  CompareResult(analysis_outputs, native_outputs);
+}
+
 }  // namespace inference
 }  // namespace paddle