From 98fb8e58fd4fb91423d414d67f2a2684b6841020 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 25 May 2018 11:57:44 +0800
Subject: [PATCH 01/19] test infer nlp

---
 paddle/fluid/inference/io.cc                  |  2 +-
 .../fluid/inference/tests/book/CMakeLists.txt |  1 +
 .../tests/book/test_inference_nlp.cc          | 85 +++++++++++++++++++
 paddle/fluid/inference/tests/test_helper.h    |  3 +
 4 files changed, 90 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/inference/tests/book/test_inference_nlp.cc

diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 65db7c7b50..98780b6881 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -117,7 +117,7 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
   std::string program_desc_str;
   VLOG(3) << "loading model from " << model_filename;
   ReadBinaryFile(model_filename, &program_desc_str);
-
+  // LOG(INFO) << program_desc_str;
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
 
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index dbb81462b8..90357f99d1 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -35,6 +35,7 @@ inference_test(image_classification ARGS vgg resnet)
 inference_test(label_semantic_roles)
 inference_test(recognize_digits ARGS mlp conv)
 inference_test(recommender_system)
+inference_test(nlp)
 #inference_test(rnn_encoder_decoder)
 #inference_test(understand_sentiment ARGS conv)
 inference_test(word2vec)
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
new file mode 100644
index 0000000000..0d6d0adfb2
--- /dev/null
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -0,0 +1,85 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, understand_sentiment) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+  paddle::framework::LoDTensor words;
+
+  paddle::framework::LoD lod{{0, 83}};
+  int64_t word_dict_len = 198392;
+  SetupLoDTensor(&words, lod, static_cast<int64_t>(0),
+                 static_cast<int64_t>(word_dict_len - 1));
+  /*
+    std::vector<int64_t> srcdata{
+        784,    784,   1550,   6463,   56,     75693, 6189,  784,    784,  1550,
+        198391, 6463,  42468,  4376,   10251,  10760, 6189,  297,    396,  6463,
+        6463,   1550,  198391, 6463,   22564,  1612,  291,   68,     164,  784,
+        784,    1550,  198391, 6463,   13659,  3362,  42468, 6189,   2209,
+    198391,
+        6463,   2209,  2209,   198391, 6463,   2209,  1062,  3029,   1831, 3029,
+        1065,   2281,  100,    11216,  1110,   56,    10869, 9811,   100,
+    198391,
+        6463,   100,   9280,   100,    288,    40031, 1680,  1335,   100,  1550,
+        9280,   7265,  244,    1550,   198391, 6463,  1550,  198391, 6463,
+    42468,
+        4376,   10251, 10760};
+    paddle::framework::LoD lod{{0, srcdata.size()}};
+    words.set_lod(lod);
+    int64_t* pdata =
+    words.mutable_data<int64_t>({static_cast<int64_t>(srcdata.size()), 1},
+                                                  paddle::platform::CPUPlace());
+    memcpy(pdata, srcdata.data(), words.numel() * sizeof(int64_t));
+  */
+  LOG(INFO) << "number of input size:" << words.numel();
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&words);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  int repeat = 100;
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace, true, true>(dirname, cpu_feeds,
+                                                        cpu_fetchs1, repeat);
+  LOG(INFO) << output1.lod();
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.lod();
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index 01b8dc0be6..1f5551567c 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -182,6 +182,9 @@ void TestInference(const std::string& dirname,
         "init_program",
         paddle::platform::DeviceContextPool::Instance().Get(place));
     inference_program = InitProgram(&executor, scope, dirname, is_combined);
+    // std::string binary_str;
+    // inference_program->Proto()->SerializeToString(&binary_str);
+    // LOG(INFO) << binary_str;
     if (use_mkldnn) {
       EnableMKLDNN(inference_program);
     }

From 602e28bf1c30cd72e7378d6dc1071423086bdc73 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 25 May 2018 14:38:01 +0800
Subject: [PATCH 02/19] use the actual data

---
 .../tests/book/test_inference_nlp.cc          | 48 +++++++++----------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 0d6d0adfb2..27bdd5528e 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <sys/time.h>
+#include <time.h>
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
@@ -29,32 +31,28 @@ TEST(inference, understand_sentiment) {
   // 0. Call `paddle::framework::InitDevices()` initialize all the devices
   // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
   paddle::framework::LoDTensor words;
-
-  paddle::framework::LoD lod{{0, 83}};
-  int64_t word_dict_len = 198392;
-  SetupLoDTensor(&words, lod, static_cast<int64_t>(0),
-                 static_cast<int64_t>(word_dict_len - 1));
   /*
-    std::vector<int64_t> srcdata{
-        784,    784,   1550,   6463,   56,     75693, 6189,  784,    784,  1550,
-        198391, 6463,  42468,  4376,   10251,  10760, 6189,  297,    396,  6463,
-        6463,   1550,  198391, 6463,   22564,  1612,  291,   68,     164,  784,
-        784,    1550,  198391, 6463,   13659,  3362,  42468, 6189,   2209,
-    198391,
-        6463,   2209,  2209,   198391, 6463,   2209,  1062,  3029,   1831, 3029,
-        1065,   2281,  100,    11216,  1110,   56,    10869, 9811,   100,
-    198391,
-        6463,   100,   9280,   100,    288,    40031, 1680,  1335,   100,  1550,
-        9280,   7265,  244,    1550,   198391, 6463,  1550,  198391, 6463,
-    42468,
-        4376,   10251, 10760};
-    paddle::framework::LoD lod{{0, srcdata.size()}};
-    words.set_lod(lod);
-    int64_t* pdata =
-    words.mutable_data<int64_t>({static_cast<int64_t>(srcdata.size()), 1},
-                                                  paddle::platform::CPUPlace());
-    memcpy(pdata, srcdata.data(), words.numel() * sizeof(int64_t));
-  */
+    paddle::framework::LoD lod{{0, 83}};
+    int64_t word_dict_len = 198392;
+    SetupLoDTensor(&words, lod, static_cast<int64_t>(0),
+                   static_cast<int64_t>(word_dict_len - 1));
+   */
+  std::vector<int64_t> srcdata{
+      784,    784,   1550,   6463,   56,     75693, 6189,  784,    784,  1550,
+      198391, 6463,  42468,  4376,   10251,  10760, 6189,  297,    396,  6463,
+      6463,   1550,  198391, 6463,   22564,  1612,  291,   68,     164,  784,
+      784,    1550,  198391, 6463,   13659,  3362,  42468, 6189,   2209, 198391,
+      6463,   2209,  2209,   198391, 6463,   2209,  1062,  3029,   1831, 3029,
+      1065,   2281,  100,    11216,  1110,   56,    10869, 9811,   100,  198391,
+      6463,   100,   9280,   100,    288,    40031, 1680,  1335,   100,  1550,
+      9280,   7265,  244,    1550,   198391, 6463,  1550,  198391, 6463, 42468,
+      4376,   10251, 10760};
+  paddle::framework::LoD lod{{0, srcdata.size()}};
+  words.set_lod(lod);
+  int64_t* pdata = words.mutable_data<int64_t>(
+      {static_cast<int64_t>(srcdata.size()), 1}, paddle::platform::CPUPlace());
+  memcpy(pdata, srcdata.data(), words.numel() * sizeof(int64_t));
+
   LOG(INFO) << "number of input size:" << words.numel();
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&words);

From ce20dfa236a0bf874d8580a7861b7a85dffdf74c Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 25 May 2018 15:17:06 +0800
Subject: [PATCH 03/19] enable more choices

---
 .../tests/book/test_inference_nlp.cc          | 29 +++++++++++++++++--
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 27bdd5528e..c942b43f17 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -19,6 +19,10 @@ limitations under the License. */
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
+DEFINE_int32(repeat, 100, "Running the inference program repeat times");
+DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference");
+DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
+DEFINE_bool(prepare_context, true, "Prepare Context before executor");
 
 TEST(inference, understand_sentiment) {
   if (FLAGS_dirname.empty()) {
@@ -61,10 +65,29 @@ TEST(inference, understand_sentiment) {
   std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
   cpu_fetchs1.push_back(&output1);
 
-  int repeat = 100;
   // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace, true, true>(dirname, cpu_feeds,
-                                                        cpu_fetchs1, repeat);
+  const bool model_combined = false;
+  if (FLAGS_prepare_vars) {
+    if (FLAGS_prepare_context) {
+      TestInference<paddle::platform::CPUPlace, false, true>(
+          dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+          FLAGS_use_mkldnn);
+    } else {
+      TestInference<paddle::platform::CPUPlace, false, false>(
+          dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+          FLAGS_use_mkldnn);
+    }
+  } else {
+    if (FLAGS_prepare_context) {
+      TestInference<paddle::platform::CPUPlace, true, true>(
+          dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+          FLAGS_use_mkldnn);
+    } else {
+      TestInference<paddle::platform::CPUPlace, true, false>(
+          dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+          FLAGS_use_mkldnn);
+    }
+  }
   LOG(INFO) << output1.lod();
   LOG(INFO) << output1.dims();
 

From 400f5e7c3ce21ba63bee62a599a82c4a0bbc299d Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 25 May 2018 21:08:49 +0800
Subject: [PATCH 04/19] add threads test

---
 .../tests/book/test_inference_nlp.cc          | 135 +++++++++---------
 1 file changed, 67 insertions(+), 68 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index c942b43f17..ca02e38ede 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <sys/time.h>
 #include <time.h>
+#include <thread>  // NOLINT
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
@@ -31,76 +32,74 @@ TEST(inference, understand_sentiment) {
 
   LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
   std::string dirname = FLAGS_dirname;
-
-  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-  paddle::framework::LoDTensor words;
-  /*
-    paddle::framework::LoD lod{{0, 83}};
-    int64_t word_dict_len = 198392;
-    SetupLoDTensor(&words, lod, static_cast<int64_t>(0),
-                   static_cast<int64_t>(word_dict_len - 1));
-   */
-  std::vector<int64_t> srcdata{
-      784,    784,   1550,   6463,   56,     75693, 6189,  784,    784,  1550,
-      198391, 6463,  42468,  4376,   10251,  10760, 6189,  297,    396,  6463,
-      6463,   1550,  198391, 6463,   22564,  1612,  291,   68,     164,  784,
-      784,    1550,  198391, 6463,   13659,  3362,  42468, 6189,   2209, 198391,
-      6463,   2209,  2209,   198391, 6463,   2209,  1062,  3029,   1831, 3029,
-      1065,   2281,  100,    11216,  1110,   56,    10869, 9811,   100,  198391,
-      6463,   100,   9280,   100,    288,    40031, 1680,  1335,   100,  1550,
-      9280,   7265,  244,    1550,   198391, 6463,  1550,  198391, 6463, 42468,
-      4376,   10251, 10760};
-  paddle::framework::LoD lod{{0, srcdata.size()}};
-  words.set_lod(lod);
-  int64_t* pdata = words.mutable_data<int64_t>(
-      {static_cast<int64_t>(srcdata.size()), 1}, paddle::platform::CPUPlace());
-  memcpy(pdata, srcdata.data(), words.numel() * sizeof(int64_t));
-
-  LOG(INFO) << "number of input size:" << words.numel();
-  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-  cpu_feeds.push_back(&words);
-
-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
-
-  // Run inference on CPU
   const bool model_combined = false;
-  if (FLAGS_prepare_vars) {
-    if (FLAGS_prepare_context) {
-      TestInference<paddle::platform::CPUPlace, false, true>(
-          dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-          FLAGS_use_mkldnn);
-    } else {
-      TestInference<paddle::platform::CPUPlace, false, false>(
-          dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-          FLAGS_use_mkldnn);
-    }
-  } else {
-    if (FLAGS_prepare_context) {
-      TestInference<paddle::platform::CPUPlace, true, true>(
-          dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-          FLAGS_use_mkldnn);
-    } else {
-      TestInference<paddle::platform::CPUPlace, true, false>(
-          dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-          FLAGS_use_mkldnn);
-    }
-  }
-  LOG(INFO) << output1.lod();
-  LOG(INFO) << output1.dims();
+  int total_work = 100;
+  int num_threads = 10;
+  int work_per_thread = total_work / num_threads;
+  std::vector<std::unique_ptr<std::thread>> infer_threads;
+  for (int i = 0; i < num_threads; ++i) {
+    infer_threads.emplace_back(new std::thread([&, i]() {
+      for (int j = 0; j < work_per_thread; ++j) {
+        // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+        // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+        paddle::framework::LoDTensor words;
+        /*
+          paddle::framework::LoD lod{{0, 83}};
+          int64_t word_dict_len = 198392;
+          SetupLoDTensor(&words, lod, static_cast<int64_t>(0),
+                         static_cast<int64_t>(word_dict_len - 1));
+         */
+        std::vector<int64_t> srcdata{
+            784,   784,    1550,   6463,   56,     75693,  6189,  784,    784,
+            1550,  198391, 6463,   42468,  4376,   10251,  10760, 6189,   297,
+            396,   6463,   6463,   1550,   198391, 6463,   22564, 1612,   291,
+            68,    164,    784,    784,    1550,   198391, 6463,  13659,  3362,
+            42468, 6189,   2209,   198391, 6463,   2209,   2209,  198391, 6463,
+            2209,  1062,   3029,   1831,   3029,   1065,   2281,  100,    11216,
+            1110,  56,     10869,  9811,   100,    198391, 6463,  100,    9280,
+            100,   288,    40031,  1680,   1335,   100,    1550,  9280,   7265,
+            244,   1550,   198391, 6463,   1550,   198391, 6463,  42468,  4376,
+            10251, 10760};
+        paddle::framework::LoD lod{{0, srcdata.size()}};
+        words.set_lod(lod);
+        int64_t* pdata = words.mutable_data<int64_t>(
+            {static_cast<int64_t>(srcdata.size()), 1},
+            paddle::platform::CPUPlace());
+        memcpy(pdata, srcdata.data(), words.numel() * sizeof(int64_t));
 
-#ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
-  cpu_fetchs2.push_back(&output2);
+        LOG(INFO) << "number of input size:" << words.numel();
+        std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+        cpu_feeds.push_back(&words);
 
-  // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.lod();
-  LOG(INFO) << output2.dims();
+        paddle::framework::LoDTensor output1;
+        std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+        cpu_fetchs1.push_back(&output1);
 
-  CheckError<float>(output1, output2);
-#endif
+        // Run inference on CPU
+        if (FLAGS_prepare_vars) {
+          if (FLAGS_prepare_context) {
+            TestInference<paddle::platform::CPUPlace, false, true>(
+                dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+                FLAGS_use_mkldnn);
+          } else {
+            TestInference<paddle::platform::CPUPlace, false, false>(
+                dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+                FLAGS_use_mkldnn);
+          }
+        } else {
+          if (FLAGS_prepare_context) {
+            TestInference<paddle::platform::CPUPlace, true, true>(
+                dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+                FLAGS_use_mkldnn);
+          } else {
+            TestInference<paddle::platform::CPUPlace, true, false>(
+                dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+                FLAGS_use_mkldnn);
+          }
+        }
+        LOG(INFO) << output1.lod();
+        LOG(INFO) << output1.dims();
+      }
+    }));
+  }
 }

From c00843f4e8860d7abff0077168942fa99ef37154 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Tue, 29 May 2018 17:43:01 +0800
Subject: [PATCH 05/19] enable multi-threads

---
 .../tests/book/test_inference_nlp.cc          | 12 ++++++++
 paddle/fluid/inference/tests/test_helper.h    | 29 -------------------
 2 files changed, 12 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index ca02e38ede..6ff8a18cdb 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -25,6 +25,12 @@ DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference");
 DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
 DEFINE_bool(prepare_context, true, "Prepare Context before executor");
 
+inline double get_current_ms() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
+}
+
 TEST(inference, understand_sentiment) {
   if (FLAGS_dirname.empty()) {
     LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
@@ -102,4 +108,10 @@ TEST(inference, understand_sentiment) {
       }
     }));
   }
+  auto start_ms = get_current_ms();
+  for (int i = 0; i < num_threads; ++i) {
+    infer_threads[i]->join();
+  }
+  auto stop_ms = get_current_ms();
+  LOG(INFO) << "total: " << stop_ms - start_ms << " ms";
 }
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index 1f5551567c..dd3a7a584a 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -156,27 +156,10 @@ void TestInference(const std::string& dirname,
   auto executor = paddle::framework::Executor(place);
   auto* scope = new paddle::framework::Scope();
 
-  // Profile the performance
-  paddle::platform::ProfilerState state;
-  if (paddle::platform::is_cpu_place(place)) {
-    state = paddle::platform::ProfilerState::kCPU;
-  } else {
-#ifdef PADDLE_WITH_CUDA
-    state = paddle::platform::ProfilerState::kAll;
-    // The default device_id of paddle::platform::CUDAPlace is 0.
-    // Users can get the device_id using:
-    //   int device_id = place.GetDeviceId();
-    paddle::platform::SetDeviceId(0);
-#else
-    PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
-#endif
-  }
-
   // 2. Initialize the inference_program and load parameters
   std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
 
   // Enable the profiler
-  paddle::platform::EnableProfiler(state);
   {
     paddle::platform::RecordEvent record_event(
         "init_program",
@@ -189,10 +172,6 @@ void TestInference(const std::string& dirname,
       EnableMKLDNN(inference_program);
     }
   }
-  // Disable the profiler and print the timing information
-  paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
-                                    "load_program_profiler");
-  paddle::platform::ResetProfiler();
 
   // 3. Get the feed_target_names and fetch_target_names
   const std::vector<std::string>& feed_target_names =
@@ -233,9 +212,6 @@ void TestInference(const std::string& dirname,
                    true, CreateVars);
     }
 
-    // Enable the profiler
-    paddle::platform::EnableProfiler(state);
-
     // Run repeat times to profile the performance
     for (int i = 0; i < repeat; ++i) {
       paddle::platform::RecordEvent record_event(
@@ -252,11 +228,6 @@ void TestInference(const std::string& dirname,
                      CreateVars);
       }
     }
-
-    // Disable the profiler and print the timing information
-    paddle::platform::DisableProfiler(
-        paddle::platform::EventSortingKey::kDefault, "run_inference_profiler");
-    paddle::platform::ResetProfiler();
   }
 
   delete scope;

From 77599415ba1b93715fa0626e147865c088970ee6 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 30 May 2018 12:15:10 +0800
Subject: [PATCH 06/19] enable read dataset

---
 .../tests/book/test_inference_nlp.cc          | 32 +++++++++++++++++--
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 6ff8a18cdb..95cdeb4ad1 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -14,7 +14,12 @@ limitations under the License. */
 
 #include <sys/time.h>
 #include <time.h>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
 #include <thread>  // NOLINT
+#include <vector>
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
@@ -31,16 +36,37 @@ inline double get_current_ms() {
   return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
 }
 
+void read_data(
+    std::vector<std::vector<int64_t>>* out,
+    const std::string& filename = "/home/tangjian/paddle-tj/out.ids.txt") {
+  using namespace std;  // NOLINT
+  fstream fin(filename);
+  string line;
+  out->clear();
+  while (getline(fin, line)) {
+    istringstream iss(line);
+    vector<int64_t> ids;
+    string field;
+    while (getline(iss, field, ' ')) {
+      ids.push_back(stoi(field));
+    }
+    out->push_back(ids);
+  }
+}
+
 TEST(inference, understand_sentiment) {
   if (FLAGS_dirname.empty()) {
     LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
   }
-
+  std::vector<std::vector<int64_t>> inputdatas;
+  read_data(&inputdatas);
+  LOG(INFO) << "---------- dataset size: " << inputdatas.size();
   LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
   std::string dirname = FLAGS_dirname;
+
   const bool model_combined = false;
-  int total_work = 100;
-  int num_threads = 10;
+  int total_work = 10;
+  int num_threads = 2;
   int work_per_thread = total_work / num_threads;
   std::vector<std::unique_ptr<std::thread>> infer_threads;
   for (int i = 0; i < num_threads; ++i) {

From 4d11c8e9c64f65b6701edb1ba44cefdff0423acb Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 31 May 2018 15:11:46 +0800
Subject: [PATCH 07/19] retest single thread

---
 .../tests/book/test_inference_nlp.cc          | 224 +++++++++++-------
 1 file changed, 143 insertions(+), 81 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 95cdeb4ad1..e216e9dbe6 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -30,16 +30,19 @@ DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference");
 DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
 DEFINE_bool(prepare_context, true, "Prepare Context before executor");
 
+DEFINE_int32(num_threads, 1, "Number of threads should be used");
+
 inline double get_current_ms() {
   struct timeval time;
   gettimeofday(&time, NULL);
   return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
 }
 
-void read_data(
-    std::vector<std::vector<int64_t>>* out,
-    const std::string& filename = "/home/tangjian/paddle-tj/out.ids.txt") {
+// return size of total words
+size_t read_datasets(std::vector<paddle::framework::LoDTensor>* out,
+                     const std::string& filename) {
   using namespace std;  // NOLINT
+  size_t sz = 0;
   fstream fin(filename);
   string line;
   out->clear();
@@ -50,94 +53,153 @@ void read_data(
     while (getline(iss, field, ' ')) {
       ids.push_back(stoi(field));
     }
-    out->push_back(ids);
+    if (ids.size() >= 1024 || out->size() >= 100) {
+      continue;
+    }
+
+    paddle::framework::LoDTensor words;
+    paddle::framework::LoD lod{{0, ids.size()}};
+    words.set_lod(lod);
+    int64_t* pdata = words.mutable_data<int64_t>(
+        {static_cast<int64_t>(ids.size()), 1}, paddle::platform::CPUPlace());
+    memcpy(pdata, ids.data(), words.numel() * sizeof(int64_t));
+    out->emplace_back(words);
+    sz += ids.size();
   }
+  return sz;
+}
+
+void test_multi_threads() {
+  /*
+    size_t jobs_per_thread = std::min(inputdatas.size() / FLAGS_num_threads,
+    inputdatas.size());
+    std::vector<size_t> workers(FLAGS_num_threads, jobs_per_thread);
+    workers[FLAGS_num_threads - 1] += inputdatas.size() % FLAGS_num_threads;
+
+    std::vector<std::unique_ptr<std::thread>> infer_threads;
+
+    for (size_t i = 0; i < workers.size(); ++i) {
+      infer_threads.emplace_back(new std::thread([&, i]() {
+        size_t start = i * jobs_per_thread;
+        for (size_t j = start; j < start + workers[i]; ++j ) {
+          // 0. Call `paddle::framework::InitDevices()` initialize all the
+    devices
+          // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+          paddle::framework::LoDTensor words;
+          auto& srcdata = inputdatas[j];
+          paddle::framework::LoD lod{{0, srcdata.size()}};
+          words.set_lod(lod);
+          int64_t* pdata = words.mutable_data<int64_t>(
+              {static_cast<int64_t>(srcdata.size()), 1},
+              paddle::platform::CPUPlace());
+          memcpy(pdata, srcdata.data(), words.numel() * sizeof(int64_t));
+
+          LOG(INFO) << "thread id: " << i << ", words size:" << words.numel();
+          std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+          cpu_feeds.push_back(&words);
+
+          paddle::framework::LoDTensor output1;
+          std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+          cpu_fetchs1.push_back(&output1);
+
+          // Run inference on CPU
+          if (FLAGS_prepare_vars) {
+            if (FLAGS_prepare_context) {
+              TestInference<paddle::platform::CPUPlace, false, true>(
+                  dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+                  FLAGS_use_mkldnn);
+            } else {
+              TestInference<paddle::platform::CPUPlace, false, false>(
+                  dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+                  FLAGS_use_mkldnn);
+            }
+          } else {
+            if (FLAGS_prepare_context) {
+              TestInference<paddle::platform::CPUPlace, true, true>(
+                  dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+                  FLAGS_use_mkldnn);
+            } else {
+              TestInference<paddle::platform::CPUPlace, true, false>(
+                  dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+                  FLAGS_use_mkldnn);
+            }
+          }
+          //LOG(INFO) << output1.lod();
+          //LOG(INFO) << output1.dims();
+        }
+      }));
+    }
+    auto start_ms = get_current_ms();
+    for (int i = 0; i < FLAGS_num_threads; ++i) {
+      infer_threads[i]->join();
+    }
+    auto stop_ms = get_current_ms();
+    LOG(INFO) << "total: " << stop_ms - start_ms << " ms";*/
 }
 
-TEST(inference, understand_sentiment) {
+TEST(inference, nlp) {
   if (FLAGS_dirname.empty()) {
     LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
   }
-  std::vector<std::vector<int64_t>> inputdatas;
-  read_data(&inputdatas);
-  LOG(INFO) << "---------- dataset size: " << inputdatas.size();
   LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
   std::string dirname = FLAGS_dirname;
 
+  std::vector<paddle::framework::LoDTensor> datasets;
+  size_t num_total_words =
+      read_datasets(&datasets, "/home/tangjian/paddle-tj/out.ids.txt");
+  LOG(INFO) << "Number of dataset samples(seq len<1024): " << datasets.size();
+  LOG(INFO) << "Total number of words: " << num_total_words;
+
   const bool model_combined = false;
-  int total_work = 10;
-  int num_threads = 2;
-  int work_per_thread = total_work / num_threads;
-  std::vector<std::unique_ptr<std::thread>> infer_threads;
-  for (int i = 0; i < num_threads; ++i) {
-    infer_threads.emplace_back(new std::thread([&, i]() {
-      for (int j = 0; j < work_per_thread; ++j) {
-        // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-        // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-        paddle::framework::LoDTensor words;
-        /*
-          paddle::framework::LoD lod{{0, 83}};
-          int64_t word_dict_len = 198392;
-          SetupLoDTensor(&words, lod, static_cast<int64_t>(0),
-                         static_cast<int64_t>(word_dict_len - 1));
-         */
-        std::vector<int64_t> srcdata{
-            784,   784,    1550,   6463,   56,     75693,  6189,  784,    784,
-            1550,  198391, 6463,   42468,  4376,   10251,  10760, 6189,   297,
-            396,   6463,   6463,   1550,   198391, 6463,   22564, 1612,   291,
-            68,    164,    784,    784,    1550,   198391, 6463,  13659,  3362,
-            42468, 6189,   2209,   198391, 6463,   2209,   2209,  198391, 6463,
-            2209,  1062,   3029,   1831,   3029,   1065,   2281,  100,    11216,
-            1110,  56,     10869,  9811,   100,    198391, 6463,  100,    9280,
-            100,   288,    40031,  1680,   1335,   100,    1550,  9280,   7265,
-            244,   1550,   198391, 6463,   1550,   198391, 6463,  42468,  4376,
-            10251, 10760};
-        paddle::framework::LoD lod{{0, srcdata.size()}};
-        words.set_lod(lod);
-        int64_t* pdata = words.mutable_data<int64_t>(
-            {static_cast<int64_t>(srcdata.size()), 1},
-            paddle::platform::CPUPlace());
-        memcpy(pdata, srcdata.data(), words.numel() * sizeof(int64_t));
-
-        LOG(INFO) << "number of input size:" << words.numel();
-        std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-        cpu_feeds.push_back(&words);
-
-        paddle::framework::LoDTensor output1;
-        std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-        cpu_fetchs1.push_back(&output1);
-
-        // Run inference on CPU
-        if (FLAGS_prepare_vars) {
-          if (FLAGS_prepare_context) {
-            TestInference<paddle::platform::CPUPlace, false, true>(
-                dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-                FLAGS_use_mkldnn);
-          } else {
-            TestInference<paddle::platform::CPUPlace, false, false>(
-                dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-                FLAGS_use_mkldnn);
-          }
-        } else {
-          if (FLAGS_prepare_context) {
-            TestInference<paddle::platform::CPUPlace, true, true>(
-                dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-                FLAGS_use_mkldnn);
-          } else {
-            TestInference<paddle::platform::CPUPlace, true, false>(
-                dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-                FLAGS_use_mkldnn);
-          }
-        }
-        LOG(INFO) << output1.lod();
-        LOG(INFO) << output1.dims();
-      }
-    }));
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // 1. Define place, executor, scope
+  auto place = paddle::platform::CPUPlace();
+  auto executor = paddle::framework::Executor(place);
+  auto* scope = new paddle::framework::Scope();
+
+  // 2. Initialize the inference_program and load parameters
+  std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
+  inference_program = InitProgram(&executor, scope, dirname, model_combined);
+  if (FLAGS_use_mkldnn) {
+    EnableMKLDNN(inference_program);
   }
-  auto start_ms = get_current_ms();
-  for (int i = 0; i < num_threads; ++i) {
-    infer_threads[i]->join();
+
+  if (FLAGS_num_threads > 1) {
+    test_multi_threads();
+  } else {
+    if (FLAGS_prepare_vars) {
+      executor.CreateVariables(*inference_program, scope, 0);
+    }
+    // always prepare context and burning first time
+    std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
+    ctx = executor.Prepare(*inference_program, 0);
+
+    // preapre fetch
+    const std::vector<std::string>& fetch_target_names =
+        inference_program->GetFetchTargetNames();
+    PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
+    std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+    paddle::framework::LoDTensor outtensor;
+    fetch_targets[fetch_target_names[0]] = &outtensor;
+
+    // prepare feed
+    const std::vector<std::string>& feed_target_names =
+        inference_program->GetFeedTargetNames();
+    PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL);
+    std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+
+    // for data and run
+    auto start_ms = get_current_ms();
+    for (size_t i = 0; i < datasets.size(); ++i) {
+      feed_targets[feed_target_names[0]] = &(datasets[i]);
+      executor.RunPreparedContext(ctx.get(), scope, &feed_targets,
+                                  &fetch_targets, !FLAGS_prepare_vars);
+    }
+    auto stop_ms = get_current_ms();
+    LOG(INFO) << "Total infer time: " << (stop_ms - start_ms) / 1000.0 / 60
+              << " min, avg time per seq: "
+              << (stop_ms - start_ms) / datasets.size() << " ms";
   }
-  auto stop_ms = get_current_ms();
-  LOG(INFO) << "total: " << stop_ms - start_ms << " ms";
+  delete scope;
 }

From d13dd3b6a7ee81d4c106035ec0bad2c581ea795c Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 31 May 2018 16:04:47 +0800
Subject: [PATCH 08/19] revert profiling

---
 paddle/fluid/inference/tests/test_helper.h | 29 ++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index dd3a7a584a..1f5551567c 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -156,10 +156,27 @@ void TestInference(const std::string& dirname,
   auto executor = paddle::framework::Executor(place);
   auto* scope = new paddle::framework::Scope();
 
+  // Profile the performance
+  paddle::platform::ProfilerState state;
+  if (paddle::platform::is_cpu_place(place)) {
+    state = paddle::platform::ProfilerState::kCPU;
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    state = paddle::platform::ProfilerState::kAll;
+    // The default device_id of paddle::platform::CUDAPlace is 0.
+    // Users can get the device_id using:
+    //   int device_id = place.GetDeviceId();
+    paddle::platform::SetDeviceId(0);
+#else
+    PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+#endif
+  }
+
   // 2. Initialize the inference_program and load parameters
   std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
 
   // Enable the profiler
+  paddle::platform::EnableProfiler(state);
   {
     paddle::platform::RecordEvent record_event(
         "init_program",
@@ -172,6 +189,10 @@ void TestInference(const std::string& dirname,
       EnableMKLDNN(inference_program);
     }
   }
+  // Disable the profiler and print the timing information
+  paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
+                                    "load_program_profiler");
+  paddle::platform::ResetProfiler();
 
   // 3. Get the feed_target_names and fetch_target_names
   const std::vector<std::string>& feed_target_names =
@@ -212,6 +233,9 @@ void TestInference(const std::string& dirname,
                    true, CreateVars);
     }
 
+    // Enable the profiler
+    paddle::platform::EnableProfiler(state);
+
     // Run repeat times to profile the performance
     for (int i = 0; i < repeat; ++i) {
       paddle::platform::RecordEvent record_event(
@@ -228,6 +252,11 @@ void TestInference(const std::string& dirname,
                      CreateVars);
       }
     }
+
+    // Disable the profiler and print the timing information
+    paddle::platform::DisableProfiler(
+        paddle::platform::EventSortingKey::kDefault, "run_inference_profiler");
+    paddle::platform::ResetProfiler();
   }
 
   delete scope;

From 708bec2e56c6a856f628ad8b650b0bf04a3df975 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 31 May 2018 16:33:54 +0800
Subject: [PATCH 09/19] add test

---
 paddle/fluid/inference/tests/book/test_inference_nlp.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index e216e9dbe6..990d45964e 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -200,6 +200,14 @@ TEST(inference, nlp) {
     LOG(INFO) << "Total infer time: " << (stop_ms - start_ms) / 1000.0 / 60
               << " min, avg time per seq: "
               << (stop_ms - start_ms) / datasets.size() << " ms";
+    {  // just for test
+      auto* scope = new paddle::framework::Scope();
+      paddle::framework::LoDTensor outtensor;
+      TestInference<paddle::platform::CPUPlace, false, true>(
+          dirname, {&(datasets[0])}, {&outtensor}, FLAGS_repeat, model_combined,
+          false);
+      delete scope;
+    }
   }
   delete scope;
 }

From 733718c3e724fdd84355010e76ddd17e5b60ef2c Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 31 May 2018 19:12:23 +0800
Subject: [PATCH 10/19] remove the ugly test

---
 .../inference/tests/book/test_inference_nlp.cc | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 990d45964e..5241661fb3 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -53,7 +53,7 @@ size_t read_datasets(std::vector<paddle::framework::LoDTensor>* out,
     while (getline(iss, field, ' ')) {
       ids.push_back(stoi(field));
     }
-    if (ids.size() >= 1024 || out->size() >= 100) {
+    if (ids.size() >= 1024 ) {
       continue;
     }
 
@@ -200,14 +200,14 @@ TEST(inference, nlp) {
     LOG(INFO) << "Total infer time: " << (stop_ms - start_ms) / 1000.0 / 60
               << " min, avg time per seq: "
               << (stop_ms - start_ms) / datasets.size() << " ms";
-    {  // just for test
-      auto* scope = new paddle::framework::Scope();
-      paddle::framework::LoDTensor outtensor;
-      TestInference<paddle::platform::CPUPlace, false, true>(
-          dirname, {&(datasets[0])}, {&outtensor}, FLAGS_repeat, model_combined,
-          false);
-      delete scope;
-    }
+//    {  // just for test
+//      auto* scope = new paddle::framework::Scope();
+//      paddle::framework::LoDTensor outtensor;
+//      TestInference<paddle::platform::CPUPlace, false, true>(
+//          dirname, {&(datasets[0])}, {&outtensor}, FLAGS_repeat, model_combined,
+//          false);
+//      delete scope;
+//    }
   }
   delete scope;
 }

From 5387562576de020a35f864a07f14802b68ee398d Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 1 Jun 2018 14:07:41 +0800
Subject: [PATCH 11/19] add multi-thread test

---
 .../tests/book/test_inference_nlp.cc          | 157 ++++++++----------
 1 file changed, 72 insertions(+), 85 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 5241661fb3..4e92d6a17b 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -15,11 +15,7 @@ limitations under the License. */
 #include <sys/time.h>
 #include <time.h>
 #include <fstream>
-#include <iostream>
-#include <sstream>
-#include <string>
 #include <thread>  // NOLINT
-#include <vector>
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
@@ -41,19 +37,18 @@ inline double get_current_ms() {
 // return size of total words
 size_t read_datasets(std::vector<paddle::framework::LoDTensor>* out,
                      const std::string& filename) {
-  using namespace std;  // NOLINT
   size_t sz = 0;
-  fstream fin(filename);
-  string line;
+  std::fstream fin(filename);
+  std::string line;
   out->clear();
   while (getline(fin, line)) {
-    istringstream iss(line);
-    vector<int64_t> ids;
-    string field;
+    std::istringstream iss(line);
+    std::vector<int64_t> ids;
+    std::string field;
     while (getline(iss, field, ' ')) {
       ids.push_back(stoi(field));
     }
-    if (ids.size() >= 1024 ) {
+    if (ids.size() >= 1024) {
       continue;
     }
 
@@ -69,72 +64,61 @@ size_t read_datasets(std::vector<paddle::framework::LoDTensor>* out,
   return sz;
 }
 
-void test_multi_threads() {
-  /*
-    size_t jobs_per_thread = std::min(inputdatas.size() / FLAGS_num_threads,
-    inputdatas.size());
-    std::vector<size_t> workers(FLAGS_num_threads, jobs_per_thread);
-    workers[FLAGS_num_threads - 1] += inputdatas.size() % FLAGS_num_threads;
-
-    std::vector<std::unique_ptr<std::thread>> infer_threads;
-
-    for (size_t i = 0; i < workers.size(); ++i) {
-      infer_threads.emplace_back(new std::thread([&, i]() {
-        size_t start = i * jobs_per_thread;
-        for (size_t j = start; j < start + workers[i]; ++j ) {
-          // 0. Call `paddle::framework::InitDevices()` initialize all the
-    devices
-          // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-          paddle::framework::LoDTensor words;
-          auto& srcdata = inputdatas[j];
-          paddle::framework::LoD lod{{0, srcdata.size()}};
-          words.set_lod(lod);
-          int64_t* pdata = words.mutable_data<int64_t>(
-              {static_cast<int64_t>(srcdata.size()), 1},
-              paddle::platform::CPUPlace());
-          memcpy(pdata, srcdata.data(), words.numel() * sizeof(int64_t));
-
-          LOG(INFO) << "thread id: " << i << ", words size:" << words.numel();
-          std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-          cpu_feeds.push_back(&words);
-
-          paddle::framework::LoDTensor output1;
-          std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-          cpu_fetchs1.push_back(&output1);
-
-          // Run inference on CPU
-          if (FLAGS_prepare_vars) {
-            if (FLAGS_prepare_context) {
-              TestInference<paddle::platform::CPUPlace, false, true>(
-                  dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-                  FLAGS_use_mkldnn);
-            } else {
-              TestInference<paddle::platform::CPUPlace, false, false>(
-                  dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-                  FLAGS_use_mkldnn);
-            }
-          } else {
-            if (FLAGS_prepare_context) {
-              TestInference<paddle::platform::CPUPlace, true, true>(
-                  dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-                  FLAGS_use_mkldnn);
-            } else {
-              TestInference<paddle::platform::CPUPlace, true, false>(
-                  dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-                  FLAGS_use_mkldnn);
-            }
-          }
-          //LOG(INFO) << output1.lod();
-          //LOG(INFO) << output1.dims();
-        }
-      }));
-    }
-    auto start_ms = get_current_ms();
-    for (int i = 0; i < FLAGS_num_threads; ++i) {
-      infer_threads[i]->join();
+void ThreadRunInfer(
+    const int tid, paddle::framework::Executor* executor,
+    paddle::framework::Scope* scope,
+    const std::unique_ptr<paddle::framework::ProgramDesc>& inference_program,
+    const std::vector<std::vector<const paddle::framework::LoDTensor*>>& jobs) {
+  auto copy_program = std::unique_ptr<paddle::framework::ProgramDesc>(
+      new paddle::framework::ProgramDesc(*inference_program));
+  std::string feed_holder_name = "feed_" + paddle::string::to_string(tid);
+  std::string fetch_holder_name = "fetch_" + paddle::string::to_string(tid);
+  copy_program->SetFeedHolderName(feed_holder_name);
+  copy_program->SetFetchHolderName(fetch_holder_name);
+
+  // 3. Get the feed_target_names and fetch_target_names
+  const std::vector<std::string>& feed_target_names =
+      copy_program->GetFeedTargetNames();
+  const std::vector<std::string>& fetch_target_names =
+      copy_program->GetFetchTargetNames();
+
+  PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
+  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+  paddle::framework::LoDTensor outtensor;
+  fetch_targets[fetch_target_names[0]] = &outtensor;
+
+  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+  PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL);
+
+  auto& inputs = jobs[tid];
+  auto start_ms = get_current_ms();
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    feed_targets[feed_target_names[0]] = inputs[i];
+    executor->Run(*copy_program, scope, &feed_targets, &fetch_targets, true,
+                  true, feed_holder_name, fetch_holder_name);
+  }
+  auto stop_ms = get_current_ms();
+  LOG(INFO) << "Tid: " << tid << ", process " << inputs.size()
+            << " samples, avg time per sample: "
+
+            << (stop_ms - start_ms) / inputs.size() << " ms";
+}
+
+void bcast_datasets(
+    const std::vector<paddle::framework::LoDTensor>& datasets,
+    std::vector<std::vector<const paddle::framework::LoDTensor*>>* jobs,
+    const int num_threads) {
+  size_t s = 0;
+  jobs->resize(num_threads);
+  while (s < datasets.size()) {
+    for (auto it = jobs->begin(); it != jobs->end(); it++) {
+      it->emplace_back(&datasets[s]);
+      s++;
+      if (s >= datasets.size()) {
+        break;
+      }
     }
-    auto stop_ms = get_current_ms();
-    LOG(INFO) << "total: " << stop_ms - start_ms << " ms";*/
+  }
 }
 
 TEST(inference, nlp) {
@@ -166,7 +150,18 @@ TEST(inference, nlp) {
   }
 
   if (FLAGS_num_threads > 1) {
-    test_multi_threads();
+    std::vector<std::vector<const paddle::framework::LoDTensor*>> jobs;
+    bcast_datasets(datasets, &jobs, FLAGS_num_threads);
+    std::vector<std::unique_ptr<std::thread>> threads;
+    for (int i = 0; i < FLAGS_num_threads; ++i) {
+      threads.emplace_back(new std::thread(ThreadRunInfer, i, &executor, scope,
+                                           std::ref(inference_program),
+                                           std::ref(jobs)));
+    }
+    for (int i = 0; i < FLAGS_num_threads; ++i) {
+      threads[i]->join();
+    }
+
   } else {
     if (FLAGS_prepare_vars) {
       executor.CreateVariables(*inference_program, scope, 0);
@@ -200,14 +195,6 @@ TEST(inference, nlp) {
     LOG(INFO) << "Total infer time: " << (stop_ms - start_ms) / 1000.0 / 60
               << " min, avg time per seq: "
               << (stop_ms - start_ms) / datasets.size() << " ms";
-//    {  // just for test
-//      auto* scope = new paddle::framework::Scope();
-//      paddle::framework::LoDTensor outtensor;
-//      TestInference<paddle::platform::CPUPlace, false, true>(
-//          dirname, {&(datasets[0])}, {&outtensor}, FLAGS_repeat, model_combined,
-//          false);
-//      delete scope;
-//    }
   }
   delete scope;
 }

From a4822ed897cebe6a27bd61d82c5a1b43022d3760 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 1 Jun 2018 14:37:35 +0800
Subject: [PATCH 12/19] add thread setting

---
 .../tests/book/test_inference_nlp.cc          | 26 +++++++++++++++----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 4e92d6a17b..fba64efece 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -19,6 +19,10 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
+#ifdef PADDLE_WITH_MKLML
+#include <mkl_service.h>
+#include <omp.h>
+#endif
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
 DEFINE_int32(repeat, 100, "Running the inference program repeat times");
@@ -149,6 +153,14 @@ TEST(inference, nlp) {
     EnableMKLDNN(inference_program);
   }
 
+#ifdef PADDLE_WITH_MKLML
+  // only use 1 core per thread
+  omp_set_dynamic(0);
+  omp_set_num_threads(1);
+  mkl_set_num_threads(1);
+#endif
+
+  double start_ms = 0, stop_ms = 0;
   if (FLAGS_num_threads > 1) {
     std::vector<std::vector<const paddle::framework::LoDTensor*>> jobs;
     bcast_datasets(datasets, &jobs, FLAGS_num_threads);
@@ -158,9 +170,11 @@ TEST(inference, nlp) {
                                            std::ref(inference_program),
                                            std::ref(jobs)));
     }
+    start_ms = get_current_ms();
     for (int i = 0; i < FLAGS_num_threads; ++i) {
       threads[i]->join();
     }
+    stop_ms = get_current_ms();
 
   } else {
     if (FLAGS_prepare_vars) {
@@ -185,16 +199,18 @@ TEST(inference, nlp) {
     std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
 
     // for data and run
-    auto start_ms = get_current_ms();
+    start_ms = get_current_ms();
     for (size_t i = 0; i < datasets.size(); ++i) {
       feed_targets[feed_target_names[0]] = &(datasets[i]);
       executor.RunPreparedContext(ctx.get(), scope, &feed_targets,
                                   &fetch_targets, !FLAGS_prepare_vars);
     }
-    auto stop_ms = get_current_ms();
-    LOG(INFO) << "Total infer time: " << (stop_ms - start_ms) / 1000.0 / 60
-              << " min, avg time per seq: "
-              << (stop_ms - start_ms) / datasets.size() << " ms";
+    stop_ms = get_current_ms();
   }
+
+  LOG(INFO) << "Total inference time with " << FLAGS_num_threads
+            << " threads : " << (stop_ms - start_ms) / 1000.0
+            << " sec, avg time per seq: "
+            << (stop_ms - start_ms) / datasets.size() << " ms";
   delete scope;
 }

From 4a24c238c15212dd921bd0199beca6fc145cd62a Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 1 Jun 2018 15:43:34 +0800
Subject: [PATCH 13/19] refine code

---
 paddle/fluid/inference/io.cc                  |  2 +-
 .../tests/book/test_inference_nlp.cc          | 86 +++++++++----------
 paddle/fluid/inference/tests/test_helper.h    |  3 -
 3 files changed, 42 insertions(+), 49 deletions(-)

diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 98780b6881..65db7c7b50 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -117,7 +117,7 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
   std::string program_desc_str;
   VLOG(3) << "loading model from " << model_filename;
   ReadBinaryFile(model_filename, &program_desc_str);
-  // LOG(INFO) << program_desc_str;
+
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
 
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index fba64efece..962358d761 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -24,23 +24,22 @@ limitations under the License. */
 #include <omp.h>
 #endif
 
-DEFINE_string(dirname, "", "Directory of the inference model.");
+DEFINE_string(modelpath, "", "Directory of the inference model.");
+DEFINE_string(datafile, "", "File of input index data.");
 DEFINE_int32(repeat, 100, "Running the inference program repeat times");
 DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference");
 DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
-DEFINE_bool(prepare_context, true, "Prepare Context before executor");
-
 DEFINE_int32(num_threads, 1, "Number of threads should be used");
 
-inline double get_current_ms() {
+inline double GetCurrentMs() {
   struct timeval time;
   gettimeofday(&time, NULL);
   return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
 }
 
 // return size of total words
-size_t read_datasets(std::vector<paddle::framework::LoDTensor>* out,
-                     const std::string& filename) {
+size_t LoadData(std::vector<paddle::framework::LoDTensor>* out,
+                const std::string& filename) {
   size_t sz = 0;
   std::fstream fin(filename);
   std::string line;
@@ -68,6 +67,23 @@ size_t read_datasets(std::vector<paddle::framework::LoDTensor>* out,
   return sz;
 }
 
+void SplitData(
+    const std::vector<paddle::framework::LoDTensor>& datasets,
+    std::vector<std::vector<const paddle::framework::LoDTensor*>>* jobs,
+    const int num_threads) {
+  size_t s = 0;
+  jobs->resize(num_threads);
+  while (s < datasets.size()) {
+    for (auto it = jobs->begin(); it != jobs->end(); it++) {
+      it->emplace_back(&datasets[s]);
+      s++;
+      if (s >= datasets.size()) {
+        break;
+      }
+    }
+  }
+}
+
 void ThreadRunInfer(
     const int tid, paddle::framework::Executor* executor,
     paddle::framework::Scope* scope,
@@ -80,7 +96,6 @@ void ThreadRunInfer(
   copy_program->SetFeedHolderName(feed_holder_name);
   copy_program->SetFetchHolderName(fetch_holder_name);
 
-  // 3. Get the feed_target_names and fetch_target_names
   const std::vector<std::string>& feed_target_names =
       copy_program->GetFeedTargetNames();
   const std::vector<std::string>& fetch_target_names =
@@ -95,51 +110,32 @@ void ThreadRunInfer(
   PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL);
 
   auto& inputs = jobs[tid];
-  auto start_ms = get_current_ms();
+  auto start_ms = GetCurrentMs();
   for (size_t i = 0; i < inputs.size(); ++i) {
     feed_targets[feed_target_names[0]] = inputs[i];
     executor->Run(*copy_program, scope, &feed_targets, &fetch_targets, true,
                   true, feed_holder_name, fetch_holder_name);
   }
-  auto stop_ms = get_current_ms();
+  auto stop_ms = GetCurrentMs();
   LOG(INFO) << "Tid: " << tid << ", process " << inputs.size()
             << " samples, avg time per sample: "
-
             << (stop_ms - start_ms) / inputs.size() << " ms";
 }
 
-void bcast_datasets(
-    const std::vector<paddle::framework::LoDTensor>& datasets,
-    std::vector<std::vector<const paddle::framework::LoDTensor*>>* jobs,
-    const int num_threads) {
-  size_t s = 0;
-  jobs->resize(num_threads);
-  while (s < datasets.size()) {
-    for (auto it = jobs->begin(); it != jobs->end(); it++) {
-      it->emplace_back(&datasets[s]);
-      s++;
-      if (s >= datasets.size()) {
-        break;
-      }
-    }
-  }
-}
-
 TEST(inference, nlp) {
-  if (FLAGS_dirname.empty()) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  if (FLAGS_modelpath.empty() || FLAGS_datafile.empty()) {
+    LOG(FATAL) << "Usage: ./example --modelpath=path/to/your/model "
+               << "--datafile=path/to/your/data";
   }
-  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  std::string dirname = FLAGS_dirname;
+  LOG(INFO) << "Model Path: " << FLAGS_modelpath;
+  LOG(INFO) << "Data File: " << FLAGS_datafile;
 
   std::vector<paddle::framework::LoDTensor> datasets;
-  size_t num_total_words =
-      read_datasets(&datasets, "/home/tangjian/paddle-tj/out.ids.txt");
-  LOG(INFO) << "Number of dataset samples(seq len<1024): " << datasets.size();
+  size_t num_total_words = LoadData(&datasets, FLAGS_datafile);
+  LOG(INFO) << "Number of samples (seq_len<1024): " << datasets.size();
   LOG(INFO) << "Total number of words: " << num_total_words;
 
   const bool model_combined = false;
-
   // 0. Call `paddle::framework::InitDevices()` initialize all the devices
   // 1. Define place, executor, scope
   auto place = paddle::platform::CPUPlace();
@@ -148,13 +144,14 @@ TEST(inference, nlp) {
 
   // 2. Initialize the inference_program and load parameters
   std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
-  inference_program = InitProgram(&executor, scope, dirname, model_combined);
+  inference_program =
+      InitProgram(&executor, scope, FLAGS_modelpath, model_combined);
   if (FLAGS_use_mkldnn) {
     EnableMKLDNN(inference_program);
   }
 
 #ifdef PADDLE_WITH_MKLML
-  // only use 1 core per thread
+  // only use 1 thread number per std::thread
   omp_set_dynamic(0);
   omp_set_num_threads(1);
   mkl_set_num_threads(1);
@@ -163,24 +160,23 @@ TEST(inference, nlp) {
   double start_ms = 0, stop_ms = 0;
   if (FLAGS_num_threads > 1) {
     std::vector<std::vector<const paddle::framework::LoDTensor*>> jobs;
-    bcast_datasets(datasets, &jobs, FLAGS_num_threads);
+    SplitData(datasets, &jobs, FLAGS_num_threads);
     std::vector<std::unique_ptr<std::thread>> threads;
     for (int i = 0; i < FLAGS_num_threads; ++i) {
       threads.emplace_back(new std::thread(ThreadRunInfer, i, &executor, scope,
                                            std::ref(inference_program),
                                            std::ref(jobs)));
     }
-    start_ms = get_current_ms();
+    start_ms = GetCurrentMs();
     for (int i = 0; i < FLAGS_num_threads; ++i) {
       threads[i]->join();
     }
-    stop_ms = get_current_ms();
-
+    stop_ms = GetCurrentMs();
   } else {
     if (FLAGS_prepare_vars) {
       executor.CreateVariables(*inference_program, scope, 0);
     }
-    // always prepare context and burning first time
+    // always prepare context
     std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
     ctx = executor.Prepare(*inference_program, 0);
 
@@ -198,14 +194,14 @@ TEST(inference, nlp) {
     PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL);
     std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
 
-    // for data and run
-    start_ms = get_current_ms();
+    // feed data and run
+    start_ms = GetCurrentMs();
     for (size_t i = 0; i < datasets.size(); ++i) {
       feed_targets[feed_target_names[0]] = &(datasets[i]);
       executor.RunPreparedContext(ctx.get(), scope, &feed_targets,
                                   &fetch_targets, !FLAGS_prepare_vars);
     }
-    stop_ms = get_current_ms();
+    stop_ms = GetCurrentMs();
   }
 
   LOG(INFO) << "Total inference time with " << FLAGS_num_threads
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index 1f5551567c..01b8dc0be6 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -182,9 +182,6 @@ void TestInference(const std::string& dirname,
         "init_program",
         paddle::platform::DeviceContextPool::Instance().Get(place));
     inference_program = InitProgram(&executor, scope, dirname, is_combined);
-    // std::string binary_str;
-    // inference_program->Proto()->SerializeToString(&binary_str);
-    // LOG(INFO) << binary_str;
     if (use_mkldnn) {
       EnableMKLDNN(inference_program);
     }

From 3206bcd9291833518289e73e37513cdbc29e96c7 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 1 Jun 2018 16:24:55 +0800
Subject: [PATCH 14/19] refine log and add QPS

---
 paddle/fluid/inference/tests/book/test_inference_nlp.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 962358d761..378e1620a0 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -202,11 +202,13 @@ TEST(inference, nlp) {
                                   &fetch_targets, !FLAGS_prepare_vars);
     }
     stop_ms = GetCurrentMs();
+    LOG(INFO) << "Tid: 0, process " << datasets.size()
+              << " samples, avg time per sample: "
+              << (stop_ms - start_ms) / datasets.size() << " ms";
   }
 
   LOG(INFO) << "Total inference time with " << FLAGS_num_threads
             << " threads : " << (stop_ms - start_ms) / 1000.0
-            << " sec, avg time per seq: "
-            << (stop_ms - start_ms) / datasets.size() << " ms";
+            << " sec, QPS: " << datasets.size() / ((stop_ms - start_ms) / 1000);
   delete scope;
 }

From 7e9f0790e0366ef8db3f48f83635400d4742ad71 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 1 Jun 2018 17:24:54 +0800
Subject: [PATCH 15/19] fix scope in thread

---
 paddle/fluid/inference/tests/book/test_inference_nlp.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 378e1620a0..f7788ccbf4 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -91,6 +91,8 @@ void ThreadRunInfer(
     const std::vector<std::vector<const paddle::framework::LoDTensor*>>& jobs) {
   auto copy_program = std::unique_ptr<paddle::framework::ProgramDesc>(
       new paddle::framework::ProgramDesc(*inference_program));
+  auto& sub_scope = scope->NewScope();
+
   std::string feed_holder_name = "feed_" + paddle::string::to_string(tid);
   std::string fetch_holder_name = "fetch_" + paddle::string::to_string(tid);
   copy_program->SetFeedHolderName(feed_holder_name);
@@ -113,10 +115,11 @@ void ThreadRunInfer(
   auto start_ms = GetCurrentMs();
   for (size_t i = 0; i < inputs.size(); ++i) {
     feed_targets[feed_target_names[0]] = inputs[i];
-    executor->Run(*copy_program, scope, &feed_targets, &fetch_targets, true,
-                  true, feed_holder_name, fetch_holder_name);
+    executor->Run(*copy_program, &sub_scope, &feed_targets, &fetch_targets,
+                  true, true, feed_holder_name, fetch_holder_name);
   }
   auto stop_ms = GetCurrentMs();
+  scope->DeleteScope(&sub_scope);
   LOG(INFO) << "Tid: " << tid << ", process " << inputs.size()
             << " samples, avg time per sample: "
             << (stop_ms - start_ms) / inputs.size() << " ms";

From eaeb76c419fbad9b7d3dd083666f80d84f89f55f Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 1 Jun 2018 19:35:49 +0800
Subject: [PATCH 16/19] add some comments

---
 .../tests/book/test_inference_nlp.cc          | 25 +++++++++++--------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index f7788ccbf4..c4d7b0bbf0 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -37,7 +37,8 @@ inline double GetCurrentMs() {
   return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
 }
 
-// return size of total words
+// Load the input word index data from file and save into LodTensor.
+// Return the size of words.
 size_t LoadData(std::vector<paddle::framework::LoDTensor>* out,
                 const std::string& filename) {
   size_t sz = 0;
@@ -67,6 +68,8 @@ size_t LoadData(std::vector<paddle::framework::LoDTensor>* out,
   return sz;
 }
 
+// Split input data samples into small pieces jobs as balanced as possible,
+// according to the number of threads.
 void SplitData(
     const std::vector<paddle::framework::LoDTensor>& datasets,
     std::vector<std::vector<const paddle::framework::LoDTensor*>>* jobs,
@@ -116,7 +119,8 @@ void ThreadRunInfer(
   for (size_t i = 0; i < inputs.size(); ++i) {
     feed_targets[feed_target_names[0]] = inputs[i];
     executor->Run(*copy_program, &sub_scope, &feed_targets, &fetch_targets,
-                  true, true, feed_holder_name, fetch_holder_name);
+                  true /*create_local_scope*/, true /*create_vars*/,
+                  feed_holder_name, fetch_holder_name);
   }
   auto stop_ms = GetCurrentMs();
   scope->DeleteScope(&sub_scope);
@@ -143,12 +147,13 @@ TEST(inference, nlp) {
   // 1. Define place, executor, scope
   auto place = paddle::platform::CPUPlace();
   auto executor = paddle::framework::Executor(place);
-  auto* scope = new paddle::framework::Scope();
+  std::unique_ptr<paddle::framework::Scope> scope(
+      new paddle::framework::Scope());
 
   // 2. Initialize the inference_program and load parameters
   std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
   inference_program =
-      InitProgram(&executor, scope, FLAGS_modelpath, model_combined);
+      InitProgram(&executor, scope.get(), FLAGS_modelpath, model_combined);
   if (FLAGS_use_mkldnn) {
     EnableMKLDNN(inference_program);
   }
@@ -166,9 +171,9 @@ TEST(inference, nlp) {
     SplitData(datasets, &jobs, FLAGS_num_threads);
     std::vector<std::unique_ptr<std::thread>> threads;
     for (int i = 0; i < FLAGS_num_threads; ++i) {
-      threads.emplace_back(new std::thread(ThreadRunInfer, i, &executor, scope,
-                                           std::ref(inference_program),
-                                           std::ref(jobs)));
+      threads.emplace_back(
+          new std::thread(ThreadRunInfer, i, &executor, scope.get(),
+                          std::ref(inference_program), std::ref(jobs)));
     }
     start_ms = GetCurrentMs();
     for (int i = 0; i < FLAGS_num_threads; ++i) {
@@ -177,7 +182,7 @@ TEST(inference, nlp) {
     stop_ms = GetCurrentMs();
   } else {
     if (FLAGS_prepare_vars) {
-      executor.CreateVariables(*inference_program, scope, 0);
+      executor.CreateVariables(*inference_program, scope.get(), 0);
     }
     // always prepare context
     std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
@@ -201,7 +206,7 @@ TEST(inference, nlp) {
     start_ms = GetCurrentMs();
     for (size_t i = 0; i < datasets.size(); ++i) {
       feed_targets[feed_target_names[0]] = &(datasets[i]);
-      executor.RunPreparedContext(ctx.get(), scope, &feed_targets,
+      executor.RunPreparedContext(ctx.get(), scope.get(), &feed_targets,
                                   &fetch_targets, !FLAGS_prepare_vars);
     }
     stop_ms = GetCurrentMs();
@@ -209,9 +214,7 @@ TEST(inference, nlp) {
               << " samples, avg time per sample: "
               << (stop_ms - start_ms) / datasets.size() << " ms";
   }
-
   LOG(INFO) << "Total inference time with " << FLAGS_num_threads
             << " threads : " << (stop_ms - start_ms) / 1000.0
             << " sec, QPS: " << datasets.size() / ((stop_ms - start_ms) / 1000);
-  delete scope;
 }

From 38f8182df63d33ff619297d95f5a4431bf8d5362 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 1 Jun 2018 20:41:18 +0800
Subject: [PATCH 17/19] work around with dummy test

---
 .../fluid/inference/tests/book/CMakeLists.txt |  8 ++++++-
 .../tests/book/test_inference_nlp.cc          | 21 ++++++++++++++++---
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index 90357f99d1..b33df2942a 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -35,7 +35,13 @@ inference_test(image_classification ARGS vgg resnet)
 inference_test(label_semantic_roles)
 inference_test(recognize_digits ARGS mlp conv)
 inference_test(recommender_system)
-inference_test(nlp)
 #inference_test(rnn_encoder_decoder)
 #inference_test(understand_sentiment ARGS conv)
 inference_test(word2vec)
+
+# This is an unly work around to make this test run
+cc_test(test_inference_nlp
+  SRCS test_inference_nlp.cc
+  DEPS paddle_fluid
+  ARGS
+  --modelpath=${PADDLE_BINARY_DIR}/python/paddle/fluid/tests/book/recognize_digits_mlp.inference.model)
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index c4d7b0bbf0..5ece6084df 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -37,10 +37,22 @@ inline double GetCurrentMs() {
   return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
 }
 
+// This function just give dummy data for recognize_digits model.
+size_t DummyData(std::vector<paddle::framework::LoDTensor>* out) {
+  paddle::framework::LoDTensor input;
+  SetupTensor<float>(&input, {1, 1, 28, 28}, -1.f, 1.f);
+  out->emplace_back(input);
+  return 1;
+}
+
 // Load the input word index data from file and save into LodTensor.
 // Return the size of words.
 size_t LoadData(std::vector<paddle::framework::LoDTensor>* out,
                 const std::string& filename) {
+  if (filename.empty()) {
+    return DummyData(out);
+  }
+
   size_t sz = 0;
   std::fstream fin(filename);
   std::string line;
@@ -130,9 +142,12 @@ void ThreadRunInfer(
 }
 
 TEST(inference, nlp) {
-  if (FLAGS_modelpath.empty() || FLAGS_datafile.empty()) {
-    LOG(FATAL) << "Usage: ./example --modelpath=path/to/your/model "
-               << "--datafile=path/to/your/data";
+  if (FLAGS_modelpath.empty()) {
+    LOG(FATAL) << "Usage: ./example --modelpath=path/to/your/model";
+  }
+  if (FLAGS_datafile.empty()) {
+    LOG(WARNING) << " Not data file provided, will use dummy data!"
+                 << "Note: if you use nlp model, please provide data file.";
   }
   LOG(INFO) << "Model Path: " << FLAGS_modelpath;
   LOG(INFO) << "Data File: " << FLAGS_datafile;

From 99d00cce9330dac56aac52788d7fba76d0137430 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 1 Jun 2018 21:04:51 +0800
Subject: [PATCH 18/19] follow comment: refine where time started

---
 paddle/fluid/inference/tests/book/test_inference_nlp.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 5ece6084df..c3bec27925 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -185,12 +185,12 @@ TEST(inference, nlp) {
     std::vector<std::vector<const paddle::framework::LoDTensor*>> jobs;
     SplitData(datasets, &jobs, FLAGS_num_threads);
     std::vector<std::unique_ptr<std::thread>> threads;
+    start_ms = GetCurrentMs();
     for (int i = 0; i < FLAGS_num_threads; ++i) {
       threads.emplace_back(
           new std::thread(ThreadRunInfer, i, &executor, scope.get(),
                           std::ref(inference_program), std::ref(jobs)));
     }
-    start_ms = GetCurrentMs();
     for (int i = 0; i < FLAGS_num_threads; ++i) {
       threads[i]->join();
     }

From 6ae7cbe252178e7bd3e5c3b7cde21581948b478f Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Mon, 4 Jun 2018 11:08:08 +0800
Subject: [PATCH 19/19] follow comments

---
 .../fluid/inference/tests/book/CMakeLists.txt |  3 ++-
 .../tests/book/test_inference_nlp.cc          | 21 ++++++++++---------
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index b33df2942a..2fa5a9540b 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -40,8 +40,9 @@ inference_test(recommender_system)
 inference_test(word2vec)
 
 # This is an unly work around to make this test run
+# TODO(TJ): clean me up
 cc_test(test_inference_nlp
   SRCS test_inference_nlp.cc
   DEPS paddle_fluid
   ARGS
-  --modelpath=${PADDLE_BINARY_DIR}/python/paddle/fluid/tests/book/recognize_digits_mlp.inference.model)
+  --model_path=${PADDLE_BINARY_DIR}/python/paddle/fluid/tests/book/recognize_digits_mlp.inference.model)
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index c3bec27925..70aa42ac41 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -24,8 +24,8 @@ limitations under the License. */
 #include <omp.h>
 #endif
 
-DEFINE_string(modelpath, "", "Directory of the inference model.");
-DEFINE_string(datafile, "", "File of input index data.");
+DEFINE_string(model_path, "", "Directory of the inference model.");
+DEFINE_string(data_file, "", "File of input index data.");
 DEFINE_int32(repeat, 100, "Running the inference program repeat times");
 DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference");
 DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
@@ -65,6 +65,7 @@ size_t LoadData(std::vector<paddle::framework::LoDTensor>* out,
       ids.push_back(stoi(field));
     }
     if (ids.size() >= 1024) {
+      // Synced with NLP guys, they will ignore input larger then 1024
       continue;
     }
 
@@ -142,18 +143,18 @@ void ThreadRunInfer(
 }
 
 TEST(inference, nlp) {
-  if (FLAGS_modelpath.empty()) {
-    LOG(FATAL) << "Usage: ./example --modelpath=path/to/your/model";
+  if (FLAGS_model_path.empty()) {
+    LOG(FATAL) << "Usage: ./example --model_path=path/to/your/model";
   }
-  if (FLAGS_datafile.empty()) {
-    LOG(WARNING) << " Not data file provided, will use dummy data!"
+  if (FLAGS_data_file.empty()) {
+    LOG(WARNING) << "No data file provided, will use dummy data!"
                  << "Note: if you use nlp model, please provide data file.";
   }
-  LOG(INFO) << "Model Path: " << FLAGS_modelpath;
-  LOG(INFO) << "Data File: " << FLAGS_datafile;
+  LOG(INFO) << "Model Path: " << FLAGS_model_path;
+  LOG(INFO) << "Data File: " << FLAGS_data_file;
 
   std::vector<paddle::framework::LoDTensor> datasets;
-  size_t num_total_words = LoadData(&datasets, FLAGS_datafile);
+  size_t num_total_words = LoadData(&datasets, FLAGS_data_file);
   LOG(INFO) << "Number of samples (seq_len<1024): " << datasets.size();
   LOG(INFO) << "Total number of words: " << num_total_words;
 
@@ -168,7 +169,7 @@ TEST(inference, nlp) {
   // 2. Initialize the inference_program and load parameters
   std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
   inference_program =
-      InitProgram(&executor, scope.get(), FLAGS_modelpath, model_combined);
+      InitProgram(&executor, scope.get(), FLAGS_model_path, model_combined);
   if (FLAGS_use_mkldnn) {
     EnableMKLDNN(inference_program);
   }