From 98fb8e58fd4fb91423d414d67f2a2684b6841020 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 25 May 2018 11:57:44 +0800
Subject: [PATCH 01/68] test infer nlp

---
 paddle/fluid/inference/io.cc                  |  2 +-
 .../fluid/inference/tests/book/CMakeLists.txt |  1 +
 .../tests/book/test_inference_nlp.cc          | 85 +++++++++++++++++++
 paddle/fluid/inference/tests/test_helper.h    |  3 +
 4 files changed, 90 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/inference/tests/book/test_inference_nlp.cc

diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 65db7c7b50..98780b6881 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -117,7 +117,7 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
   std::string program_desc_str;
   VLOG(3) << "loading model from " << model_filename;
   ReadBinaryFile(model_filename, &program_desc_str);
-
+  // LOG(INFO) << program_desc_str;
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
 
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index dbb81462b8..90357f99d1 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -35,6 +35,7 @@ inference_test(image_classification ARGS vgg resnet)
 inference_test(label_semantic_roles)
 inference_test(recognize_digits ARGS mlp conv)
 inference_test(recommender_system)
+inference_test(nlp)
 #inference_test(rnn_encoder_decoder)
 #inference_test(understand_sentiment ARGS conv)
 inference_test(word2vec)
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
new file mode 100644
index 0000000000..0d6d0adfb2
--- /dev/null
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -0,0 +1,85 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+TEST(inference, understand_sentiment) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+  paddle::framework::LoDTensor words;
+
+  paddle::framework::LoD lod{{0, 83}};
+  int64_t word_dict_len = 198392;
+  SetupLoDTensor(&words, lod, static_cast<int64_t>(0),
+                 static_cast<int64_t>(word_dict_len - 1));
+  /*
+    std::vector<int64_t> srcdata{
+        784,    784,   1550,   6463,   56,     75693, 6189,  784,    784,  1550,
+        198391, 6463,  42468,  4376,   10251,  10760, 6189,  297,    396,  6463,
+        6463,   1550,  198391, 6463,   22564,  1612,  291,   68,     164,  784,
+        784,    1550,  198391, 6463,   13659,  3362,  42468, 6189,   2209,
+    198391,
+        6463,   2209,  2209,   198391, 6463,   2209,  1062,  3029,   1831, 3029,
+        1065,   2281,  100,    11216,  1110,   56,    10869, 9811,   100,
+    198391,
+        6463,   100,   9280,   100,    288,    40031, 1680,  1335,   100,  1550,
+        9280,   7265,  244,    1550,   198391, 6463,  1550,  198391, 6463,
+    42468,
+        4376,   10251, 10760};
+    paddle::framework::LoD lod{{0, srcdata.size()}};
+    words.set_lod(lod);
+    int64_t* pdata =
+    words.mutable_data<int64_t>({static_cast<int64_t>(srcdata.size()), 1},
+                                                  paddle::platform::CPUPlace());
+    memcpy(pdata, srcdata.data(), words.numel() * sizeof(int64_t));
+  */
+  LOG(INFO) << "number of input size:" << words.numel();
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&words);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  int repeat = 100;
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace, true, true>(dirname, cpu_feeds,
+                                                        cpu_fetchs1, repeat);
+  LOG(INFO) << output1.lod();
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.lod();
+  LOG(INFO) << output2.dims();
+
+  CheckError<float>(output1, output2);
+#endif
+}
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index 01b8dc0be6..1f5551567c 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -182,6 +182,9 @@ void TestInference(const std::string& dirname,
         "init_program",
         paddle::platform::DeviceContextPool::Instance().Get(place));
     inference_program = InitProgram(&executor, scope, dirname, is_combined);
+    // std::string binary_str;
+    // inference_program->Proto()->SerializeToString(&binary_str);
+    // LOG(INFO) << binary_str;
     if (use_mkldnn) {
       EnableMKLDNN(inference_program);
     }

From 602e28bf1c30cd72e7378d6dc1071423086bdc73 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 25 May 2018 14:38:01 +0800
Subject: [PATCH 02/68] use the actual data

---
 .../tests/book/test_inference_nlp.cc          | 48 +++++++++----------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 0d6d0adfb2..27bdd5528e 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <sys/time.h>
+#include <time.h>
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
@@ -29,32 +31,28 @@ TEST(inference, understand_sentiment) {
   // 0. Call `paddle::framework::InitDevices()` initialize all the devices
   // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
   paddle::framework::LoDTensor words;
-
-  paddle::framework::LoD lod{{0, 83}};
-  int64_t word_dict_len = 198392;
-  SetupLoDTensor(&words, lod, static_cast<int64_t>(0),
-                 static_cast<int64_t>(word_dict_len - 1));
   /*
-    std::vector<int64_t> srcdata{
-        784,    784,   1550,   6463,   56,     75693, 6189,  784,    784,  1550,
-        198391, 6463,  42468,  4376,   10251,  10760, 6189,  297,    396,  6463,
-        6463,   1550,  198391, 6463,   22564,  1612,  291,   68,     164,  784,
-        784,    1550,  198391, 6463,   13659,  3362,  42468, 6189,   2209,
-    198391,
-        6463,   2209,  2209,   198391, 6463,   2209,  1062,  3029,   1831, 3029,
-        1065,   2281,  100,    11216,  1110,   56,    10869, 9811,   100,
-    198391,
-        6463,   100,   9280,   100,    288,    40031, 1680,  1335,   100,  1550,
-        9280,   7265,  244,    1550,   198391, 6463,  1550,  198391, 6463,
-    42468,
-        4376,   10251, 10760};
-    paddle::framework::LoD lod{{0, srcdata.size()}};
-    words.set_lod(lod);
-    int64_t* pdata =
-    words.mutable_data<int64_t>({static_cast<int64_t>(srcdata.size()), 1},
-                                                  paddle::platform::CPUPlace());
-    memcpy(pdata, srcdata.data(), words.numel() * sizeof(int64_t));
-  */
+    paddle::framework::LoD lod{{0, 83}};
+    int64_t word_dict_len = 198392;
+    SetupLoDTensor(&words, lod, static_cast<int64_t>(0),
+                   static_cast<int64_t>(word_dict_len - 1));
+   */
+  std::vector<int64_t> srcdata{
+      784,    784,   1550,   6463,   56,     75693, 6189,  784,    784,  1550,
+      198391, 6463,  42468,  4376,   10251,  10760, 6189,  297,    396,  6463,
+      6463,   1550,  198391, 6463,   22564,  1612,  291,   68,     164,  784,
+      784,    1550,  198391, 6463,   13659,  3362,  42468, 6189,   2209, 198391,
+      6463,   2209,  2209,   198391, 6463,   2209,  1062,  3029,   1831, 3029,
+      1065,   2281,  100,    11216,  1110,   56,    10869, 9811,   100,  198391,
+      6463,   100,   9280,   100,    288,    40031, 1680,  1335,   100,  1550,
+      9280,   7265,  244,    1550,   198391, 6463,  1550,  198391, 6463, 42468,
+      4376,   10251, 10760};
+  paddle::framework::LoD lod{{0, srcdata.size()}};
+  words.set_lod(lod);
+  int64_t* pdata = words.mutable_data<int64_t>(
+      {static_cast<int64_t>(srcdata.size()), 1}, paddle::platform::CPUPlace());
+  memcpy(pdata, srcdata.data(), words.numel() * sizeof(int64_t));
+
   LOG(INFO) << "number of input size:" << words.numel();
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&words);

From ce20dfa236a0bf874d8580a7861b7a85dffdf74c Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 25 May 2018 15:17:06 +0800
Subject: [PATCH 03/68] enable more choices

---
 .../tests/book/test_inference_nlp.cc          | 29 +++++++++++++++++--
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 27bdd5528e..c942b43f17 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -19,6 +19,10 @@ limitations under the License. */
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
+DEFINE_int32(repeat, 100, "Running the inference program repeat times");
+DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference");
+DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
+DEFINE_bool(prepare_context, true, "Prepare Context before executor");
 
 TEST(inference, understand_sentiment) {
   if (FLAGS_dirname.empty()) {
@@ -61,10 +65,29 @@ TEST(inference, understand_sentiment) {
   std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
   cpu_fetchs1.push_back(&output1);
 
-  int repeat = 100;
   // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace, true, true>(dirname, cpu_feeds,
-                                                        cpu_fetchs1, repeat);
+  const bool model_combined = false;
+  if (FLAGS_prepare_vars) {
+    if (FLAGS_prepare_context) {
+      TestInference<paddle::platform::CPUPlace, false, true>(
+          dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+          FLAGS_use_mkldnn);
+    } else {
+      TestInference<paddle::platform::CPUPlace, false, false>(
+          dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+          FLAGS_use_mkldnn);
+    }
+  } else {
+    if (FLAGS_prepare_context) {
+      TestInference<paddle::platform::CPUPlace, true, true>(
+          dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+          FLAGS_use_mkldnn);
+    } else {
+      TestInference<paddle::platform::CPUPlace, true, false>(
+          dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+          FLAGS_use_mkldnn);
+    }
+  }
   LOG(INFO) << output1.lod();
   LOG(INFO) << output1.dims();
 

From 400f5e7c3ce21ba63bee62a599a82c4a0bbc299d Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 25 May 2018 21:08:49 +0800
Subject: [PATCH 04/68] add threads test

---
 .../tests/book/test_inference_nlp.cc          | 135 +++++++++---------
 1 file changed, 67 insertions(+), 68 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index c942b43f17..ca02e38ede 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <sys/time.h>
 #include <time.h>
+#include <thread>  // NOLINT
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
@@ -31,76 +32,74 @@ TEST(inference, understand_sentiment) {
 
   LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
   std::string dirname = FLAGS_dirname;
-
-  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-  paddle::framework::LoDTensor words;
-  /*
-    paddle::framework::LoD lod{{0, 83}};
-    int64_t word_dict_len = 198392;
-    SetupLoDTensor(&words, lod, static_cast<int64_t>(0),
-                   static_cast<int64_t>(word_dict_len - 1));
-   */
-  std::vector<int64_t> srcdata{
-      784,    784,   1550,   6463,   56,     75693, 6189,  784,    784,  1550,
-      198391, 6463,  42468,  4376,   10251,  10760, 6189,  297,    396,  6463,
-      6463,   1550,  198391, 6463,   22564,  1612,  291,   68,     164,  784,
-      784,    1550,  198391, 6463,   13659,  3362,  42468, 6189,   2209, 198391,
-      6463,   2209,  2209,   198391, 6463,   2209,  1062,  3029,   1831, 3029,
-      1065,   2281,  100,    11216,  1110,   56,    10869, 9811,   100,  198391,
-      6463,   100,   9280,   100,    288,    40031, 1680,  1335,   100,  1550,
-      9280,   7265,  244,    1550,   198391, 6463,  1550,  198391, 6463, 42468,
-      4376,   10251, 10760};
-  paddle::framework::LoD lod{{0, srcdata.size()}};
-  words.set_lod(lod);
-  int64_t* pdata = words.mutable_data<int64_t>(
-      {static_cast<int64_t>(srcdata.size()), 1}, paddle::platform::CPUPlace());
-  memcpy(pdata, srcdata.data(), words.numel() * sizeof(int64_t));
-
-  LOG(INFO) << "number of input size:" << words.numel();
-  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-  cpu_feeds.push_back(&words);
-
-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
-
-  // Run inference on CPU
   const bool model_combined = false;
-  if (FLAGS_prepare_vars) {
-    if (FLAGS_prepare_context) {
-      TestInference<paddle::platform::CPUPlace, false, true>(
-          dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-          FLAGS_use_mkldnn);
-    } else {
-      TestInference<paddle::platform::CPUPlace, false, false>(
-          dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-          FLAGS_use_mkldnn);
-    }
-  } else {
-    if (FLAGS_prepare_context) {
-      TestInference<paddle::platform::CPUPlace, true, true>(
-          dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-          FLAGS_use_mkldnn);
-    } else {
-      TestInference<paddle::platform::CPUPlace, true, false>(
-          dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-          FLAGS_use_mkldnn);
-    }
-  }
-  LOG(INFO) << output1.lod();
-  LOG(INFO) << output1.dims();
+  int total_work = 100;
+  int num_threads = 10;
+  int work_per_thread = total_work / num_threads;
+  std::vector<std::unique_ptr<std::thread>> infer_threads;
+  for (int i = 0; i < num_threads; ++i) {
+    infer_threads.emplace_back(new std::thread([&, i]() {
+      for (int j = 0; j < work_per_thread; ++j) {
+        // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+        // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+        paddle::framework::LoDTensor words;
+        /*
+          paddle::framework::LoD lod{{0, 83}};
+          int64_t word_dict_len = 198392;
+          SetupLoDTensor(&words, lod, static_cast<int64_t>(0),
+                         static_cast<int64_t>(word_dict_len - 1));
+         */
+        std::vector<int64_t> srcdata{
+            784,   784,    1550,   6463,   56,     75693,  6189,  784,    784,
+            1550,  198391, 6463,   42468,  4376,   10251,  10760, 6189,   297,
+            396,   6463,   6463,   1550,   198391, 6463,   22564, 1612,   291,
+            68,    164,    784,    784,    1550,   198391, 6463,  13659,  3362,
+            42468, 6189,   2209,   198391, 6463,   2209,   2209,  198391, 6463,
+            2209,  1062,   3029,   1831,   3029,   1065,   2281,  100,    11216,
+            1110,  56,     10869,  9811,   100,    198391, 6463,  100,    9280,
+            100,   288,    40031,  1680,   1335,   100,    1550,  9280,   7265,
+            244,   1550,   198391, 6463,   1550,   198391, 6463,  42468,  4376,
+            10251, 10760};
+        paddle::framework::LoD lod{{0, srcdata.size()}};
+        words.set_lod(lod);
+        int64_t* pdata = words.mutable_data<int64_t>(
+            {static_cast<int64_t>(srcdata.size()), 1},
+            paddle::platform::CPUPlace());
+        memcpy(pdata, srcdata.data(), words.numel() * sizeof(int64_t));
 
-#ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
-  cpu_fetchs2.push_back(&output2);
+        LOG(INFO) << "number of input size:" << words.numel();
+        std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+        cpu_feeds.push_back(&words);
 
-  // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.lod();
-  LOG(INFO) << output2.dims();
+        paddle::framework::LoDTensor output1;
+        std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+        cpu_fetchs1.push_back(&output1);
 
-  CheckError<float>(output1, output2);
-#endif
+        // Run inference on CPU
+        if (FLAGS_prepare_vars) {
+          if (FLAGS_prepare_context) {
+            TestInference<paddle::platform::CPUPlace, false, true>(
+                dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+                FLAGS_use_mkldnn);
+          } else {
+            TestInference<paddle::platform::CPUPlace, false, false>(
+                dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+                FLAGS_use_mkldnn);
+          }
+        } else {
+          if (FLAGS_prepare_context) {
+            TestInference<paddle::platform::CPUPlace, true, true>(
+                dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+                FLAGS_use_mkldnn);
+          } else {
+            TestInference<paddle::platform::CPUPlace, true, false>(
+                dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+                FLAGS_use_mkldnn);
+          }
+        }
+        LOG(INFO) << output1.lod();
+        LOG(INFO) << output1.dims();
+      }
+    }));
+  }
 }

From c00843f4e8860d7abff0077168942fa99ef37154 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Tue, 29 May 2018 17:43:01 +0800
Subject: [PATCH 05/68] enable multi-threads

---
 .../tests/book/test_inference_nlp.cc          | 12 ++++++++
 paddle/fluid/inference/tests/test_helper.h    | 29 -------------------
 2 files changed, 12 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index ca02e38ede..6ff8a18cdb 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -25,6 +25,12 @@ DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference");
 DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
 DEFINE_bool(prepare_context, true, "Prepare Context before executor");
 
+inline double get_current_ms() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
+}
+
 TEST(inference, understand_sentiment) {
   if (FLAGS_dirname.empty()) {
     LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
@@ -102,4 +108,10 @@ TEST(inference, understand_sentiment) {
       }
     }));
   }
+  auto start_ms = get_current_ms();
+  for (int i = 0; i < num_threads; ++i) {
+    infer_threads[i]->join();
+  }
+  auto stop_ms = get_current_ms();
+  LOG(INFO) << "total: " << stop_ms - start_ms << " ms";
 }
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index 1f5551567c..dd3a7a584a 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -156,27 +156,10 @@ void TestInference(const std::string& dirname,
   auto executor = paddle::framework::Executor(place);
   auto* scope = new paddle::framework::Scope();
 
-  // Profile the performance
-  paddle::platform::ProfilerState state;
-  if (paddle::platform::is_cpu_place(place)) {
-    state = paddle::platform::ProfilerState::kCPU;
-  } else {
-#ifdef PADDLE_WITH_CUDA
-    state = paddle::platform::ProfilerState::kAll;
-    // The default device_id of paddle::platform::CUDAPlace is 0.
-    // Users can get the device_id using:
-    //   int device_id = place.GetDeviceId();
-    paddle::platform::SetDeviceId(0);
-#else
-    PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
-#endif
-  }
-
   // 2. Initialize the inference_program and load parameters
   std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
 
   // Enable the profiler
-  paddle::platform::EnableProfiler(state);
   {
     paddle::platform::RecordEvent record_event(
         "init_program",
@@ -189,10 +172,6 @@ void TestInference(const std::string& dirname,
       EnableMKLDNN(inference_program);
     }
   }
-  // Disable the profiler and print the timing information
-  paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
-                                    "load_program_profiler");
-  paddle::platform::ResetProfiler();
 
   // 3. Get the feed_target_names and fetch_target_names
   const std::vector<std::string>& feed_target_names =
@@ -233,9 +212,6 @@ void TestInference(const std::string& dirname,
                    true, CreateVars);
     }
 
-    // Enable the profiler
-    paddle::platform::EnableProfiler(state);
-
     // Run repeat times to profile the performance
     for (int i = 0; i < repeat; ++i) {
       paddle::platform::RecordEvent record_event(
@@ -252,11 +228,6 @@ void TestInference(const std::string& dirname,
                      CreateVars);
       }
     }
-
-    // Disable the profiler and print the timing information
-    paddle::platform::DisableProfiler(
-        paddle::platform::EventSortingKey::kDefault, "run_inference_profiler");
-    paddle::platform::ResetProfiler();
   }
 
   delete scope;

From 77599415ba1b93715fa0626e147865c088970ee6 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 30 May 2018 12:15:10 +0800
Subject: [PATCH 06/68] enable read dataset

---
 .../tests/book/test_inference_nlp.cc          | 32 +++++++++++++++++--
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 6ff8a18cdb..95cdeb4ad1 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -14,7 +14,12 @@ limitations under the License. */
 
 #include <sys/time.h>
 #include <time.h>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
 #include <thread>  // NOLINT
+#include <vector>
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
@@ -31,16 +36,37 @@ inline double get_current_ms() {
   return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
 }
 
+void read_data(
+    std::vector<std::vector<int64_t>>* out,
+    const std::string& filename = "/home/tangjian/paddle-tj/out.ids.txt") {
+  using namespace std;  // NOLINT
+  fstream fin(filename);
+  string line;
+  out->clear();
+  while (getline(fin, line)) {
+    istringstream iss(line);
+    vector<int64_t> ids;
+    string field;
+    while (getline(iss, field, ' ')) {
+      ids.push_back(stoi(field));
+    }
+    out->push_back(ids);
+  }
+}
+
 TEST(inference, understand_sentiment) {
   if (FLAGS_dirname.empty()) {
     LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
   }
-
+  std::vector<std::vector<int64_t>> inputdatas;
+  read_data(&inputdatas);
+  LOG(INFO) << "---------- dataset size: " << inputdatas.size();
   LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
   std::string dirname = FLAGS_dirname;
+
   const bool model_combined = false;
-  int total_work = 100;
-  int num_threads = 10;
+  int total_work = 10;
+  int num_threads = 2;
   int work_per_thread = total_work / num_threads;
   std::vector<std::unique_ptr<std::thread>> infer_threads;
   for (int i = 0; i < num_threads; ++i) {

From a2c017da9b77ac76359dcb3dc24fccde6d71f32b Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 31 May 2018 01:58:00 +0800
Subject: [PATCH 07/68] 1. merge simple_dist_transpiler to
 distribute_transpiler 2. add align_var_to_block argument to func transpile 3.
 remove concat and spilt if align_var_to_block is False 4. unittests for
 simple_dist_transpiler

---
 .../unittests/test_simple_dist_transpiler.py  | 120 +++++++++
 .../fluid/transpiler/distribute_transpiler.py |  45 +++-
 .../distribute_transpiler_simple.py           | 254 ------------------
 3 files changed, 156 insertions(+), 263 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
 delete mode 100644 python/paddle/fluid/transpiler/distribute_transpiler_simple.py

diff --git a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
new file mode 100644
index 0000000000..d51e356a53
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
@@ -0,0 +1,120 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.layers as layers
+from paddle.fluid.transpiler.distribute_transpiler import delete_ops
+import numpy as np
+
+
+class TestSimpleDistTranspiler(unittest.TestCase):
+    def setUp(self):
+        self.trainer_id = 0
+        self.trainers = 2
+        self.pservers = 2
+        self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175"
+        self.current_pserver_ep = "127.0.0.1:6175"
+
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'))
+
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+
+        optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+        return optimize_ops, params_grads
+
+    def test_simple_transpiler(self):
+        np.random.seed(1)
+
+        trainer = self.get_trainer()
+        pserver, startup = self.get_pserver(self.current_pserver_ep)
+        self.assertEqual([op.type for op in trainer.global_block().ops],
+                         self.get_expect_trainer_ops())
+
+        self.assertEqual(len(pserver.blocks), 2)
+        # block0: listen_and_serv
+        self.assertEqual([op.type for op in pserver.blocks[0].ops],
+                         ["listen_and_serv"])
+        # block1: optimize pass
+        self.assertEqual([op.type for op in pserver.blocks[1].ops],
+                         ["sum", "scale", "sgd"])
+
+        print("xxx", [op.output_arg_names for op in startup.global_block().ops])
+        # confirm startup program
+        self.assertEqual([op.type for op in startup.global_block().ops],
+                         ["fill_constant", "uniform_random", "uniform_random"])
+
+        # the variable #fc_w will NOT be splited
+        fc_w_var = startup.global_block().var("fc_w@GRAD")
+        self.assertEqual(fc_w_var.shape, (1000, 1000))
+
+        fc_w_var = startup.global_block().var("fc_w@GRAD.trainer_0")
+        self.assertEqual(fc_w_var.shape, (1000, 1000))
+
+    def get_main_program(self):
+        main = fluid.Program()
+
+        with fluid.program_guard(main):
+            self.net_conf()
+
+        return main
+
+    def get_expect_trainer_ops(self):
+        trainer = fluid.Program()
+
+        with fluid.program_guard(trainer):
+            optimize_ops, params_grads = self.net_conf()
+
+        delete_ops(trainer.global_block(), optimize_ops)
+        ops = [op.type for op in trainer.global_block().ops] + [
+            "send_vars", "send_barrier", "recv", "recv", "fetch_barrier"
+        ]
+        ops.insert(ops.index("elementwise_add_grad") + 1, "send_vars")
+        return ops
+
+    def get_trainer(self):
+        return self._transpiler_instance().get_trainer_program()
+
+    def get_pserver(self, ep):
+        t = self._transpiler_instance()
+        pserver = t.get_pserver_program(ep)
+        startup = t.get_startup_program(ep, pserver)
+        return pserver, startup
+
+    def _transpiler_instance(self):
+        main = self.get_main_program()
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            self.trainer_id,
+            program=main,
+            pservers=self.pserver_eps,
+            trainers=self.trainers,
+            align_var_to_block=False)
+        return t
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index e9b7d9e9d2..eb7daeffb9 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import math
+import numpy as np
 
 from ps_dispatcher import RoundRobin, HashName, PSDispatcher
 from .. import core, framework
@@ -103,7 +104,7 @@ def split_dense_variable(var_list, service_count, min_block_size=8192):
 
     We need to have a minimal block size so that the calculations in
     the parameter server side can gain better performance. By default
-    minimum block size 8K elements (maybe 16bit or 32bit or 64bit). 
+    minimum block size 8K elements (maybe 16bit or 32bit or 64bit).
 
     Args:
         var_list (list): List of variables.
@@ -111,7 +112,7 @@ def split_dense_variable(var_list, service_count, min_block_size=8192):
             or more listening ports.
         min_block_size (int): Minimum splitted block size.
     Returns:
-        blocks (list[(varname, block_id, current_block_size)]): A list 
+        blocks (list[(varname, block_id, current_block_size)]): A list
             of VarBlocks. Each VarBlock specifies a shard of the var.
     """
     blocks = []
@@ -171,6 +172,7 @@ class DistributeTranspiler:
                   program=None,
                   pservers="127.0.0.1:6174",
                   trainers=1,
+                  align_var_to_block=True,
                   split_method=RoundRobin,
                   sync_mode=True):
         """
@@ -183,7 +185,8 @@ class DistributeTranspiler:
         parameter servers.
 
         Steps to transpile trainer:
-        1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
+        1. split variable to multiple blocks, aligned by product(dim[1:]) (width)
+            if align_var_to_block is True
         2. rename splited grad variables to add trainer_id suffix ".trainer_%d".
         3. modify trainer program add split_op to each grad variable.
         4. append send_op to send splited variables to server and fetch
@@ -293,9 +296,18 @@ class DistributeTranspiler:
                     for index in range(len(self.pserver_endpoints))
                 ]
 
-        grad_blocks = split_dense_variable(grad_list, len(pserver_endpoints))
-        param_blocks = split_dense_variable(param_list, len(pserver_endpoints))
+        if align_var_to_block:
+            grad_blocks = split_dense_variable(grad_list,
+                                               len(pserver_endpoints))
+            param_blocks = split_dense_variable(param_list,
+                                                len(pserver_endpoints))
+        else:
+            # when we do NOT align var to block, we will always split params
+            # grads into one block.
+            grad_blocks = split_dense_variable(grad_list, 1)
+            param_blocks = split_dense_variable(param_list, 1)
         assert (len(grad_blocks) == len(param_blocks))
+
         # step2: Create new vars for the parameters and gradients blocks and
         # add ops to do the split.
         param_var_mapping = self._create_vars_from_blocklist(program,
@@ -325,8 +337,22 @@ class DistributeTranspiler:
         # step 3.1: insert send op to send gradient vars to parameter servers
         ps_dispatcher.reset()
         send_vars = []
-        for orig_varname, splited_vars in grad_var_mapping.items():
+
+        # in general cases, the number of pservers is times of 2, and this
+        # will lead to uneven distribution among weights and bias:
+        #       fc_w@GRAD_trainer_0, fc_w@GRAD_trainer_1 --> pserver1
+        #       fc_b@GRAD_trainer_0, fc_b@GRAD_trainer_1 --> pserver2
+        # shuffle the map will avoid the uneven distribution above
+        grad_var_mapping_items = grad_var_mapping.items()
+        if not align_var_to_block:
+            np.random.shuffle(grad_var_mapping_items)
+
+        for orig_varname, splited_vars in grad_var_mapping_items:
             eplist = ps_dispatcher.dispatch(splited_vars)
+
+            if not align_var_to_block:
+                assert (len(splited_vars) == 1)
+
             if len(splited_vars) == 1:
                 orig_varname = splited_vars[0].name
                 index = find_op_by_output_arg(program.global_block(),
@@ -374,7 +400,7 @@ class DistributeTranspiler:
         for i, ep in enumerate(eplist):
             self.param_grad_ep_mapping[ep]["params"].append(recv_vars[i])
             self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i])
-        # step4: Concat the parameters splits together after recv.
+
         for varname, splited_var in param_var_mapping.iteritems():
             eps = []
             for var in splited_var:
@@ -399,6 +425,7 @@ class DistributeTranspiler:
                 RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
             })
 
+        # step4: Concat the parameters splits together after recv.
         for varname, splited_var in param_var_mapping.iteritems():
             if len(splited_var) <= 1:
                 continue
@@ -849,8 +876,8 @@ class DistributeTranspiler:
             program (ProgramDesc): ProgramDesc which gradients blong.
             block_list (list[(varname, block_id, block_size)]): List of gradient blocks.
             add_trainer_suffix (Bool): Add trainer suffix to new variable's name if set True.
-        Returns: 
-            var_mapping (dict(varname->[new_varname_variable])):A dict mapping 
+        Returns:
+            var_mapping (dict(varname->[new_varname_variable])):A dict mapping
                 from original var name to each var split.
         """
 
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler_simple.py b/python/paddle/fluid/transpiler/distribute_transpiler_simple.py
deleted file mode 100644
index ea8c27cdca..0000000000
--- a/python/paddle/fluid/transpiler/distribute_transpiler_simple.py
+++ /dev/null
@@ -1,254 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ..framework import Program, default_main_program, Parameter, Variable
-from ..layer_helper import LayerHelper
-
-
-def hash_name_to_server(params_grads, pserver_endpoints):
-    """
-    :param param_grads:
-    :return: a map of pserver endpoint -> 
-                    params -> [param list]
-                    grads  -> [grad list]
-    """
-
-    def _hash_param(param_name, total):
-        return hash(param_name) % total
-
-    param_grad_map = dict()
-    for param, grad in params_grads:
-        if param.trainable is True and grad is not None:
-            server_id = _hash_param(param.name, len(pserver_endpoints))
-            server_for_param = pserver_endpoints[server_id]
-            if not param_grad_map.has_key(server_for_param):
-                param_grad_map[server_for_param] = {"params": [], "grads": []}
-            param_grad_map[server_for_param]["params"].append(param)
-            param_grad_map[server_for_param]["grads"].append(grad)
-
-    return param_grad_map
-
-
-def round_robin(params_grads, pserver_endpoints):
-    assert (len(params_grads) > len(pserver_endpoints))
-
-    param_grad_map = dict()
-    pserver_idx = 0
-    for param, grad in params_grads:
-        if param.trainable is True:
-            server_for_param = pserver_endpoints[pserver_idx]
-            if not param_grad_map.has_key(server_for_param):
-                param_grad_map[server_for_param] = {"params": [], "grads": []}
-
-            param_grad_map[server_for_param]["params"].append(param)
-            param_grad_map[server_for_param]["grads"].append(grad)
-
-            pserver_idx += 1
-            if pserver_idx >= len(pserver_endpoints):
-                pserver_idx = 0
-    return param_grad_map
-
-
-class SimpleDistributeTranspiler:
-    def transpile(self,
-                  optimize_ops,
-                  params_grads,
-                  program=None,
-                  pservers="127.0.0.1:6174",
-                  trainers=1,
-                  split_method=round_robin):
-        """
-            Transpile the program to a distributed data-parallelism programs.
-
-            The main_program will be transform to use a remote parameter server
-            to do parameter optimization. And the optimization graph will be put
-            in to a parameter server program.
-
-            Use different methods to split trainable varialbles to different
-            parameter servers.
-
-            Example to run:
-
-            exe = fluid.Executor(place)
-            t = fluid.DistributeTranspiler()
-            t.transpile(optimize_ops, params_grads, pservers="127.0.0.1:6174", trainers=1)
-
-            pserver_endpoint = os.getenv("PSERVER")
-            if pserver_endpoint:
-                pserver_prog = t.get_pserver_program(pserver_endpoint, optimize_ops)
-                exe.run(fluid.default_startup_program())
-                exe.run(pserver_prog)
-            else:
-                feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
-                exe.run(fluid.default_startup_program())
-
-                for pass_id in range(PASS_NUM):
-                    ...
-
-            :param optimize_ops: op list of optimization, should be the
-                                 return value of Optimizer.minimize
-            :type optimize_ops: list
-            :param program: program to optimize, default default_main_program
-            :param pservers: parameter server endpoints like "m1:6174,m2:6174"
-            :type pservers: string
-
-            :return: return a list of programs
-        """
-        if program is None:
-            program = default_main_program()
-        self.program = program
-        self.trainers = trainers
-        self.optimize_ops = optimize_ops
-        self._optimize_distributed(
-            optimize_ops,
-            program,
-            params_grads,
-            pservers=pservers,
-            trainers=trainers,
-            split_method=split_method)
-
-    def _clone_param(self, block, v):
-        assert isinstance(v, Parameter)
-        new_p = Parameter(
-            block=block,
-            shape=v.shape,
-            dtype=v.dtype,
-            type=v.type,
-            lod_level=v.lod_level,
-            stop_gradient=v.stop_gradient,
-            trainable=v.trainable,
-            optimize_attr=v.optimize_attr,
-            regularizer=v.regularizer,
-            name=v.name)
-        block.vars[new_p.name] = new_p
-
-    def _clone_var(self, block, var):
-        assert isinstance(var, Variable)
-        return block.create_var(
-            name=var.name,
-            shape=var.shape,
-            dtype=var.dtype,
-            type=var.type,
-            lod_level=var.lod_level,
-            persistable=var.persistable)
-
-    def _optimize_distributed(self, optimize_ops, program, params_and_grads,
-                              **kwargs):
-        if kwargs.has_key("split_method"):
-            split_method = kwargs["split_method"]
-        else:
-            split_method = round_robin
-
-        assert (callable(split_method))
-        pserver_endpoints = kwargs["pservers"].split(",")
-        self.param_grad_map = split_method(params_and_grads, pserver_endpoints)
-
-        send_op_ordered_inputs = []
-        send_op_ordered_outputs = []
-        epmap = []
-        for ep, v in self.param_grad_map.iteritems():
-            send_op_ordered_inputs.extend(v["grads"])
-            send_op_ordered_outputs.extend(v["params"])
-            for i in v["grads"]:
-                epmap.append(ep)
-        send_op = program.global_block().append_op(
-            type="send",
-            inputs={"X": send_op_ordered_inputs
-                    },  # inputs is a list of tensors to be send
-            outputs={"Out": send_op_ordered_outputs},
-            attrs={"endpoints": pserver_endpoints,
-                   "epmap": epmap})
-
-    def get_trainer_program(self):
-        # remove optimize ops and add a send op to main_program
-        self.program.global_block().delete_ops(self.optimize_ops)
-        return self.program
-
-    def _create_var_for_trainers(self, block, var, trainers):
-        var_list = []
-        for i in xrange(trainers):
-            var_each = block.create_var(
-                name="%s.trainer_%d" % (var.name, i),
-                psersistable=var.persistable,
-                dtype=var.dtype,
-                shape=var.shape)
-            var_list.append(var_each)
-        return var_list
-
-    def get_pserver_program(self, endpoint, optimize_ops):
-        pserver_program = Program()
-        for v in self.param_grad_map[endpoint]["params"]:
-            self._clone_param(pserver_program.global_block(), v)
-
-        optimize_sub_program = Program()
-        grad_var_names = [
-            var.name for var in self.param_grad_map[endpoint]["grads"]
-        ]
-        for opt_op in optimize_ops:
-            for _, var in opt_op.inputs.iteritems():
-                # NOTE: append operators to merge gradients from multiple
-                # trainers. If trainers == 1, this is not needed.
-                if self.trainers > 1 and var.name in grad_var_names:
-                    vars2merge = self._create_var_for_trainers(
-                        optimize_sub_program.global_block(), var, self.trainers)
-                    merged_var = optimize_sub_program.global_block().create_var(
-                        name=var.name,
-                        persistable=var.persistable,
-                        dtype=var.dtype,
-                        shape=var.shape)
-                    optimize_sub_program.global_block().append_op(
-                        type="sum",
-                        inputs={"X": vars2merge},
-                        outputs={"Out": merged_var})
-                    optimize_sub_program.global_block().append_op(
-                        type="scale",
-                        inputs={"X": merged_var},
-                        outputs={"Out": merged_var},
-                        attrs={"scale": 1.0 / float(self.trainers)})
-                else:
-                    optimize_sub_program.global_block().create_var(
-                        name=var.name,
-                        persistable=var.persistable,
-                        dtype=var.dtype,
-                        shape=var.shape)
-
-            if opt_op.inputs.has_key("Grad"):
-                if opt_op.inputs["Grad"].name in grad_var_names:
-                    optimize_sub_program.global_block().append_op(
-                        type=opt_op.type,
-                        inputs=opt_op.inputs,
-                        outputs=opt_op.outputs,
-                        attrs=opt_op.attrs)
-            else:
-                optimize_sub_program.global_block().append_op(
-                    type=opt_op.type,
-                    inputs=opt_op.inputs,
-                    outputs=opt_op.outputs,
-                    attrs=opt_op.attrs)
-        pserver_program.global_block().append_op(
-            type="recv",
-            inputs={"RX":
-                    self.param_grad_map[endpoint]["grads"]},  # grads to recv
-            outputs={},
-            attrs={
-                "OptimizeBlock": optimize_sub_program.global_block(),
-                "endpoint": endpoint,
-                "ParamList":
-                [p.name for p in self.param_grad_map[endpoint]["params"]],
-                "GradList":
-                [p.name for p in self.param_grad_map[endpoint]["grads"]],
-                "Trainers": self.trainers
-            })
-        pserver_program.sync_with_cpp()
-        return pserver_program

From 8248333ad2adcb3b3cdcb353cff4025092f8dcdb Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 31 May 2018 10:53:45 +0800
Subject: [PATCH 08/68] remove simple distranspiler in transpiler/ repo

---
 python/paddle/fluid/transpiler/__init__.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/transpiler/__init__.py b/python/paddle/fluid/transpiler/__init__.py
index 045ca537b2..cf18090f71 100644
--- a/python/paddle/fluid/transpiler/__init__.py
+++ b/python/paddle/fluid/transpiler/__init__.py
@@ -15,10 +15,9 @@
 from distribute_transpiler import DistributeTranspiler
 from inference_transpiler import InferenceTranspiler
 from memory_optimization_transpiler import memory_optimize, release_memory
-from distribute_transpiler_simple import SimpleDistributeTranspiler
 from ps_dispatcher import HashName, RoundRobin
 
 __all__ = [
-    "DistributeTranspiler", "InferenceTranspiler", "SimpleDistributeTranspiler",
-    "memory_optimize", "release_memory", "HashName", "RoundRobin"
+    "DistributeTranspiler", "InferenceTranspiler", "memory_optimize",
+    "release_memory", "HashName", "RoundRobin"
 ]

From 164692da9a75744db770836111ae4c63ac6ed7c3 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Thu, 31 May 2018 11:00:40 +0800
Subject: [PATCH 09/68] drop the last batch, if the size of last batch is not
 equal to batch_size

---
 python/paddle/batch.py        | 6 ++++--
 python/paddle/v2/minibatch.py | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/python/paddle/batch.py b/python/paddle/batch.py
index 317cf037c6..d48c54fcbb 100644
--- a/python/paddle/batch.py
+++ b/python/paddle/batch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']
 
 
-def batch(reader, batch_size):
+def batch(reader, batch_size, drop_last=False):
     """
     Create a batched reader.
 
@@ -23,6 +23,8 @@ def batch(reader, batch_size):
     :type reader: callable
     :param batch_size: size of each mini-batch
     :type batch_size: int
+    :param drop_last: drop the last batch, if the size of last batch is not equal to batch_size.
+    :type drop_last: bool
     :return: the batched reader.
     :rtype: callable
     """
@@ -35,7 +37,7 @@ def batch(reader, batch_size):
             if len(b) == batch_size:
                 yield b
                 b = []
-        if b:
+        if drop_last == False and len(b) != 0:
             yield b
 
     return batch_reader
diff --git a/python/paddle/v2/minibatch.py b/python/paddle/v2/minibatch.py
index 317cf037c6..d48c54fcbb 100644
--- a/python/paddle/v2/minibatch.py
+++ b/python/paddle/v2/minibatch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']
 
 
-def batch(reader, batch_size):
+def batch(reader, batch_size, drop_last=False):
     """
     Create a batched reader.
 
@@ -23,6 +23,8 @@ def batch(reader, batch_size):
     :type reader: callable
     :param batch_size: size of each mini-batch
     :type batch_size: int
+    :param drop_last: drop the last batch, if the size of last batch is not equal to batch_size.
+    :type drop_last: bool
     :return: the batched reader.
     :rtype: callable
     """
@@ -35,7 +37,7 @@ def batch(reader, batch_size):
             if len(b) == batch_size:
                 yield b
                 b = []
-        if b:
+        if drop_last == False and len(b) != 0:
             yield b
 
     return batch_reader

From a4d88fb6420cba41c701378b1ed56fe4f9ec45f1 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 31 May 2018 11:41:08 +0800
Subject: [PATCH 10/68] 1. remove simple distranspiler from transpiler/ repo 2.
 remvoe comment line

---
 python/paddle/fluid/__init__.py                               | 4 ++--
 .../fluid/tests/unittests/test_simple_dist_transpiler.py      | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 859605d005..d53a96a7a7 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -44,8 +44,8 @@ import transpiler
 from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
 from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace
-from transpiler import DistributeTranspiler, SimpleDistributeTranspiler, \
-    InferenceTranspiler, memory_optimize, release_memory
+from transpiler import DistributeTranspiler, InferenceTranspiler, \
+    memory_optimize, release_memory
 from concurrency import (Go, make_channel, channel_send, channel_recv,
                          channel_close, Select)
 from lod_tensor import create_lod_tensor, create_random_int_lodtensor
diff --git a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
index d51e356a53..60f99f412c 100644
--- a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
@@ -62,7 +62,6 @@ class TestSimpleDistTranspiler(unittest.TestCase):
         self.assertEqual([op.type for op in pserver.blocks[1].ops],
                          ["sum", "scale", "sgd"])
 
-        print("xxx", [op.output_arg_names for op in startup.global_block().ops])
         # confirm startup program
         self.assertEqual([op.type for op in startup.global_block().ops],
                          ["fill_constant", "uniform_random", "uniform_random"])

From e05abab60923c819c16980bdd53bdd684518f75b Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Thu, 31 May 2018 11:56:59 +0800
Subject: [PATCH 11/68] use recordio in dist train

---
 doc/v2/howto/recordio/README.md               | 122 ++++++++++++++++++
 .../details/threaded_ssa_graph_executor.cc    |   6 +-
 .../reader/create_recordio_file_reader_op.cc  |  12 +-
 python/paddle/fluid/layers/io.py              |  20 +--
 python/paddle/fluid/recordio_writer.py        |  35 ++++-
 tools/codestyle/docstring_checker.pyc         | Bin 11769 -> 11769 bytes
 6 files changed, 178 insertions(+), 17 deletions(-)
 create mode 100644 doc/v2/howto/recordio/README.md

diff --git a/doc/v2/howto/recordio/README.md b/doc/v2/howto/recordio/README.md
new file mode 100644
index 0000000000..3f81d54b8e
--- /dev/null
+++ b/doc/v2/howto/recordio/README.md
@@ -0,0 +1,122 @@
+# How to use RecordIO in Fluid
+
+If you want to use RecordIO as your training data format, you need to convert to your training data
+to RecordIO files and reading them in the process of training, PaddlePaddle Fluid provides some
+interface to deal with the RecordIO files.
+
+## Generate RecordIO File
+
+Before start training with RecordIO files, you need to convert your training data
+to RecordIO format by `fluid.recordio_writer.convert_reader_to_recordio_file`, the sample codes
+as follows:
+
+```python
+    reader = paddle.batch(mnist.train(), batch_size=1)
+    feeder = fluid.DataFeeder(
+        feed_list=[  # order is image and label
+            fluid.layers.data(
+            name='image', shape=[784]),
+            fluid.layers.data(
+            name='label', shape=[1], dtype='int64'),
+        ],
+        place=fluid.CPUPlace())
+    fluid.recordio_writer.convert_reader_to_recordio_file('./mnist.recordio', reader, feeder)
+```
+
+The above codes would generate a RecordIO `./mnist.recordio` on your host.
+
+## Use the RecordIO file in a Local Training Job
+
+PaddlePaddle Fluid provides an interface `fluid.layers.io.open_recordio_file` to load your RecordIO file
+and then you can use them as a Layer in your network configuration, the sample codes as follows:
+
+```python
+    data_file = fluid.layers.io.open_recordio_file(
+        filename="./mnist.recordio",
+        shapes=[(-1, 784),(-1, 1)],
+        lod_levels=[0, 0],
+        dtypes=["float32", "int32"])
+    data_file = fluid.layers.io.batch(data_file, batch_size=4)
+
+    img, label = fluid.layers.io.read_file(data_file)
+    hidden = fluid.layers.fc(input=img, size=100, act='tanh')
+    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_loss = fluid.layers.mean(loss)
+
+    fluid.optimizer.Adam(learning_rate=1e-3).minimize(avg_loss)
+
+    place = fluid.CPUPlace()
+
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+    avg_loss_np = []
+
+    # train a pass
+    batch_id = 0
+    while True:
+        tmp, = exe.run(fetch_list=[avg_loss])
+
+        avg_loss_np.append(tmp)
+        print(batch_id)
+        batch_id += 1
+```
+
+## Use the RecordIO files in Distributed Training
+
+1. generate multiple RecordIO files
+
+For a distributed training job, you may have multiple trainer nodes,
+and one or more RecordIO files for one trainer node, you can use the interface
+`fluid.recordio_writer.convert_reader_to_recordio_files` to convert your training data
+into multiple RecordIO files, the sample codes as follows:
+
+```python
+    reader = paddle.batch(mnist.train(), batch_size=1)
+    feeder = fluid.DataFeeder(
+        feed_list=[  # order is image and label
+            fluid.layers.data(
+            name='image', shape=[784]),
+            fluid.layers.data(
+            name='label', shape=[1], dtype='int64'),
+        ],
+        place=fluid.CPUPlace())
+    fluid.recordio_writer.convert_reader_to_recordio_files(
+          filename_suffix='./mnist.recordio', batch_per_file=100, reader, feeder)
+```
+
+The above codes would generate multiple RecordIO files on your host like:
+
+```bash
+.
+ \_mnist.recordio-00000
+ |-mnist.recordio-00001
+ |-mnist.recordio-00002
+ |-mnist.recordio-00003
+ |-mnist.recordio-00004
+```
+
+1. read these RecordIO files with `fluid.layers.io.open_recordio_file`
+
+For a distributed training job, the distributed operator system will schedule trainer process on multiple nodes,
+each trainer process reads parts of the whole training data, we usually take the following approach to make the training
+data allocated by each trainer process as uniform as possiable:
+
+```python
+def gen_train_list(file_pattern, trainers, trainer_id):
+   file_list = glob.glob(file_pattern)
+   ret_list = []
+   for idx, f in enumerate(file_list):
+       if (idx + trainers) % trainers == trainer_id:
+           ret_list.append(f)
+   return ret_list
+
+trainers = int(os.getenv("TRAINERS"))
+trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+data_file = fluid.layers.io.open_recordio_file(
+        filename=gen_train_list("./mnist.recordio*", trainers, trainer_id),
+        shapes=[(-1, 784),(-1, 1)],
+        lod_levels=[0, 0],
+        dtypes=["float32", "int32"])
+data_file = fluid.layers.io.batch(data_file, batch_size=4)
+```
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 815f739371..335c067dd7 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -189,9 +189,11 @@ void ThreadedSSAGraphExecutor::RunOp(
     BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) {
   auto op_run = [ready_var_q, op, this] {
     try {
-      VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
+      VLOG(10) << "PE start "
+               << " " << op->Name() << " : " << op->DebugString();
       op->Run(strategy_.use_event_);
-      VLOG(10) << op << " " << op->Name() << " Done ";
+      VLOG(10) << "PE end "
+               << " " << op->Name() << " Done ";
       running_ops_--;
       ready_var_q->Extend(op->Outputs());
       VLOG(10) << op << " " << op->Name() << "Signal posted";
diff --git a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
index 282ec3f36b..6b6d447026 100644
--- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
@@ -65,20 +65,22 @@ class CreateRecordIOReaderOp : public framework::OperatorBase {
                       static_cast<int>(shape_concat.size()),
                       "The accumulate of all ranks should be equal to the "
                       "shape concat's length.");
-    std::string filename = Attr<std::string>("filename");
+    auto filenames = Attr<std::vector<std::string>>("filenames");
 
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
-
-    out->Reset(new RecordIOFileReader<true>(
-        filename, RestoreShapes(shape_concat, ranks)));
+    for (auto& fn : filenames) {
+      out->Reset(
+          new RecordIOFileReader<true>(fn, RestoreShapes(shape_concat, ranks)));
+    }
   }
 };
 
 class CreateRecordIOReaderOpMaker : public FileReaderMakerBase {
  protected:
   void Apply() override {
-    AddAttr<std::string>("filename", "The filename of record io reader");
+    AddAttr<std::vector<std::string>>("filenames",
+                                      "The filenames of record io reader");
     AddComment(R"DOC(
       CreateRecordIOReader Operator
 
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 8758ac9f94..b9d5582730 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -21,7 +21,7 @@ from ..layer_helper import LayerHelper
 from ..executor import global_scope
 
 __all__ = [
-    'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
+    'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_files',
     'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer',
     'random_data_generator', 'Preprocessor'
 ]
@@ -291,12 +291,12 @@ def _copy_reader_create_op_(block, op):
     return new_op
 
 
-def open_recordio_file(filename,
-                       shapes,
-                       lod_levels,
-                       dtypes,
-                       pass_num=1,
-                       for_parallel=True):
+def open_recordio_files(filenames,
+                        shapes,
+                        lod_levels,
+                        dtypes,
+                        pass_num=1,
+                        for_parallel=True):
     """
     Open a RecordIO file
 
@@ -304,7 +304,7 @@ def open_recordio_file(filename,
     Via the Reader Variable, we can get data from the given RecordIO file.
 
     Args:
-       filename(str): The RecordIO file's name.
+       filename(str) or list(str): The RecordIO file's name.
        shapes(list): List of tuples which declaring data shapes.
        lod_levels(list): List of ints which declaring data lod_level.
        dtypes(list): List of strs which declaring data type.
@@ -336,6 +336,8 @@ def open_recordio_file(filename,
         ranks.append(len(shape))
 
     var_name = unique_name('open_recordio_file')
+    if isinstance(filenames, str):
+        filenames = [filenames]
 
     startup_blk = default_startup_program().current_block()
     startup_var = startup_blk.create_var(name=var_name)
@@ -345,7 +347,7 @@ def open_recordio_file(filename,
         attrs={
             'shape_concat': shape_concat,
             'lod_levels': lod_levels,
-            'filename': filename,
+            'filenames': filenames,
             'ranks': ranks
         })
 
diff --git a/python/paddle/fluid/recordio_writer.py b/python/paddle/fluid/recordio_writer.py
index 5accaacd53..7c966cba74 100644
--- a/python/paddle/fluid/recordio_writer.py
+++ b/python/paddle/fluid/recordio_writer.py
@@ -14,7 +14,7 @@
 
 import core
 import contextlib
-
+from ..batch import batch
 __all__ = ['convert_reader_to_recordio_file']
 
 
@@ -46,3 +46,36 @@ def convert_reader_to_recordio_file(
             writer.complete_append_tensor()
             counter += 1
     return counter
+
+
+import paddle
+
+
+def convert_reader_to_recordio_files(
+        filename_suffix,
+        batch_per_file,
+        reader_creator,
+        feeder,
+        compressor=core.RecordIOWriter.Compressor.Snappy,
+        max_num_records=1000,
+        feed_order=None):
+    if feed_order is None:
+        feed_order = feeder.feed_names
+    lines = []
+    f_idx = 0
+    counter = 0
+    for idx, batch in enumerate(reader_creator()):
+        lines.append(batch)
+        if idx >= batch_per_file and idx % batch_per_file == 0:
+            filename = "%s-%05d" % (filename_suffix, f_idx)
+            with create_recordio_writer(filename, compressor,
+                                        max_num_records) as writer:
+                for l in lines:
+                    res = feeder.feed(l)
+                    for each in feed_order:
+                        writer.append_tensor(res[each])
+                    writer.complete_append_tensor()
+                    counter += 1
+                lines = []
+                f_idx += 1
+    return counter
diff --git a/tools/codestyle/docstring_checker.pyc b/tools/codestyle/docstring_checker.pyc
index 1ce612ca2318ccb9b9f28d51cb93ce8e5e1d0680..07e875aec6c9bae8002bde4223348c6a29647b03 100644
GIT binary patch
delta 939
zcmewv{WF@K`7<w9o;dGD_7{w@m5F&3r3R+@<@rU~#RZAUsrm(pDJeOr@hO?bC6nWo
zbT(TsMG~QG@-61;1P#-j{F~(wK~*}N)!5pZ2`iiYjdMB?s;+Sr5}|4WcL5Qq?8R9(
z`|-#T4iDYQ+j!dv8mPOuoo_ljL1j9V)5ZFUP-QP(L0Hu!i3bD~>2B7N6d=Nrn_Hwj
znTRk=T=oZH({v_B$ZsaXuuz4!1Pt38uT)5kv!qmAh*7pe-Ij1_)ZMJFElI##!0e<u
Qd6LdIf}Yfwd{I{e08va`eE<Le

delta 939
zcmewv{WF@K`7<w<(M|4+>@OH)lM*viO7&gxlS^|`^Gb^K(=$ur%kzt}iwhEyQzyqO
z>1?)OiX=kW<Xg<w2^yw5`8UfWf~s^jtFg5+6IM3)8|QQ)R9)jLBtq2$?gAoI*^9Go
z_T!Nw93HxpxAC?UG*EYQJKuD6g35Fzr;GIyp~_yog0QMd5)TL}(%q~jDL{lLH@8T6
zG7({#xa<$Yrs+(Mkl##%VWA3d2^h9HUa626XGy8L5Tk5`x-H?<sJmHTTatjgfZ0iR
Q@+6&a1U;!U`J%1{0B`1GNdN!<


From d599a66f49b59288e540f75b6aa111ba52170bef Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Thu, 31 May 2018 12:21:13 +0800
Subject: [PATCH 12/68] clean up code

---
 python/paddle/fluid/recordio_writer.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/python/paddle/fluid/recordio_writer.py b/python/paddle/fluid/recordio_writer.py
index 7c966cba74..9557f91bb3 100644
--- a/python/paddle/fluid/recordio_writer.py
+++ b/python/paddle/fluid/recordio_writer.py
@@ -14,7 +14,6 @@
 
 import core
 import contextlib
-from ..batch import batch
 __all__ = ['convert_reader_to_recordio_file']
 
 
@@ -48,9 +47,6 @@ def convert_reader_to_recordio_file(
     return counter
 
 
-import paddle
-
-
 def convert_reader_to_recordio_files(
         filename_suffix,
         batch_per_file,

From a6a7b6f180ec7fe08a3bd41902e88f3fd5caa17a Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Thu, 31 May 2018 12:23:57 +0800
Subject: [PATCH 13/68] code cleanup

---
 .../details/threaded_ssa_graph_executor.cc      |   6 ++----
 tools/codestyle/docstring_checker.pyc           | Bin 11769 -> 0 bytes
 2 files changed, 2 insertions(+), 4 deletions(-)
 delete mode 100644 tools/codestyle/docstring_checker.pyc

diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 335c067dd7..815f739371 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -189,11 +189,9 @@ void ThreadedSSAGraphExecutor::RunOp(
     BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) {
   auto op_run = [ready_var_q, op, this] {
     try {
-      VLOG(10) << "PE start "
-               << " " << op->Name() << " : " << op->DebugString();
+      VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
       op->Run(strategy_.use_event_);
-      VLOG(10) << "PE end "
-               << " " << op->Name() << " Done ";
+      VLOG(10) << op << " " << op->Name() << " Done ";
       running_ops_--;
       ready_var_q->Extend(op->Outputs());
       VLOG(10) << op << " " << op->Name() << "Signal posted";
diff --git a/tools/codestyle/docstring_checker.pyc b/tools/codestyle/docstring_checker.pyc
deleted file mode 100644
index 07e875aec6c9bae8002bde4223348c6a29647b03..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 11769
zcmdT~%X1XR8SmL$t+b1Tc#4Oy8S`8MD~X3+M8*~fV-j0Xvp5to)Ogt$Nh8hftfpse
z$%aa$aOH8#E%^^pm8zUlxutT5Z$2h}Kyu6}hg439EBXDto}FEU>_FK9N7A&qzkYxH
zy}R@O7%8{*jJ)4a=}#X2-^8bXiNsTC4ymQQoC+;9hm6VR)Lc&Tc@^eWv!Lb*syU$M
z29(7X1r-)mb5P9<n(}}Ohg7qq=1QiF(aLJBth}Q76jN+b-k|b_)jYZm5%NH(TXQ4I
zD=EJq&~gUd%C*!gTe!q2Br{P%CvnhPxU}dumi^ccwB6CZXD5-}Ajod7C5utZ_M(Pu
zS{~QPtcCt~@E`olR&p37;eZ$G+E0f`22k!KL8vPht4EPsokssC@X{uxg8e*yzTOEF
zFK8qUw&q7XUc#sMAi1a15(crDd0_F6XsXG~dIuUf{z9M=tX`Jv$9g<n);3heX{DQ1
zCIqhEw^>aQHDS=gNEPIiB>FmX6Ypt^I<Z!7t#&3~n)oP+mvy_|@F&`J&kKFm!}Jr^
zB|niwQK%;xk>?AVCV)AExb6bRfzlxi+FjUw<TbIVa!CnkwzdXg{9)xqeEJp=A4iUp
zMk+K!3dr+RnJFj_`GWcs)Ha~J0Ra}fgfd7%VY8u>P(V;15<<$#sa49H;yjXU1$HB>
zYi%z^p{MO+(YM?6nACwirPN^iq2Kgdi5@T8_{k2COO^wC9+^Z=59fvhbaQf^c~Yt6
zDI6oud$v+Q>g3s&xey4&frB`Nlp3L5kIfmeR{P=n(>Nj54O&6sx`#O@Csx*|m2HuW
zy~LVnBnw?K1!Ye43c3D5zQ0gVQ-Snx<uGd%vdpY(emY*zLLrw|>&<ouIvPZs<0qZC
zr4tgVQxCMSg#jF%nslq9o8ZV9256J<<B)TY<I|j9StIGMph+l!=;59+N+6nmo%?yx
z!@V5&_<ljHR@6NZMnTyCE$37`m6i|Q2UsZk6?9qIt!|=Jt8my^U{0xk^kImCz|e0A
z4rTX(CBt@-4O$+ERA?#mKlDRkv<?qf2Os;Jj4PRJ!B1S7dK_!jJRxNjtzm1YRkpU~
z8pIM$nN%i3bq2dLg6{!Z0!2%Zd`jI1(~l5`s{<AgHgo3-A`$3Y0zy$eppv}0H$db<
z6f9dRepjs=RxLA_v?Sh(Y_kLrQDDpXW6FwqMKzz}&}g657NzzVlp7n=4oU6rC_^@=
zElcg6mAB>LT1<^o9aj663o6c8+@k^I)xLd8m`E0HnMHeBxuCCTbCY#g2yg^6*DBOB
z`Y%XCQrW;d)Ro^zf~ckK`8aA)c8RDWk=8nmMZ2zTa@;99<uGJ~y-wL({<C$~gDuZz
zC5}N^STUq_;ym>y<z(IHI=clm&?ha8kYulPu7`)Z5qEq$n3oP1C>6H4(`fiw&v(Lb
ztv7&>b%eQMEs9uJLX+WSiOyCcf@-EalS93yMUu$&&O6%)pvLL$^Snd!8NMPxI{R3p
znm3}RS0vMm2uMU}w8J2Q;)JB)0*H*Zvy)BsA~8*pdYtHwf@IMMEK0F@yY08Uge!F+
zyj<40$y(bNaYkfRWVvJ(P&K4S)G#dEH&s3HMNx#3Hk>iE+iZU3CgpHmK}M6I)lfM%
z41OK~NAIwTxlt&JhkkS0ti6C2tpmAoPlaT%k<!#~J0#>jW7^Ehi5ZDbEI&2bF4i37
zRcutwZA@+ql*e8<N$XjxoOqIz3yHf=zHl<YvsgH3oD26;m4A2L(n<a#yGw-A3|zg|
zM=&#Y-c(SRU;(HB*n&_O4<O2sfkNHNbO|ySHkX)SOHcy_BGP|qWvjxUNYVmCD5se<
z@`DP#^8R#nu3r6kvih1^y>s%!ZT-pQsdM)$w=3f(Z=dKE&j+MsBRV*M;c~)2L}Ydi
z3$C#!?7I+0owh^6U9`DT*i`1m<ZEs+`#LXRDxXUtwW!=Ka;$YMCti`*U9Q`zH+|O?
z$>qAu$m_t!cf|~IT}UC<bw<!tB%sJR=O~NEm=I?Uw;=QM_9D4W@g|%h%ytl;I71LJ
zxl*AtI9w_nE#>*^@IV^)<cAi<MSS`!P!2R?&V-+*vq&Gpcn|PW;X35$hQdss(*PH>
zqyiL+%13cf?SaEN56UX3m%XyUwg3iK6tEG%whC+%ux$d{4%iNX?F4L>z;**RCa^t#
z?G@NQ!1fF5IlvAG?0LXm5ZFP$Y=OO~UZx*BE>7?v@nnXTcUUrTfR9K9KJQV<j4JP#
zWVR{qxbiCECT&;V2?6d<-ghLkQ+X#PvrBoWB(qz2RmqIeO`0I`E+d(Vw9)&iOH1Y7
zUZVrqbwuGti>)B8C)K)XpE}S|rAagZSLt?FqgJ&Y>44@+J*>`0akHLOp&L8mV;x@C
zblG2nE9yPcj;w{rs%Exo*DEHu_rRjpwGG&{mTyygbAesA4le%%`?P?y*a%pfUlkpH
z8eQNId-g|*e#@@gdNGQV{=Ps1x;Np|2k>6Z7{d5BUz?mft;rxWy_vko*RYck8@7N=
z6Tm(+e#mb7aS(ZCW>i#Jq1Vscx86rbsi$LPGoR6~Vdm*Dw1H4<rh@$x#5QdXjRot(
zzU$jWHSmuO2K{jlO*=)_QLTp|4@K)U521paKoy5K=Y<^Xxt3UjeQV8DbXK3l0PhA`
zBLYJVrsNUDeFa*SQDm?z23@sjI%t|>)1G+aX~!^XEg%eX;q}v#*(5KCfdYi5ZHy~q
z`*9q_rx@(9&)E{put9EEEgdCI=~gxQs<hO!t8nTYG|pyvSsy{8TTzFL?S!6<fQbD;
zYJYw}L^PtN5V9GcXIf2d3{9@O0c+OpcE(B6`+BEFeE%rf#IFb*ADLKefDgLne}7N$
zzsy<2(n@f&(ez(FMmGDa;=JXtetwtm=^r66<{ULK;#J0fW13TYAzzo_bRDNrTO+KX
z^f@VD%m?D*Q9r{gk~--RF9)s>)HmF`9L5u`=ixHEu&iF9d|ZzH49SN9M2b7#X^9_%
zc$<+=+I}kqWqwLZBPPI?xjEC1I6dOmR8lWzoF(`^z@IHsk(vqXc`GRqZ0zpz92{F5
z8n7k^?X7yK`$h;Ty=EmwB@sLkYNCtHnniIZ5hWD(kd5i>nW97G=-5oCDn(gKOcLBm
z&qE{5!=VydVd90PR4JCQ0SF=V*(+3|+@lQJ)AF05;1inDj~wF90o1#YsG*VEm{rJ?
z<l6>UW1A&3N%M=J&?Lurh$g*W@Quco^C}YK=MfFg1s3sbW*C$cnSB`>>sKguJ&o?-
zzttO{PB0sgNGVwB*I%^-qWGaRU;#{<3JDa9?M{h-0wN!#rthdocL^1Iss+}w02G%?
zsuvbmtZUb3QXy?4&|Tr~r+d;uT)!b|$bj8&VMe5M5QIqU$Y9S)CbcqIxnCu?w_pt{
zOQn{Fl)j)KEe!+`k}?&(#vMT~%vQKTf;|tu4tf!dIjaTK4R$>o5h%_%OemzDhMik}
zg)4MuDmaXV_hg5(m<4_%v>~Da3eIgJHdBY|Mk8Bf)>FppMa=tKd}$z-g5ZdC&<aQ>
z4XOafQ1}Uge};@fFsyl?G2WSB7K#Tb%M-;9kVj~Cj)BFQJC1}Efk=tW0SzWPh(<vC
zUd9sfms4XrgS?6-zT^xB<qROFYbz7#CeO1Xj^W?2q<+fmDyPgI!i|%uH+J(ksFoOs
zONXw&Y)^GqdLh-ViRRbM`wb{FY*{%kF`<}s-a;~4VJOaMsVmHh(&FWip*P|xq^@f_
zZxblQc@?={kjxCWo3%ER7nvMJBEq22iJ|9c-c<nFM6vGBIr=@mR6C7|UK-AgSO=h)
z4p>7b+)csGP375g4T+&kQ0PzLr<8>{_X%~vP68<9azu?M478wr0xb{T&#51y#}+k%
zCTvxdi-h+KWkZf4&JCw^h!ZeY4SB%8cFv+O5IUqD(tkwKm7ZN1G{M-^Dyra$!Je|u
zO`f2q2Nwse#k=X*IZt3Ii3!y;mwF>S5piv3DDiX~)aXPok$V=OX3zsV<<2e!ihxZc
zVu7t6SSun^yBBfnDWYy3#3Hf`C{k*CQvS6<6CJYw;zVjw3?A|OrWX8`zuJy3{FT!f
zYSCO>S<(>!l}Cav5TADF8us&&5Y?lNjc~WwxW{8VQIz<y?ljl)7N?Z}l?*OB?jc}P
z_U>H5lz+mPa!&~%<w9<URj>}>{{eiDs!b@iu2cu@XHp%TGCTzj<3T?Io#_+$;11M*
zBfv%=2fZZX8~g&Q5ar3MA+QW@d%!ZdWD%C3M$vyj(z7x8n5N4a5*FzC@AmQW<93hI
zACp($q8s-FJhvI_BJ|E=+@U+)Pek-0j3j-SSR9ec&!JYl4(!j|Y`o3(J-NIQ?V!cx
zu#7W<k^jcgdC#K;DRiI1w-*8zBtw3ng+hT#0@FW4l8DM*os{#npy`iN?c;c%_CY(O
znjG&pR<`j15f%;%gzG5qCRSe}poZ8keO2i8jJ^<Z{zPa9A;NWLSjo^391r%DNVBH;
z?{l%91rgB<fVE*)vM32Gfh=!Fcb9O3*dyGpFQTz!<?!;TM{VNSI@ISPeD6lynrLmf
z?zr@?w;H~LpX<;?ctj)Crd(4RGl3$^M4mGm>|KVNNo+QGhsh}<Mv@`3N!0i{M*cTP
z=MA7L6`}ZfFIKb$@ckc9<-f4%{zyO{S!Q-=7S2K>w2sgpVpk?^fY4!&TD}o6_iQNi
zWguZA3UwS-)R^2rBKJFnDf+1K7^}Y})HsE;j6uf#Z)&7k<0mq52f8d68Hw9A=s@U6
zSeOW3-?^}Xq#Q<&lrJ%0Zo(6S7@ScnWAgL@vJ`Sss-R*mKyQMZAw^MwjAS&hMO{*(
zEX9O)#nE34llL=$hy#{(_>3C*3S6-(Z!=bar(C$X5mgQQmy6{C3@Gd^s<wKCbIsg&
zLmqyh7Ovl*ia)30b_J&K<BhWNIaZA|iVdaz2_&rxpY`zwf@*ys@bwY(C-vaI+5~tk
z-pHo>+QM6`!2r8gno`&E?{*f>r%yGoZGGN$&H~=!8?oaM)5Oh2g*Oc+V9q3PF6|24
z4!H-F=nSua8;e2c#eS=gxoCt;<Az|NHH%A0Q4YMFboj8&JhBi+G2=ARA?tlYJDVww
z9F{5J3$tp8fIbza@LkM3Nj9WxP(wv)49}lNarwH7_ATO<N34>VD||*lZ*eGR^b}u#
zI^#@uP2fy2p><{6j<^9|$;rh6E@|ohUcXkmdgJP~+39!OcQ4m!(^oFLS7#(@_WjGZ
zZeDX{`oPXD4n(_1uQl&Rp_p`S+&;xYvL@(mYP`pzP>9;{j>vffdFM?g(@YrEb1pKW
z<16;NSYr;AphMZ?aOXXhE4j^a9%$o5`1JRX6!S%^1bs`r%ilcSnw9bXq>S94q}!3>
zXG(+C2rrNK$hC2?R2m*Djg@wm_Lug{yDGuEag>5pBl$-$i@EX~@7^*}Hz)_EtR!rm
z*2=5ec|4ib=HU3?r4fZ;`ZP;kemW=ElZck%FlLvD*mGM?1AlV(QzjUFVJjw0uhfUF
QVT9NjS|84r2F9-c7hLv=%K!iX


From 75ea577fd31a7ead1cbec6aa6b21338040ea585d Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Thu, 31 May 2018 16:01:16 +0800
Subject: [PATCH 14/68] allow profiler and timeline to work when dev_ctx is
 nullptr.

Sometimes dev_ctx is not available when RecordEvent.
---
 paddle/fluid/platform/profiler.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 3d8d64e4c2..01de9d7041 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -127,6 +127,7 @@ double Event::CpuElapsedMs(const Event& e) const {
 
 double Event::CudaElapsedMs(const Event& e) const {
 #ifdef PADDLE_WITH_CUDA
+  if (!has_cuda_) return 0.0;
   PADDLE_ENFORCE(e.has_cuda() && has_cuda());
   PADDLE_ENFORCE(e.device() == device());
   PADDLE_ENFORCE(cudaEventSynchronize(event_));

From 4d11c8e9c64f65b6701edb1ba44cefdff0423acb Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 31 May 2018 15:11:46 +0800
Subject: [PATCH 15/68] retest single thread

---
 .../tests/book/test_inference_nlp.cc          | 224 +++++++++++-------
 1 file changed, 143 insertions(+), 81 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 95cdeb4ad1..e216e9dbe6 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -30,16 +30,19 @@ DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference");
 DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
 DEFINE_bool(prepare_context, true, "Prepare Context before executor");
 
+DEFINE_int32(num_threads, 1, "Number of threads should be used");
+
 inline double get_current_ms() {
   struct timeval time;
   gettimeofday(&time, NULL);
   return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
 }
 
-void read_data(
-    std::vector<std::vector<int64_t>>* out,
-    const std::string& filename = "/home/tangjian/paddle-tj/out.ids.txt") {
+// return size of total words
+size_t read_datasets(std::vector<paddle::framework::LoDTensor>* out,
+                     const std::string& filename) {
   using namespace std;  // NOLINT
+  size_t sz = 0;
   fstream fin(filename);
   string line;
   out->clear();
@@ -50,94 +53,153 @@ void read_data(
     while (getline(iss, field, ' ')) {
       ids.push_back(stoi(field));
     }
-    out->push_back(ids);
+    if (ids.size() >= 1024 || out->size() >= 100) {
+      continue;
+    }
+
+    paddle::framework::LoDTensor words;
+    paddle::framework::LoD lod{{0, ids.size()}};
+    words.set_lod(lod);
+    int64_t* pdata = words.mutable_data<int64_t>(
+        {static_cast<int64_t>(ids.size()), 1}, paddle::platform::CPUPlace());
+    memcpy(pdata, ids.data(), words.numel() * sizeof(int64_t));
+    out->emplace_back(words);
+    sz += ids.size();
   }
+  return sz;
+}
+
+void test_multi_threads() {
+  /*
+    size_t jobs_per_thread = std::min(inputdatas.size() / FLAGS_num_threads,
+    inputdatas.size());
+    std::vector<size_t> workers(FLAGS_num_threads, jobs_per_thread);
+    workers[FLAGS_num_threads - 1] += inputdatas.size() % FLAGS_num_threads;
+
+    std::vector<std::unique_ptr<std::thread>> infer_threads;
+
+    for (size_t i = 0; i < workers.size(); ++i) {
+      infer_threads.emplace_back(new std::thread([&, i]() {
+        size_t start = i * jobs_per_thread;
+        for (size_t j = start; j < start + workers[i]; ++j ) {
+          // 0. Call `paddle::framework::InitDevices()` initialize all the
+    devices
+          // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+          paddle::framework::LoDTensor words;
+          auto& srcdata = inputdatas[j];
+          paddle::framework::LoD lod{{0, srcdata.size()}};
+          words.set_lod(lod);
+          int64_t* pdata = words.mutable_data<int64_t>(
+              {static_cast<int64_t>(srcdata.size()), 1},
+              paddle::platform::CPUPlace());
+          memcpy(pdata, srcdata.data(), words.numel() * sizeof(int64_t));
+
+          LOG(INFO) << "thread id: " << i << ", words size:" << words.numel();
+          std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+          cpu_feeds.push_back(&words);
+
+          paddle::framework::LoDTensor output1;
+          std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+          cpu_fetchs1.push_back(&output1);
+
+          // Run inference on CPU
+          if (FLAGS_prepare_vars) {
+            if (FLAGS_prepare_context) {
+              TestInference<paddle::platform::CPUPlace, false, true>(
+                  dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+                  FLAGS_use_mkldnn);
+            } else {
+              TestInference<paddle::platform::CPUPlace, false, false>(
+                  dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+                  FLAGS_use_mkldnn);
+            }
+          } else {
+            if (FLAGS_prepare_context) {
+              TestInference<paddle::platform::CPUPlace, true, true>(
+                  dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+                  FLAGS_use_mkldnn);
+            } else {
+              TestInference<paddle::platform::CPUPlace, true, false>(
+                  dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
+                  FLAGS_use_mkldnn);
+            }
+          }
+          //LOG(INFO) << output1.lod();
+          //LOG(INFO) << output1.dims();
+        }
+      }));
+    }
+    auto start_ms = get_current_ms();
+    for (int i = 0; i < FLAGS_num_threads; ++i) {
+      infer_threads[i]->join();
+    }
+    auto stop_ms = get_current_ms();
+    LOG(INFO) << "total: " << stop_ms - start_ms << " ms";*/
 }
 
-TEST(inference, understand_sentiment) {
+TEST(inference, nlp) {
   if (FLAGS_dirname.empty()) {
     LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
   }
-  std::vector<std::vector<int64_t>> inputdatas;
-  read_data(&inputdatas);
-  LOG(INFO) << "---------- dataset size: " << inputdatas.size();
   LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
   std::string dirname = FLAGS_dirname;
 
+  std::vector<paddle::framework::LoDTensor> datasets;
+  size_t num_total_words =
+      read_datasets(&datasets, "/home/tangjian/paddle-tj/out.ids.txt");
+  LOG(INFO) << "Number of dataset samples(seq len<1024): " << datasets.size();
+  LOG(INFO) << "Total number of words: " << num_total_words;
+
   const bool model_combined = false;
-  int total_work = 10;
-  int num_threads = 2;
-  int work_per_thread = total_work / num_threads;
-  std::vector<std::unique_ptr<std::thread>> infer_threads;
-  for (int i = 0; i < num_threads; ++i) {
-    infer_threads.emplace_back(new std::thread([&, i]() {
-      for (int j = 0; j < work_per_thread; ++j) {
-        // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-        // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-        paddle::framework::LoDTensor words;
-        /*
-          paddle::framework::LoD lod{{0, 83}};
-          int64_t word_dict_len = 198392;
-          SetupLoDTensor(&words, lod, static_cast<int64_t>(0),
-                         static_cast<int64_t>(word_dict_len - 1));
-         */
-        std::vector<int64_t> srcdata{
-            784,   784,    1550,   6463,   56,     75693,  6189,  784,    784,
-            1550,  198391, 6463,   42468,  4376,   10251,  10760, 6189,   297,
-            396,   6463,   6463,   1550,   198391, 6463,   22564, 1612,   291,
-            68,    164,    784,    784,    1550,   198391, 6463,  13659,  3362,
-            42468, 6189,   2209,   198391, 6463,   2209,   2209,  198391, 6463,
-            2209,  1062,   3029,   1831,   3029,   1065,   2281,  100,    11216,
-            1110,  56,     10869,  9811,   100,    198391, 6463,  100,    9280,
-            100,   288,    40031,  1680,   1335,   100,    1550,  9280,   7265,
-            244,   1550,   198391, 6463,   1550,   198391, 6463,  42468,  4376,
-            10251, 10760};
-        paddle::framework::LoD lod{{0, srcdata.size()}};
-        words.set_lod(lod);
-        int64_t* pdata = words.mutable_data<int64_t>(
-            {static_cast<int64_t>(srcdata.size()), 1},
-            paddle::platform::CPUPlace());
-        memcpy(pdata, srcdata.data(), words.numel() * sizeof(int64_t));
-
-        LOG(INFO) << "number of input size:" << words.numel();
-        std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-        cpu_feeds.push_back(&words);
-
-        paddle::framework::LoDTensor output1;
-        std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-        cpu_fetchs1.push_back(&output1);
-
-        // Run inference on CPU
-        if (FLAGS_prepare_vars) {
-          if (FLAGS_prepare_context) {
-            TestInference<paddle::platform::CPUPlace, false, true>(
-                dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-                FLAGS_use_mkldnn);
-          } else {
-            TestInference<paddle::platform::CPUPlace, false, false>(
-                dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-                FLAGS_use_mkldnn);
-          }
-        } else {
-          if (FLAGS_prepare_context) {
-            TestInference<paddle::platform::CPUPlace, true, true>(
-                dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-                FLAGS_use_mkldnn);
-          } else {
-            TestInference<paddle::platform::CPUPlace, true, false>(
-                dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-                FLAGS_use_mkldnn);
-          }
-        }
-        LOG(INFO) << output1.lod();
-        LOG(INFO) << output1.dims();
-      }
-    }));
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // 1. Define place, executor, scope
+  auto place = paddle::platform::CPUPlace();
+  auto executor = paddle::framework::Executor(place);
+  auto* scope = new paddle::framework::Scope();
+
+  // 2. Initialize the inference_program and load parameters
+  std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
+  inference_program = InitProgram(&executor, scope, dirname, model_combined);
+  if (FLAGS_use_mkldnn) {
+    EnableMKLDNN(inference_program);
   }
-  auto start_ms = get_current_ms();
-  for (int i = 0; i < num_threads; ++i) {
-    infer_threads[i]->join();
+
+  if (FLAGS_num_threads > 1) {
+    test_multi_threads();
+  } else {
+    if (FLAGS_prepare_vars) {
+      executor.CreateVariables(*inference_program, scope, 0);
+    }
+    // always prepare context and burning first time
+    std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
+    ctx = executor.Prepare(*inference_program, 0);
+
+    // preapre fetch
+    const std::vector<std::string>& fetch_target_names =
+        inference_program->GetFetchTargetNames();
+    PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
+    std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+    paddle::framework::LoDTensor outtensor;
+    fetch_targets[fetch_target_names[0]] = &outtensor;
+
+    // prepare feed
+    const std::vector<std::string>& feed_target_names =
+        inference_program->GetFeedTargetNames();
+    PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL);
+    std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+
+    // for data and run
+    auto start_ms = get_current_ms();
+    for (size_t i = 0; i < datasets.size(); ++i) {
+      feed_targets[feed_target_names[0]] = &(datasets[i]);
+      executor.RunPreparedContext(ctx.get(), scope, &feed_targets,
+                                  &fetch_targets, !FLAGS_prepare_vars);
+    }
+    auto stop_ms = get_current_ms();
+    LOG(INFO) << "Total infer time: " << (stop_ms - start_ms) / 1000.0 / 60
+              << " min, avg time per seq: "
+              << (stop_ms - start_ms) / datasets.size() << " ms";
   }
-  auto stop_ms = get_current_ms();
-  LOG(INFO) << "total: " << stop_ms - start_ms << " ms";
+  delete scope;
 }

From d13dd3b6a7ee81d4c106035ec0bad2c581ea795c Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 31 May 2018 16:04:47 +0800
Subject: [PATCH 16/68] revert profiling

---
 paddle/fluid/inference/tests/test_helper.h | 29 ++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index dd3a7a584a..1f5551567c 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -156,10 +156,27 @@ void TestInference(const std::string& dirname,
   auto executor = paddle::framework::Executor(place);
   auto* scope = new paddle::framework::Scope();
 
+  // Profile the performance
+  paddle::platform::ProfilerState state;
+  if (paddle::platform::is_cpu_place(place)) {
+    state = paddle::platform::ProfilerState::kCPU;
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    state = paddle::platform::ProfilerState::kAll;
+    // The default device_id of paddle::platform::CUDAPlace is 0.
+    // Users can get the device_id using:
+    //   int device_id = place.GetDeviceId();
+    paddle::platform::SetDeviceId(0);
+#else
+    PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+#endif
+  }
+
   // 2. Initialize the inference_program and load parameters
   std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
 
   // Enable the profiler
+  paddle::platform::EnableProfiler(state);
   {
     paddle::platform::RecordEvent record_event(
         "init_program",
@@ -172,6 +189,10 @@ void TestInference(const std::string& dirname,
       EnableMKLDNN(inference_program);
     }
   }
+  // Disable the profiler and print the timing information
+  paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
+                                    "load_program_profiler");
+  paddle::platform::ResetProfiler();
 
   // 3. Get the feed_target_names and fetch_target_names
   const std::vector<std::string>& feed_target_names =
@@ -212,6 +233,9 @@ void TestInference(const std::string& dirname,
                    true, CreateVars);
     }
 
+    // Enable the profiler
+    paddle::platform::EnableProfiler(state);
+
     // Run repeat times to profile the performance
     for (int i = 0; i < repeat; ++i) {
       paddle::platform::RecordEvent record_event(
@@ -228,6 +252,11 @@ void TestInference(const std::string& dirname,
                      CreateVars);
       }
     }
+
+    // Disable the profiler and print the timing information
+    paddle::platform::DisableProfiler(
+        paddle::platform::EventSortingKey::kDefault, "run_inference_profiler");
+    paddle::platform::ResetProfiler();
   }
 
   delete scope;

From 708bec2e56c6a856f628ad8b650b0bf04a3df975 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 31 May 2018 16:33:54 +0800
Subject: [PATCH 17/68] add test

---
 paddle/fluid/inference/tests/book/test_inference_nlp.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index e216e9dbe6..990d45964e 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -200,6 +200,14 @@ TEST(inference, nlp) {
     LOG(INFO) << "Total infer time: " << (stop_ms - start_ms) / 1000.0 / 60
               << " min, avg time per seq: "
               << (stop_ms - start_ms) / datasets.size() << " ms";
+    {  // just for test
+      auto* scope = new paddle::framework::Scope();
+      paddle::framework::LoDTensor outtensor;
+      TestInference<paddle::platform::CPUPlace, false, true>(
+          dirname, {&(datasets[0])}, {&outtensor}, FLAGS_repeat, model_combined,
+          false);
+      delete scope;
+    }
   }
   delete scope;
 }

From e90bfd562b4738ead125a126124bbd8d526cb7a6 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 31 May 2018 18:07:06 +0800
Subject: [PATCH 18/68] 1. Make a base unittest class for dist transpiler
 unittest 2. Merge the develop repo

---
 .../tests/unittests/test_dist_transpiler.py   | 58 +--------------
 .../unittests/test_simple_dist_transpiler.py  | 47 +-----------
 .../fluid/tests/unittests/transpiler_test.py  | 73 +++++++++++++++++++
 .../fluid/transpiler/distribute_transpiler.py | 15 ++--
 4 files changed, 87 insertions(+), 106 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/transpiler_test.py

diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index fa49bd41a5..a24f2aeecd 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -12,40 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
-
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.layers as layers
 from paddle.fluid.transpiler.distribute_transpiler import delete_ops
-import numpy
+
+from transpiler_test import TranspilerTest
 
 
-class TestDistTranspiler(unittest.TestCase):
+class TestDistTranspiler(TranspilerTest):
     def setUp(self):
-        self.trainer_id = 0
-        self.trainers = 2
-        self.pservers = 2
-        self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175"
         self.current_pserver_ep = "127.0.0.1:6174"
 
-    def net_conf(self):
-        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
-
-        y_predict = fluid.layers.fc(input=x,
-                                    size=1000,
-                                    act=None,
-                                    param_attr=fluid.ParamAttr(name='fc_w'))
-
-        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
-        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
-
-        optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
-        return optimize_ops, params_grads
-
     def test_transpiler(self):
         trainer = self.get_trainer()
         pserver, startup = self.get_pserver(self.current_pserver_ep)
@@ -70,14 +45,6 @@ class TestDistTranspiler(unittest.TestCase):
         fc_w_var = startup.global_block().var("fc_w.block1")
         self.assertEqual(fc_w_var.shape, (500, 1000))
 
-    def get_main_program(self):
-        main = fluid.Program()
-
-        with fluid.program_guard(main):
-            self.net_conf()
-
-        return main
-
     def get_expect_trainer_ops(self):
         trainer = fluid.Program()
 
@@ -92,25 +59,6 @@ class TestDistTranspiler(unittest.TestCase):
         ops.insert(ops.index("elementwise_add_grad") + 1, "send_vars")
         return ops
 
-    def get_trainer(self):
-        return self._transpiler_instance().get_trainer_program()
-
-    def get_pserver(self, ep):
-        t = self._transpiler_instance()
-        pserver = t.get_pserver_program(ep)
-        startup = t.get_startup_program(ep, pserver)
-        return pserver, startup
-
-    def _transpiler_instance(self):
-        main = self.get_main_program()
-        t = fluid.DistributeTranspiler()
-        t.transpile(
-            self.trainer_id,
-            program=main,
-            pservers=self.pserver_eps,
-            trainers=self.trainers)
-        return t
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
index 60f99f412c..25d79b51f9 100644
--- a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
@@ -12,40 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
+import numpy as np
 
 import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.layers as layers
 from paddle.fluid.transpiler.distribute_transpiler import delete_ops
-import numpy as np
 
+from transpiler_test import TranspilerTest
 
-class TestSimpleDistTranspiler(unittest.TestCase):
+
+class TestSimpleDistTranspiler(TranspilerTest):
     def setUp(self):
-        self.trainer_id = 0
-        self.trainers = 2
-        self.pservers = 2
-        self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175"
         self.current_pserver_ep = "127.0.0.1:6175"
 
-    def net_conf(self):
-        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
-
-        y_predict = fluid.layers.fc(input=x,
-                                    size=1000,
-                                    act=None,
-                                    param_attr=fluid.ParamAttr(name='fc_w'))
-
-        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
-        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
-
-        optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
-        return optimize_ops, params_grads
-
     def test_simple_transpiler(self):
         np.random.seed(1)
 
@@ -73,14 +51,6 @@ class TestSimpleDistTranspiler(unittest.TestCase):
         fc_w_var = startup.global_block().var("fc_w@GRAD.trainer_0")
         self.assertEqual(fc_w_var.shape, (1000, 1000))
 
-    def get_main_program(self):
-        main = fluid.Program()
-
-        with fluid.program_guard(main):
-            self.net_conf()
-
-        return main
-
     def get_expect_trainer_ops(self):
         trainer = fluid.Program()
 
@@ -94,15 +64,6 @@ class TestSimpleDistTranspiler(unittest.TestCase):
         ops.insert(ops.index("elementwise_add_grad") + 1, "send_vars")
         return ops
 
-    def get_trainer(self):
-        return self._transpiler_instance().get_trainer_program()
-
-    def get_pserver(self, ep):
-        t = self._transpiler_instance()
-        pserver = t.get_pserver_program(ep)
-        startup = t.get_startup_program(ep, pserver)
-        return pserver, startup
-
     def _transpiler_instance(self):
         main = self.get_main_program()
         t = fluid.DistributeTranspiler()
diff --git a/python/paddle/fluid/tests/unittests/transpiler_test.py b/python/paddle/fluid/tests/unittests/transpiler_test.py
new file mode 100644
index 0000000000..d84c5d9c41
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/transpiler_test.py
@@ -0,0 +1,73 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.layers as layers
+
+
+class TranspilerTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        self.trainer_id = 0
+        self.trainers = 2
+        self.pservers = 2
+        self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175"
+
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'))
+
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+
+        optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+        return optimize_ops, params_grads
+
+    def get_main_program(self):
+        main = fluid.Program()
+
+        with fluid.program_guard(main):
+            self.net_conf()
+
+        return main
+
+    def get_trainer(self):
+        return self._transpiler_instance().get_trainer_program()
+
+    def get_pserver(self, ep):
+        t = self._transpiler_instance()
+        pserver = t.get_pserver_program(ep)
+        startup = t.get_startup_program(ep, pserver)
+        return pserver, startup
+
+    def _transpiler_instance(self):
+        main = self.get_main_program()
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            self.trainer_id,
+            program=main,
+            pservers=self.pserver_eps,
+            trainers=self.trainers)
+        return t
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 04d47fb82e..a116671c1b 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -178,7 +178,7 @@ class DistributeTranspiler:
                     for index in range(len(self.pserver_endpoints))
                 ]
 
-    def _init_splited_vars(self, split_method):
+    def _init_splited_vars(self, split_method, align_var_to_block=True):
         # update these mappings for further transpile:
         # 1. param_var_mapping: param var name -> [splited params vars]
         # 2. grad_var_mapping: grad var name -> [splited grads vars]
@@ -198,15 +198,14 @@ class DistributeTranspiler:
                                             self.params_grads)
 
         if align_var_to_block:
-            grad_blocks = split_dense_variable(grad_list,
-                                               len(pserver_endpoints))
-            param_blocks = split_dense_variable(param_list,
-                                                len(pserver_endpoints))
+            grad_blocks = split_variable(grad_list, len(self.pserver_endpoints))
+            param_blocks = split_variable(param_list,
+                                          len(self.pserver_endpoints))
         else:
             # when we do NOT align var to block, we will always split params
             # grads into one block.
-            grad_blocks = split_dense_variable(grad_list, 1)
-            param_blocks = split_dense_variable(param_list, 1)
+            grad_blocks = split_variable(grad_list, 1)
+            param_blocks = split_variable(param_list, 1)
         assert (len(grad_blocks) == len(param_blocks))
         # origin_varname -> [splited_var]
         self.param_var_mapping = self._create_vars_from_blocklist(
@@ -272,7 +271,7 @@ class DistributeTranspiler:
         self.has_distributed_lookup_table = self._has_distributed_lookup_table()
 
         # split and create vars, then put splited vars in dicts for later use.
-        self._init_splited_vars(split_method)
+        self._init_splited_vars(split_method, align_var_to_block)
 
         # step 3.1: insert send op to send gradient vars to parameter servers
         ps_dispatcher.reset()

From 733718c3e724fdd84355010e76ddd17e5b60ef2c Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 31 May 2018 19:12:23 +0800
Subject: [PATCH 19/68] remove the ugly test

---
 .../inference/tests/book/test_inference_nlp.cc | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 990d45964e..5241661fb3 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -53,7 +53,7 @@ size_t read_datasets(std::vector<paddle::framework::LoDTensor>* out,
     while (getline(iss, field, ' ')) {
       ids.push_back(stoi(field));
     }
-    if (ids.size() >= 1024 || out->size() >= 100) {
+    if (ids.size() >= 1024 ) {
       continue;
     }
 
@@ -200,14 +200,14 @@ TEST(inference, nlp) {
     LOG(INFO) << "Total infer time: " << (stop_ms - start_ms) / 1000.0 / 60
               << " min, avg time per seq: "
               << (stop_ms - start_ms) / datasets.size() << " ms";
-    {  // just for test
-      auto* scope = new paddle::framework::Scope();
-      paddle::framework::LoDTensor outtensor;
-      TestInference<paddle::platform::CPUPlace, false, true>(
-          dirname, {&(datasets[0])}, {&outtensor}, FLAGS_repeat, model_combined,
-          false);
-      delete scope;
-    }
+//    {  // just for test
+//      auto* scope = new paddle::framework::Scope();
+//      paddle::framework::LoDTensor outtensor;
+//      TestInference<paddle::platform::CPUPlace, false, true>(
+//          dirname, {&(datasets[0])}, {&outtensor}, FLAGS_repeat, model_combined,
+//          false);
+//      delete scope;
+//    }
   }
   delete scope;
 }

From 9d92dcea0a41093ad782561a0ade4095afbefcfa Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 31 May 2018 20:34:23 +0800
Subject: [PATCH 20/68] 1. import fluid to dist transpiler test

---
 python/paddle/fluid/tests/unittests/test_dist_transpiler.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index a24f2aeecd..32647f9aa8 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle.fluid as fluid
 from paddle.fluid.transpiler.distribute_transpiler import delete_ops
 
 from transpiler_test import TranspilerTest

From f9556dca51484c270284c181337fc041964f2db0 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Fri, 1 Jun 2018 10:59:42 +0800
Subject: [PATCH 21/68] use open_files reader to read multiple files

---
 .../howto/cluster/fluid_recordio.md}          |  26 ++++++++++--------
 .../reader/create_recordio_file_reader_op.cc  |  12 ++++----
 python/paddle/fluid/layers/io.py              |  20 ++++++--------
 python/paddle/fluid/recordio_writer.py        |  12 ++++++--
 tools/codestyle/docstring_checker.pyc         | Bin 0 -> 11769 bytes
 5 files changed, 37 insertions(+), 33 deletions(-)
 rename doc/{v2/howto/recordio/README.md => fluid/howto/cluster/fluid_recordio.md} (88%)
 create mode 100644 tools/codestyle/docstring_checker.pyc

diff --git a/doc/v2/howto/recordio/README.md b/doc/fluid/howto/cluster/fluid_recordio.md
similarity index 88%
rename from doc/v2/howto/recordio/README.md
rename to doc/fluid/howto/cluster/fluid_recordio.md
index 3f81d54b8e..0e8b98542d 100644
--- a/doc/v2/howto/recordio/README.md
+++ b/doc/fluid/howto/cluster/fluid_recordio.md
@@ -89,14 +89,14 @@ The above codes would generate multiple RecordIO files on your host like:
 
 ```bash
 .
- \_mnist.recordio-00000
- |-mnist.recordio-00001
- |-mnist.recordio-00002
- |-mnist.recordio-00003
- |-mnist.recordio-00004
+ \_mnist-00000.recordio
+ |-mnist-00001.recordio
+ |-mnist-00002.recordio
+ |-mnist-00003.recordio
+ |-mnist-00004.recordio
 ```
 
-1. read these RecordIO files with `fluid.layers.io.open_recordio_file`
+1. open multiple RecordIO files by `fluid.layers.io.open_files`
 
 For a distributed training job, the distributed operator system will schedule trainer process on multiple nodes,
 each trainer process reads parts of the whole training data, we usually take the following approach to make the training
@@ -113,10 +113,12 @@ def gen_train_list(file_pattern, trainers, trainer_id):
 
 trainers = int(os.getenv("TRAINERS"))
 trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
-data_file = fluid.layers.io.open_recordio_file(
-        filename=gen_train_list("./mnist.recordio*", trainers, trainer_id),
-        shapes=[(-1, 784),(-1, 1)],
-        lod_levels=[0, 0],
-        dtypes=["float32", "int32"])
-data_file = fluid.layers.io.batch(data_file, batch_size=4)
+data_file = fluid.layers.io.open_files(
+    filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0),
+    thread_num=1,
+    shapes=[(-1, 784),(-1, 1)],
+    lod_levels=[0, 0],
+    dtypes=["float32", "int32"])
+img, label = fluid.layers.io.read_file(data_files)
+...
 ```
diff --git a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
index 6b6d447026..282ec3f36b 100644
--- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
@@ -65,22 +65,20 @@ class CreateRecordIOReaderOp : public framework::OperatorBase {
                       static_cast<int>(shape_concat.size()),
                       "The accumulate of all ranks should be equal to the "
                       "shape concat's length.");
-    auto filenames = Attr<std::vector<std::string>>("filenames");
+    std::string filename = Attr<std::string>("filename");
 
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
-    for (auto& fn : filenames) {
-      out->Reset(
-          new RecordIOFileReader<true>(fn, RestoreShapes(shape_concat, ranks)));
-    }
+
+    out->Reset(new RecordIOFileReader<true>(
+        filename, RestoreShapes(shape_concat, ranks)));
   }
 };
 
 class CreateRecordIOReaderOpMaker : public FileReaderMakerBase {
  protected:
   void Apply() override {
-    AddAttr<std::vector<std::string>>("filenames",
-                                      "The filenames of record io reader");
+    AddAttr<std::string>("filename", "The filename of record io reader");
     AddComment(R"DOC(
       CreateRecordIOReader Operator
 
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index b9d5582730..8758ac9f94 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -21,7 +21,7 @@ from ..layer_helper import LayerHelper
 from ..executor import global_scope
 
 __all__ = [
-    'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_files',
+    'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
     'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer',
     'random_data_generator', 'Preprocessor'
 ]
@@ -291,12 +291,12 @@ def _copy_reader_create_op_(block, op):
     return new_op
 
 
-def open_recordio_files(filenames,
-                        shapes,
-                        lod_levels,
-                        dtypes,
-                        pass_num=1,
-                        for_parallel=True):
+def open_recordio_file(filename,
+                       shapes,
+                       lod_levels,
+                       dtypes,
+                       pass_num=1,
+                       for_parallel=True):
     """
     Open a RecordIO file
 
@@ -304,7 +304,7 @@ def open_recordio_files(filenames,
     Via the Reader Variable, we can get data from the given RecordIO file.
 
     Args:
-       filename(str) or list(str): The RecordIO file's name.
+       filename(str): The RecordIO file's name.
        shapes(list): List of tuples which declaring data shapes.
        lod_levels(list): List of ints which declaring data lod_level.
        dtypes(list): List of strs which declaring data type.
@@ -336,8 +336,6 @@ def open_recordio_files(filenames,
         ranks.append(len(shape))
 
     var_name = unique_name('open_recordio_file')
-    if isinstance(filenames, str):
-        filenames = [filenames]
 
     startup_blk = default_startup_program().current_block()
     startup_var = startup_blk.create_var(name=var_name)
@@ -347,7 +345,7 @@ def open_recordio_files(filenames,
         attrs={
             'shape_concat': shape_concat,
             'lod_levels': lod_levels,
-            'filenames': filenames,
+            'filename': filename,
             'ranks': ranks
         })
 
diff --git a/python/paddle/fluid/recordio_writer.py b/python/paddle/fluid/recordio_writer.py
index 9557f91bb3..8d48e9abef 100644
--- a/python/paddle/fluid/recordio_writer.py
+++ b/python/paddle/fluid/recordio_writer.py
@@ -12,9 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import core
 import contextlib
-__all__ = ['convert_reader_to_recordio_file']
+__all__ = [
+    'convert_reader_to_recordio_file', 'convert_reader_to_recordio_files'
+]
 
 
 @contextlib.contextmanager
@@ -48,7 +51,7 @@ def convert_reader_to_recordio_file(
 
 
 def convert_reader_to_recordio_files(
-        filename_suffix,
+        filename,
         batch_per_file,
         reader_creator,
         feeder,
@@ -57,13 +60,16 @@ def convert_reader_to_recordio_files(
         feed_order=None):
     if feed_order is None:
         feed_order = feeder.feed_names
+    f_name, f_ext = os.path.splitext(filename)
+    assert (f_ext == ".recordio")
+
     lines = []
     f_idx = 0
     counter = 0
     for idx, batch in enumerate(reader_creator()):
         lines.append(batch)
         if idx >= batch_per_file and idx % batch_per_file == 0:
-            filename = "%s-%05d" % (filename_suffix, f_idx)
+            filename = "%s-%05d%s" % (f_name, f_idx, f_ext)
             with create_recordio_writer(filename, compressor,
                                         max_num_records) as writer:
                 for l in lines:
diff --git a/tools/codestyle/docstring_checker.pyc b/tools/codestyle/docstring_checker.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..07e875aec6c9bae8002bde4223348c6a29647b03
GIT binary patch
literal 11769
zcmdT~%X1XR8SmL$t+b1Tc#4Oy8S`8MD~X3+M8*~fV-j0Xvp5to)Ogt$Nh8hftfpse
z$%aa$aOH8#E%^^pm8zUlxutT5Z$2h}Kyu6}hg439EBXDto}FEU>_FK9N7A&qzkYxH
zy}R@O7%8{*jJ)4a=}#X2-^8bXiNsTC4ymQQoC+;9hm6VR)Lc&Tc@^eWv!Lb*syU$M
z29(7X1r-)mb5P9<n(}}Ohg7qq=1QiF(aLJBth}Q76jN+b-k|b_)jYZm5%NH(TXQ4I
zD=EJq&~gUd%C*!gTe!q2Br{P%CvnhPxU}dumi^ccwB6CZXD5-}Ajod7C5utZ_M(Pu
zS{~QPtcCt~@E`olR&p37;eZ$G+E0f`22k!KL8vPht4EPsokssC@X{uxg8e*yzTOEF
zFK8qUw&q7XUc#sMAi1a15(crDd0_F6XsXG~dIuUf{z9M=tX`Jv$9g<n);3heX{DQ1
zCIqhEw^>aQHDS=gNEPIiB>FmX6Ypt^I<Z!7t#&3~n)oP+mvy_|@F&`J&kKFm!}Jr^
zB|niwQK%;xk>?AVCV)AExb6bRfzlxi+FjUw<TbIVa!CnkwzdXg{9)xqeEJp=A4iUp
zMk+K!3dr+RnJFj_`GWcs)Ha~J0Ra}fgfd7%VY8u>P(V;15<<$#sa49H;yjXU1$HB>
zYi%z^p{MO+(YM?6nACwirPN^iq2Kgdi5@T8_{k2COO^wC9+^Z=59fvhbaQf^c~Yt6
zDI6oud$v+Q>g3s&xey4&frB`Nlp3L5kIfmeR{P=n(>Nj54O&6sx`#O@Csx*|m2HuW
zy~LVnBnw?K1!Ye43c3D5zQ0gVQ-Snx<uGd%vdpY(emY*zLLrw|>&<ouIvPZs<0qZC
zr4tgVQxCMSg#jF%nslq9o8ZV9256J<<B)TY<I|j9StIGMph+l!=;59+N+6nmo%?yx
z!@V5&_<ljHR@6NZMnTyCE$37`m6i|Q2UsZk6?9qIt!|=Jt8my^U{0xk^kImCz|e0A
z4rTX(CBt@-4O$+ERA?#mKlDRkv<?qf2Os;Jj4PRJ!B1S7dK_!jJRxNjtzm1YRkpU~
z8pIM$nN%i3bq2dLg6{!Z0!2%Zd`jI1(~l5`s{<AgHgo3-A`$3Y0zy$eppv}0H$db<
z6f9dRepjs=RxLA_v?Sh(Y_kLrQDDpXW6FwqMKzz}&}g657NzzVlp7n=4oU6rC_^@=
zElcg6mAB>LT1<^o9aj663o6c8+@k^I)xLd8m`E0HnMHeBxuCCTbCY#g2yg^6*DBOB
z`Y%XCQrW;d)Ro^zf~ckK`8aA)c8RDWk=8nmMZ2zTa@;99<uGJ~y-wL({<C$~gDuZz
zC5}N^STUq_;ym>y<z(IHI=clm&?ha8kYulPu7`)Z5qEq$n3oP1C>6H4(`fiw&v(Lb
ztv7&>b%eQMEs9uJLX+WSiOyCcf@-EalS93yMUu$&&O6%)pvLL$^Snd!8NMPxI{R3p
znm3}RS0vMm2uMU}w8J2Q;)JB)0*H*Zvy)BsA~8*pdYtHwf@IMMEK0F@yY08Uge!F+
zyj<40$y(bNaYkfRWVvJ(P&K4S)G#dEH&s3HMNx#3Hk>iE+iZU3CgpHmK}M6I)lfM%
z41OK~NAIwTxlt&JhkkS0ti6C2tpmAoPlaT%k<!#~J0#>jW7^Ehi5ZDbEI&2bF4i37
zRcutwZA@+ql*e8<N$XjxoOqIz3yHf=zHl<YvsgH3oD26;m4A2L(n<a#yGw-A3|zg|
zM=&#Y-c(SRU;(HB*n&_O4<O2sfkNHNbO|ySHkX)SOHcy_BGP|qWvjxUNYVmCD5se<
z@`DP#^8R#nu3r6kvih1^y>s%!ZT-pQsdM)$w=3f(Z=dKE&j+MsBRV*M;c~)2L}Ydi
z3$C#!?7I+0owh^6U9`DT*i`1m<ZEs+`#LXRDxXUtwW!=Ka;$YMCti`*U9Q`zH+|O?
z$>qAu$m_t!cf|~IT}UC<bw<!tB%sJR=O~NEm=I?Uw;=QM_9D4W@g|%h%ytl;I71LJ
zxl*AtI9w_nE#>*^@IV^)<cAi<MSS`!P!2R?&V-+*vq&Gpcn|PW;X35$hQdss(*PH>
zqyiL+%13cf?SaEN56UX3m%XyUwg3iK6tEG%whC+%ux$d{4%iNX?F4L>z;**RCa^t#
z?G@NQ!1fF5IlvAG?0LXm5ZFP$Y=OO~UZx*BE>7?v@nnXTcUUrTfR9K9KJQV<j4JP#
zWVR{qxbiCECT&;V2?6d<-ghLkQ+X#PvrBoWB(qz2RmqIeO`0I`E+d(Vw9)&iOH1Y7
zUZVrqbwuGti>)B8C)K)XpE}S|rAagZSLt?FqgJ&Y>44@+J*>`0akHLOp&L8mV;x@C
zblG2nE9yPcj;w{rs%Exo*DEHu_rRjpwGG&{mTyygbAesA4le%%`?P?y*a%pfUlkpH
z8eQNId-g|*e#@@gdNGQV{=Ps1x;Np|2k>6Z7{d5BUz?mft;rxWy_vko*RYck8@7N=
z6Tm(+e#mb7aS(ZCW>i#Jq1Vscx86rbsi$LPGoR6~Vdm*Dw1H4<rh@$x#5QdXjRot(
zzU$jWHSmuO2K{jlO*=)_QLTp|4@K)U521paKoy5K=Y<^Xxt3UjeQV8DbXK3l0PhA`
zBLYJVrsNUDeFa*SQDm?z23@sjI%t|>)1G+aX~!^XEg%eX;q}v#*(5KCfdYi5ZHy~q
z`*9q_rx@(9&)E{put9EEEgdCI=~gxQs<hO!t8nTYG|pyvSsy{8TTzFL?S!6<fQbD;
zYJYw}L^PtN5V9GcXIf2d3{9@O0c+OpcE(B6`+BEFeE%rf#IFb*ADLKefDgLne}7N$
zzsy<2(n@f&(ez(FMmGDa;=JXtetwtm=^r66<{ULK;#J0fW13TYAzzo_bRDNrTO+KX
z^f@VD%m?D*Q9r{gk~--RF9)s>)HmF`9L5u`=ixHEu&iF9d|ZzH49SN9M2b7#X^9_%
zc$<+=+I}kqWqwLZBPPI?xjEC1I6dOmR8lWzoF(`^z@IHsk(vqXc`GRqZ0zpz92{F5
z8n7k^?X7yK`$h;Ty=EmwB@sLkYNCtHnniIZ5hWD(kd5i>nW97G=-5oCDn(gKOcLBm
z&qE{5!=VydVd90PR4JCQ0SF=V*(+3|+@lQJ)AF05;1inDj~wF90o1#YsG*VEm{rJ?
z<l6>UW1A&3N%M=J&?Lurh$g*W@Quco^C}YK=MfFg1s3sbW*C$cnSB`>>sKguJ&o?-
zzttO{PB0sgNGVwB*I%^-qWGaRU;#{<3JDa9?M{h-0wN!#rthdocL^1Iss+}w02G%?
zsuvbmtZUb3QXy?4&|Tr~r+d;uT)!b|$bj8&VMe5M5QIqU$Y9S)CbcqIxnCu?w_pt{
zOQn{Fl)j)KEe!+`k}?&(#vMT~%vQKTf;|tu4tf!dIjaTK4R$>o5h%_%OemzDhMik}
zg)4MuDmaXV_hg5(m<4_%v>~Da3eIgJHdBY|Mk8Bf)>FppMa=tKd}$z-g5ZdC&<aQ>
z4XOafQ1}Uge};@fFsyl?G2WSB7K#Tb%M-;9kVj~Cj)BFQJC1}Efk=tW0SzWPh(<vC
zUd9sfms4XrgS?6-zT^xB<qROFYbz7#CeO1Xj^W?2q<+fmDyPgI!i|%uH+J(ksFoOs
zONXw&Y)^GqdLh-ViRRbM`wb{FY*{%kF`<}s-a;~4VJOaMsVmHh(&FWip*P|xq^@f_
zZxblQc@?={kjxCWo3%ER7nvMJBEq22iJ|9c-c<nFM6vGBIr=@mR6C7|UK-AgSO=h)
z4p>7b+)csGP375g4T+&kQ0PzLr<8>{_X%~vP68<9azu?M478wr0xb{T&#51y#}+k%
zCTvxdi-h+KWkZf4&JCw^h!ZeY4SB%8cFv+O5IUqD(tkwKm7ZN1G{M-^Dyra$!Je|u
zO`f2q2Nwse#k=X*IZt3Ii3!y;mwF>S5piv3DDiX~)aXPok$V=OX3zsV<<2e!ihxZc
zVu7t6SSun^yBBfnDWYy3#3Hf`C{k*CQvS6<6CJYw;zVjw3?A|OrWX8`zuJy3{FT!f
zYSCO>S<(>!l}Cav5TADF8us&&5Y?lNjc~WwxW{8VQIz<y?ljl)7N?Z}l?*OB?jc}P
z_U>H5lz+mPa!&~%<w9<URj>}>{{eiDs!b@iu2cu@XHp%TGCTzj<3T?Io#_+$;11M*
zBfv%=2fZZX8~g&Q5ar3MA+QW@d%!ZdWD%C3M$vyj(z7x8n5N4a5*FzC@AmQW<93hI
zACp($q8s-FJhvI_BJ|E=+@U+)Pek-0j3j-SSR9ec&!JYl4(!j|Y`o3(J-NIQ?V!cx
zu#7W<k^jcgdC#K;DRiI1w-*8zBtw3ng+hT#0@FW4l8DM*os{#npy`iN?c;c%_CY(O
znjG&pR<`j15f%;%gzG5qCRSe}poZ8keO2i8jJ^<Z{zPa9A;NWLSjo^391r%DNVBH;
z?{l%91rgB<fVE*)vM32Gfh=!Fcb9O3*dyGpFQTz!<?!;TM{VNSI@ISPeD6lynrLmf
z?zr@?w;H~LpX<;?ctj)Crd(4RGl3$^M4mGm>|KVNNo+QGhsh}<Mv@`3N!0i{M*cTP
z=MA7L6`}ZfFIKb$@ckc9<-f4%{zyO{S!Q-=7S2K>w2sgpVpk?^fY4!&TD}o6_iQNi
zWguZA3UwS-)R^2rBKJFnDf+1K7^}Y})HsE;j6uf#Z)&7k<0mq52f8d68Hw9A=s@U6
zSeOW3-?^}Xq#Q<&lrJ%0Zo(6S7@ScnWAgL@vJ`Sss-R*mKyQMZAw^MwjAS&hMO{*(
zEX9O)#nE34llL=$hy#{(_>3C*3S6-(Z!=bar(C$X5mgQQmy6{C3@Gd^s<wKCbIsg&
zLmqyh7Ovl*ia)30b_J&K<BhWNIaZA|iVdaz2_&rxpY`zwf@*ys@bwY(C-vaI+5~tk
z-pHo>+QM6`!2r8gno`&E?{*f>r%yGoZGGN$&H~=!8?oaM)5Oh2g*Oc+V9q3PF6|24
z4!H-F=nSua8;e2c#eS=gxoCt;<Az|NHH%A0Q4YMFboj8&JhBi+G2=ARA?tlYJDVww
z9F{5J3$tp8fIbza@LkM3Nj9WxP(wv)49}lNarwH7_ATO<N34>VD||*lZ*eGR^b}u#
zI^#@uP2fy2p><{6j<^9|$;rh6E@|ohUcXkmdgJP~+39!OcQ4m!(^oFLS7#(@_WjGZ
zZeDX{`oPXD4n(_1uQl&Rp_p`S+&;xYvL@(mYP`pzP>9;{j>vffdFM?g(@YrEb1pKW
z<16;NSYr;AphMZ?aOXXhE4j^a9%$o5`1JRX6!S%^1bs`r%ilcSnw9bXq>S94q}!3>
zXG(+C2rrNK$hC2?R2m*Djg@wm_Lug{yDGuEag>5pBl$-$i@EX~@7^*}Hz)_EtR!rm
z*2=5ec|4ib=HU3?r4fZ;`ZP;kemW=ElZck%FlLvD*mGM?1AlV(QzjUFVJjw0uhfUF
QVT9NjS|84r2F9-c7hLv=%K!iX

literal 0
HcmV?d00001


From b33ea7be2d68e64c4a3844b7fbfa01f2697a7839 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 1 Jun 2018 13:35:20 +0800
Subject: [PATCH 22/68] 1. change the variable name from align_var_to_block to
 slice_var_up 2. replace split_method with slice_var_up in func
 init_splited_variables

---
 .../unittests/test_simple_dist_transpiler.py  |  2 +-
 .../fluid/transpiler/distribute_transpiler.py | 35 ++++++++++---------
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
index 25d79b51f9..5ae2844e29 100644
--- a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
@@ -72,7 +72,7 @@ class TestSimpleDistTranspiler(TranspilerTest):
             program=main,
             pservers=self.pserver_eps,
             trainers=self.trainers,
-            align_var_to_block=False)
+            slice_var_up=False)
         return t
 
 
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index a116671c1b..da001add8e 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -71,7 +71,7 @@ def same_or_split_var(p_name, var_name):
     return p_name == var_name or p_name.startswith(var_name + ".block")
 
 
-def split_variable(var_list, service_count, min_block_size=8192):
+def slice_variable(var_list, slice_count, min_block_size=8192):
     """
     We may need to split dense tensor to one or more blocks and put
     them equally onto parameter server. One block is a sub-tensor
@@ -83,8 +83,8 @@ def split_variable(var_list, service_count, min_block_size=8192):
 
     Args:
         var_list (list): List of variables.
-        service_count (int): Numel of pserver services. A pserver may have two
-            or more listening ports.
+        slice_count (int): Numel of count that variables will be sliced, which
+            could be the pserver services' count.
         min_block_size (int): Minimum splitted block size.
     Returns:
         blocks (list[(varname, block_id, current_block_size)]): A list
@@ -92,12 +92,12 @@ def split_variable(var_list, service_count, min_block_size=8192):
     """
     blocks = []
     for var in var_list:
-        split_count = service_count
+        split_count = slice_count
         var_numel = reduce(lambda x, y: x * y, var.shape)
         max_pserver_count = int(math.floor(var_numel / float(min_block_size)))
         if max_pserver_count == 0:
             max_pserver_count = 1
-        if max_pserver_count < service_count:
+        if max_pserver_count < slice_count:
             split_count = max_pserver_count
         block_size = int(math.ceil(var_numel / float(split_count)))
 
@@ -178,7 +178,7 @@ class DistributeTranspiler:
                     for index in range(len(self.pserver_endpoints))
                 ]
 
-    def _init_splited_vars(self, split_method, align_var_to_block=True):
+    def _init_splited_vars(self, slice_var_up):
         # update these mappings for further transpile:
         # 1. param_var_mapping: param var name -> [splited params vars]
         # 2. grad_var_mapping: grad var name -> [splited grads vars]
@@ -197,16 +197,19 @@ class DistributeTranspiler:
         self._update_dist_lookup_table_vars(param_list, grad_list,
                                             self.params_grads)
 
-        if align_var_to_block:
-            grad_blocks = split_variable(grad_list, len(self.pserver_endpoints))
-            param_blocks = split_variable(param_list,
+        if slice_var_up:
+            # when we slice var up into blocks, we will slice the var according to
+            # pserver services' count. A pserver may have two or more listening ports.
+            grad_blocks = slice_variable(grad_list, len(self.pserver_endpoints))
+            param_blocks = slice_variable(param_list,
                                           len(self.pserver_endpoints))
         else:
-            # when we do NOT align var to block, we will always split params
+            # when we do NOT slice var up into blocks, we will always slice params
             # grads into one block.
-            grad_blocks = split_variable(grad_list, 1)
-            param_blocks = split_variable(param_list, 1)
+            grad_blocks = slice_variable(grad_list, 1)
+            param_blocks = slice_variable(param_list, 1)
         assert (len(grad_blocks) == len(param_blocks))
+
         # origin_varname -> [splited_var]
         self.param_var_mapping = self._create_vars_from_blocklist(
             self.origin_program, param_blocks)
@@ -237,7 +240,7 @@ class DistributeTranspiler:
                   program=None,
                   pservers="127.0.0.1:6174",
                   trainers=1,
-                  align_var_to_block=True,
+                  slice_var_up=True,
                   split_method=RoundRobin,
                   sync_mode=True):
         """
@@ -271,7 +274,7 @@ class DistributeTranspiler:
         self.has_distributed_lookup_table = self._has_distributed_lookup_table()
 
         # split and create vars, then put splited vars in dicts for later use.
-        self._init_splited_vars(split_method, align_var_to_block)
+        self._init_splited_vars(slice_var_up)
 
         # step 3.1: insert send op to send gradient vars to parameter servers
         ps_dispatcher.reset()
@@ -283,13 +286,13 @@ class DistributeTranspiler:
         #       fc_b@GRAD_trainer_0, fc_b@GRAD_trainer_1 --> pserver2
         # shuffle the map will avoid the uneven distribution above
         grad_var_mapping_items = self.grad_var_mapping.items()
-        if not align_var_to_block:
+        if not slice_var_up:
             np.random.shuffle(grad_var_mapping_items)
 
         for orig_varname, splited_vars in grad_var_mapping_items:
             eplist = ps_dispatcher.dispatch(splited_vars)
 
-            if not align_var_to_block:
+            if not slice_var_up:
                 assert (len(splited_vars) == 1)
 
             if len(splited_vars) == 1:

From 5387562576de020a35f864a07f14802b68ee398d Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 1 Jun 2018 14:07:41 +0800
Subject: [PATCH 23/68] add multi-thread test

---
 .../tests/book/test_inference_nlp.cc          | 157 ++++++++----------
 1 file changed, 72 insertions(+), 85 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 5241661fb3..4e92d6a17b 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -15,11 +15,7 @@ limitations under the License. */
 #include <sys/time.h>
 #include <time.h>
 #include <fstream>
-#include <iostream>
-#include <sstream>
-#include <string>
 #include <thread>  // NOLINT
-#include <vector>
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
@@ -41,19 +37,18 @@ inline double get_current_ms() {
 // return size of total words
 size_t read_datasets(std::vector<paddle::framework::LoDTensor>* out,
                      const std::string& filename) {
-  using namespace std;  // NOLINT
   size_t sz = 0;
-  fstream fin(filename);
-  string line;
+  std::fstream fin(filename);
+  std::string line;
   out->clear();
   while (getline(fin, line)) {
-    istringstream iss(line);
-    vector<int64_t> ids;
-    string field;
+    std::istringstream iss(line);
+    std::vector<int64_t> ids;
+    std::string field;
     while (getline(iss, field, ' ')) {
       ids.push_back(stoi(field));
     }
-    if (ids.size() >= 1024 ) {
+    if (ids.size() >= 1024) {
       continue;
     }
 
@@ -69,72 +64,61 @@ size_t read_datasets(std::vector<paddle::framework::LoDTensor>* out,
   return sz;
 }
 
-void test_multi_threads() {
-  /*
-    size_t jobs_per_thread = std::min(inputdatas.size() / FLAGS_num_threads,
-    inputdatas.size());
-    std::vector<size_t> workers(FLAGS_num_threads, jobs_per_thread);
-    workers[FLAGS_num_threads - 1] += inputdatas.size() % FLAGS_num_threads;
-
-    std::vector<std::unique_ptr<std::thread>> infer_threads;
-
-    for (size_t i = 0; i < workers.size(); ++i) {
-      infer_threads.emplace_back(new std::thread([&, i]() {
-        size_t start = i * jobs_per_thread;
-        for (size_t j = start; j < start + workers[i]; ++j ) {
-          // 0. Call `paddle::framework::InitDevices()` initialize all the
-    devices
-          // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-          paddle::framework::LoDTensor words;
-          auto& srcdata = inputdatas[j];
-          paddle::framework::LoD lod{{0, srcdata.size()}};
-          words.set_lod(lod);
-          int64_t* pdata = words.mutable_data<int64_t>(
-              {static_cast<int64_t>(srcdata.size()), 1},
-              paddle::platform::CPUPlace());
-          memcpy(pdata, srcdata.data(), words.numel() * sizeof(int64_t));
-
-          LOG(INFO) << "thread id: " << i << ", words size:" << words.numel();
-          std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-          cpu_feeds.push_back(&words);
-
-          paddle::framework::LoDTensor output1;
-          std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-          cpu_fetchs1.push_back(&output1);
-
-          // Run inference on CPU
-          if (FLAGS_prepare_vars) {
-            if (FLAGS_prepare_context) {
-              TestInference<paddle::platform::CPUPlace, false, true>(
-                  dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-                  FLAGS_use_mkldnn);
-            } else {
-              TestInference<paddle::platform::CPUPlace, false, false>(
-                  dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-                  FLAGS_use_mkldnn);
-            }
-          } else {
-            if (FLAGS_prepare_context) {
-              TestInference<paddle::platform::CPUPlace, true, true>(
-                  dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-                  FLAGS_use_mkldnn);
-            } else {
-              TestInference<paddle::platform::CPUPlace, true, false>(
-                  dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, model_combined,
-                  FLAGS_use_mkldnn);
-            }
-          }
-          //LOG(INFO) << output1.lod();
-          //LOG(INFO) << output1.dims();
-        }
-      }));
-    }
-    auto start_ms = get_current_ms();
-    for (int i = 0; i < FLAGS_num_threads; ++i) {
-      infer_threads[i]->join();
+void ThreadRunInfer(
+    const int tid, paddle::framework::Executor* executor,
+    paddle::framework::Scope* scope,
+    const std::unique_ptr<paddle::framework::ProgramDesc>& inference_program,
+    const std::vector<std::vector<const paddle::framework::LoDTensor*>>& jobs) {
+  auto copy_program = std::unique_ptr<paddle::framework::ProgramDesc>(
+      new paddle::framework::ProgramDesc(*inference_program));
+  std::string feed_holder_name = "feed_" + paddle::string::to_string(tid);
+  std::string fetch_holder_name = "fetch_" + paddle::string::to_string(tid);
+  copy_program->SetFeedHolderName(feed_holder_name);
+  copy_program->SetFetchHolderName(fetch_holder_name);
+
+  // 3. Get the feed_target_names and fetch_target_names
+  const std::vector<std::string>& feed_target_names =
+      copy_program->GetFeedTargetNames();
+  const std::vector<std::string>& fetch_target_names =
+      copy_program->GetFetchTargetNames();
+
+  PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
+  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+  paddle::framework::LoDTensor outtensor;
+  fetch_targets[fetch_target_names[0]] = &outtensor;
+
+  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+  PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL);
+
+  auto& inputs = jobs[tid];
+  auto start_ms = get_current_ms();
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    feed_targets[feed_target_names[0]] = inputs[i];
+    executor->Run(*copy_program, scope, &feed_targets, &fetch_targets, true,
+                  true, feed_holder_name, fetch_holder_name);
+  }
+  auto stop_ms = get_current_ms();
+  LOG(INFO) << "Tid: " << tid << ", process " << inputs.size()
+            << " samples, avg time per sample: "
+
+            << (stop_ms - start_ms) / inputs.size() << " ms";
+}
+
+void bcast_datasets(
+    const std::vector<paddle::framework::LoDTensor>& datasets,
+    std::vector<std::vector<const paddle::framework::LoDTensor*>>* jobs,
+    const int num_threads) {
+  size_t s = 0;
+  jobs->resize(num_threads);
+  while (s < datasets.size()) {
+    for (auto it = jobs->begin(); it != jobs->end(); it++) {
+      it->emplace_back(&datasets[s]);
+      s++;
+      if (s >= datasets.size()) {
+        break;
+      }
     }
-    auto stop_ms = get_current_ms();
-    LOG(INFO) << "total: " << stop_ms - start_ms << " ms";*/
+  }
 }
 
 TEST(inference, nlp) {
@@ -166,7 +150,18 @@ TEST(inference, nlp) {
   }
 
   if (FLAGS_num_threads > 1) {
-    test_multi_threads();
+    std::vector<std::vector<const paddle::framework::LoDTensor*>> jobs;
+    bcast_datasets(datasets, &jobs, FLAGS_num_threads);
+    std::vector<std::unique_ptr<std::thread>> threads;
+    for (int i = 0; i < FLAGS_num_threads; ++i) {
+      threads.emplace_back(new std::thread(ThreadRunInfer, i, &executor, scope,
+                                           std::ref(inference_program),
+                                           std::ref(jobs)));
+    }
+    for (int i = 0; i < FLAGS_num_threads; ++i) {
+      threads[i]->join();
+    }
+
   } else {
     if (FLAGS_prepare_vars) {
       executor.CreateVariables(*inference_program, scope, 0);
@@ -200,14 +195,6 @@ TEST(inference, nlp) {
     LOG(INFO) << "Total infer time: " << (stop_ms - start_ms) / 1000.0 / 60
               << " min, avg time per seq: "
               << (stop_ms - start_ms) / datasets.size() << " ms";
-//    {  // just for test
-//      auto* scope = new paddle::framework::Scope();
-//      paddle::framework::LoDTensor outtensor;
-//      TestInference<paddle::platform::CPUPlace, false, true>(
-//          dirname, {&(datasets[0])}, {&outtensor}, FLAGS_repeat, model_combined,
-//          false);
-//      delete scope;
-//    }
   }
   delete scope;
 }

From a4822ed897cebe6a27bd61d82c5a1b43022d3760 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 1 Jun 2018 14:37:35 +0800
Subject: [PATCH 24/68] add thread setting

---
 .../tests/book/test_inference_nlp.cc          | 26 +++++++++++++++----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 4e92d6a17b..fba64efece 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -19,6 +19,10 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
+#ifdef PADDLE_WITH_MKLML
+#include <mkl_service.h>
+#include <omp.h>
+#endif
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
 DEFINE_int32(repeat, 100, "Running the inference program repeat times");
@@ -149,6 +153,14 @@ TEST(inference, nlp) {
     EnableMKLDNN(inference_program);
   }
 
+#ifdef PADDLE_WITH_MKLML
+  // only use 1 core per thread
+  omp_set_dynamic(0);
+  omp_set_num_threads(1);
+  mkl_set_num_threads(1);
+#endif
+
+  double start_ms = 0, stop_ms = 0;
   if (FLAGS_num_threads > 1) {
     std::vector<std::vector<const paddle::framework::LoDTensor*>> jobs;
     bcast_datasets(datasets, &jobs, FLAGS_num_threads);
@@ -158,9 +170,11 @@ TEST(inference, nlp) {
                                            std::ref(inference_program),
                                            std::ref(jobs)));
     }
+    start_ms = get_current_ms();
     for (int i = 0; i < FLAGS_num_threads; ++i) {
       threads[i]->join();
     }
+    stop_ms = get_current_ms();
 
   } else {
     if (FLAGS_prepare_vars) {
@@ -185,16 +199,18 @@ TEST(inference, nlp) {
     std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
 
     // for data and run
-    auto start_ms = get_current_ms();
+    start_ms = get_current_ms();
     for (size_t i = 0; i < datasets.size(); ++i) {
       feed_targets[feed_target_names[0]] = &(datasets[i]);
       executor.RunPreparedContext(ctx.get(), scope, &feed_targets,
                                   &fetch_targets, !FLAGS_prepare_vars);
     }
-    auto stop_ms = get_current_ms();
-    LOG(INFO) << "Total infer time: " << (stop_ms - start_ms) / 1000.0 / 60
-              << " min, avg time per seq: "
-              << (stop_ms - start_ms) / datasets.size() << " ms";
+    stop_ms = get_current_ms();
   }
+
+  LOG(INFO) << "Total inference time with " << FLAGS_num_threads
+            << " threads : " << (stop_ms - start_ms) / 1000.0
+            << " sec, avg time per seq: "
+            << (stop_ms - start_ms) / datasets.size() << " ms";
   delete scope;
 }

From 15193c9e936270de7f39b1d738bbb3571adcd737 Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Fri, 1 Jun 2018 14:48:24 +0800
Subject: [PATCH 25/68] Faster RecordIO Scanner

---
 paddle/fluid/recordio/chunk.cc   | 58 ++++++++++++++++++++------------
 paddle/fluid/recordio/chunk.h    | 16 +++++++--
 paddle/fluid/recordio/scanner.cc | 26 +++++++-------
 paddle/fluid/recordio/scanner.h  |  6 +---
 4 files changed, 64 insertions(+), 42 deletions(-)

diff --git a/paddle/fluid/recordio/chunk.cc b/paddle/fluid/recordio/chunk.cc
index 82d9aa601c..6c65d9160c 100644
--- a/paddle/fluid/recordio/chunk.cc
+++ b/paddle/fluid/recordio/chunk.cc
@@ -119,40 +119,56 @@ bool Chunk::Write(std::ostream& os, Compressor ct) const {
 }
 
 bool Chunk::Parse(std::istream& sin) {
-  Header hdr;
-  bool ok = hdr.Parse(sin);
+  ChunkParser parser(sin);
+  if (!parser.Init()) {
+    return false;
+  }
+  Clear();
+  while (parser.HasNext()) {
+    Add(parser.Next());
+  }
+  return true;
+}
+
+ChunkParser::ChunkParser(std::istream& sin) : in_(sin) {}
+bool ChunkParser::Init() {
+  pos_ = 0;
+  bool ok = header_.Parse(in_);
   if (!ok) {
     return ok;
   }
-  auto beg_pos = sin.tellg();
-  uint32_t crc = Crc32Stream(sin, hdr.CompressSize());
-  PADDLE_ENFORCE_EQ(hdr.Checksum(), crc);
-  Clear();
-  sin.seekg(beg_pos, sin.beg);
-  std::unique_ptr<std::istream> compressed_stream;
-  switch (hdr.CompressType()) {
+  auto beg_pos = in_.tellg();
+  uint32_t crc = Crc32Stream(in_, header_.CompressSize());
+  PADDLE_ENFORCE_EQ(header_.Checksum(), crc);
+  in_.seekg(beg_pos, in_.beg);
+
+  switch (header_.CompressType()) {
     case Compressor::kNoCompress:
       break;
     case Compressor::kSnappy:
-      compressed_stream.reset(new snappy::iSnappyStream(sin));
+      compressed_stream_.reset(new snappy::iSnappyStream(in_));
       break;
     default:
       PADDLE_THROW("Not implemented");
   }
+  return true;
+}
 
-  std::istream& stream = compressed_stream ? *compressed_stream : sin;
+bool ChunkParser::HasNext() const { return pos_ < header_.NumRecords(); }
 
-  for (uint32_t i = 0; i < hdr.NumRecords(); ++i) {
-    uint32_t rec_len;
-    stream.read(reinterpret_cast<char*>(&rec_len), sizeof(uint32_t));
-    std::string buf;
-    buf.resize(rec_len);
-    stream.read(&buf[0], rec_len);
-    PADDLE_ENFORCE_EQ(rec_len, stream.gcount());
-    Add(buf);
+std::string ChunkParser::Next() {
+  if (!HasNext()) {
+    return "";
   }
-  return true;
+  ++pos_;
+  std::istream& stream = compressed_stream_ ? *compressed_stream_ : in_;
+  uint32_t rec_len;
+  stream.read(reinterpret_cast<char*>(&rec_len), sizeof(uint32_t));
+  std::string buf;
+  buf.resize(rec_len);
+  stream.read(&buf[0], rec_len);
+  PADDLE_ENFORCE_EQ(rec_len, stream.gcount());
+  return buf;
 }
-
 }  // namespace recordio
 }  // namespace paddle
diff --git a/paddle/fluid/recordio/chunk.h b/paddle/fluid/recordio/chunk.h
index 71a1556a33..cfb954a591 100644
--- a/paddle/fluid/recordio/chunk.h
+++ b/paddle/fluid/recordio/chunk.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -53,9 +54,20 @@ class Chunk {
   DISABLE_COPY_AND_ASSIGN(Chunk);
 };
 
-size_t CompressData(const char* in, size_t in_length, Compressor ct, char* out);
+class ChunkParser {
+ public:
+  explicit ChunkParser(std::istream& sin);
+
+  bool Init();
+  std::string Next();
+  bool HasNext() const;
 
-void DeflateData(const char* in, size_t in_length, Compressor ct, char* out);
+ private:
+  Header header_;
+  uint32_t pos_{0};
+  std::istream& in_;
+  std::unique_ptr<std::istream> compressed_stream_;
+};
 
 }  // namespace recordio
 }  // namespace paddle
diff --git a/paddle/fluid/recordio/scanner.cc b/paddle/fluid/recordio/scanner.cc
index 88b4d4001b..06a13e6c5b 100644
--- a/paddle/fluid/recordio/scanner.cc
+++ b/paddle/fluid/recordio/scanner.cc
@@ -22,35 +22,33 @@ namespace paddle {
 namespace recordio {
 
 Scanner::Scanner(std::unique_ptr<std::istream> &&stream)
-    : stream_(std::move(stream)) {
+    : stream_(std::move(stream)), parser_(*stream_) {
   Reset();
 }
 
-Scanner::Scanner(const std::string &filename) {
-  stream_.reset(new std::ifstream(filename));
+Scanner::Scanner(const std::string &filename)
+    : stream_(new std::ifstream(filename)), parser_(*stream_) {
   Reset();
 }
 
 void Scanner::Reset() {
   stream_->clear();
   stream_->seekg(0, std::ios::beg);
-  ParseNextChunk();
+  parser_.Init();
 }
 
 std::string Scanner::Next() {
-  PADDLE_ENFORCE(!eof_, "StopIteration");
-  auto rec = cur_chunk_.Record(offset_++);
-  if (offset_ == cur_chunk_.NumRecords()) {
-    ParseNextChunk();
+  if (stream_->eof()) {
+    return "";
   }
-  return rec;
-}
 
-void Scanner::ParseNextChunk() {
-  eof_ = !cur_chunk_.Parse(*stream_);
-  offset_ = 0;
+  auto res = parser_.Next();
+  if (!parser_.HasNext() && HasNext()) {
+    parser_.Init();
+  }
+  return res;
 }
 
-bool Scanner::HasNext() const { return !eof_; }
+bool Scanner::HasNext() const { return !stream_->eof(); }
 }  // namespace recordio
 }  // namespace paddle
diff --git a/paddle/fluid/recordio/scanner.h b/paddle/fluid/recordio/scanner.h
index 34f1b0c78d..0d885dd87a 100644
--- a/paddle/fluid/recordio/scanner.h
+++ b/paddle/fluid/recordio/scanner.h
@@ -37,11 +37,7 @@ class Scanner {
 
  private:
   std::unique_ptr<std::istream> stream_;
-  Chunk cur_chunk_;
-  size_t offset_;
-  bool eof_;
-
-  void ParseNextChunk();
+  ChunkParser parser_;
 };
 }  // namespace recordio
 }  // namespace paddle

From eacac49bcdd281f68d9fac2cba9dee2b245d0d17 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 1 Jun 2018 15:15:26 +0800
Subject: [PATCH 26/68] 1. update test_split_var: replace split with slice

---
 .../{test_split_var.py => test_slice_var.py}         | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)
 rename python/paddle/fluid/tests/unittests/{test_split_var.py => test_slice_var.py} (85%)

diff --git a/python/paddle/fluid/tests/unittests/test_split_var.py b/python/paddle/fluid/tests/unittests/test_slice_var.py
similarity index 85%
rename from python/paddle/fluid/tests/unittests/test_split_var.py
rename to python/paddle/fluid/tests/unittests/test_slice_var.py
index 157def9b56..82305b23a1 100644
--- a/python/paddle/fluid/tests/unittests/test_split_var.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_var.py
@@ -14,14 +14,14 @@
 
 import math
 import unittest
-from paddle.fluid.transpiler.distribute_transpiler import split_variable
+from paddle.fluid.transpiler.distribute_transpiler import slice_variable
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import random
 
 
-class TestSplitVar(unittest.TestCase):
-    def check_split_output(self, shapes, expected_sizes, min_size):
+class TestSliceVar(unittest.TestCase):
+    def check_slice_output(self, shapes, expected_sizes, min_size):
         var_list = []
         program = fluid.Program()
         for shape in shapes:
@@ -31,7 +31,7 @@ class TestSplitVar(unittest.TestCase):
                 # dtype=core.VarDesc.VarType.LOD_TENSOR,
                 shape=shape)
             var_list.append(var)
-        blocks = split_variable(var_list, 10, min_size)
+        blocks = slice_variable(var_list, 10, min_size)
         all_sizes = []
         for s in expected_sizes:
             for s2 in s:
@@ -49,7 +49,7 @@ class TestSplitVar(unittest.TestCase):
             [1150, 1150, 1150, 1150, 1150, 1150, 1100]
         ]
 
-        self.check_split_output(shapes, expected_sizes, 1024)
+        self.check_slice_output(shapes, expected_sizes, 1024)
 
     def test_check_output_8k(self):
         shapes = [[3, 5], [1024], [28, 784], [8, 1020], [800, 10],
@@ -57,7 +57,7 @@ class TestSplitVar(unittest.TestCase):
         expected_sizes = [[15], [1024], [10976, 10976], [8160], [8000],
                           [35937, 35937, 35937, 35937, 35937, 35937]]
 
-        self.check_split_output(shapes, expected_sizes, 8192)
+        self.check_slice_output(shapes, expected_sizes, 8192)
 
 
 if __name__ == '__main__':

From 18d640255efb6807a360c29d6e1c672aa679818a Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Fri, 1 Jun 2018 15:38:45 +0800
Subject: [PATCH 27/68] simplify inference api (#11104)

---
 .../contrib/inference/paddle_inference_api.h  | 40 +++++++++++--------
 .../inference/paddle_inference_api_impl.cc    | 22 +++++-----
 .../test_paddle_inference_api_impl.cc         |  1 -
 3 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/contrib/inference/paddle_inference_api.h
index b4c7f9bef4..5fe8399762 100644
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -40,14 +40,23 @@ struct PaddleBuf {
 struct PaddleTensor {
   std::string name;  // variable name.
   std::vector<int> shape;
+  // TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
   PaddleBuf data;  // blob of data.
   PaddleDType dtype;
 };
 
+enum class PaddleEngineKind {
+  kNative = 0,  // Use the native Fluid facility.
+  // TODO(Superjomn) support following engines latter.
+  // kAnakin,             // Use Anakin for inference.
+  // kTensorRT,           // Use TensorRT for inference.
+  // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
+  // kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
+};
+
 /*
  * A simple Inference API for Paddle. Currently this API can be used by
  * non-sequence scenerios.
- * TODO(Superjomn) Support another API for NLP-related usages.
  */
 class PaddlePredictor {
  public:
@@ -69,15 +78,6 @@ class PaddlePredictor {
   // Destroy the Predictor.
   virtual ~PaddlePredictor() {}
 
-  enum class EngineKind {
-    kNative = -1,  // Use the native Fluid facility.
-    // TODO(Superjomn) support latter.
-    // kAnakin,             // Use Anakin for inference.
-    // kTensorRT,           // Use TensorRT for inference.
-    // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
-    // kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
-  };
-
   // The common configs for all the predictors.
   struct Config {
     std::string model_dir;      // path to the model directory.
@@ -86,18 +86,24 @@ class PaddlePredictor {
 };
 
 struct NativeConfig : public PaddlePredictor::Config {
+  // GPU related fields.
   bool use_gpu{false};
-  int device;
-  float fraction_of_gpu_memory;
+  int device{0};
+  float fraction_of_gpu_memory{-1.f};  // Negative to notify initialization.
+
   std::string prog_file;
   std::string param_file;
-  bool share_variables;
 };
 
-// A factory to help create difference predictor.
-template <
-    typename ConfigT,
-    PaddlePredictor::EngineKind engine = PaddlePredictor::EngineKind::kNative>
+// A factory to help create different predictors.
+//
+// FOR EXTENSION DEVELOPER:
+// Different predictors are designated by config type and engine kind. Similar
+// configs can be merged, but there shouldn't be a huge config containing
+// different fields for more than one kind of predictors.
+//
+// Similarly, each engine kind should map to a unique predictor implementation.
+template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
 
 }  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.cc b/paddle/contrib/inference/paddle_inference_api_impl.cc
index 989252f69e..99a64662d4 100644
--- a/paddle/contrib/inference/paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
@@ -57,8 +57,7 @@ std::string num2str(T a) {
 bool NativePaddlePredictor::Init() {
   VLOG(3) << "Predictor::init()";
 
-  // TODO(panyx0718): Should CPU vs GPU device be decided by id?
-  if (config_.device >= 0) {
+  if (config_.use_gpu) {
     place_ = paddle::platform::CUDAPlace(config_.device);
   } else {
     place_ = paddle::platform::CPUPlace();
@@ -85,11 +84,13 @@ bool NativePaddlePredictor::Init() {
   }
   ctx_ = executor_->Prepare(*inference_program_, 0);
 
-  // Create variables
-  // TODO(panyx0718): Why need to test share_variables here?
-  if (config_.share_variables) {
-    executor_->CreateVariables(*inference_program_, scope_.get(), 0);
-  }
+  // Create temporary variables first, so that the first batch do not need to
+  // create variables in the runtime. This is the logics of the old inference
+  // API.
+  // TODO(Superjomn) this should be modified when `Clone` is valid for
+  // multi-thread application.
+  executor_->CreateVariables(*inference_program_, scope_.get(), 0);
+
   // Get the feed_target_names and fetch_target_names
   feed_target_names_ = inference_program_->GetFeedTargetNames();
   fetch_target_names_ = inference_program_->GetFetchTargetNames();
@@ -124,7 +125,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
                                 scope_.get(),
                                 &feed_targets,
                                 &fetch_targets,
-                                !config_.share_variables);
+                                false /* don't create variable eatch time */);
   if (!GetFetch(fetchs, output_data)) {
     LOG(ERROR) << "fail to get fetchs";
     return false;
@@ -242,11 +243,14 @@ bool NativePaddlePredictor::GetFetch(
 
 template <>
 std::unique_ptr<PaddlePredictor>
-CreatePaddlePredictor<NativeConfig, PaddlePredictor::EngineKind::kNative>(
+CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
     const NativeConfig &config) {
   VLOG(3) << "create NativePaddlePredictor";
   if (config.use_gpu) {
     // 1. GPU memeroy
+    PADDLE_ENFORCE(
+        config.fraction_of_gpu_memory > 0.f,
+        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
     std::vector<std::string> flags;
     if (config.fraction_of_gpu_memory >= 0.0f ||
         config.fraction_of_gpu_memory <= 0.95f) {
diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
index 5240fc2f20..07b17acd48 100644
--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -47,7 +47,6 @@ NativeConfig GetConfig() {
   config.fraction_of_gpu_memory = 0.15;
   config.use_gpu = true;
   config.device = 0;
-  config.share_variables = true;
   return config;
 }
 

From 0c0c5df4cbed8a9c947fd2819640e9d402555ed1 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Fri, 1 Jun 2018 15:39:30 +0800
Subject: [PATCH 28/68] feature/add TRT fc converter (#11043)

---
 .../inference/tensorrt/convert/CMakeLists.txt |   2 +
 .../inference/tensorrt/convert/conv2d_op.cc   |   3 +-
 .../fluid/inference/tensorrt/convert/fc_op.cc | 119 ++++++++++++++++++
 .../inference/tensorrt/convert/mul_op.cc      |   5 +-
 .../inference/tensorrt/convert/op_converter.h |  41 ++++--
 .../inference/tensorrt/convert/test_fc_op.cc  |  46 +++++++
 .../inference/tensorrt/convert/test_mul_op.cc |   4 +-
 .../tensorrt/convert/test_op_converter.cc     |   7 +-
 .../inference/tensorrt/convert/ut_helper.h    |  40 +++---
 paddle/fluid/inference/tensorrt/engine.cc     |   1 +
 paddle/fluid/inference/tensorrt/engine.h      |   4 +-
 paddle/fluid/operators/tensorrt_engine_op.cc  |   3 +-
 12 files changed, 240 insertions(+), 35 deletions(-)
 create mode 100644 paddle/fluid/inference/tensorrt/convert/fc_op.cc
 create mode 100644 paddle/fluid/inference/tensorrt/convert/test_fc_op.cc

diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 5ada1d6312..23ca8bfac8 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -8,3 +8,5 @@ nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS
 nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
 nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
+nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 209936c3ba..668d344f1b 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -21,7 +21,8 @@ namespace tensorrt {
 class Conv2dOpConverter : public OpConverter {
  public:
   Conv2dOpConverter() {}
-  void operator()(const framework::proto::OpDesc& op) override {
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope) override {
     LOG(INFO)
         << "convert a fluid conv2d op to tensorrt conv layer without bias";
   }
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
new file mode 100644
index 0000000000..bd05608d76
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+// Reorder the elements from istrides to ostrides, borrowed from TRT convert in
+// tensorflow.
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/tensorrt/convert/convert_nodes.cc#L318
+template <typename T>
+void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
+              T* odata, nvinfer1::DimsHW ostrides) {
+  for (int h = 0; h < shape.h(); ++h) {
+    for (int w = 0; w < shape.w(); ++w) {
+      odata[h * ostrides.h() + w * ostrides.w()] =
+          idata[h * ostrides.h() + w * ostrides.w()];
+    }
+  }
+}
+
+// Reorder the data layout from CK to KC.
+void ReorderCKtoKC(TensorRTEngine::Weight& iweights,
+                   TensorRTEngine::Weight* oweights) {
+  int c = iweights.dims[0];
+  int k = iweights.dims[1];
+  oweights->dims.assign({k, c});
+  nvinfer1::DimsHW istrides = {1, k};
+  nvinfer1::DimsHW ostrides = {c, 1};
+  Reorder2({k, c}, static_cast<float const*>(iweights.get().values), istrides,
+           static_cast<float*>(const_cast<void*>(oweights->get().values)),
+           ostrides);
+}
+
+/*
+ * FC converter convert a MUL op in Fluid to a FC layer in TRT.
+ */
+class FcOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope) override {
+    VLOG(4) << "convert a fluid fc op to tensorrt fc layer without bias";
+
+    framework::OpDesc op_desc(op, nullptr, nullptr);
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+    // Declare inputs
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+
+    // Declare weights
+    auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
+    PADDLE_ENFORCE_NOT_NULL(Y_v);
+    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+    // This may trigger a GPU->CPU copy, because TRT's weight can only be
+    // assigned from CPU memory, that can't be avoided.
+    auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
+    PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL);  // a matrix
+    size_t n_output = Y_t->dims()[1];
+
+    framework::LoDTensor tmp;
+    tmp.Resize(Y_t->dims());
+    memcpy(tmp.mutable_data<float>(platform::CPUPlace()), Y_t->data<float>(),
+           Y_t->dims()[0] * Y_t->dims()[1]);
+
+    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+                                  static_cast<void*>(weight_data),
+                                  Y_t->memory_size() / sizeof(float)};
+    TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT,
+                                      static_cast<void*>(tmp.data<float>()),
+                                      Y_t->memory_size() / sizeof(float));
+    weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]});
+    tmp_weight.dims = weight.dims;
+
+    // The data layout of TRT FC layer's weight is different from fluid's FC,
+    // need to reorder the elements.
+    ReorderCKtoKC(tmp_weight, &weight);
+
+    // Currently, the framework can only handle one fluid op -> one TRT layer,
+    // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just
+    // handle `mul`, leave `add` as another layer.
+    // DEBUG
+    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected,
+                                       *const_cast<nvinfer1::ITensor*>(X),
+                                       n_output, weight.get(), bias.get());
+
+    auto output_name = op_desc.Output("Out").front();
+    engine_->DeclareOutput(layer, 0, output_name);
+  }
+};
+
+REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter);
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(mul);
diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
index aa8e66490f..6bb07709c7 100644
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
@@ -24,8 +24,9 @@ namespace tensorrt {
 class MulOpConverter : public OpConverter {
  public:
   MulOpConverter() {}
-  void operator()(const framework::proto::OpDesc& op) override {
-    VLOG(4) << "convert a fluid mul op to tensorrt fc layer without bias";
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope) override {
+    VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias";
 
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 1cd3ed9a00..4d21e241c0 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -31,27 +31,42 @@ namespace tensorrt {
 class OpConverter {
  public:
   OpConverter() {}
-  virtual void operator()(const framework::proto::OpDesc& op) {}
 
-  void Run(const framework::proto::OpDesc& op, TensorRTEngine* engine) {
-    std::string type = op.type();
-    auto* it = Registry<OpConverter>::Lookup(type);
-    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", type);
-    it->SetEngine(engine);
-    (*it)(op);
-  }
+  // Converter logic for an op.
+  virtual void operator()(const framework::proto::OpDesc& op,
+                          const framework::Scope& scope) {}
+
+  // Convert a single fluid operaotr and add the corresponding layer to TRT.
+  void ConvertOp(const framework::proto::OpDesc& op,
+                 const std::unordered_set<std::string>& parameters,
+                 const framework::Scope& scope, TensorRTEngine* engine) {
+    framework::OpDesc op_desc(op, nullptr, nullptr);
+
+    OpConverter* it{nullptr};
 
-  // convert fluid op to tensorrt layer
-  void ConvertOp(const framework::proto::OpDesc& op, TensorRTEngine* engine) {
-    OpConverter::Run(op, engine);
+    if (op_desc.Type() == "mul") {
+      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
+      std::string Y = op_desc.Input("Y")[0];
+      if (parameters.count(Y)) {
+        it = Registry<OpConverter>::Lookup("fc");
+      }
+    }
+    if (!it) {
+      it = Registry<OpConverter>::Lookup(op_desc.Type());
+    }
+    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
+                            op_desc.Type());
+    it->SetEngine(engine);
+    (*it)(op, scope);
   }
 
   // convert fluid block to tensorrt network
   void ConvertBlock(const framework::proto::BlockDesc& block,
-                    TensorRTEngine* engine) {
+                    const std::unordered_set<std::string>& parameters,
+                    const framework::Scope& scope, TensorRTEngine* engine) {
     for (int i = 0; i < block.ops_size(); i++) {
       const auto& op = block.ops(i);
-      OpConverter::Run(op, engine);
+      ConvertOp(op, parameters, scope, engine);
     }
   }
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
new file mode 100644
index 0000000000..a30253072a
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(fc_op, test) {
+  std::unordered_set<std::string> parameters({"mul-Y"});
+  framework::Scope scope;
+  TRTConvertValidation validator(20, parameters, scope, 1000);
+
+  validator.DeclInputVar("mul-X", nvinfer1::Dims4(8, 3, 1, 1));
+  validator.DeclParamVar("mul-Y", nvinfer1::Dims2(3, 2));
+  validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(8, 2));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("mul");
+  desc.SetInput("X", {"mul-X"});
+  desc.SetInput("Y", {"mul-Y"});
+  desc.SetOutput("Out", {"mul-Out"});
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(10);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
index d8b61d5f08..1ce1130e5d 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
@@ -21,7 +21,9 @@ namespace inference {
 namespace tensorrt {
 
 TEST(MulOpConverter, main) {
-  TRTConvertValidation validator(10, 1000);
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
   validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6));
   validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10));
   validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10));
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 9ae7de9cbf..1d3f5eabb2 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
 namespace paddle {
 namespace inference {
@@ -27,7 +28,9 @@ TEST(OpConverter, ConvertBlock) {
   conv2d_op->SetType("conv2d");
 
   OpConverter converter;
-  converter.ConvertBlock(*block->Proto(), nullptr /*TensorRTEngine*/);
+  framework::Scope scope;
+  converter.ConvertBlock(*block->Proto(), {}, scope,
+                         nullptr /*TensorRTEngine*/);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index 684bbc208f..d7e05dd5b5 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -61,7 +61,10 @@ class TRTConvertValidation {
  public:
   TRTConvertValidation() = delete;
 
-  explicit TRTConvertValidation(int batch_size, int workspace_size = 1024) {
+  TRTConvertValidation(int batch_size,
+                       const std::unordered_set<std::string>& parameters,
+                       framework::Scope& scope, int workspace_size = 1 << 10)
+      : parameters_(parameters), scope_(scope) {
     // create engine.
     engine_.reset(new TensorRTEngine(10, 1 << 10, &stream_));
     engine_->InitNetwork();
@@ -76,19 +79,22 @@ class TRTConvertValidation {
     engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, dims);
   }
 
+  // Declare a parameter varaible in the scope.
+  void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) {
+    DeclVar(name, dims);
+  }
+
   void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) {
     DeclVar(name, dims);
   }
 
+  // Declare a variable in a fluid Scope.
   void DeclVar(const std::string& name, const nvinfer1::Dims& dims) {
     platform::CPUPlace place;
     platform::CPUDeviceContext ctx(place);
 
     // Init Fluid tensor.
-    std::vector<int> dim_vec(dims.nbDims);
-    for (int i = 0; i < dims.nbDims; i++) {
-      dim_vec[i] = dims.d[i];
-    }
+    std::vector<int> dim_vec(dims.d, dims.d + dims.nbDims);
     auto* x = scope_.Var(name);
     auto* x_tensor = x->GetMutable<framework::LoDTensor>();
     x_tensor->Resize(framework::make_ddim(dim_vec));
@@ -99,7 +105,7 @@ class TRTConvertValidation {
     op_ = framework::OpRegistry::CreateOp(desc);
 
     OpConverter op_converter;
-    op_converter.ConvertOp(desc, engine_.get());
+    op_converter.ConvertOp(desc, parameters_, scope_, engine_.get());
 
     engine_->FreezeNetwork();
 
@@ -108,11 +114,13 @@ class TRTConvertValidation {
 
     // Set Inputs.
     for (const auto& input : op_desc_->InputArgumentNames()) {
+      if (parameters_.count(input)) continue;
       auto* var = scope_.FindVar(input);
       PADDLE_ENFORCE(var);
       auto tensor = var->GetMutable<framework::LoDTensor>();
+
       engine_->SetInputFromCPU(
-          input, static_cast<void*>(tensor->data<float>()),
+          input, static_cast<void*>(tensor->data<void>()),
           sizeof(float) *
               analysis::AccuDims(tensor->dims(), tensor->dims().size()));
     }
@@ -120,18 +128,21 @@ class TRTConvertValidation {
 
   void Execute(int batch_size) {
     // Execute Fluid Op
-    // Execute TRT
     platform::CPUPlace place;
     platform::CPUDeviceContext ctx(place);
-    engine_->Execute(batch_size);
-
     op_->Run(scope_, place);
+    // Execute TRT.
+    engine_->Execute(batch_size);
+    cudaStreamSynchronize(*engine_->stream());
 
     ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
+    const size_t output_space_size = 200;
     for (const auto& output : op_desc_->OutputArgumentNames()) {
       std::vector<float> fluid_out;
-      std::vector<float> trt_out(200);
-      engine_->GetOutputInCPU(output, &trt_out[0], 200 * sizeof(float));
+      std::vector<float> trt_out(output_space_size);
+      engine_->GetOutputInCPU(output, &trt_out[0],
+                              output_space_size * sizeof(float));
+      cudaStreamSynchronize(*engine_->stream());
 
       auto* var = scope_.FindVar(output);
       auto tensor = var->GetMutable<framework::LoDTensor>();
@@ -139,7 +150,7 @@ class TRTConvertValidation {
       // Compare two output
       ASSERT_FALSE(fluid_out.empty());
       for (size_t i = 0; i < fluid_out.size(); i++) {
-        EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 0.001);
+        EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 1e-6);
       }
     }
   }
@@ -149,9 +160,10 @@ class TRTConvertValidation {
  private:
   std::unique_ptr<TensorRTEngine> engine_;
   cudaStream_t stream_;
-  framework::Scope scope_;
   std::unique_ptr<framework::OperatorBase> op_;
   std::unique_ptr<framework::OpDesc> op_desc_;
+  const std::unordered_set<std::string>& parameters_;
+  framework::Scope& scope_;
 };
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index a88236ae98..3d75fefc1a 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -106,6 +106,7 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
                     name);
 
   auto* output = layer->getOutput(offset);
+  SetITensor(name, output);
   PADDLE_ENFORCE(output != nullptr);
   output->setName(name.c_str());
   infer_network_->markOutput(*output);
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index d9d3163b66..fabcfd9e80 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -37,13 +37,15 @@ class TensorRTEngine : public EngineBase {
   // Weight is model parameter.
   class Weight {
    public:
-    Weight(nvinfer1::DataType dtype, void* value, int num_elem) {
+    Weight(nvinfer1::DataType dtype, void* value, size_t num_elem) {
       w_.type = dtype;
       w_.values = value;
       w_.count = num_elem;
     }
     const nvinfer1::Weights& get() { return w_; }
 
+    std::vector<int64_t> dims;
+
    private:
     nvinfer1::Weights w_;
   };
diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc
index 83e768b4dc..855157e7c4 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -31,8 +31,9 @@ void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
   auto max_workspace = context.Attr<int>("max_workspace");
   engine_.reset(new inference::tensorrt::TensorRTEngine(
       max_batch_, max_workspace, nullptr));
+  // TODO(Superjomn) parameters should be passed after analysised from outside.
   inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
-      block, engine_.get());
+      block, {}, context.scope(), engine_.get());
   engine_->FreezeNetwork();
 }
 

From 4a24c238c15212dd921bd0199beca6fc145cd62a Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 1 Jun 2018 15:43:34 +0800
Subject: [PATCH 29/68] refine code

---
 paddle/fluid/inference/io.cc                  |  2 +-
 .../tests/book/test_inference_nlp.cc          | 86 +++++++++----------
 paddle/fluid/inference/tests/test_helper.h    |  3 -
 3 files changed, 42 insertions(+), 49 deletions(-)

diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 98780b6881..65db7c7b50 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -117,7 +117,7 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
   std::string program_desc_str;
   VLOG(3) << "loading model from " << model_filename;
   ReadBinaryFile(model_filename, &program_desc_str);
-  // LOG(INFO) << program_desc_str;
+
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
 
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index fba64efece..962358d761 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -24,23 +24,22 @@ limitations under the License. */
 #include <omp.h>
 #endif
 
-DEFINE_string(dirname, "", "Directory of the inference model.");
+DEFINE_string(modelpath, "", "Directory of the inference model.");
+DEFINE_string(datafile, "", "File of input index data.");
 DEFINE_int32(repeat, 100, "Running the inference program repeat times");
 DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference");
 DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
-DEFINE_bool(prepare_context, true, "Prepare Context before executor");
-
 DEFINE_int32(num_threads, 1, "Number of threads should be used");
 
-inline double get_current_ms() {
+inline double GetCurrentMs() {
   struct timeval time;
   gettimeofday(&time, NULL);
   return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
 }
 
 // return size of total words
-size_t read_datasets(std::vector<paddle::framework::LoDTensor>* out,
-                     const std::string& filename) {
+size_t LoadData(std::vector<paddle::framework::LoDTensor>* out,
+                const std::string& filename) {
   size_t sz = 0;
   std::fstream fin(filename);
   std::string line;
@@ -68,6 +67,23 @@ size_t read_datasets(std::vector<paddle::framework::LoDTensor>* out,
   return sz;
 }
 
+void SplitData(
+    const std::vector<paddle::framework::LoDTensor>& datasets,
+    std::vector<std::vector<const paddle::framework::LoDTensor*>>* jobs,
+    const int num_threads) {
+  size_t s = 0;
+  jobs->resize(num_threads);
+  while (s < datasets.size()) {
+    for (auto it = jobs->begin(); it != jobs->end(); it++) {
+      it->emplace_back(&datasets[s]);
+      s++;
+      if (s >= datasets.size()) {
+        break;
+      }
+    }
+  }
+}
+
 void ThreadRunInfer(
     const int tid, paddle::framework::Executor* executor,
     paddle::framework::Scope* scope,
@@ -80,7 +96,6 @@ void ThreadRunInfer(
   copy_program->SetFeedHolderName(feed_holder_name);
   copy_program->SetFetchHolderName(fetch_holder_name);
 
-  // 3. Get the feed_target_names and fetch_target_names
   const std::vector<std::string>& feed_target_names =
       copy_program->GetFeedTargetNames();
   const std::vector<std::string>& fetch_target_names =
@@ -95,51 +110,32 @@ void ThreadRunInfer(
   PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL);
 
   auto& inputs = jobs[tid];
-  auto start_ms = get_current_ms();
+  auto start_ms = GetCurrentMs();
   for (size_t i = 0; i < inputs.size(); ++i) {
     feed_targets[feed_target_names[0]] = inputs[i];
     executor->Run(*copy_program, scope, &feed_targets, &fetch_targets, true,
                   true, feed_holder_name, fetch_holder_name);
   }
-  auto stop_ms = get_current_ms();
+  auto stop_ms = GetCurrentMs();
   LOG(INFO) << "Tid: " << tid << ", process " << inputs.size()
             << " samples, avg time per sample: "
-
             << (stop_ms - start_ms) / inputs.size() << " ms";
 }
 
-void bcast_datasets(
-    const std::vector<paddle::framework::LoDTensor>& datasets,
-    std::vector<std::vector<const paddle::framework::LoDTensor*>>* jobs,
-    const int num_threads) {
-  size_t s = 0;
-  jobs->resize(num_threads);
-  while (s < datasets.size()) {
-    for (auto it = jobs->begin(); it != jobs->end(); it++) {
-      it->emplace_back(&datasets[s]);
-      s++;
-      if (s >= datasets.size()) {
-        break;
-      }
-    }
-  }
-}
-
 TEST(inference, nlp) {
-  if (FLAGS_dirname.empty()) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  if (FLAGS_modelpath.empty() || FLAGS_datafile.empty()) {
+    LOG(FATAL) << "Usage: ./example --modelpath=path/to/your/model "
+               << "--datafile=path/to/your/data";
   }
-  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  std::string dirname = FLAGS_dirname;
+  LOG(INFO) << "Model Path: " << FLAGS_modelpath;
+  LOG(INFO) << "Data File: " << FLAGS_datafile;
 
   std::vector<paddle::framework::LoDTensor> datasets;
-  size_t num_total_words =
-      read_datasets(&datasets, "/home/tangjian/paddle-tj/out.ids.txt");
-  LOG(INFO) << "Number of dataset samples(seq len<1024): " << datasets.size();
+  size_t num_total_words = LoadData(&datasets, FLAGS_datafile);
+  LOG(INFO) << "Number of samples (seq_len<1024): " << datasets.size();
   LOG(INFO) << "Total number of words: " << num_total_words;
 
   const bool model_combined = false;
-
   // 0. Call `paddle::framework::InitDevices()` initialize all the devices
   // 1. Define place, executor, scope
   auto place = paddle::platform::CPUPlace();
@@ -148,13 +144,14 @@ TEST(inference, nlp) {
 
   // 2. Initialize the inference_program and load parameters
   std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
-  inference_program = InitProgram(&executor, scope, dirname, model_combined);
+  inference_program =
+      InitProgram(&executor, scope, FLAGS_modelpath, model_combined);
   if (FLAGS_use_mkldnn) {
     EnableMKLDNN(inference_program);
   }
 
 #ifdef PADDLE_WITH_MKLML
-  // only use 1 core per thread
+  // only use 1 thread number per std::thread
   omp_set_dynamic(0);
   omp_set_num_threads(1);
   mkl_set_num_threads(1);
@@ -163,24 +160,23 @@ TEST(inference, nlp) {
   double start_ms = 0, stop_ms = 0;
   if (FLAGS_num_threads > 1) {
     std::vector<std::vector<const paddle::framework::LoDTensor*>> jobs;
-    bcast_datasets(datasets, &jobs, FLAGS_num_threads);
+    SplitData(datasets, &jobs, FLAGS_num_threads);
     std::vector<std::unique_ptr<std::thread>> threads;
     for (int i = 0; i < FLAGS_num_threads; ++i) {
       threads.emplace_back(new std::thread(ThreadRunInfer, i, &executor, scope,
                                            std::ref(inference_program),
                                            std::ref(jobs)));
     }
-    start_ms = get_current_ms();
+    start_ms = GetCurrentMs();
     for (int i = 0; i < FLAGS_num_threads; ++i) {
       threads[i]->join();
     }
-    stop_ms = get_current_ms();
-
+    stop_ms = GetCurrentMs();
   } else {
     if (FLAGS_prepare_vars) {
       executor.CreateVariables(*inference_program, scope, 0);
     }
-    // always prepare context and burning first time
+    // always prepare context
     std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
     ctx = executor.Prepare(*inference_program, 0);
 
@@ -198,14 +194,14 @@ TEST(inference, nlp) {
     PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL);
     std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
 
-    // for data and run
-    start_ms = get_current_ms();
+    // feed data and run
+    start_ms = GetCurrentMs();
     for (size_t i = 0; i < datasets.size(); ++i) {
       feed_targets[feed_target_names[0]] = &(datasets[i]);
       executor.RunPreparedContext(ctx.get(), scope, &feed_targets,
                                   &fetch_targets, !FLAGS_prepare_vars);
     }
-    stop_ms = get_current_ms();
+    stop_ms = GetCurrentMs();
   }
 
   LOG(INFO) << "Total inference time with " << FLAGS_num_threads
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index 1f5551567c..01b8dc0be6 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -182,9 +182,6 @@ void TestInference(const std::string& dirname,
         "init_program",
         paddle::platform::DeviceContextPool::Instance().Get(place));
     inference_program = InitProgram(&executor, scope, dirname, is_combined);
-    // std::string binary_str;
-    // inference_program->Proto()->SerializeToString(&binary_str);
-    // LOG(INFO) << binary_str;
     if (use_mkldnn) {
       EnableMKLDNN(inference_program);
     }

From 3206bcd9291833518289e73e37513cdbc29e96c7 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 1 Jun 2018 16:24:55 +0800
Subject: [PATCH 30/68] refine log and add QPS

---
 paddle/fluid/inference/tests/book/test_inference_nlp.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 962358d761..378e1620a0 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -202,11 +202,13 @@ TEST(inference, nlp) {
                                   &fetch_targets, !FLAGS_prepare_vars);
     }
     stop_ms = GetCurrentMs();
+    LOG(INFO) << "Tid: 0, process " << datasets.size()
+              << " samples, avg time per sample: "
+              << (stop_ms - start_ms) / datasets.size() << " ms";
   }
 
   LOG(INFO) << "Total inference time with " << FLAGS_num_threads
             << " threads : " << (stop_ms - start_ms) / 1000.0
-            << " sec, avg time per seq: "
-            << (stop_ms - start_ms) / datasets.size() << " ms";
+            << " sec, QPS: " << datasets.size() / ((stop_ms - start_ms) / 1000);
   delete scope;
 }

From 9503dbb173f76f7b68d4a6edc18ce31cf7865c30 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Fri, 1 Jun 2018 17:14:18 +0800
Subject: [PATCH 31/68] fix compile error (#11119)

---
 paddle/fluid/inference/tensorrt/convert/fc_op.cc       | 2 +-
 paddle/fluid/inference/tensorrt/convert/op_converter.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index bd05608d76..45b0795597 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -59,7 +59,7 @@ class FcOpConverter : public OpConverter {
                   const framework::Scope& scope) override {
     VLOG(4) << "convert a fluid fc op to tensorrt fc layer without bias";
 
-    framework::OpDesc op_desc(op, nullptr, nullptr);
+    framework::OpDesc op_desc(op, nullptr);
     PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
     PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
     PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 4d21e241c0..3beafeefd0 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -40,7 +40,7 @@ class OpConverter {
   void ConvertOp(const framework::proto::OpDesc& op,
                  const std::unordered_set<std::string>& parameters,
                  const framework::Scope& scope, TensorRTEngine* engine) {
-    framework::OpDesc op_desc(op, nullptr, nullptr);
+    framework::OpDesc op_desc(op, nullptr);
 
     OpConverter* it{nullptr};
 

From 7e9f0790e0366ef8db3f48f83635400d4742ad71 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 1 Jun 2018 17:24:54 +0800
Subject: [PATCH 32/68] fix scope in thread

---
 paddle/fluid/inference/tests/book/test_inference_nlp.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 378e1620a0..f7788ccbf4 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -91,6 +91,8 @@ void ThreadRunInfer(
     const std::vector<std::vector<const paddle::framework::LoDTensor*>>& jobs) {
   auto copy_program = std::unique_ptr<paddle::framework::ProgramDesc>(
       new paddle::framework::ProgramDesc(*inference_program));
+  auto& sub_scope = scope->NewScope();
+
   std::string feed_holder_name = "feed_" + paddle::string::to_string(tid);
   std::string fetch_holder_name = "fetch_" + paddle::string::to_string(tid);
   copy_program->SetFeedHolderName(feed_holder_name);
@@ -113,10 +115,11 @@ void ThreadRunInfer(
   auto start_ms = GetCurrentMs();
   for (size_t i = 0; i < inputs.size(); ++i) {
     feed_targets[feed_target_names[0]] = inputs[i];
-    executor->Run(*copy_program, scope, &feed_targets, &fetch_targets, true,
-                  true, feed_holder_name, fetch_holder_name);
+    executor->Run(*copy_program, &sub_scope, &feed_targets, &fetch_targets,
+                  true, true, feed_holder_name, fetch_holder_name);
   }
   auto stop_ms = GetCurrentMs();
+  scope->DeleteScope(&sub_scope);
   LOG(INFO) << "Tid: " << tid << ", process " << inputs.size()
             << " samples, avg time per sample: "
             << (stop_ms - start_ms) / inputs.size() << " ms";

From 663f4e6168ef9852991dc1ccfea462307d19a5d0 Mon Sep 17 00:00:00 2001
From: baiyf <baiyfbupt@gmail.com>
Date: Fri, 1 Jun 2018 19:30:59 +0800
Subject: [PATCH 33/68] Fix bilinear_op Python API (#11117)

* fix conflict

* code clean
---
 doc/fluid/api/layers.rst                      |  4 ++--
 python/paddle/fluid/layers/nn.py              | 21 +++++++++----------
 .../fluid/tests/unittests/test_layers.py      |  6 +++---
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index dbb99d3c03..5329adaa18 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -1003,10 +1003,10 @@ dice_loss
 ..  autofunction:: paddle.fluid.layers.dice_loss
     :noindex:
 
-upsampling_bilinear2d
+resize_bilinear
 ____
 
-..  autofunction:: paddle.fluid.layers.upsampling_bilinear2d
+..  autofunction:: paddle.fluid.layers.resize_bilinear
     :noindex:
 
 gather
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 56f5c6b4be..bd6ed0f30e 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -81,7 +81,7 @@ __all__ = [
     'label_smooth',
     'roi_pool',
     'dice_loss',
-    'upsampling_bilinear2d',
+    'resize_bilinear',
     'gather',
     'random_crop',
 ]
@@ -3929,9 +3929,9 @@ def dice_loss(input, label, epsilon=0.00001):
     return reduce_mean(dice_score)
 
 
-def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
+def resize_bilinear(input, out_shape=None, scale=None, name=None):
     """
-    The mathematical meaning of upsampling_bilinear2d is also called
+    The mathematical meaning of resize bilinear layer is
     Bilinear interpolation.
     Bilinear interpolation is an extension of linear interpolation for
     interpolating functions of two variables (e.g. H-direction and
@@ -3941,13 +3941,13 @@ def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
     https://en.wikipedia.org/wiki/Bilinear_interpolation
 
     Args:
-        input (Variable): The input tensor of bilinear interpolation,
+        input (Variable): The input tensor of resize bilinear layer,
                           This is a 4-D tensor of the shape
                           (num_batches, channels, in_h, in_w).
-        out_shape(list|tuple|Variable|None): Output shape of bilinear interpolation
+        out_shape(list|tuple|Variable|None): Output shape of resize bilinear
                                     layer, the shape is (out_h, out_w).
                                     Default: None
-        scale(int|None): The multiplier for the input height or width.
+        scale(float|None): The multiplier for the input height or width.
                          At least one of out_shape or scale must be set.
                          And out_shape has a higher priority than scale.
                          Default: None
@@ -3961,7 +3961,7 @@ def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
     Examples:
         .. code-block:: python
 
-            out = fluid.layers.bilinear_interp(input, out_shape=[12, 12])
+            out = fluid.layers.resize_bilinear(input, out_shape=[12, 12])
     """
     if out_shape is None and scale is None:
         raise ValueError("One of out_shape and scale must not be None")
@@ -3975,10 +3975,9 @@ def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
     out_w = 0
     inputs = {"X": input}
     if out_shape is not None:
-        if not (_is_list_or_turple_(out_shape) and len(out_shape) == 2) and (
-                out_shape is not Variable):
-            raise ValueError('out_shape should be a list or tuple ',
-                             'with length 2, (out_h, out_w).')
+        if not (_is_list_or_turple_(out_shape) and
+                len(out_shape) == 2) and not isinstance(out_shape, Variable):
+            raise ValueError('out_shape should be a list or tuple or variable')
         if _is_list_or_turple_(out_shape):
             out_shape = list(map(int, out_shape))
             out_h = out_shape[0]
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 60dc1f83fc..ca08fd7fc8 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -369,13 +369,13 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(output)
         print(str(program))
 
-    def test_upsampling_bilinear2d(self):
+    def test_resize_bilinear(self):
         program = Program()
         with program_guard(program):
             x = layers.data(name='x', shape=[3, 9, 6], dtype="float32")
-            output = layers.upsampling_bilinear2d(x, out_shape=[12, 12])
+            output = layers.resize_bilinear(x, out_shape=[12, 12])
             self.assertIsNotNone(output)
-            output = layers.upsampling_bilinear2d(x, scale=3)
+            output = layers.resize_bilinear(x, scale=3)
             self.assertIsNotNone(output)
         print(str(program))
 

From eaeb76c419fbad9b7d3dd083666f80d84f89f55f Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 1 Jun 2018 19:35:49 +0800
Subject: [PATCH 34/68] add some comments

---
 .../tests/book/test_inference_nlp.cc          | 25 +++++++++++--------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index f7788ccbf4..c4d7b0bbf0 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -37,7 +37,8 @@ inline double GetCurrentMs() {
   return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
 }
 
-// return size of total words
+// Load the input word index data from file and save into LodTensor.
+// Return the size of words.
 size_t LoadData(std::vector<paddle::framework::LoDTensor>* out,
                 const std::string& filename) {
   size_t sz = 0;
@@ -67,6 +68,8 @@ size_t LoadData(std::vector<paddle::framework::LoDTensor>* out,
   return sz;
 }
 
+// Split input data samples into small pieces jobs as balanced as possible,
+// according to the number of threads.
 void SplitData(
     const std::vector<paddle::framework::LoDTensor>& datasets,
     std::vector<std::vector<const paddle::framework::LoDTensor*>>* jobs,
@@ -116,7 +119,8 @@ void ThreadRunInfer(
   for (size_t i = 0; i < inputs.size(); ++i) {
     feed_targets[feed_target_names[0]] = inputs[i];
     executor->Run(*copy_program, &sub_scope, &feed_targets, &fetch_targets,
-                  true, true, feed_holder_name, fetch_holder_name);
+                  true /*create_local_scope*/, true /*create_vars*/,
+                  feed_holder_name, fetch_holder_name);
   }
   auto stop_ms = GetCurrentMs();
   scope->DeleteScope(&sub_scope);
@@ -143,12 +147,13 @@ TEST(inference, nlp) {
   // 1. Define place, executor, scope
   auto place = paddle::platform::CPUPlace();
   auto executor = paddle::framework::Executor(place);
-  auto* scope = new paddle::framework::Scope();
+  std::unique_ptr<paddle::framework::Scope> scope(
+      new paddle::framework::Scope());
 
   // 2. Initialize the inference_program and load parameters
   std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
   inference_program =
-      InitProgram(&executor, scope, FLAGS_modelpath, model_combined);
+      InitProgram(&executor, scope.get(), FLAGS_modelpath, model_combined);
   if (FLAGS_use_mkldnn) {
     EnableMKLDNN(inference_program);
   }
@@ -166,9 +171,9 @@ TEST(inference, nlp) {
     SplitData(datasets, &jobs, FLAGS_num_threads);
     std::vector<std::unique_ptr<std::thread>> threads;
     for (int i = 0; i < FLAGS_num_threads; ++i) {
-      threads.emplace_back(new std::thread(ThreadRunInfer, i, &executor, scope,
-                                           std::ref(inference_program),
-                                           std::ref(jobs)));
+      threads.emplace_back(
+          new std::thread(ThreadRunInfer, i, &executor, scope.get(),
+                          std::ref(inference_program), std::ref(jobs)));
     }
     start_ms = GetCurrentMs();
     for (int i = 0; i < FLAGS_num_threads; ++i) {
@@ -177,7 +182,7 @@ TEST(inference, nlp) {
     stop_ms = GetCurrentMs();
   } else {
     if (FLAGS_prepare_vars) {
-      executor.CreateVariables(*inference_program, scope, 0);
+      executor.CreateVariables(*inference_program, scope.get(), 0);
     }
     // always prepare context
     std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
@@ -201,7 +206,7 @@ TEST(inference, nlp) {
     start_ms = GetCurrentMs();
     for (size_t i = 0; i < datasets.size(); ++i) {
       feed_targets[feed_target_names[0]] = &(datasets[i]);
-      executor.RunPreparedContext(ctx.get(), scope, &feed_targets,
+      executor.RunPreparedContext(ctx.get(), scope.get(), &feed_targets,
                                   &fetch_targets, !FLAGS_prepare_vars);
     }
     stop_ms = GetCurrentMs();
@@ -209,9 +214,7 @@ TEST(inference, nlp) {
               << " samples, avg time per sample: "
               << (stop_ms - start_ms) / datasets.size() << " ms";
   }
-
   LOG(INFO) << "Total inference time with " << FLAGS_num_threads
             << " threads : " << (stop_ms - start_ms) / 1000.0
             << " sec, QPS: " << datasets.size() / ((stop_ms - start_ms) / 1000);
-  delete scope;
 }

From 8939f17d304dd407efef3466a7bc96c6bb8747fe Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Fri, 1 Jun 2018 19:48:57 +0800
Subject: [PATCH 35/68] sppedup test_listen_and_serv_op

---
 .../fluid/tests/unittests/CMakeLists.txt      |  6 +-
 .../unittests/test_listen_and_serv_op.py      | 59 ++++++++-----------
 2 files changed, 30 insertions(+), 35 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index fead95ffda..c33539f6b5 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -48,5 +48,7 @@ foreach(TEST_OP ${TEST_OPS})
 endforeach(TEST_OP)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
 py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
-# tests that need to be done in fixed timeout
-set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
+# FIXME(Yancey1989): this test would cost much more time on CUDAPlace
+# since load cudnn libraries, so we use a longer timeout to make this
+# unit test stability.
+set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 30)
diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
index cf89f9d0eb..ad479657cc 100644
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -23,7 +23,7 @@ from multiprocessing import Process
 from op_test import OpTest
 
 
-def run_pserver(use_cuda, sync_mode, ip, port, trainer_count, trainer_id):
+def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id):
     x = fluid.layers.data(name='x', shape=[1], dtype='float32')
     y_predict = fluid.layers.fc(input=x, size=1, act=None)
     y = fluid.layers.data(name='y', shape=[1], dtype='float32')
@@ -39,15 +39,8 @@ def run_pserver(use_cuda, sync_mode, ip, port, trainer_count, trainer_id):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
 
-    port = os.getenv("PADDLE_INIT_PORT", port)
-    pserver_ips = os.getenv("PADDLE_INIT_PSERVERS", ip)  # ip,ip...
-    eplist = []
-    for ip in pserver_ips.split(","):
-        eplist.append(':'.join([ip, port]))
-    pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-    trainers = int(os.getenv("TRAINERS", trainer_count))
-    current_endpoint = os.getenv("POD_IP", ip) + ":" + port
-    trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID", trainer_id))
+    pserver_endpoints = ip + ":" + port
+    current_endpoint = ip + ":" + port
     t = fluid.DistributeTranspiler()
     t.transpile(
         trainer_id,
@@ -62,47 +55,47 @@ def run_pserver(use_cuda, sync_mode, ip, port, trainer_count, trainer_id):
 
 class TestListenAndServOp(OpTest):
     def setUp(self):
-        self.sleep_time = 5
+        self.ps_timeout = 5
         self.ip = "127.0.0.1"
         self.port = "6173"
-        self.trainer_count = 1
+        self.trainers = 1
         self.trainer_id = 1
 
-    def _raise_signal(self, parent_pid, raised_signal):
-        time.sleep(self.sleep_time)
-        ps_command = subprocess.Popen(
-            "ps -o pid --ppid %d --noheaders" % parent_pid,
-            shell=True,
-            stdout=subprocess.PIPE)
-        ps_output = ps_command.stdout.read()
-        retcode = ps_command.wait()
-        assert retcode == 0, "ps command returned %d" % retcode
-
-        for pid_str in ps_output.split("\n")[:-1]:
-            try:
-                os.kill(int(pid_str), raised_signal)
-            except Exception:
-                continue
-
     def _start_pserver(self, use_cuda, sync_mode):
         p = Process(
             target=run_pserver,
-            args=(use_cuda, sync_mode, self.ip, self.port, self.trainer_count,
+            args=(use_cuda, sync_mode, self.ip, self.port, self.trainers,
                   self.trainer_id))
         p.start()
+        return p.pid
+
+    def _wait_ps_ready(self, pid):
+        retry_times = self.ps_timeout
+        while True:
+            time.sleep(1)
+            assert retry_times >= 0, "wait ps ready failed"
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
 
     def test_handle_signal_in_serv_op(self):
         # run pserver on CPU in sync mode
-        self._start_pserver(False, True)
+        pid = self._start_pserver(False, True)
+        self._wait_ps_ready(pid)
 
         # raise SIGINT to pserver
-        self._raise_signal(os.getpid(), signal.SIGINT)
+        os.kill(pid, signal.SIGINT)
 
         # run pserver on CPU in async mode
-        self._start_pserver(False, False)
+        pid = self._start_pserver(False, False)
+        self._wait_ps_ready(pid)
 
         # raise SIGTERM to pserver
-        self._raise_signal(os.getpid(), signal.SIGTERM)
+        os.kill(pid, signal.SIGINT)
 
 
 if __name__ == '__main__':

From b5b666185720c241aad2421ffb2497dd92d159e5 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Fri, 1 Jun 2018 20:36:34 +0800
Subject: [PATCH 36/68] feature/simple inference demo (#11105)

---
 paddle/contrib/inference/CMakeLists.txt       | 43 +++++------
 paddle/contrib/inference/demo/CMakeLists.txt  | 16 +++++
 .../inference/demo/simple_on_word2vec.cc      | 71 +++++++++++++++++++
 .../inference/paddle_inference_api_impl.cc    |  6 +-
 .../test_paddle_inference_api_impl.cc         |  4 +-
 5 files changed, 116 insertions(+), 24 deletions(-)
 create mode 100644 paddle/contrib/inference/demo/CMakeLists.txt
 create mode 100644 paddle/contrib/inference/demo/simple_on_word2vec.cc

diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index 6847f7db7f..8ca3446539 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -18,32 +18,35 @@ if(APPLE)
 endif(APPLE)
 
 function(inference_api_test TARGET_NAME)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs ARGS)
-    cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    if (WITH_TESTING)
+        set(options "")
+        set(oneValueArgs "")
+        set(multiValueArgs ARGS)
+        cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-    set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
-    cc_test(test_paddle_inference_${TARGET_NAME}
-            SRCS test_paddle_inference_${TARGET_NAME}.cc
-            DEPS paddle_fluid_api paddle_inference_api
-            ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
-    if(inference_test_ARGS)
-        set_tests_properties(test_paddle_inference_${TARGET_NAME}
-                 PROPERTIES DEPENDS "${inference_test_ARGS}")
-    endif()
+        set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
+        cc_test(${TARGET_NAME}
+                SRCS ${TARGET_NAME}.cc
+                DEPS paddle_fluid paddle_inference_api
+                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
+        if(inference_test_ARGS)
+            set_tests_properties(${TARGET_NAME}
+                    PROPERTIES DEPENDS "${inference_test_ARGS}")
+        endif()
+    endif(WITH_TESTING)
 endfunction(inference_api_test)
 
-
 cc_library(paddle_inference_api
     SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
     DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 
-if(WITH_TESTING)
-    cc_test(test_paddle_inference_api
-            SRCS test_paddle_inference_api.cc
-            DEPS paddle_inference_api)
+cc_test(test_paddle_inference_api
+        SRCS test_paddle_inference_api.cc
+        DEPS paddle_inference_api)
 
-    inference_api_test(api_impl
-                       ARGS test_word2vec test_image_classification)
+inference_api_test(test_paddle_inference_api_impl
+                    ARGS test_word2vec test_image_classification)
+
+if(WITH_TESTING)
+    add_subdirectory(demo)
 endif()
diff --git a/paddle/contrib/inference/demo/CMakeLists.txt b/paddle/contrib/inference/demo/CMakeLists.txt
new file mode 100644
index 0000000000..7b0fa77ad1
--- /dev/null
+++ b/paddle/contrib/inference/demo/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+inference_api_test(simple_on_word2vec ARGS test_word2vec)
diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/contrib/inference/demo/simple_on_word2vec.cc
new file mode 100644
index 0000000000..165d2e196b
--- /dev/null
+++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains a simple demo for how to take a model for inference.
+ */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <memory>
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+namespace paddle {
+namespace demo {
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+void Main(bool use_gpu) {
+  //# 1. Create PaddlePredictor with a config.
+  NativeConfig config;
+  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.use_gpu = use_gpu;
+  config.fraction_of_gpu_memory = 0.15;
+  config.device = 0;
+  auto predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+
+  for (int batch_id = 0; batch_id < 3; batch_id++) {
+    //# 2. Prepare input.
+    int64_t data[4] = {1, 2, 3, 4};
+
+    PaddleBuf buf{.data = data, .length = sizeof(data)};
+    PaddleTensor tensor{.name = "",
+                        .shape = std::vector<int>({4, 1}),
+                        .data = buf,
+                        .dtype = PaddleDType::INT64};
+
+    // For simplicity, we set all the slots with the same data.
+    std::vector<PaddleTensor> slots(4, tensor);
+
+    //# 3. Run
+    std::vector<PaddleTensor> outputs;
+    CHECK(predictor->Run(slots, &outputs));
+
+    //# 4. Get output.
+    ASSERT_EQ(outputs.size(), 1);
+    LOG(INFO) << "output buffer size: " << outputs.front().data.length;
+    const size_t num_elements = outputs.front().data.length / sizeof(float);
+    // The outputs' buffers are in CPU memory.
+    for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+      LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
+    }
+  }
+}
+
+TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); }
+TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); }
+
+}  // namespace demo
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.cc b/paddle/contrib/inference/paddle_inference_api_impl.cc
index 99a64662d4..e7a8fa68b7 100644
--- a/paddle/contrib/inference/paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
@@ -248,9 +248,11 @@ CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
   VLOG(3) << "create NativePaddlePredictor";
   if (config.use_gpu) {
     // 1. GPU memeroy
-    PADDLE_ENFORCE(
-        config.fraction_of_gpu_memory > 0.f,
+    PADDLE_ENFORCE_GT(
+        config.fraction_of_gpu_memory,
+        0.f,
         "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
+    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
     std::vector<std::string> flags;
     if (config.fraction_of_gpu_memory >= 0.0f ||
         config.fraction_of_gpu_memory <= 0.95f) {
diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
index 07b17acd48..1f96067716 100644
--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -74,7 +74,7 @@ TEST(paddle_inference_api_impl, word2vec) {
   ASSERT_EQ(outputs.size(), 1UL);
   size_t len = outputs[0].data.length;
   float* data = static_cast<float*>(outputs[0].data.data);
-  for (int j = 0; j < len / sizeof(float); ++j) {
+  for (size_t j = 0; j < len / sizeof(float); ++j) {
     ASSERT_LT(data[j], 1.0);
     ASSERT_GT(data[j], -1.0);
   }
@@ -92,7 +92,7 @@ TEST(paddle_inference_api_impl, word2vec) {
   TestInference<platform::CPUPlace>(config.model_dir, cpu_feeds, cpu_fetchs1);
 
   float* lod_data = output1.data<float>();
-  for (size_t i = 0; i < output1.numel(); ++i) {
+  for (int i = 0; i < output1.numel(); ++i) {
     EXPECT_LT(lod_data[i] - data[i], 1e-3);
     EXPECT_GT(lod_data[i] - data[i], -1e-3);
   }

From 38f8182df63d33ff619297d95f5a4431bf8d5362 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 1 Jun 2018 20:41:18 +0800
Subject: [PATCH 37/68] work around with dummy test

---
 .../fluid/inference/tests/book/CMakeLists.txt |  8 ++++++-
 .../tests/book/test_inference_nlp.cc          | 21 ++++++++++++++++---
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index 90357f99d1..b33df2942a 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -35,7 +35,13 @@ inference_test(image_classification ARGS vgg resnet)
 inference_test(label_semantic_roles)
 inference_test(recognize_digits ARGS mlp conv)
 inference_test(recommender_system)
-inference_test(nlp)
 #inference_test(rnn_encoder_decoder)
 #inference_test(understand_sentiment ARGS conv)
 inference_test(word2vec)
+
+# This is an unly work around to make this test run
+cc_test(test_inference_nlp
+  SRCS test_inference_nlp.cc
+  DEPS paddle_fluid
+  ARGS
+  --modelpath=${PADDLE_BINARY_DIR}/python/paddle/fluid/tests/book/recognize_digits_mlp.inference.model)
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index c4d7b0bbf0..5ece6084df 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -37,10 +37,22 @@ inline double GetCurrentMs() {
   return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
 }
 
+// This function just give dummy data for recognize_digits model.
+size_t DummyData(std::vector<paddle::framework::LoDTensor>* out) {
+  paddle::framework::LoDTensor input;
+  SetupTensor<float>(&input, {1, 1, 28, 28}, -1.f, 1.f);
+  out->emplace_back(input);
+  return 1;
+}
+
 // Load the input word index data from file and save into LodTensor.
 // Return the size of words.
 size_t LoadData(std::vector<paddle::framework::LoDTensor>* out,
                 const std::string& filename) {
+  if (filename.empty()) {
+    return DummyData(out);
+  }
+
   size_t sz = 0;
   std::fstream fin(filename);
   std::string line;
@@ -130,9 +142,12 @@ void ThreadRunInfer(
 }
 
 TEST(inference, nlp) {
-  if (FLAGS_modelpath.empty() || FLAGS_datafile.empty()) {
-    LOG(FATAL) << "Usage: ./example --modelpath=path/to/your/model "
-               << "--datafile=path/to/your/data";
+  if (FLAGS_modelpath.empty()) {
+    LOG(FATAL) << "Usage: ./example --modelpath=path/to/your/model";
+  }
+  if (FLAGS_datafile.empty()) {
+    LOG(WARNING) << " Not data file provided, will use dummy data!"
+                 << "Note: if you use nlp model, please provide data file.";
   }
   LOG(INFO) << "Model Path: " << FLAGS_modelpath;
   LOG(INFO) << "Data File: " << FLAGS_datafile;

From 99d00cce9330dac56aac52788d7fba76d0137430 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 1 Jun 2018 21:04:51 +0800
Subject: [PATCH 38/68] follow comment: refine where time started

---
 paddle/fluid/inference/tests/book/test_inference_nlp.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 5ece6084df..c3bec27925 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -185,12 +185,12 @@ TEST(inference, nlp) {
     std::vector<std::vector<const paddle::framework::LoDTensor*>> jobs;
     SplitData(datasets, &jobs, FLAGS_num_threads);
     std::vector<std::unique_ptr<std::thread>> threads;
+    start_ms = GetCurrentMs();
     for (int i = 0; i < FLAGS_num_threads; ++i) {
       threads.emplace_back(
           new std::thread(ThreadRunInfer, i, &executor, scope.get(),
                           std::ref(inference_program), std::ref(jobs)));
     }
-    start_ms = GetCurrentMs();
     for (int i = 0; i < FLAGS_num_threads; ++i) {
       threads[i]->join();
     }

From e0a8c584701299017dda12214fed1bb108d8a9e9 Mon Sep 17 00:00:00 2001
From: Lei Wang <bestwanglei@gmail.com>
Date: Fri, 1 Jun 2018 15:55:21 -0700
Subject: [PATCH 39/68] Doc: fix typo in var_desc.md. (#11130)

---
 doc/fluid/design/concepts/var_desc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/fluid/design/concepts/var_desc.md b/doc/fluid/design/concepts/var_desc.md
index 6750323c01..8db67f6703 100644
--- a/doc/fluid/design/concepts/var_desc.md
+++ b/doc/fluid/design/concepts/var_desc.md
@@ -35,7 +35,7 @@ The computation `Program` consists of nested `Blocks`. Each `Block` will consist
 
 ## Definition of VarType
 
-A VarDesc should have a name, type and whether or not it is persistable. The are different kinds of variable types supported in PaddlePaddle, apart from the POD_Types like: `LOD_TENSOR`, `SELECTED_ROWS`, `FEED_MINIBATCH`, `FETCH_LIST`, `STEP_SCOPES`, `LOD_RANK_TABLE`, `LOD_TENSOR_ARRAY`, `PLACE_LIST`, `READER` and `CHANNEL`. These are declared inside `VarType`. A `VarDesc` then looks as the following:
+A VarDesc should have a name, type and whether or not it is persistable. There are different kinds of variable types supported in PaddlePaddle, apart from the POD_Types like: `LOD_TENSOR`, `SELECTED_ROWS`, `FEED_MINIBATCH`, `FETCH_LIST`, `STEP_SCOPES`, `LOD_RANK_TABLE`, `LOD_TENSOR_ARRAY`, `PLACE_LIST`, `READER` and `CHANNEL`. These are declared inside `VarType`. A `VarDesc` then looks as the following:
 
 ```proto
 message VarDesc {

From 906334a6a493b5f47cdb52fc93688ee945f96f27 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Sun, 3 Jun 2018 16:12:38 +0800
Subject: [PATCH 40/68] fix build error on mac

---
 paddle/fluid/operators/detail/grpc_server.h  | 4 ++--
 paddle/fluid/operators/listen_and_serv_op.cc | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
index d1fcbc414f..eae87de420 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -71,8 +71,8 @@ class AsyncGRPCServer final : public RPCServer {
   std::unique_ptr<::grpc::Server> server_;
 
   // condition of the sub program
-  std::mutex barrier_mutex_;
-  mutable int barrier_cond_step_;
+  // std::mutex barrier_mutex_;
+  // mutable int barrier_cond_step_;
   std::condition_variable barrier_condition_;
 
   std::mutex mutex_ready_;
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 71e75c2532..66a0f87b46 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -222,8 +222,8 @@ static void FillRequestCtx(detail::RequestHandler *h, framework::Scope *scope,
   h->SetDevCtx(dev_ctx);
   h->SetExecutor(executor);
   h->SetProgram(program);
-  h->SetPrefetchPreparedCtx(std::move(
-      std::unique_ptr<framework::ExecutorPrepareContext>(prefetch_ctx)));
+  h->SetPrefetchPreparedCtx(
+      std::unique_ptr<framework::ExecutorPrepareContext>(prefetch_ctx));
   h->SetRPCServer(rpc_server);
 }
 

From d3e99aeec33475cdb4a06bb0867f28503e91b3c7 Mon Sep 17 00:00:00 2001
From: Yuan Gao <yuan.gao.gavin@gmail.com>
Date: Mon, 4 Jun 2018 09:46:49 +0800
Subject: [PATCH 41/68] add normalize switch to box_coder_op (#11129)

---
 .../fluid/operators/detection/box_coder_op.cc |  4 ++
 .../fluid/operators/detection/box_coder_op.cu | 42 ++++++++++--------
 .../fluid/operators/detection/box_coder_op.h  | 44 +++++++++++--------
 .../tests/unittests/test_box_coder_op.py      | 32 ++++++++++----
 4 files changed, 78 insertions(+), 44 deletions(-)

diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc
index 74848005d0..76ef08cb9a 100644
--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@@ -91,6 +91,10 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
                          "the code type used with the target box")
         .SetDefault("encode_center_size")
         .InEnum({"encode_center_size", "decode_center_size"});
+    AddAttr<bool>("box_normalized",
+                  "(bool, default true) "
+                  "whether treat the priorbox as a noramlized box")
+        .SetDefault(true);
     AddOutput("OutputBox",
               "(LoDTensor or Tensor) "
               "When code_type is 'encode_center_size', the output tensor of "
diff --git a/paddle/fluid/operators/detection/box_coder_op.cu b/paddle/fluid/operators/detection/box_coder_op.cu
index 8cef8e0343..fc7eb5d1ed 100644
--- a/paddle/fluid/operators/detection/box_coder_op.cu
+++ b/paddle/fluid/operators/detection/box_coder_op.cu
@@ -20,15 +20,16 @@ __global__ void EncodeCenterSizeKernel(const T* prior_box_data,
                                        const T* prior_box_var_data,
                                        const T* target_box_data, const int row,
                                        const int col, const int len,
-                                       T* output) {
+                                       const bool normalized, T* output) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < row * col) {
     const int row_idx = idx / col;
     const int col_idx = idx % col;
-    T prior_box_width =
-        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
-    T prior_box_height =
-        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
+    T prior_box_width = prior_box_data[col_idx * len + 2] -
+                        prior_box_data[col_idx * len] + (normalized == false);
+    T prior_box_height = prior_box_data[col_idx * len + 3] -
+                         prior_box_data[col_idx * len + 1] +
+                         (normalized == false);
     T prior_box_center_x =
         (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
     T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
@@ -41,10 +42,11 @@ __global__ void EncodeCenterSizeKernel(const T* prior_box_data,
     T target_box_center_y = (target_box_data[row_idx * len + 3] +
                              target_box_data[row_idx * len + 1]) /
                             2;
-    T target_box_width =
-        target_box_data[row_idx * len + 2] - target_box_data[row_idx * len];
-    T target_box_height =
-        target_box_data[row_idx * len + 3] - target_box_data[row_idx * len + 1];
+    T target_box_width = target_box_data[row_idx * len + 2] -
+                         target_box_data[row_idx * len] + (normalized == false);
+    T target_box_height = target_box_data[row_idx * len + 3] -
+                          target_box_data[row_idx * len + 1] +
+                          (normalized == false);
 
     output[idx * len] = (target_box_center_x - prior_box_center_x) /
                         prior_box_width / prior_box_var_data[col_idx * len];
@@ -63,14 +65,15 @@ __global__ void DecodeCenterSizeKernel(const T* prior_box_data,
                                        const T* prior_box_var_data,
                                        const T* target_box_data, const int row,
                                        const int col, const int len,
-                                       T* output) {
+                                       const bool normalized, T* output) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < row * col) {
     const int col_idx = idx % col;
-    T prior_box_width =
-        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
-    T prior_box_height =
-        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
+    T prior_box_width = prior_box_data[col_idx * len + 2] -
+                        prior_box_data[col_idx * len] + (normalized == false);
+    T prior_box_height = prior_box_data[col_idx * len + 3] -
+                         prior_box_data[col_idx * len + 1] +
+                         (normalized == false);
     T prior_box_center_x =
         (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
     T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
@@ -93,8 +96,10 @@ __global__ void DecodeCenterSizeKernel(const T* prior_box_data,
 
     output[idx * len] = target_box_center_x - target_box_width / 2;
     output[idx * len + 1] = target_box_center_y - target_box_height / 2;
-    output[idx * len + 2] = target_box_center_x + target_box_width / 2;
-    output[idx * len + 3] = target_box_center_y + target_box_height / 2;
+    output[idx * len + 2] =
+        target_box_center_x + target_box_width / 2 - (normalized == false);
+    output[idx * len + 3] =
+        target_box_center_y + target_box_height / 2 - (normalized == false);
   }
 }
 
@@ -128,14 +133,15 @@ class BoxCoderCUDAKernel : public framework::OpKernel<T> {
     T* output = output_box->data<T>();
 
     auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
+    bool normalized = context.Attr<bool>("box_normalized");
     if (code_type == BoxCodeType::kEncodeCenterSize) {
       EncodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
           prior_box_data, prior_box_var_data, target_box_data, row, col, len,
-          output);
+          normalized, output);
     } else if (code_type == BoxCodeType::kDecodeCenterSize) {
       DecodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
           prior_box_data, prior_box_var_data, target_box_data, row, col, len,
-          output);
+          normalized, output);
     }
   }
 };
diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h
index 77fc6c2b62..3dc68935ac 100644
--- a/paddle/fluid/operators/detection/box_coder_op.h
+++ b/paddle/fluid/operators/detection/box_coder_op.h
@@ -34,7 +34,7 @@ class BoxCoderKernel : public framework::OpKernel<T> {
   void EncodeCenterSize(const framework::Tensor& target_box,
                         const framework::Tensor& prior_box,
                         const framework::Tensor& prior_box_var,
-                        T* output) const {
+                        const bool normalized, T* output) const {
     int64_t row = target_box.dims()[0];
     int64_t col = prior_box.dims()[0];
     int64_t len = prior_box.dims()[1];
@@ -44,10 +44,11 @@ class BoxCoderKernel : public framework::OpKernel<T> {
 
     for (int64_t i = 0; i < row; ++i) {
       for (int64_t j = 0; j < col; ++j) {
-        T prior_box_width =
-            prior_box_data[j * len + 2] - prior_box_data[j * len];
-        T prior_box_height =
-            prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
+        T prior_box_width = prior_box_data[j * len + 2] -
+                            prior_box_data[j * len] + (normalized == false);
+        T prior_box_height = prior_box_data[j * len + 3] -
+                             prior_box_data[j * len + 1] +
+                             (normalized == false);
         T prior_box_center_x =
             (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
         T prior_box_center_y =
@@ -57,10 +58,11 @@ class BoxCoderKernel : public framework::OpKernel<T> {
             (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
         T target_box_center_y =
             (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
-        T target_box_width =
-            target_box_data[i * len + 2] - target_box_data[i * len];
-        T target_box_height =
-            target_box_data[i * len + 3] - target_box_data[i * len + 1];
+        T target_box_width = target_box_data[i * len + 2] -
+                             target_box_data[i * len] + (normalized == false);
+        T target_box_height = target_box_data[i * len + 3] -
+                              target_box_data[i * len + 1] +
+                              (normalized == false);
 
         size_t offset = i * col * len + j * len;
         output[offset] = (target_box_center_x - prior_box_center_x) /
@@ -79,7 +81,7 @@ class BoxCoderKernel : public framework::OpKernel<T> {
   void DecodeCenterSize(const framework::Tensor& target_box,
                         const framework::Tensor& prior_box,
                         const framework::Tensor& prior_box_var,
-                        T* output) const {
+                        const bool normalized, T* output) const {
     int64_t row = target_box.dims()[0];
     int64_t col = prior_box.dims()[0];
     int64_t len = prior_box.dims()[1];
@@ -91,10 +93,11 @@ class BoxCoderKernel : public framework::OpKernel<T> {
     for (int64_t i = 0; i < row; ++i) {
       for (int64_t j = 0; j < col; ++j) {
         size_t offset = i * col * len + j * len;
-        T prior_box_width =
-            prior_box_data[j * len + 2] - prior_box_data[j * len];
-        T prior_box_height =
-            prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
+        T prior_box_width = prior_box_data[j * len + 2] -
+                            prior_box_data[j * len] + (normalized == false);
+        T prior_box_height = prior_box_data[j * len + 3] -
+                             prior_box_data[j * len + 1] +
+                             (normalized == false);
         T prior_box_center_x =
             (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
         T prior_box_center_y =
@@ -116,8 +119,10 @@ class BoxCoderKernel : public framework::OpKernel<T> {
 
         output[offset] = target_box_center_x - target_box_width / 2;
         output[offset + 1] = target_box_center_y - target_box_height / 2;
-        output[offset + 2] = target_box_center_x + target_box_width / 2;
-        output[offset + 3] = target_box_center_y + target_box_height / 2;
+        output[offset + 2] =
+            target_box_center_x + target_box_width / 2 - (normalized == false);
+        output[offset + 3] =
+            target_box_center_y + target_box_height / 2 - (normalized == false);
       }
     }
   }
@@ -139,11 +144,14 @@ class BoxCoderKernel : public framework::OpKernel<T> {
     output_box->mutable_data<T>({row, col, len}, context.GetPlace());
 
     auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
+    bool normalized = context.Attr<bool>("box_normalized");
     T* output = output_box->data<T>();
     if (code_type == BoxCodeType::kEncodeCenterSize) {
-      EncodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
+      EncodeCenterSize(*target_box, *prior_box, *prior_box_var, normalized,
+                       output);
     } else if (code_type == BoxCodeType::kDecodeCenterSize) {
-      DecodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
+      DecodeCenterSize(*target_box, *prior_box, *prior_box_var, normalized,
+                       output);
     }
   }
 };
diff --git a/python/paddle/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
index 56f5af91d8..a31b7ea322 100644
--- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
@@ -19,7 +19,8 @@ import math
 from op_test import OpTest
 
 
-def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
+def box_coder(target_box, prior_box, prior_box_var, output_box, code_type,
+              box_normalized):
     prior_box_x = (
         (prior_box[:, 2] + prior_box[:, 0]) / 2).reshape(1, prior_box.shape[0])
     prior_box_y = (
@@ -30,6 +31,9 @@ def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
         (prior_box[:, 3] - prior_box[:, 1])).reshape(1, prior_box.shape[0])
     prior_box_var = prior_box_var.reshape(1, prior_box_var.shape[0],
                                           prior_box_var.shape[1])
+    if not box_normalized:
+        prior_box_height = prior_box_height + 1
+        prior_box_width = prior_box_width + 1
 
     if (code_type == "EncodeCenterSize"):
         target_box_x = ((target_box[:, 2] + target_box[:, 0]) / 2).reshape(
@@ -40,6 +44,9 @@ def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
             target_box.shape[0], 1)
         target_box_height = ((target_box[:, 3] - target_box[:, 1])).reshape(
             target_box.shape[0], 1)
+        if not box_normalized:
+            target_box_height = target_box_height + 1
+            target_box_width = target_box_width + 1
 
         output_box[:,:,0] = (target_box_x - prior_box_x) / prior_box_width / \
                 prior_box_var[:,:,0]
@@ -64,9 +71,13 @@ def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
         output_box[:, :, 1] = target_box_y - target_box_height / 2
         output_box[:, :, 2] = target_box_x + target_box_width / 2
         output_box[:, :, 3] = target_box_y + target_box_height / 2
+        if not box_normalized:
+            output_box[:, :, 2] = output_box[:, :, 2] - 1
+            output_box[:, :, 3] = output_box[:, :, 3] - 1
 
 
-def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type):
+def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type,
+                    box_normalized):
     n = target_box.shape[0]
     m = prior_box.shape[0]
     output_box = np.zeros((n, m, 4), dtype=np.float32)
@@ -74,11 +85,11 @@ def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type):
         if (code_type == "EncodeCenterSize"):
             box_coder(target_box[lod[i]:lod[i + 1], :], prior_box,
                       prior_box_var, output_box[lod[i]:lod[i + 1], :, :],
-                      code_type)
+                      code_type, box_normalized)
         elif (code_type == "DecodeCenterSize"):
             box_coder(target_box[lod[i]:lod[i + 1], :, :], prior_box,
                       prior_box_var, output_box[lod[i]:lod[i + 1], :, :],
-                      code_type)
+                      code_type, box_normalized)
     return output_box
 
 
@@ -93,15 +104,19 @@ class TestBoxCoderOp(OpTest):
         prior_box_var = np.random.random((10, 4)).astype('float32')
         target_box = np.random.random((5, 10, 4)).astype('float32')
         code_type = "DecodeCenterSize"
+        box_normalized = False
         output_box = batch_box_coder(prior_box, prior_box_var, target_box,
-                                     lod[0], code_type)
+                                     lod[0], code_type, box_normalized)
 
         self.inputs = {
             'PriorBox': prior_box,
             'PriorBoxVar': prior_box_var,
             'TargetBox': target_box,
         }
-        self.attrs = {'code_type': 'decode_center_size'}
+        self.attrs = {
+            'code_type': 'decode_center_size',
+            'box_normalized': False
+        }
         self.outputs = {'OutputBox': output_box}
 
 
@@ -116,15 +131,16 @@ class TestBoxCoderOpWithLoD(OpTest):
         prior_box_var = np.random.random((10, 4)).astype('float32')
         target_box = np.random.random((20, 4)).astype('float32')
         code_type = "EncodeCenterSize"
+        box_normalized = True
         output_box = batch_box_coder(prior_box, prior_box_var, target_box,
-                                     lod[0], code_type)
+                                     lod[0], code_type, box_normalized)
 
         self.inputs = {
             'PriorBox': prior_box,
             'PriorBoxVar': prior_box_var,
             'TargetBox': (target_box, lod),
         }
-        self.attrs = {'code_type': 'encode_center_size'}
+        self.attrs = {'code_type': 'encode_center_size', 'box_normalized': True}
         self.outputs = {'OutputBox': output_box}
 
 
From 7218f4feb28e295b55e0eedd1d0e437e190c4de8 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Mon, 4 Jun 2018 10:42:26 +0800
Subject: [PATCH 42/68] update by comment

---
 .../fluid/tests/unittests/test_listen_and_serv_op.py      | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
index ad479657cc..6be24b0944 100644
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -72,8 +72,8 @@ class TestListenAndServOp(OpTest):
     def _wait_ps_ready(self, pid):
         retry_times = self.ps_timeout
         while True:
-            time.sleep(1)
             assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(0.5)
             try:
                 # the listen_and_serv_op would touch a file which contains the listen port
                 # on the /tmp directory until it was ready to process all the RPC call.
@@ -87,15 +87,15 @@ class TestListenAndServOp(OpTest):
         pid = self._start_pserver(False, True)
         self._wait_ps_ready(pid)
 
-        # raise SIGINT to pserver
-        os.kill(pid, signal.SIGINT)
+        # raise SIGTERM to pserver
+        os.kill(pid, signal.SIGTERM)
 
         # run pserver on CPU in async mode
         pid = self._start_pserver(False, False)
         self._wait_ps_ready(pid)
 
         # raise SIGTERM to pserver
-        os.kill(pid, signal.SIGINT)
+        os.kill(pid, signal.SIGTERM)
 
 
 if __name__ == '__main__':

From ee4e567dea8074245c89be79b0267a515a84c6b6 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 4 Jun 2018 10:45:59 +0800
Subject: [PATCH 43/68] Creating readers before training begining

---
 .../fluid/operators/reader/open_files_op.cc   | 52 ++++++++++---------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc
index 8c0dac65dd..c0526e45f4 100644
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -26,7 +26,11 @@ class MultiFileReader : public framework::ReaderBase {
   MultiFileReader(const std::vector<std::string>& file_names,
                   const std::vector<framework::DDim>& dims, size_t thread_num,
                   size_t buffer_size)
-      : file_names_(file_names), dims_(dims), buffer_size_(buffer_size) {
+      : buffer_size_(buffer_size) {
+    readers_.resize(file_names.size());
+    for (const std::string& f_name : file_names) {
+      readers_.emplace_back(CreateReaderByFileName(f_name, dims));
+    }
     prefetchers_.resize(thread_num);
     StartNewScheduler();
   }
@@ -40,14 +44,13 @@ class MultiFileReader : public framework::ReaderBase {
   void StartNewScheduler();
   void EndScheduler();
   void ScheduleThreadFunc();
-  void PrefetchThreadFunc(std::string file_name, size_t thread_idx);
+  void PrefetchThreadFunc(size_t reader_idx, size_t thread_idx);
 
-  std::vector<std::string> file_names_;
-  std::vector<framework::DDim> dims_;
+  std::vector<std::unique_ptr<framework::ReaderBase>> readers_;
   std::thread scheduler_;
   std::vector<std::thread> prefetchers_;
   size_t buffer_size_;
-  reader::BlockingQueue<size_t>* waiting_file_idx_;
+  reader::BlockingQueue<size_t>* waiting_reader_idx_;
   reader::BlockingQueue<size_t>* available_thread_idx_;
   reader::BlockingQueue<std::vector<framework::LoDTensor>>* buffer_;
 };
@@ -60,20 +63,23 @@ void MultiFileReader::ReadNext(std::vector<framework::LoDTensor>* out) {
 
 void MultiFileReader::ReInit() {
   EndScheduler();
+  for (auto& reader : readers_) {
+    reader->ReInit();
+  }
   StartNewScheduler();
 }
 
 void MultiFileReader::StartNewScheduler() {
   size_t thread_num = prefetchers_.size();
-  waiting_file_idx_ = new reader::BlockingQueue<size_t>(file_names_.size());
+  waiting_reader_idx_ = new reader::BlockingQueue<size_t>(readers_.size());
   available_thread_idx_ = new reader::BlockingQueue<size_t>(thread_num);
   buffer_ = new reader::BlockingQueue<std::vector<framework::LoDTensor>>(
       buffer_size_);
 
-  for (size_t i = 0; i < file_names_.size(); ++i) {
-    waiting_file_idx_->Send(i);
+  for (size_t i = 0; i < readers_.size(); ++i) {
+    waiting_reader_idx_->Send(i);
   }
-  waiting_file_idx_->Close();
+  waiting_reader_idx_->Close();
   for (size_t i = 0; i < thread_num; ++i) {
     available_thread_idx_->Send(i);
   }
@@ -84,13 +90,13 @@ void MultiFileReader::StartNewScheduler() {
 void MultiFileReader::EndScheduler() {
   available_thread_idx_->Close();
   buffer_->Close();
-  waiting_file_idx_->Close();
+  waiting_reader_idx_->Close();
   if (scheduler_.joinable()) {
     scheduler_.join();
   }
   delete buffer_;
   delete available_thread_idx_;
-  delete waiting_file_idx_;
+  delete waiting_reader_idx_;
 }
 
 void MultiFileReader::ScheduleThreadFunc() {
@@ -102,12 +108,11 @@ void MultiFileReader::ScheduleThreadFunc() {
     if (prefetcher.joinable()) {
       prefetcher.join();
     }
-    size_t file_idx;
-    if (waiting_file_idx_->Receive(&file_idx)) {
+    size_t reader_idx;
+    if (waiting_reader_idx_->Receive(&reader_idx)) {
       // Still have files to read. Start a new prefetch thread.
-      std::string file_name = file_names_[file_idx];
-      prefetcher = std::thread([this, file_name, thread_idx] {
-        PrefetchThreadFunc(file_name, thread_idx);
+      prefetcher = std::thread([this, reader_idx, thread_idx] {
+        PrefetchThreadFunc(reader_idx, thread_idx);
       });
     } else {
       // No more file to read.
@@ -129,11 +134,9 @@ void MultiFileReader::ScheduleThreadFunc() {
   VLOG(5) << "MultiFileReader schedule thread terminates.";
 }
 
-void MultiFileReader::PrefetchThreadFunc(std::string file_name,
-                                         size_t thread_idx) {
-  VLOG(5) << "The prefetch thread of file '" << file_name << "' starts.";
-  std::unique_ptr<framework::ReaderBase> reader =
-      CreateReaderByFileName(file_name, dims_);
+void MultiFileReader::PrefetchThreadFunc(size_t reader_idx, size_t thread_idx) {
+  VLOG(5) << "The prefetch thread of file idx '" << reader_idx << "' starts.";
+  std::unique_ptr<framework::ReaderBase>& reader = readers_[reader_idx];
   while (true) {
     std::vector<framework::LoDTensor> ins;
     reader->ReadNext(&ins);
@@ -144,8 +147,8 @@ void MultiFileReader::PrefetchThreadFunc(std::string file_name,
       buffer_->Send(std::move(ins));
     } catch (paddle::platform::EnforceNotMet e) {
       VLOG(5) << "WARNING: The buffer channel has been closed. The prefetch "
-                 "thread of file '"
-              << file_name << "' will terminate.";
+                 "thread of file idx '"
+              << reader_idx << "' will terminate.";
       break;
     }
   }
@@ -154,7 +157,8 @@ void MultiFileReader::PrefetchThreadFunc(std::string file_name,
     VLOG(5) << "WARNING: The available_thread_idx_ channel has been closed. "
                "Fail to send thread_idx.";
   }
-  VLOG(5) << "The prefetch thread of file '" << file_name << "' terminates.";
+  VLOG(5) << "The prefetch thread of file idx '" << reader_idx
+          << "' terminates.";
 }
 
 class OpenFilesOp : public framework::OperatorBase {

From 7f5eb9f68743093ed14d5c35ebd0d266800ac939 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Mon, 4 Jun 2018 10:46:02 +0800
Subject: [PATCH 44/68] update by comment

---
 .../paddle/fluid/tests/unittests/test_listen_and_serv_op.py   | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
index 6be24b0944..1226027ddc 100644
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -82,6 +82,10 @@ class TestListenAndServOp(OpTest):
             except os.error:
                 retry_times -= 1
 
+    def test_rpc_interfaces(self):
+        # TODO(Yancey1989): need to make sure the rpc interface correctly.
+        pass
+
     def test_handle_signal_in_serv_op(self):
         # run pserver on CPU in sync mode
         pid = self._start_pserver(False, True)

From 6ae7cbe252178e7bd3e5c3b7cde21581948b478f Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Mon, 4 Jun 2018 11:08:08 +0800
Subject: [PATCH 45/68] follow comments

---
 .../fluid/inference/tests/book/CMakeLists.txt |  3 ++-
 .../tests/book/test_inference_nlp.cc          | 21 ++++++++++---------
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index b33df2942a..2fa5a9540b 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -40,8 +40,9 @@ inference_test(recommender_system)
 inference_test(word2vec)
 
 # This is an unly work around to make this test run
+# TODO(TJ): clean me up
 cc_test(test_inference_nlp
   SRCS test_inference_nlp.cc
   DEPS paddle_fluid
   ARGS
-  --modelpath=${PADDLE_BINARY_DIR}/python/paddle/fluid/tests/book/recognize_digits_mlp.inference.model)
+  --model_path=${PADDLE_BINARY_DIR}/python/paddle/fluid/tests/book/recognize_digits_mlp.inference.model)
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index c3bec27925..70aa42ac41 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -24,8 +24,8 @@ limitations under the License. */
 #include <omp.h>
 #endif
 
-DEFINE_string(modelpath, "", "Directory of the inference model.");
-DEFINE_string(datafile, "", "File of input index data.");
+DEFINE_string(model_path, "", "Directory of the inference model.");
+DEFINE_string(data_file, "", "File of input index data.");
 DEFINE_int32(repeat, 100, "Running the inference program repeat times");
 DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference");
 DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
@@ -65,6 +65,7 @@ size_t LoadData(std::vector<paddle::framework::LoDTensor>* out,
       ids.push_back(stoi(field));
     }
     if (ids.size() >= 1024) {
+      // Synced with NLP guys, they will ignore input larger then 1024
       continue;
     }
 
@@ -142,18 +143,18 @@ void ThreadRunInfer(
 }
 
 TEST(inference, nlp) {
-  if (FLAGS_modelpath.empty()) {
-    LOG(FATAL) << "Usage: ./example --modelpath=path/to/your/model";
+  if (FLAGS_model_path.empty()) {
+    LOG(FATAL) << "Usage: ./example --model_path=path/to/your/model";
   }
-  if (FLAGS_datafile.empty()) {
-    LOG(WARNING) << " Not data file provided, will use dummy data!"
+  if (FLAGS_data_file.empty()) {
+    LOG(WARNING) << "No data file provided, will use dummy data!"
                  << "Note: if you use nlp model, please provide data file.";
   }
-  LOG(INFO) << "Model Path: " << FLAGS_modelpath;
-  LOG(INFO) << "Data File: " << FLAGS_datafile;
+  LOG(INFO) << "Model Path: " << FLAGS_model_path;
+  LOG(INFO) << "Data File: " << FLAGS_data_file;
 
   std::vector<paddle::framework::LoDTensor> datasets;
-  size_t num_total_words = LoadData(&datasets, FLAGS_datafile);
+  size_t num_total_words = LoadData(&datasets, FLAGS_data_file);
   LOG(INFO) << "Number of samples (seq_len<1024): " << datasets.size();
   LOG(INFO) << "Total number of words: " << num_total_words;
 
@@ -168,7 +169,7 @@ TEST(inference, nlp) {
   // 2. Initialize the inference_program and load parameters
   std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
   inference_program =
-      InitProgram(&executor, scope.get(), FLAGS_modelpath, model_combined);
+      InitProgram(&executor, scope.get(), FLAGS_model_path, model_combined);
   if (FLAGS_use_mkldnn) {
     EnableMKLDNN(inference_program);
   }

From f1facb7a3c8b61d9231d27408b173e388818f44b Mon Sep 17 00:00:00 2001
From: weixing02 <wx_crome@163.com>
Date: Mon, 4 Jun 2018 11:33:35 +0800
Subject: [PATCH 46/68] Fix links

---
 doc/v2/build_and_install/build_from_source_cn.rst | 12 ++++++------
 doc/v2/build_and_install/build_from_source_en.rst | 12 +++++++-----
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/doc/v2/build_and_install/build_from_source_cn.rst b/doc/v2/build_and_install/build_from_source_cn.rst
index 741c01ce54..de7e9eb75c 100644
--- a/doc/v2/build_and_install/build_from_source_cn.rst
+++ b/doc/v2/build_and_install/build_from_source_cn.rst
@@ -23,7 +23,7 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 在 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`__ 找到 paddle_manylinux_devel
 镜像的编译以及使用方法。或者参考下述可选步骤，从源码中构建用于编译PaddlePaddle的Docker镜像。
 
-如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
+如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 :ref:`编译依赖 <_compile_deps>` 之后才能开始编译的步骤。
 
 编译PaddlePaddle，需要执行：
 
@@ -106,7 +106,7 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 
 - 学习 Docker 有多难？
 
-  理解 Docker 并不难，大概花十分钟看一下[这篇文章](https://zhuanlan.zhihu.com/p/19902938)。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+  理解 Docker 并不难，大概花十分钟看一下 `这篇文章 <https://zhuanlan.zhihu.com/p/19902938>`_ 。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
 
 - 我可以用 IDE 吗？
 
@@ -123,7 +123,7 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 
 - 可以并行编译吗？
 
-  是的。我们的 Docker image 运行一个 [Bash 脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+  是的。我们的 Docker image 运行一个 `Bash脚本 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
 
 - Docker 需要 sudo
 
@@ -131,11 +131,11 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 
 - 在 Windows/MacOS 上编译很慢
 
-  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考[这个issue](https://github.com/PaddlePaddle/Paddle/issues/627)。
+  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考 `这个issue <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 。
 
 - 磁盘不够
 
-  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考[这篇文章](https://zaiste.net/posts/removing_docker_containers/)来清理这些内容。
+  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考 `这篇文章 <https://zaiste.net/posts/removing_docker_containers/>`_ 来清理这些内容。
 
 
 .. _compile_deps:
@@ -211,7 +211,7 @@ PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行，
 编译选项的设置
 ++++++++++++++
 
-PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如
 
 ..  code-block:: bash
 
diff --git a/doc/v2/build_and_install/build_from_source_en.rst b/doc/v2/build_and_install/build_from_source_en.rst
index b06c43e19d..b08b45d43e 100644
--- a/doc/v2/build_and_install/build_from_source_en.rst
+++ b/doc/v2/build_and_install/build_from_source_en.rst
@@ -11,7 +11,7 @@ To build PaddlePaddle, you need
 1. A computer -- Linux, Windows, MacOS.
 2. Docker.
 
-Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image. 
+Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image.
 We run all the tools by running this image.
 
 .. _build_step:
@@ -26,6 +26,8 @@ you can also find how to build and use paddle_manylinux_devel Docker image from
 `here <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`__
 Or you can build your own image from source as the optional step below:
 
+If you don't wish to use docker，you need to install several compile dependencies manually as :ref:`Compile Dependencies <_compile_deps>` shows to start compilation.
+
 .. code-block:: bash
 
    # 1. clone the source code
@@ -108,7 +110,7 @@ Frequently Asked Questions
 
 - How difficult is it to learn Docker?
 
-    It takes you ten minutes to read [an introductory article](https://docs.docker.com/get-started) and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools.  Not even to mention the time saved when other people trying to reproduce the issue you have.
+    It takes you ten minutes to read `an introductory article <https://docs.docker.com/get-started>`_ and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools.  Not even to mention the time saved when other people trying to reproduce the issue you have.
 
 - Can I use my favorite IDE?
 
@@ -125,7 +127,7 @@ Frequently Asked Questions
 
 - Does Docker do parallel building?
 
-  Our building Docker image runs a [Bash script](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh), which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
+  Our building Docker image runs a  `Bash script <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ , which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
 
 - Docker requires sudo
 
@@ -133,11 +135,11 @@ Frequently Asked Questions
 
 - Docker on Windows/MacOS builds slowly
 
-  On Windows and MacOS, Docker containers run in a Linux VM.  You might want to give this VM some more memory and CPUs so to make the building efficient.  Please refer to [this issue](https://github.com/PaddlePaddle/Paddle/issues/627) for details.
+  On Windows and MacOS, Docker containers run in a Linux VM.  You might want to give this VM some more memory and CPUs so to make the building efficient.  Please refer to `this issue  <https://github.com/PaddlePaddle/Paddle/issues/627>`_ for details.
 
 - Not enough disk space
 
-  Examples in this article use option `--rm` with the `docker run` command.  This option ensures that stopped containers do not exist on hard disks.  We can use `docker ps -a` to list all containers, including stopped.  Sometimes `docker build` generates some intermediate dangling images, which also take disk space.  To clean them, please refer to [this article](https://zaiste.net/posts/removing_docker_containers/).
+  Examples in this article use option `--rm` with the `docker run` command.  This option ensures that stopped containers do not exist on hard disks.  We can use `docker ps -a` to list all containers, including stopped.  Sometimes `docker build` generates some intermediate dangling images, which also take disk space.  To clean them, please refer to `this article <https://zaiste.net/posts/removing_docker_containers/>`_ .
 
 .. _compile_deps:
 

From 05a2a1a9064cdd1bf02f46603b2ee814bee4031d Mon Sep 17 00:00:00 2001
From: weixing02 <wx_crome@163.com>
Date: Mon, 4 Jun 2018 05:24:31 +0000
Subject: [PATCH 47/68] update apis

---
 doc/fluid/api/io.rst        | 18 ++++++++++
 doc/fluid/api/layers.rst    | 72 ++++++++++++++++++++++++-------------
 doc/fluid/api/optimizer.rst | 44 ++++++++---------------
 doc/fluid/api/profiler.rst  | 12 +++++++
 4 files changed, 92 insertions(+), 54 deletions(-)

diff --git a/doc/fluid/api/io.rst b/doc/fluid/api/io.rst
index dd9d88b669..3e956f8302 100644
--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
@@ -59,3 +59,21 @@ get_inference_program
 ..  autofunction:: paddle.fluid.io.get_inference_program
     :noindex:
 
+save_checkpoint
+---------------
+
+..  autofunction:: paddle.fluid.io.save_checkpoint
+    :noindex:
+
+load_checkpoint
+---------------
+
+..  autofunction:: paddle.fluid.io.load_checkpoint
+    :noindex:
+
+clean_checkpoint
+----------------
+
+..  autofunction:: paddle.fluid.io.clean_checkpoint
+    :noindex:
+
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index 5329adaa18..f78e6db326 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -181,6 +181,12 @@ Print
 ..  autofunction:: paddle.fluid.layers.Print
     :noindex:
 
+is_empty
+--------
+
+..  autofunction:: paddle.fluid.layers.is_empty
+    :noindex:
+
 device
 ======
 
@@ -255,6 +261,19 @@ double_buffer
 ..  autofunction:: paddle.fluid.layers.double_buffer
     :noindex:
 
+random_data_generator
+---------------------
+
+..  autofunction:: paddle.fluid.layers.random_data_generator
+    :noindex:
+
+Preprocessor
+------------
+
+..  autoclass:: paddle.fluid.layers.Preprocessor
+    :members:
+    :noindex:
+
 nn
 ==
 
@@ -594,6 +613,29 @@ roi_pool
 ..  autofunction:: paddle.fluid.layers.roi_pool
     :noindex:
 
+dice_loss
+---------
+
+..  autofunction:: paddle.fluid.layers.dice_loss
+    :noindex:
+
+resize_bilinear
+---------------
+
+..  autofunction:: paddle.fluid.layers.resize_bilinear
+    :noindex:
+
+gather
+------
+
+..  autofunction:: paddle.fluid.layers.gather
+    :noindex:
+
+random_crop
+-----------
+
+..  autofunction:: paddle.fluid.layers.random_crop
+    :noindex:
 
 ops
 ===
@@ -742,6 +784,12 @@ sum
 ..  autofunction:: paddle.fluid.layers.sum
     :noindex:
 
+shape
+-----
+
+..  autofunction:: paddle.fluid.layers.shape
+    :noindex:
+
 sigmoid
 -------
 
@@ -991,27 +1039,3 @@ zeros
 ..  autofunction:: paddle.fluid.layers.zeros
     :noindex:
 
-topk
-----
-
-..  autofunction:: paddle.fluid.layers.topk
-    :noindex:
-
-dice_loss
-----
-
-..  autofunction:: paddle.fluid.layers.dice_loss
-    :noindex:
-
-resize_bilinear
-____
-
-..  autofunction:: paddle.fluid.layers.resize_bilinear
-    :noindex:
-
-gather
-____
-
-..  autofunction:: paddle.fluid.layers.gather
-    :noindex:
-
diff --git a/doc/fluid/api/optimizer.rst b/doc/fluid/api/optimizer.rst
index df2bd2eace..6ad44bb690 100644
--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
@@ -47,28 +47,6 @@ DecayedAdagrad
     :members:
     :noindex:
 
-Adadelta
------------------
-
-..  autoclass:: paddle.fluid.optimizer.Adadelta
-    :members:
-    :noindex:
-
-RMSProp
------------------
-
-..  autoclass:: paddle.fluid.optimizer.RMSProp
-    :members:
-    :noindex:
-
-ModelAverage
------------------
-
-..  autoclass:: paddle.fluid.optimizer.ModelAverage
-    :members:
-    :noindex:
-
-
 SGDOptimizer
 ------------
 
@@ -111,25 +89,31 @@ DecayedAdagradOptimizer
     :members:
     :noindex:
 
+RMSPropOptimizer
+----------------
 
-AdadeltaOptimizer
------------------
-
-..  autoclass:: paddle.fluid.optimizer.AdadeltaOptimizer
+..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
     :members:
     :noindex:
 
+Adadelta
+--------
 
-RMSPropOptimizer
------------------
+..  autoclass:: paddle.fluid.optimizer.Adadelta
+    :members:
+    :noindex:
 
-..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+ModelAverage
+------------
+
+..  autoclass:: paddle.fluid.optimizer.ModelAverage
     :members:
     :noindex:
-    
+
 Optimizer
 ---------
 
 ..  autoclass:: paddle.fluid.optimizer.Optimizer
     :members:
     :noindex:
+
diff --git a/doc/fluid/api/profiler.rst b/doc/fluid/api/profiler.rst
index 74d102dcb0..39fda65863 100644
--- a/doc/fluid/api/profiler.rst
+++ b/doc/fluid/api/profiler.rst
@@ -23,3 +23,15 @@ profiler
 ..  autofunction:: paddle.fluid.profiler.profiler
     :noindex:
 
+start_profiler
+--------------
+
+..  autofunction:: paddle.fluid.profiler.start_profiler
+    :noindex:
+
+stop_profiler
+-------------
+
+..  autofunction:: paddle.fluid.profiler.stop_profiler
+    :noindex:
+

From 744cc412b779333f2869653d9c6d462fba3521eb Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 4 Jun 2018 14:11:25 +0800
Subject: [PATCH 48/68] fix a bug

---
 paddle/fluid/operators/reader/open_files_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc
index c0526e45f4..2cbae8a90a 100644
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -27,7 +27,7 @@ class MultiFileReader : public framework::ReaderBase {
                   const std::vector<framework::DDim>& dims, size_t thread_num,
                   size_t buffer_size)
       : buffer_size_(buffer_size) {
-    readers_.resize(file_names.size());
+    readers_.reserve(file_names.size());
     for (const std::string& f_name : file_names) {
       readers_.emplace_back(CreateReaderByFileName(f_name, dims));
     }

From 6ac47a3d90c64f283fb4b5ab30acac46efb2fe91 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Mon, 4 Jun 2018 14:48:02 +0800
Subject: [PATCH 49/68] rename Mkldnn to MKLDNN

---
 .../fluid/operators/activation_mkldnn_op.cc   | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/operators/activation_mkldnn_op.cc b/paddle/fluid/operators/activation_mkldnn_op.cc
index b892ac77d9..46ed99bcf2 100644
--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
@@ -222,35 +222,35 @@ struct MKLDNNActivationGradFunc : public BaseActivationFunctor<T> {
 };
 
 template <typename T>
-using ReluMkldnnFunctor =
+using ReluMKLDNNFunctor =
     MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_relu>;
 
 template <typename T>
-using TanhMkldnnFunctor =
+using TanhMKLDNNFunctor =
     MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_tanh>;
 
 template <typename T>
-using SqrtMkldnnFunctor =
+using SqrtMKLDNNFunctor =
     MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_sqrt>;
 
 template <typename T>
-using AbsMkldnnFunctor =
+using AbsMKLDNNFunctor =
     MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_abs>;
 
 template <typename T>
-using ReluMkldnnGradFunctor =
+using ReluMKLDNNGradFunctor =
     MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_relu>;
 
 template <typename T>
-using TanhMkldnnGradFunctor =
+using TanhMKLDNNGradFunctor =
     MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_tanh>;
 
 template <typename T>
-using SqrtMkldnnGradFunctor =
+using SqrtMKLDNNGradFunctor =
     MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_sqrt>;
 
 template <typename T>
-using AbsMkldnnGradFunctor =
+using AbsMKLDNNGradFunctor =
     MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_abs>;
 }  // namespace operators
 }  // namespace paddle
@@ -265,9 +265,9 @@ namespace ops = paddle::operators;
       ops::MKLDNNActivationGradKernel<ops::grad_functor<float>>);
 
 #define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro)            \
-  __macro(relu, ReluMkldnnFunctor, ReluMkldnnGradFunctor); \
-  __macro(tanh, TanhMkldnnFunctor, TanhMkldnnGradFunctor); \
-  __macro(sqrt, SqrtMkldnnFunctor, SqrtMkldnnGradFunctor); \
-  __macro(abs, AbsMkldnnFunctor, AbsMkldnnGradFunctor);
+  __macro(relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \
+  __macro(tanh, TanhMKLDNNFunctor, TanhMKLDNNGradFunctor); \
+  __macro(sqrt, SqrtMKLDNNFunctor, SqrtMKLDNNGradFunctor); \
+  __macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor);
 
 FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);

From 1666cf96bfed426be8debdefa9a8a561f5a8a13f Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Mon, 4 Jun 2018 14:53:24 +0800
Subject: [PATCH 50/68] update by comment

---
 doc/fluid/howto/cluster/fluid_recordio.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/doc/fluid/howto/cluster/fluid_recordio.md b/doc/fluid/howto/cluster/fluid_recordio.md
index 0e8b98542d..55ce63ec19 100644
--- a/doc/fluid/howto/cluster/fluid_recordio.md
+++ b/doc/fluid/howto/cluster/fluid_recordio.md
@@ -23,7 +23,10 @@ as follows:
     fluid.recordio_writer.convert_reader_to_recordio_file('./mnist.recordio', reader, feeder)
 ```
 
-The above codes would generate a RecordIO `./mnist.recordio` on your host.
+The above code snippet would generate a RecordIO `./mnist.recordio` on your host.
+
+**NOTE**: we recommend users to set `batch_size=1` when generating the recordio files so that users can
+adjust it flexibly while reading it.
 
 ## Use the RecordIO file in a Local Training Job
 
@@ -96,7 +99,7 @@ The above codes would generate multiple RecordIO files on your host like:
  |-mnist-00004.recordio
 ```
 
-1. open multiple RecordIO files by `fluid.layers.io.open_files`
+2. open multiple RecordIO files by `fluid.layers.io.open_files`
 
 For a distributed training job, the distributed operator system will schedule trainer process on multiple nodes,
 each trainer process reads parts of the whole training data, we usually take the following approach to make the training

From 3526ac1136eb51decb567b4cd0230ac9bae5c244 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 4 Jun 2018 15:35:06 +0800
Subject: [PATCH 51/68] refine code

---
 paddle/fluid/operators/reader/open_files_op.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc
index 2cbae8a90a..31e5d81e55 100644
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -63,9 +63,6 @@ void MultiFileReader::ReadNext(std::vector<framework::LoDTensor>* out) {
 
 void MultiFileReader::ReInit() {
   EndScheduler();
-  for (auto& reader : readers_) {
-    reader->ReInit();
-  }
   StartNewScheduler();
 }
 
@@ -141,6 +138,7 @@ void MultiFileReader::PrefetchThreadFunc(size_t reader_idx, size_t thread_idx) {
     std::vector<framework::LoDTensor> ins;
     reader->ReadNext(&ins);
     if (ins.empty()) {
+      reader->ReInit();
       break;
     }
     try {

From 97b2f6f5f961575633b3ba0e8ea7b5c2fd5d4396 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Mon, 4 Jun 2018 17:02:37 +0800
Subject: [PATCH 52/68] ignore pyc files

---
 tools/codestyle/.gitignore            |   1 +
 tools/codestyle/docstring_checker.pyc | Bin 11769 -> 0 bytes
 2 files changed, 1 insertion(+)
 create mode 100644 tools/codestyle/.gitignore
 delete mode 100644 tools/codestyle/docstring_checker.pyc

diff --git a/tools/codestyle/.gitignore b/tools/codestyle/.gitignore
new file mode 100644
index 0000000000..0d20b6487c
--- /dev/null
+++ b/tools/codestyle/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/tools/codestyle/docstring_checker.pyc b/tools/codestyle/docstring_checker.pyc
deleted file mode 100644
index 07e875aec6c9bae8002bde4223348c6a29647b03..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 11769
zcmdT~%X1XR8SmL$t+b1Tc#4Oy8S`8MD~X3+M8*~fV-j0Xvp5to)Ogt$Nh8hftfpse
z$%aa$aOH8#E%^^pm8zUlxutT5Z$2h}Kyu6}hg439EBXDto}FEU>_FK9N7A&qzkYxH
zy}R@O7%8{*jJ)4a=}#X2-^8bXiNsTC4ymQQoC+;9hm6VR)Lc&Tc@^eWv!Lb*syU$M
z29(7X1r-)mb5P9<n(}}Ohg7qq=1QiF(aLJBth}Q76jN+b-k|b_)jYZm5%NH(TXQ4I
zD=EJq&~gUd%C*!gTe!q2Br{P%CvnhPxU}dumi^ccwB6CZXD5-}Ajod7C5utZ_M(Pu
zS{~QPtcCt~@E`olR&p37;eZ$G+E0f`22k!KL8vPht4EPsokssC@X{uxg8e*yzTOEF
zFK8qUw&q7XUc#sMAi1a15(crDd0_F6XsXG~dIuUf{z9M=tX`Jv$9g<n);3heX{DQ1
zCIqhEw^>aQHDS=gNEPIiB>FmX6Ypt^I<Z!7t#&3~n)oP+mvy_|@F&`J&kKFm!}Jr^
zB|niwQK%;xk>?AVCV)AExb6bRfzlxi+FjUw<TbIVa!CnkwzdXg{9)xqeEJp=A4iUp
zMk+K!3dr+RnJFj_`GWcs)Ha~J0Ra}fgfd7%VY8u>P(V;15<<$#sa49H;yjXU1$HB>
zYi%z^p{MO+(YM?6nACwirPN^iq2Kgdi5@T8_{k2COO^wC9+^Z=59fvhbaQf^c~Yt6
zDI6oud$v+Q>g3s&xey4&frB`Nlp3L5kIfmeR{P=n(>Nj54O&6sx`#O@Csx*|m2HuW
zy~LVnBnw?K1!Ye43c3D5zQ0gVQ-Snx<uGd%vdpY(emY*zLLrw|>&<ouIvPZs<0qZC
zr4tgVQxCMSg#jF%nslq9o8ZV9256J<<B)TY<I|j9StIGMph+l!=;59+N+6nmo%?yx
z!@V5&_<ljHR@6NZMnTyCE$37`m6i|Q2UsZk6?9qIt!|=Jt8my^U{0xk^kImCz|e0A
z4rTX(CBt@-4O$+ERA?#mKlDRkv<?qf2Os;Jj4PRJ!B1S7dK_!jJRxNjtzm1YRkpU~
z8pIM$nN%i3bq2dLg6{!Z0!2%Zd`jI1(~l5`s{<AgHgo3-A`$3Y0zy$eppv}0H$db<
z6f9dRepjs=RxLA_v?Sh(Y_kLrQDDpXW6FwqMKzz}&}g657NzzVlp7n=4oU6rC_^@=
zElcg6mAB>LT1<^o9aj663o6c8+@k^I)xLd8m`E0HnMHeBxuCCTbCY#g2yg^6*DBOB
z`Y%XCQrW;d)Ro^zf~ckK`8aA)c8RDWk=8nmMZ2zTa@;99<uGJ~y-wL({<C$~gDuZz
zC5}N^STUq_;ym>y<z(IHI=clm&?ha8kYulPu7`)Z5qEq$n3oP1C>6H4(`fiw&v(Lb
ztv7&>b%eQMEs9uJLX+WSiOyCcf@-EalS93yMUu$&&O6%)pvLL$^Snd!8NMPxI{R3p
znm3}RS0vMm2uMU}w8J2Q;)JB)0*H*Zvy)BsA~8*pdYtHwf@IMMEK0F@yY08Uge!F+
zyj<40$y(bNaYkfRWVvJ(P&K4S)G#dEH&s3HMNx#3Hk>iE+iZU3CgpHmK}M6I)lfM%
z41OK~NAIwTxlt&JhkkS0ti6C2tpmAoPlaT%k<!#~J0#>jW7^Ehi5ZDbEI&2bF4i37
zRcutwZA@+ql*e8<N$XjxoOqIz3yHf=zHl<YvsgH3oD26;m4A2L(n<a#yGw-A3|zg|
zM=&#Y-c(SRU;(HB*n&_O4<O2sfkNHNbO|ySHkX)SOHcy_BGP|qWvjxUNYVmCD5se<
z@`DP#^8R#nu3r6kvih1^y>s%!ZT-pQsdM)$w=3f(Z=dKE&j+MsBRV*M;c~)2L}Ydi
z3$C#!?7I+0owh^6U9`DT*i`1m<ZEs+`#LXRDxXUtwW!=Ka;$YMCti`*U9Q`zH+|O?
z$>qAu$m_t!cf|~IT}UC<bw<!tB%sJR=O~NEm=I?Uw;=QM_9D4W@g|%h%ytl;I71LJ
zxl*AtI9w_nE#>*^@IV^)<cAi<MSS`!P!2R?&V-+*vq&Gpcn|PW;X35$hQdss(*PH>
zqyiL+%13cf?SaEN56UX3m%XyUwg3iK6tEG%whC+%ux$d{4%iNX?F4L>z;**RCa^t#
z?G@NQ!1fF5IlvAG?0LXm5ZFP$Y=OO~UZx*BE>7?v@nnXTcUUrTfR9K9KJQV<j4JP#
zWVR{qxbiCECT&;V2?6d<-ghLkQ+X#PvrBoWB(qz2RmqIeO`0I`E+d(Vw9)&iOH1Y7
zUZVrqbwuGti>)B8C)K)XpE}S|rAagZSLt?FqgJ&Y>44@+J*>`0akHLOp&L8mV;x@C
zblG2nE9yPcj;w{rs%Exo*DEHu_rRjpwGG&{mTyygbAesA4le%%`?P?y*a%pfUlkpH
z8eQNId-g|*e#@@gdNGQV{=Ps1x;Np|2k>6Z7{d5BUz?mft;rxWy_vko*RYck8@7N=
z6Tm(+e#mb7aS(ZCW>i#Jq1Vscx86rbsi$LPGoR6~Vdm*Dw1H4<rh@$x#5QdXjRot(
zzU$jWHSmuO2K{jlO*=)_QLTp|4@K)U521paKoy5K=Y<^Xxt3UjeQV8DbXK3l0PhA`
zBLYJVrsNUDeFa*SQDm?z23@sjI%t|>)1G+aX~!^XEg%eX;q}v#*(5KCfdYi5ZHy~q
z`*9q_rx@(9&)E{put9EEEgdCI=~gxQs<hO!t8nTYG|pyvSsy{8TTzFL?S!6<fQbD;
zYJYw}L^PtN5V9GcXIf2d3{9@O0c+OpcE(B6`+BEFeE%rf#IFb*ADLKefDgLne}7N$
zzsy<2(n@f&(ez(FMmGDa;=JXtetwtm=^r66<{ULK;#J0fW13TYAzzo_bRDNrTO+KX
z^f@VD%m?D*Q9r{gk~--RF9)s>)HmF`9L5u`=ixHEu&iF9d|ZzH49SN9M2b7#X^9_%
zc$<+=+I}kqWqwLZBPPI?xjEC1I6dOmR8lWzoF(`^z@IHsk(vqXc`GRqZ0zpz92{F5
z8n7k^?X7yK`$h;Ty=EmwB@sLkYNCtHnniIZ5hWD(kd5i>nW97G=-5oCDn(gKOcLBm
z&qE{5!=VydVd90PR4JCQ0SF=V*(+3|+@lQJ)AF05;1inDj~wF90o1#YsG*VEm{rJ?
z<l6>UW1A&3N%M=J&?Lurh$g*W@Quco^C}YK=MfFg1s3sbW*C$cnSB`>>sKguJ&o?-
zzttO{PB0sgNGVwB*I%^-qWGaRU;#{<3JDa9?M{h-0wN!#rthdocL^1Iss+}w02G%?
zsuvbmtZUb3QXy?4&|Tr~r+d;uT)!b|$bj8&VMe5M5QIqU$Y9S)CbcqIxnCu?w_pt{
zOQn{Fl)j)KEe!+`k}?&(#vMT~%vQKTf;|tu4tf!dIjaTK4R$>o5h%_%OemzDhMik}
zg)4MuDmaXV_hg5(m<4_%v>~Da3eIgJHdBY|Mk8Bf)>FppMa=tKd}$z-g5ZdC&<aQ>
z4XOafQ1}Uge};@fFsyl?G2WSB7K#Tb%M-;9kVj~Cj)BFQJC1}Efk=tW0SzWPh(<vC
zUd9sfms4XrgS?6-zT^xB<qROFYbz7#CeO1Xj^W?2q<+fmDyPgI!i|%uH+J(ksFoOs
zONXw&Y)^GqdLh-ViRRbM`wb{FY*{%kF`<}s-a;~4VJOaMsVmHh(&FWip*P|xq^@f_
zZxblQc@?={kjxCWo3%ER7nvMJBEq22iJ|9c-c<nFM6vGBIr=@mR6C7|UK-AgSO=h)
z4p>7b+)csGP375g4T+&kQ0PzLr<8>{_X%~vP68<9azu?M478wr0xb{T&#51y#}+k%
zCTvxdi-h+KWkZf4&JCw^h!ZeY4SB%8cFv+O5IUqD(tkwKm7ZN1G{M-^Dyra$!Je|u
zO`f2q2Nwse#k=X*IZt3Ii3!y;mwF>S5piv3DDiX~)aXPok$V=OX3zsV<<2e!ihxZc
zVu7t6SSun^yBBfnDWYy3#3Hf`C{k*CQvS6<6CJYw;zVjw3?A|OrWX8`zuJy3{FT!f
zYSCO>S<(>!l}Cav5TADF8us&&5Y?lNjc~WwxW{8VQIz<y?ljl)7N?Z}l?*OB?jc}P
z_U>H5lz+mPa!&~%<w9<URj>}>{{eiDs!b@iu2cu@XHp%TGCTzj<3T?Io#_+$;11M*
zBfv%=2fZZX8~g&Q5ar3MA+QW@d%!ZdWD%C3M$vyj(z7x8n5N4a5*FzC@AmQW<93hI
zACp($q8s-FJhvI_BJ|E=+@U+)Pek-0j3j-SSR9ec&!JYl4(!j|Y`o3(J-NIQ?V!cx
zu#7W<k^jcgdC#K;DRiI1w-*8zBtw3ng+hT#0@FW4l8DM*os{#npy`iN?c;c%_CY(O
znjG&pR<`j15f%;%gzG5qCRSe}poZ8keO2i8jJ^<Z{zPa9A;NWLSjo^391r%DNVBH;
z?{l%91rgB<fVE*)vM32Gfh=!Fcb9O3*dyGpFQTz!<?!;TM{VNSI@ISPeD6lynrLmf
z?zr@?w;H~LpX<;?ctj)Crd(4RGl3$^M4mGm>|KVNNo+QGhsh}<Mv@`3N!0i{M*cTP
z=MA7L6`}ZfFIKb$@ckc9<-f4%{zyO{S!Q-=7S2K>w2sgpVpk?^fY4!&TD}o6_iQNi
zWguZA3UwS-)R^2rBKJFnDf+1K7^}Y})HsE;j6uf#Z)&7k<0mq52f8d68Hw9A=s@U6
zSeOW3-?^}Xq#Q<&lrJ%0Zo(6S7@ScnWAgL@vJ`Sss-R*mKyQMZAw^MwjAS&hMO{*(
zEX9O)#nE34llL=$hy#{(_>3C*3S6-(Z!=bar(C$X5mgQQmy6{C3@Gd^s<wKCbIsg&
zLmqyh7Ovl*ia)30b_J&K<BhWNIaZA|iVdaz2_&rxpY`zwf@*ys@bwY(C-vaI+5~tk
z-pHo>+QM6`!2r8gno`&E?{*f>r%yGoZGGN$&H~=!8?oaM)5Oh2g*Oc+V9q3PF6|24
z4!H-F=nSua8;e2c#eS=gxoCt;<Az|NHH%A0Q4YMFboj8&JhBi+G2=ARA?tlYJDVww
z9F{5J3$tp8fIbza@LkM3Nj9WxP(wv)49}lNarwH7_ATO<N34>VD||*lZ*eGR^b}u#
zI^#@uP2fy2p><{6j<^9|$;rh6E@|ohUcXkmdgJP~+39!OcQ4m!(^oFLS7#(@_WjGZ
zZeDX{`oPXD4n(_1uQl&Rp_p`S+&;xYvL@(mYP`pzP>9;{j>vffdFM?g(@YrEb1pKW
z<16;NSYr;AphMZ?aOXXhE4j^a9%$o5`1JRX6!S%^1bs`r%ilcSnw9bXq>S94q}!3>
zXG(+C2rrNK$hC2?R2m*Djg@wm_Lug{yDGuEag>5pBl$-$i@EX~@7^*}Hz)_EtR!rm
z*2=5ec|4ib=HU3?r4fZ;`ZP;kemW=ElZck%FlLvD*mGM?1AlV(QzjUFVJjw0uhfUF
QVT9NjS|84r2F9-c7hLv=%K!iX


From 9d3114c8c99b46a90b76811b3a2b19b1c1e31919 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Mon, 4 Jun 2018 18:42:44 +0800
Subject: [PATCH 53/68] fix transpiler error

---
 .../paddle/fluid/transpiler/distribute_transpiler.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index da001add8e..27992df462 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -187,12 +187,17 @@ class DistributeTranspiler:
 
         param_list = []
         grad_list = []
+        param_grad_set = set()
         for p, g in self.params_grads:
             # skip parameter marked not trainable
             if type(p) == Parameter and p.trainable == False:
                 continue
-            param_list.append(p)
-            grad_list.append(g)
+            if p.name not in param_grad_set:
+                param_list.append(p)
+                param_grad_set.add(p.name)
+            if g.name not in param_grad_set:
+                grad_list.append(g)
+                param_grad_set.add(g.name)
 
         self._update_dist_lookup_table_vars(param_list, grad_list,
                                             self.params_grads)
@@ -829,6 +834,9 @@ class DistributeTranspiler:
             if not block_map.has_key(varname):
                 block_map[varname] = []
             block_map[varname].append((long(offset), long(size)))
+        # Do not remove this important debug message:
+        print("block map: %s" % block_map)
+
         for varname, splited in block_map.iteritems():
             orig_var = program.global_block().var(varname)
             if len(splited) == 1:

From 1766406f76ebf93f3ff1acc57c15019c4a2dacdd Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Mon, 4 Jun 2018 19:47:16 +0800
Subject: [PATCH 54/68] delete unused code

---
 paddle/fluid/operators/detail/grpc_server.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
index eae87de420..e6ffc7066f 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -71,8 +71,6 @@ class AsyncGRPCServer final : public RPCServer {
   std::unique_ptr<::grpc::Server> server_;
 
   // condition of the sub program
-  // std::mutex barrier_mutex_;
-  // mutable int barrier_cond_step_;
   std::condition_variable barrier_condition_;
 
   std::mutex mutex_ready_;

From dcf40fd0bc5a7ed48e954feb591c63defd689f45 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Mon, 4 Jun 2018 20:39:34 +0800
Subject: [PATCH 55/68] refine benchmark

---
 benchmark/.gitignore               |  3 +++
 benchmark/fluid/fluid_benchmark.py | 31 ++++++++++++++++--------------
 benchmark/fluid/run.sh             | 20 ++++++++++---------
 3 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/benchmark/.gitignore b/benchmark/.gitignore
index 7b66e8a5b5..fb4114356d 100644
--- a/benchmark/.gitignore
+++ b/benchmark/.gitignore
@@ -7,3 +7,6 @@ paddle/rnn/imdb.pkl
 caffe/image/logs
 tensorflow/image/logs
 tensorflow/rnn/logs
+fluid/models/*.pyc
+fluid/logs
+fluid/nohup.out
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index c1d458970a..cac1b0fad0 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -40,10 +40,7 @@ def parse_args():
     parser.add_argument(
         '--batch_size', type=int, default=32, help='The minibatch size.')
     parser.add_argument(
-        '--learning_rate',
-        type=float,
-        default=0.001,
-        help='The minibatch size.')
+        '--learning_rate', type=float, default=0.001, help='The learning rate.')
     # TODO(wuyi): add "--use_fake_data" option back.
     parser.add_argument(
         '--skip_batch_num',
@@ -72,6 +69,11 @@ def parse_args():
         type=int,
         default=1,
         help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
+    parser.add_argument(
+        '--cpus',
+        type=int,
+        default=1,
+        help='If cpus > 1, will use ParallelDo to run, else use Executor.')
     parser.add_argument(
         '--data_set',
         type=str,
@@ -231,10 +233,7 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
             train_losses.append(loss)
             print("Pass: %d, Iter: %d, Loss: %f\n" %
                   (pass_id, iters, np.mean(train_losses)))
-        train_elapsed = time.time() - start_time
-        examples_per_sec = num_samples / train_elapsed
-        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sec\n' %
-              (num_samples, train_elapsed, examples_per_sec))
+        print_train_time(start_time, time.time(), num_samples)
         print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses)))
         # evaluation
         if not args.no_test and batch_acc != None:
@@ -315,10 +314,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
             if batch_id % 1 == 0:
                 print("Pass %d, batch %d, loss %s" %
                       (pass_id, batch_id, np.array(loss)))
-        train_elapsed = time.time() - start_time
-        examples_per_sec = num_samples / train_elapsed
-        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
-              (num_samples, train_elapsed, examples_per_sec))
+        print_train_time(start_time, time.time(), num_samples)
         if not args.no_test and batch_acc != None:
             test_acc = test(startup_exe, infer_prog, test_reader, feeder,
                             batch_acc)
@@ -329,12 +325,19 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
 def print_arguments(args):
     vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
                                 vars(args)['device'] == 'GPU')
-    print('----------- resnet Configuration Arguments -----------')
+    print('----------- Configuration Arguments -----------')
     for arg, value in sorted(vars(args).iteritems()):
         print('%s: %s' % (arg, value))
     print('------------------------------------------------')
 
 
+def print_train_time(start_time, end_time, num_samples):
+    train_elapsed = end_time - start_time
+    examples_per_sec = num_samples / train_elapsed
+    print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+          (num_samples, train_elapsed, examples_per_sec))
+
+
 def main():
     args = parse_args()
     print_arguments(args)
@@ -342,7 +345,7 @@ def main():
     # the unique trainer id, starting from 0, needed by trainer
     # only
     nccl_id_var, num_trainers, trainer_id = (
-        None, 1, int(os.getenv("PADDLE_TRAINER_ID", "-1")))
+        None, 1, int(os.getenv("PADDLE_TRAINER_ID", "0")))
 
     if args.use_cprof:
         pr = cProfile.Profile()
diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh
index afaab5f4de..5d9b2db871 100644
--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
@@ -2,6 +2,7 @@
 # This script benchmarking the PaddlePaddle Fluid on
 # single thread single GPU.
 
+mkdir -p logs
 #export FLAGS_fraction_of_gpu_memory_to_use=0.0
 export CUDNN_PATH=/paddle/cudnn_v5
 
@@ -35,6 +36,7 @@ nohup stdbuf -oL nvidia-smi \
       --format=csv \
       --filename=mem.log  \
       -l 1 &
+
 # mnist
 # mnist gpu mnist 128
 FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
@@ -43,7 +45,7 @@ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
                --batch_size=128 \
                --skip_batch_num=5 \
                --iterations=500 \
-               2>&1 | tee -a mnist_gpu_128.log
+               2>&1 | tee -a logs/mnist_gpu_128.log
 
 # vgg16
 # gpu cifar10 128
@@ -53,7 +55,7 @@ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
                --batch_size=128 \
                --skip_batch_num=5 \
                --iterations=30 \
-               2>&1 | tee -a vgg16_gpu_128.log
+               2>&1 | tee -a logs/vgg16_gpu_128.log
 
 # flowers gpu  128
 FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
@@ -63,28 +65,28 @@ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
                --data_set=flowers \
                --skip_batch_num=5 \
                --iterations=30 \
-               2>&1 | tee -a vgg16_gpu_flowers_32.log
+               2>&1 | tee -a logs/vgg16_gpu_flowers_32.log
 
 # resnet50
 # resnet50 gpu cifar10 128
 FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
-               --model=resnet50 \
+               --model=resnet \
                --device=GPU \
                --batch_size=128 \
                --data_set=cifar10 \
                --skip_batch_num=5 \
                --iterations=30 \
-               2>&1 | tee -a resnet50_gpu_128.log
+               2>&1 | tee -a logs/resnet50_gpu_128.log
 
 # resnet50 gpu flowers 64
 FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
-               --model=resnet50 \
+               --model=resnet \
                --device=GPU \
                --batch_size=64 \
                --data_set=flowers \
                --skip_batch_num=5 \
                --iterations=30 \
-               2>&1 | tee -a resnet50_gpu_flowers_64.log
+               2>&1 | tee -a logs/resnet50_gpu_flowers_64.log
 
 # lstm
 # lstm gpu imdb 32 # tensorflow only support batch=32
@@ -94,7 +96,7 @@ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
                --batch_size=32 \
                --skip_batch_num=5 \
                --iterations=30 \
-               2>&1 | tee -a lstm_gpu_32.log
+               2>&1 | tee -a logs/lstm_gpu_32.log
 
 # seq2seq
 # seq2seq gpu wmb 128
@@ -104,4 +106,4 @@ FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
                --batch_size=128 \
                --skip_batch_num=5 \
                --iterations=30 \
-               2>&1 | tee -a lstm_gpu_128.log
+               2>&1 | tee -a logs/lstm_gpu_128.log

From d58955997e946f1d2c91849a935917e2643f4374 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Mon, 4 Jun 2018 20:50:22 +0800
Subject: [PATCH 56/68] fix term core only

---
 cmake/external/grpc.cmake | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index 9459f1ddfe..4b6840578f 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -33,10 +33,18 @@ ELSE()
   SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin)
 ENDIF()
 
+# FIXME(wuyi): do not build zlib cares protobuf twice, find a way to build grpc with them
 ExternalProject_Add(
     extern_grpc
     DEPENDS protobuf zlib
-    URL "http://paddlepaddledeps.bj.bcebos.com/grpc.tar.xz"
+    # NOTE(wuyi):
+    # this package is generated by following steps:
+    # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git
+    # 2. submodule update --init
+    # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
+    #    checkout and clean other dirs under third_party
+    # 4. remove .git, and package the directory.
+    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.8.x.tar.gz"
     PREFIX          ${GRPC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""
@@ -49,7 +57,6 @@ ExternalProject_Add(
     INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
 )
 
-# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them.
 ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
              "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")

From 418c41d8fb4a9dd0f221e7c3e3b94d5433493ded Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Mon, 4 Jun 2018 22:01:14 +0800
Subject: [PATCH 57/68] Feature/anakin embed (#11135)

---
 paddle/contrib/inference/CMakeLists.txt       | 43 +++++++++-
 .../inference/demo/simple_on_word2vec.cc      |  2 +-
 .../contrib/inference/paddle_inference_api.h  | 27 +++---
 .../paddle_inference_api_anakin_engine.cc     | 82 +++++++++++++++++++
 .../paddle_inference_api_anakin_engine.h      | 51 ++++++++++++
 ...ddle_inference_api_anakin_engine_tester.cc | 27 ++++++
 .../inference/paddle_inference_api_impl.cc    | 18 ++--
 7 files changed, 229 insertions(+), 21 deletions(-)
 create mode 100644 paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
 create mode 100644 paddle/contrib/inference/paddle_inference_api_anakin_engine.h
 create mode 100644 paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc

diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index 8ca3446539..1e3bb7bf16 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -17,6 +17,42 @@ if(APPLE)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
 endif(APPLE)
 
+set(ANAKIN_INCLUDE "" CACHE STRING "root of Anakin header files")
+set(ANAKIN_LIBRARY "" CACHE STRING "path of Anakin library")
+
+
+set(inference_deps paddle_inference_api paddle_fluid_api)
+
+# if anakin is set enable anakin api implementation
+if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY)
+    set(ANAKIN_FOUND ON)
+else()
+    set(ANAKIN_FOUND OFF)
+endif()
+
+if (ANAKIN_FOUND)
+    # Anakin's code style doesn't follow google c style.
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=comment
+                                            -Wno-error=reorder
+                                            -Wno-error=format
+                                            -Wno-error=switch
+                                            -Wno-error=return-type
+                                            -Wno-error=non-virtual-dtor
+                                            -Wno-error=cpp")
+
+    message(STATUS "Anakin for inference is enabled")
+    message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
+    include_directories("${ANAKIN_INCLUDE}")
+    # Anakin's source path is a mass, need to set sub-directories trivially.
+    include_directories("${ANAKIN_INCLUDE}/saber")
+    link_directories("${ANAKIN_LIBRARY}")
+
+    nv_library(inference_anakin_api SRCS paddle_inference_api_anakin_engine.cc)
+    target_link_libraries(inference_anakin_api anakin)
+    list(APPEND inference_deps inference_anakin_api)
+endif()
+
+
 function(inference_api_test TARGET_NAME)
     if (WITH_TESTING)
         set(options "")
@@ -27,7 +63,7 @@ function(inference_api_test TARGET_NAME)
         set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
         cc_test(${TARGET_NAME}
                 SRCS ${TARGET_NAME}.cc
-                DEPS paddle_fluid paddle_inference_api
+                DEPS "${inference_deps}"
                 ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
         if(inference_test_ARGS)
             set_tests_properties(${TARGET_NAME}
@@ -47,6 +83,11 @@ cc_test(test_paddle_inference_api
 inference_api_test(test_paddle_inference_api_impl
                     ARGS test_word2vec test_image_classification)
 
+if (ANAKIN_FOUND)
+  nv_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
+    DEPS ${inference_deps} protobuf)
+endif()
+
 if(WITH_TESTING)
     add_subdirectory(demo)
 endif()
diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/contrib/inference/demo/simple_on_word2vec.cc
index 165d2e196b..ee865f3790 100644
--- a/paddle/contrib/inference/demo/simple_on_word2vec.cc
+++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc
@@ -54,7 +54,7 @@ void Main(bool use_gpu) {
     CHECK(predictor->Run(slots, &outputs));
 
     //# 4. Get output.
-    ASSERT_EQ(outputs.size(), 1);
+    ASSERT_EQ(outputs.size(), 1UL);
     LOG(INFO) << "output buffer size: " << outputs.front().data.length;
     const size_t num_elements = outputs.front().data.length / sizeof(float);
     // The outputs' buffers are in CPU memory.
diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/contrib/inference/paddle_inference_api.h
index 5fe8399762..b5cd0d603f 100644
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 /*
  * This file contains the definition of a simple Inference API for Paddle.
@@ -47,8 +47,8 @@ struct PaddleTensor {
 
 enum class PaddleEngineKind {
   kNative = 0,  // Use the native Fluid facility.
+  kAnakin,      // Use Anakin for inference.
   // TODO(Superjomn) support following engines latter.
-  // kAnakin,             // Use Anakin for inference.
   // kTensorRT,           // Use TensorRT for inference.
   // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
   // kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
@@ -95,6 +95,13 @@ struct NativeConfig : public PaddlePredictor::Config {
   std::string param_file;
 };
 
+// Configurations for Anakin engine.
+struct AnakinConfig : public PaddlePredictor::Config {
+  int device;
+  std::string model_file;
+  int max_batch_size{-1};
+};
+
 // A factory to help create different predictors.
 //
 // FOR EXTENSION DEVELOPER:
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
new file mode 100644
index 0000000000..865d7ac10d
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda.h>
+
+#include "paddle/contrib/inference/paddle_inference_api_anakin_engine.h"
+
+namespace paddle {
+
+PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor(
+    const AnakinConfig &config) {
+  CHECK(Init(config));
+}
+
+bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {
+  // TODO(Superjomn) Tell anakin to support return code.
+  engine_.Build(config.model_file, config.max_batch_size);
+  return true;
+}
+
+bool PaddleInferenceAnakinPredictor::Run(
+    const std::vector<PaddleTensor> &inputs,
+    std::vector<PaddleTensor> *output_data) {
+  for (const auto &input : inputs) {
+    if (input.dtype != PaddleDType::FLOAT32) {
+      LOG(ERROR) << "Only support float type inputs. " << input.name
+                 << "'s type is not float";
+      return false;
+    }
+    engine_.SetInputFromCPU(
+        input.name, static_cast<float *>(input.data.data), input.data.length);
+  }
+
+  // TODO(Superjomn) Tell anakin to support return code.
+  engine_.Execute();
+
+  if (output_data->empty()) {
+    LOG(ERROR) << "At least one output should be set with tensors' names.";
+    return false;
+  }
+  for (auto &output : *output_data) {
+    auto *tensor = engine_.GetOutputInGPU(output.name);
+    output.shape = tensor->shape();
+    // Copy data from GPU -> CPU
+    if (cudaMemcpy(output.data.data,
+                   tensor->data(),
+                   tensor->size(),
+                   cudaMemcpyDeviceToHost) != 0) {
+      LOG(ERROR) << "copy data from GPU to CPU error";
+      return false;
+    }
+  }
+  return true;
+}
+
+// TODO(Superjomn) To implement latter.
+std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
+  return nullptr;
+}
+
+// A factory to help create difference predictor.
+template <>
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(
+    const AnakinConfig &config) {
+  std::unique_ptr<PaddlePredictor> x(
+      new PaddleInferenceAnakinPredictor(config));
+  return x;
+};
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
new file mode 100644
index 0000000000..fe9f562e9d
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains the implementation of inference API with Anakin engine
+ * embeded, this API can only support Anakin models.
+ */
+
+#pragma once
+
+// NOTE This header file do not have namespace.
+// TODO(Superjomn) Tell Anakin to provide better APIs.
+#include <test/framework/net/paddle_api.h>
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+namespace paddle {
+
+class PaddleInferenceAnakinPredictor : public PaddlePredictor {
+ public:
+  PaddleInferenceAnakinPredictor(const AnakinConfig& config);
+
+  // NOTE Unlike the native engine, the buffers of anakin engine's output_data
+  // should be allocated first.
+  // TODO(Superjomn) should unify all the behaviors of output_data accross all
+  // the engines.
+  bool Run(const std::vector<PaddleTensor>& inputs,
+           std::vector<PaddleTensor>* output_data) override;
+
+  std::unique_ptr<PaddlePredictor> Clone() override;
+
+ private:
+  bool Init(const AnakinConfig& config);
+
+  anakin::AnakinEngine<anakin::NV,
+                       anakin::saber::AK_FLOAT,
+                       anakin::Precision::FP32>
+      engine_;
+};
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
new file mode 100644
index 0000000000..43324bc67c
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
@@ -0,0 +1,27 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+
+TEST(inference, anakin) {
+  AnakinConfig config;
+
+  auto engine =
+      CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
+}
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.cc b/paddle/contrib/inference/paddle_inference_api_impl.cc
index e7a8fa68b7..b52a43a463 100644
--- a/paddle/contrib/inference/paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include <sys/time.h>
 #include <algorithm>

From 71b6bdb5d4dc090f8746845bf7bba9f62415e9f3 Mon Sep 17 00:00:00 2001
From: Siddharth Goyal <vi.siddharth78@gmail.com>
Date: Mon, 4 Jun 2018 13:15:05 -0700
Subject: [PATCH 58/68] Fix signed-unsigned comparison warning (#11167)

---
 paddle/fluid/operators/sgd_op.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h
index f9e0596191..2685ce217e 100644
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
@@ -114,7 +114,7 @@ class SGDOpKernel : public framework::OpKernel<T> {
         int64_t id_index = param.Index(grad.rows()[i]);
         PADDLE_ENFORCE_GE(id_index, static_cast<int64_t>(0),
                           "id should be in the table");
-        for (size_t j = 0; j < grad_row_width; j++) {
+        for (int64_t j = 0; j < grad_row_width; j++) {
           out_data[id_index * grad_row_width + j] -=
               lr[0] * grad_data[i * grad_row_width + j];
         }

From 95ba67a30cfe7395d2ed2f1e8193a88a4569fb22 Mon Sep 17 00:00:00 2001
From: Lei Wang <bestwanglei@gmail.com>
Date: Mon, 4 Jun 2018 14:28:47 -0700
Subject: [PATCH 59/68] Build: fix go package installation path. (#11166)

---
 paddle/scripts/paddle_build.sh | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 8eeea1805d..113d02ce48 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -145,19 +145,17 @@ function check_style() {
     trap 'abort' 0
     set -e
 
-    # install glide
-    curl https://glide.sh/get | bash
-    eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
+    if [ -x "$(command -v gimme)" ]; then
+    	eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
+    fi
 
     # set up go environment for running gometalinter
     mkdir -p $GOPATH/src/github.com/PaddlePaddle/
     ln -sf ${PADDLE_ROOT} $GOPATH/src/github.com/PaddlePaddle/Paddle
-    cd  $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd -
-
-    go get github.com/alecthomas/gometalinter
-    gometalinter --install
+    mkdir -p ./build/go
+    cp go/glide.* build/go
+    cd build/go; glide install; cd -
 
-    cd ${PADDLE_ROOT}
     export PATH=/usr/bin:$PATH
     pre-commit install
     clang-format --version

From a4237171a5f647f5c841e61d5d61f2ff8f8e80f7 Mon Sep 17 00:00:00 2001
From: Siddharth Goyal <vi.siddharth78@gmail.com>
Date: Mon, 4 Jun 2018 14:29:19 -0700
Subject: [PATCH 60/68] Modify optimizer in new API to support more usecases
 (#11168)

* Modify optimizer in new API to support more usecase

* Modify CMake to include only modified examples
---
 .../fluid/tests/book/high-level-api/CMakeLists.txt  | 12 ++++++------
 .../high-level-api/fit_a_line/test_fit_a_line.py    |  8 +++++---
 .../test_label_semantic_roles_newapi.py             | 10 +++++++---
 python/paddle/fluid/trainer.py                      | 13 +++++++------
 4 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
index efa5ee2d06..07da382867 100644
--- a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
@@ -7,10 +7,10 @@ foreach(src ${TEST_OPS})
 endforeach()
 
 add_subdirectory(fit_a_line)
-add_subdirectory(recognize_digits)
-add_subdirectory(image_classification)
-add_subdirectory(understand_sentiment)
+#add_subdirectory(recognize_digits)
+#add_subdirectory(image_classification)
+#add_subdirectory(understand_sentiment)
 add_subdirectory(label_semantic_roles)
-add_subdirectory(word2vec)
-add_subdirectory(recommender_system)
-add_subdirectory(machine_translation)
+#add_subdirectory(word2vec)
+#add_subdirectory(recommender_system)
+#add_subdirectory(machine_translation)
diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
index de3906fc6a..b3117cf2e5 100644
--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
@@ -48,13 +48,15 @@ def linear():
     return avg_loss
 
 
+def optimizer_func():
+    return fluid.optimizer.SGD(learning_rate=0.001)
+
+
 def train(use_cuda, train_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     trainer = fluid.Trainer(
-        train_func=train_program,
-        place=place,
-        optimizer=fluid.optimizer.SGD(learning_rate=0.001))
+        train_func=train_program, place=place, optimizer_func=optimizer_func)
 
     def event_handler(event):
         if isinstance(event, fluid.EndStepEvent):
diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
index 8cce398ff3..0ccb3a39e0 100755
--- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
@@ -141,12 +141,16 @@ def train_program():
     return [avg_cost]
 
 
+def optimize_func():
+    return fluid.optimizer.SGD(learning_rate=fluid.layers.exponential_decay(
+        learning_rate=0.01, decay_steps=100000, decay_rate=0.5, staircase=True))
+
+
 def train(use_cuda, train_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    optimizer = fluid.optimizer.SGD(learning_rate=0.01)
 
     trainer = fluid.Trainer(
-        train_func=train_program, place=place, optimizer=optimizer)
+        train_func=train_program, place=place, optimizer_func=optimize_func)
 
     feed_order = [
         'word_data', 'ctx_n2_data', 'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
@@ -245,7 +249,7 @@ def infer(use_cuda, inference_program, params_dirname):
         },
         return_numpy=False)
 
-    print("infer results: ", np.array(results[0]))
+    print("infer results: ", np.array(results[0]).shape)
 
 
 def main(use_cuda):
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 7da123dd92..cdacb41986 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -90,13 +90,13 @@ class Trainer(object):
 
     Args:
         train_func(callable): A function which will return loss. The loss must be a scalar.
-        optimizer(optimizer.Optimizer): The optimizer should be an instance of Optimizer
+        optimizer_func(callable): A function that returns an Optimizer object.
         place: The device place of this trainer.
     """
 
     def __init__(self,
                  train_func,
-                 optimizer,
+                 optimizer_func,
                  param_path=None,
                  place=None,
                  parallel=False):
@@ -105,8 +105,6 @@ class Trainer(object):
         # 1. we need to generate a framework.Program by calling
         # program_func. Reference: fluid.program_guard in
         # test_word2vec.py
-        if not isinstance(optimizer, opt_module.Optimizer):
-            raise TypeError("The optimizer should be an instance of Optimizer")
 
         self.scope = core.Scope()
 
@@ -118,11 +116,14 @@ class Trainer(object):
             self.train_func_outputs = program_func_outs if isinstance(
                 program_func_outs, list) else [program_func_outs]
             self.test_program = self.train_program.clone()
+
+            # The fisrt element of program_func_outs is loss.
+            loss = self.train_func_outputs[0]
+
+            optimizer = optimizer_func()
             if not isinstance(optimizer, opt_module.Optimizer):
                 raise TypeError(
                     "The optimizer should be an instance of Optimizer")
-            # The fisrt element of program_func_outs is loss.
-            loss = self.train_func_outputs[0]
             optimize_ops, params_grads = optimizer.minimize(loss)
 
         self.place = check_and_get_place(place)

From f3d14c3bed9cc783fa6089819ddb54236e6e67f2 Mon Sep 17 00:00:00 2001
From: Lei Wang <bestwanglei@gmail.com>
Date: Mon, 4 Jun 2018 15:04:21 -0700
Subject: [PATCH 61/68] Doc: fix android cross_compiling documents. (#11169)

---
 doc/mobile/cross_compiling_for_android_cn.md | 6 +++---
 doc/mobile/cross_compiling_for_android_en.md | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/mobile/cross_compiling_for_android_cn.md b/doc/mobile/cross_compiling_for_android_cn.md
index cdd6917239..0607748b75 100644
--- a/doc/mobile/cross_compiling_for_android_cn.md
+++ b/doc/mobile/cross_compiling_for_android_cn.md
@@ -63,16 +63,16 @@ Android的Docker开发镜像向用户提供两个可配置的参数：
 - 编译`armeabi-v7a`，`Android API 21`的PaddlePaddle库
 
 ```bash
-$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev ./paddle/scripts/paddle_build.sh build_android
 ```
 
 - 编译`arm64-v8a`，`Android API 21`的PaddlePaddle库
 
 ```bash
-$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev ./paddle/scripts/paddle_build.sh build_android
 ```
 
-执行上述`docker run`命令时，容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
+执行上述`docker run`命令时，容器执行[paddle/scripts/paddle_build.sh build_android](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/paddle_build.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
 
 ## 基于Linux交叉编译环境的编译方式
 本文档将以Linux x86-64平台为例，介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。
diff --git a/doc/mobile/cross_compiling_for_android_en.md b/doc/mobile/cross_compiling_for_android_en.md
index 6af16fc114..572063e801 100644
--- a/doc/mobile/cross_compiling_for_android_en.md
+++ b/doc/mobile/cross_compiling_for_android_en.md
@@ -36,7 +36,7 @@ $ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android
 We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below:
 
 ```bash
-$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android ./paddle/scripts/paddle_build.sh build_android
 ```
 
 The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
@@ -70,7 +70,7 @@ The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
 
 The ARM-64 architecture (`arm64-v8a`) requires at least level 21 of Android API.
 
-The default entry-point of the Docker image, [`paddle/scripts/docker/build_android.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`.  For information about other configuration arguments, please continue reading.
+The build command, [`paddle/scripts/paddle_build.sh build_android`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/paddle_build.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`.  For information about other configuration arguments, please continue reading.
 
 The above command generates and outputs the inference library in `$PWD/install_android` and puts third-party libraries in `$PWD/install_android/third_party`.
 

From c9067bef99acf7b4903de86470437b39e55d4d53 Mon Sep 17 00:00:00 2001
From: Lei Wang <bestwanglei@gmail.com>
Date: Mon, 4 Jun 2018 16:37:27 -0700
Subject: [PATCH 62/68] Doc: remove broken symbol links. (#11170)

---
 doc/fluid/howto/optimization/benchmark/README.md       | 1 -
 doc/fluid/howto/optimization/benchmark/vgg16/README.md | 1 -
 2 files changed, 2 deletions(-)
 delete mode 120000 doc/fluid/howto/optimization/benchmark/README.md
 delete mode 120000 doc/fluid/howto/optimization/benchmark/vgg16/README.md

diff --git a/doc/fluid/howto/optimization/benchmark/README.md b/doc/fluid/howto/optimization/benchmark/README.md
deleted file mode 120000
index db30af7f53..0000000000
--- a/doc/fluid/howto/optimization/benchmark/README.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../benchmark/cluster/README.md
\ No newline at end of file
diff --git a/doc/fluid/howto/optimization/benchmark/vgg16/README.md b/doc/fluid/howto/optimization/benchmark/vgg16/README.md
deleted file mode 120000
index ca963ef5f0..0000000000
--- a/doc/fluid/howto/optimization/benchmark/vgg16/README.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../../benchmark/cluster/vgg16/README.md
\ No newline at end of file

From 99a5a3d8d3f1f2ab08b2e04828b812890bb29a73 Mon Sep 17 00:00:00 2001
From: Siddharth Goyal <vi.siddharth78@gmail.com>
Date: Mon, 4 Jun 2018 17:17:57 -0700
Subject: [PATCH 63/68] Fix optimizer in remaining chapters with high level API
 (#11172)

---
 .../fluid/tests/book/high-level-api/CMakeLists.txt | 12 ++++++------
 .../test_image_classification_resnet.py            |  8 +++++---
 .../test_image_classification_vgg.py               |  8 +++++---
 .../test_machine_translation.py                    | 14 +++++++++-----
 .../recognize_digits/test_recognize_digits_conv.py |  7 +++++--
 .../recognize_digits/test_recognize_digits_mlp.py  |  7 +++++--
 .../test_recommender_system_newapi.py              |  7 +++++--
 .../test_understand_sentiment_conv.py              |  7 +++++--
 .../test_understand_sentiment_dynamic_rnn.py       |  7 +++++--
 .../test_understand_sentiment_stacked_lstm.py      |  7 +++++--
 .../word2vec/test_word2vec_new_api.py              |  8 +++++---
 11 files changed, 60 insertions(+), 32 deletions(-)

diff --git a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
index 07da382867..efa5ee2d06 100644
--- a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
@@ -7,10 +7,10 @@ foreach(src ${TEST_OPS})
 endforeach()
 
 add_subdirectory(fit_a_line)
-#add_subdirectory(recognize_digits)
-#add_subdirectory(image_classification)
-#add_subdirectory(understand_sentiment)
+add_subdirectory(recognize_digits)
+add_subdirectory(image_classification)
+add_subdirectory(understand_sentiment)
 add_subdirectory(label_semantic_roles)
-#add_subdirectory(word2vec)
-#add_subdirectory(recommender_system)
-#add_subdirectory(machine_translation)
+add_subdirectory(word2vec)
+add_subdirectory(recommender_system)
+add_subdirectory(machine_translation)
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
index 63dc1b6ce3..2df3da9cca 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -85,6 +85,10 @@ def train_network():
     return [avg_cost, accuracy]
 
 
+def optimizer_func():
+    return fluid.optimizer.Adam(learning_rate=0.001)
+
+
 def train(use_cuda, train_program, params_dirname):
     BATCH_SIZE = 128
     EPOCH_NUM = 1
@@ -111,9 +115,7 @@ def train(use_cuda, train_program, params_dirname):
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     trainer = fluid.Trainer(
-        train_func=train_program,
-        optimizer=fluid.optimizer.Adam(learning_rate=0.001),
-        place=place)
+        train_func=train_program, optimizer_func=optimizer_func, place=place)
 
     trainer.train(
         reader=train_reader,
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
index 0bf8f265a1..224cca417e 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -64,6 +64,10 @@ def train_network():
     return [avg_cost, accuracy]
 
 
+def optimizer_func():
+    return fluid.optimizer.Adam(learning_rate=0.001)
+
+
 def train(use_cuda, train_program, params_dirname):
     BATCH_SIZE = 128
     train_reader = paddle.batch(
@@ -88,9 +92,7 @@ def train(use_cuda, train_program, params_dirname):
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     trainer = fluid.Trainer(
-        train_func=train_program,
-        place=place,
-        optimizer=fluid.optimizer.Adam(learning_rate=0.001))
+        train_func=train_program, place=place, optimizer_func=optimizer_func)
 
     trainer.train(
         reader=train_reader,
diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
index d4b723d3e6..c4b37df3a0 100644
--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
@@ -158,6 +158,13 @@ def train_program(is_sparse):
     return avg_cost
 
 
+def optimizer_func():
+    return fluid.optimizer.Adagrad(
+        learning_rate=1e-4,
+        regularization=fluid.regularizer.L2DecayRegularizer(
+            regularization_coeff=0.1))
+
+
 def train(use_cuda, is_sparse, is_local=True):
     EPOCH_NUM = 1
 
@@ -182,11 +189,8 @@ def train(use_cuda, is_sparse, is_local=True):
 
     trainer = fluid.Trainer(
         train_func=partial(train_program, is_sparse),
-        optimizer=fluid.optimizer.Adagrad(
-            learning_rate=1e-4,
-            regularization=fluid.regularizer.L2DecayRegularizer(
-                regularization_coeff=0.1)),
-        place=place)
+        place=place,
+        optimizer_func=optimizer_func)
 
     trainer.train(
         reader=train_reader,
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
index 03439cbd37..9a09db25dc 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -57,14 +57,17 @@ def train_program():
     return [avg_cost, acc]
 
 
+def optimizer_func():
+    return fluid.optimizer.Adam(learning_rate=0.001)
+
+
 def train(use_cuda, train_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
 
     trainer = fluid.Trainer(
         train_func=train_program,
         place=place,
-        optimizer=optimizer,
+        optimizer_func=optimizer_func,
         parallel=True)
 
     def event_handler(event):
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
index 89bbd21bea..b2b544e791 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
@@ -44,12 +44,15 @@ def train_program():
     return [avg_cost, acc]
 
 
+def optimizer_func():
+    return fluid.optimizer.Adam(learning_rate=0.001)
+
+
 def train(use_cuda, train_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
 
     trainer = fluid.Trainer(
-        train_func=train_program, place=place, optimizer=optimizer)
+        train_func=train_program, place=place, optimizer_func=optimizer_func)
 
     def event_handler(event):
         if isinstance(event, fluid.EndEpochEvent):
diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
index dfc7325acf..090c11ce1e 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
@@ -155,12 +155,15 @@ def train_program():
     return [avg_cost, scale_infer]
 
 
+def optimizer_func():
+    return fluid.optimizer.SGD(learning_rate=0.2)
+
+
 def train(use_cuda, train_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    optimizer = fluid.optimizer.SGD(learning_rate=0.2)
 
     trainer = fluid.Trainer(
-        train_func=train_program, place=place, optimizer=optimizer)
+        train_func=train_program, place=place, optimizer_func=optimizer_func)
 
     feed_order = [
         'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
index 11e9fd1bec..9b61f7a00c 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
@@ -64,15 +64,18 @@ def train_program(word_dict):
     return [avg_cost, accuracy]
 
 
+def optimizer_func():
+    return fluid.optimizer.Adagrad(learning_rate=0.002)
+
+
 def train(use_cuda, train_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
 
     word_dict = paddle.dataset.imdb.word_dict()
     trainer = fluid.Trainer(
         train_func=partial(train_program, word_dict),
         place=place,
-        optimizer=optimizer)
+        optimizer_func=optimizer_func)
 
     def event_handler(event):
         if isinstance(event, fluid.EndEpochEvent):
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
index 90757d54f9..aa7c567b4d 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
@@ -79,15 +79,18 @@ def train_program(word_dict):
     return [avg_cost, accuracy]
 
 
+def optimizer_func():
+    return fluid.optimizer.Adagrad(learning_rate=0.002)
+
+
 def train(use_cuda, train_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
 
     word_dict = paddle.dataset.imdb.word_dict()
     trainer = fluid.Trainer(
         train_func=partial(train_program, word_dict),
         place=place,
-        optimizer=optimizer)
+        optimizer_func=optimizer_func)
 
     def event_handler(event):
         if isinstance(event, fluid.EndEpochEvent):
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
index 52b7d4a837..113dda88ca 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
@@ -71,15 +71,18 @@ def train_program(word_dict):
     return [avg_cost, accuracy]
 
 
+def optimizer_func():
+    return fluid.optimizer.Adagrad(learning_rate=0.002)
+
+
 def train(use_cuda, train_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
 
     word_dict = paddle.dataset.imdb.word_dict()
     trainer = fluid.Trainer(
         train_func=partial(train_program, word_dict),
         place=place,
-        optimizer=optimizer)
+        optimizer_func=optimizer_func)
 
     def event_handler(event):
         if isinstance(event, fluid.EndEpochEvent):
diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
index eeb8e67087..ba44f72d9b 100644
--- a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
@@ -80,6 +80,10 @@ def train_program(is_sparse):
     return avg_cost
 
 
+def optimizer_func():
+    return fluid.optimizer.SGD(learning_rate=0.001)
+
+
 def train(use_cuda, train_program, params_dirname):
     train_reader = paddle.batch(
         paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
@@ -104,9 +108,7 @@ def train(use_cuda, train_program, params_dirname):
                 sys.exit("got NaN loss, training failed.")
 
     trainer = fluid.Trainer(
-        train_func=train_program,
-        optimizer=fluid.optimizer.SGD(learning_rate=0.001),
-        place=place)
+        train_func=train_program, optimizer_func=optimizer_func, place=place)
 
     trainer.train(
         reader=train_reader,

From 5713a0bf5eef095ee2e1d6006add1574fca8de63 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <kexin.zhao.paddle@gmail.com>
Date: Mon, 4 Jun 2018 17:46:12 -0700
Subject: [PATCH 64/68] fix lodtensor api in memory opt machine trans (#11171)

---
 .../test_memopt_machine_translation.py        | 34 ++++++-------------
 1 file changed, 11 insertions(+), 23 deletions(-)

diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
index a1ca6d981f..fa696acdfa 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
@@ -80,21 +80,6 @@ def encoder_decoder():
     return rnn()
 
 
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = core.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
 def main():
     rnn_out = encoder_decoder()
     label = layers.data(
@@ -122,18 +107,21 @@ def main():
 
     exe.run(framework.default_startup_program())
 
+    feed_order = [
+        'src_word_id', 'target_language_word', 'target_language_next_word'
+    ]
+
+    feed_list = [
+        fluid.default_main_program().global_block().var(var_name)
+        for var_name in feed_order
+    ]
+    feeder = fluid.DataFeeder(feed_list, place)
+
     batch_id = 0
     for pass_id in xrange(10):
         for data in train_data():
-            word_data = to_lodtensor(map(lambda x: x[0], data), place)
-            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
-            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
             outs = exe.run(fluid.default_main_program(),
-                           feed={
-                               'src_word_id': word_data,
-                               'target_language_word': trg_word,
-                               'target_language_next_word': trg_word_next
-                           },
+                           feed=feeder.feed(data),
                            fetch_list=[avg_cost])
             avg_cost_val = np.array(outs[0])
             print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +

From c8d6c1d09c46457bbb68665108b3468de5a064fa Mon Sep 17 00:00:00 2001
From: whs <wanghaoshuang@baidu.com>
Date: Tue, 5 Jun 2018 10:09:22 +0800
Subject: [PATCH 65/68] Auto generate python api for polygon_box_transform op.
 (#11074)

* Auto generate python api for polygon_box_transform op.

* Add unitest.
---
 python/paddle/fluid/layers/ops.py                  | 1 +
 python/paddle/fluid/tests/unittests/test_layers.py | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 60f8cbbfa7..69cfde852d 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -71,6 +71,7 @@ __all__ = [
     'cumsum',
     'scatter',
     'sum',
+    'polygon_box_transform',
     'shape',
 ] + __activations__
 
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index ca08fd7fc8..621a450fa6 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -379,6 +379,14 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(output)
         print(str(program))
 
+    def test_polygon_box_transform(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[8, 4, 4], dtype="float32")
+            output = layers.polygon_box_transform(input=x)
+            self.assertIsNotNone(output)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()

From 9d1dae3987abcef8a77f07e9e86e96de27b1cd46 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Tue, 5 Jun 2018 10:20:40 +0800
Subject: [PATCH 66/68] remove argument cpus, and will add in next PR

---
 benchmark/fluid/fluid_benchmark.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index cac1b0fad0..9d33a841cd 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -69,11 +69,6 @@ def parse_args():
         type=int,
         default=1,
         help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
-    parser.add_argument(
-        '--cpus',
-        type=int,
-        default=1,
-        help='If cpus > 1, will use ParallelDo to run, else use Executor.')
     parser.add_argument(
         '--data_set',
         type=str,

From 02cc80b30d5b5d6152171427ed8be8c8f91616e6 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Tue, 5 Jun 2018 10:33:20 +0800
Subject: [PATCH 67/68] Fix dangling pointer bug

---
 paddle/fluid/framework/executor.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 863053c32b..3d68c5fb87 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -220,8 +220,10 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
       has_fetch_operators(program.Block(0), *fetch_targets, fetch_holder_name);
 
   ProgramDesc* copy_program = const_cast<ProgramDesc*>(&program);
+  std::unique_ptr<ProgramDesc> unique_ptr_of_copy_program;
   if (!has_feed_ops || !has_fetch_ops) {
-    copy_program = std::unique_ptr<ProgramDesc>(new ProgramDesc(program)).get();
+    unique_ptr_of_copy_program.reset(new ProgramDesc(program));
+    copy_program = unique_ptr_of_copy_program.get();
   }
 
   auto* global_block = copy_program->MutableBlock(0);

From 238124909e978b3e581c214859cf7aea5df57b31 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 5 Jun 2018 13:01:21 +0800
Subject: [PATCH 68/68] fix protobuf memory leak (#11177)

fix protobuf memory leak
---
 paddle/fluid/framework/block_desc.cc | 28 ++++------------------------
 paddle/fluid/framework/block_desc.h  |  9 ---------
 2 files changed, 4 insertions(+), 33 deletions(-)

diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc
index e7842e9b81..f537e4b9e5 100644
--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -169,17 +169,13 @@ void BlockDesc::Flush() {
   }
 
   if (need_update_) {
-    auto &op_field = *this->desc_->mutable_ops();
-    this->ClearPBOps();
-    op_field.Reserve(static_cast<int>(ops_.size()));
+    this->desc_->mutable_ops()->Clear();
     for (auto &op_desc : ops_) {
-      op_field.AddAllocated(op_desc->Proto());
+      this->desc_->mutable_ops()->Add()->CopyFrom(*op_desc->Proto());
     }
-    auto &var_field = *this->desc_->mutable_vars();
-    this->ClearPBVars();
-    var_field.Reserve(static_cast<int>(vars_.size()));
+    this->desc_->mutable_vars()->Clear();
     for (auto &var_desc : vars_) {
-      var_field.AddAllocated(var_desc.second->Proto());
+      this->desc_->mutable_vars()->Add()->CopyFrom(*var_desc.second->Proto());
     }
     need_update_ = false;
   }
@@ -217,22 +213,6 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
   }
 }
 
-void BlockDesc::ClearPBOps() {
-  auto ops = this->desc_->mutable_ops();
-  while (!ops->empty()) {
-    // we do not own the OpDesc, so release the ownership.
-    ops->ReleaseLast();
-  }
-}
-
-void BlockDesc::ClearPBVars() {
-  auto vars = this->desc_->mutable_vars();
-  while (!vars->empty()) {
-    // we do not own the VarDesc, so release the ownership.
-    vars->ReleaseLast();
-  }
-}
-
 void BlockDesc::SetForwardBlockID(int32_t forward_block_id) {
   PADDLE_ENFORCE(!desc_->has_forward_block_idx(),
                  "Parent block ID has been set to %d. Cannot set to %d",
diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index 189dd6c52f..ce48548418 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -41,11 +41,6 @@ class BlockDesc {
 
   BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, ProgramDesc *prog);
 
-  ~BlockDesc() {
-    this->ClearPBVars();
-    this->ClearPBOps();
-  }
-
   int32_t ID() const { return desc_->idx(); }
 
   int32_t Parent() const { return desc_->parent_idx(); }
@@ -113,10 +108,6 @@ class BlockDesc {
 
   ProgramDesc *Program() const { return this->prog_; }
 
- private:
-  void ClearPBOps();
-  void ClearPBVars();
-
  private:
   ProgramDesc *prog_;       // not_own
   proto::BlockDesc *desc_;  // not_own