From 84246284b5ab6f2046c916ed356b2071e73eccc2 Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Mon, 21 May 2018 16:46:53 +0800
Subject: [PATCH 01/24] Fix dev image build on nodes of yq

It seems that the `doxygen` package will remove system includes...

It should be a bug of ubuntu or docker. Since we are not using doxygen
now, just remove this package.
---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index ea39efd00b..8c742c3fee 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -29,7 +29,7 @@ RUN apt-get update && \
     wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
     curl sed grep graphviz libjpeg-dev zlib1g-dev  \
     python-matplotlib gcc-4.8 g++-4.8 \
-    automake locales clang-format swig doxygen cmake  \
+    automake locales clang-format swig cmake  \
     liblapack-dev liblapacke-dev \
     clang-3.8 llvm-3.8 libclang-3.8-dev \
     net-tools libtool ccache && \

From 46f13237dc32c113ec0788fbe4b569ca0b5353b0 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Wed, 30 May 2018 14:29:16 +0800
Subject: [PATCH 02/24] Fix bugs in framework/tensor_impl.h and polish
 framework/reader.cc

---
 paddle/fluid/framework/reader.cc     | 4 +++-
 paddle/fluid/framework/tensor_impl.h | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc
index 76126f3dc6..0b36f1116d 100644
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -25,8 +25,10 @@ void FileReader::ReadNext(std::vector<LoDTensor> *out) {
   if (out->empty()) {
     return;
   }
+
+  PADDLE_ENFORCE_EQ(out->size(), dims_.size());
   for (size_t i = 0; i < dims_.size(); ++i) {
-    auto &actual = out->at(i).dims();
+    auto &actual = (*out)[i].dims();
     auto &expect = dims_[i];
 
     PADDLE_ENFORCE_EQ(actual.size(), expect.size());
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 0a1db7758b..2f19ec0f0a 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -39,7 +39,7 @@ template <typename T>
 inline const T* Tensor::data() const {
   check_memory_size();
   PADDLE_ENFORCE(std::is_same<T, void>::value ||
-                     holder_->type().hash_code() == typeid(T).hash_code(),
+                     holder_->type() == std::type_index(typeid(T)),
                  "Tensor holds the wrong type, it holds %s",
                  this->holder_->type().name());
 
@@ -53,7 +53,7 @@ template <typename T>
 inline T* Tensor::data() {
   check_memory_size();
   PADDLE_ENFORCE(std::is_same<T, void>::value ||
-                     holder_->type().hash_code() == typeid(T).hash_code(),
+                     holder_->type() == std::type_index(typeid(T)),
                  "Tensor holds the wrong type, it holds %s",
                  this->holder_->type().name());
   return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +

From 164692da9a75744db770836111ae4c63ac6ed7c3 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Thu, 31 May 2018 11:00:40 +0800
Subject: [PATCH 03/24] drop the last batch, if the size of last batch is not
 equal to batch_size

---
 python/paddle/batch.py        | 6 ++++--
 python/paddle/v2/minibatch.py | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/python/paddle/batch.py b/python/paddle/batch.py
index 317cf037c6..d48c54fcbb 100644
--- a/python/paddle/batch.py
+++ b/python/paddle/batch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']
 
 
-def batch(reader, batch_size):
+def batch(reader, batch_size, drop_last=False):
     """
     Create a batched reader.
 
@@ -23,6 +23,8 @@ def batch(reader, batch_size):
     :type reader: callable
     :param batch_size: size of each mini-batch
     :type batch_size: int
+    :param drop_last: drop the last batch, if the size of last batch is not equal to batch_size.
+    :type drop_last: bool
     :return: the batched reader.
     :rtype: callable
     """
@@ -35,7 +37,7 @@ def batch(reader, batch_size):
             if len(b) == batch_size:
                 yield b
                 b = []
-        if b:
+        if drop_last == False and len(b) != 0:
             yield b
 
     return batch_reader
diff --git a/python/paddle/v2/minibatch.py b/python/paddle/v2/minibatch.py
index 317cf037c6..d48c54fcbb 100644
--- a/python/paddle/v2/minibatch.py
+++ b/python/paddle/v2/minibatch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']
 
 
-def batch(reader, batch_size):
+def batch(reader, batch_size, drop_last=False):
     """
     Create a batched reader.
 
@@ -23,6 +23,8 @@ def batch(reader, batch_size):
     :type reader: callable
     :param batch_size: size of each mini-batch
     :type batch_size: int
+    :param drop_last: drop the last batch, if the size of last batch is not equal to batch_size.
+    :type drop_last: bool
     :return: the batched reader.
     :rtype: callable
     """
@@ -35,7 +37,7 @@ def batch(reader, batch_size):
             if len(b) == batch_size:
                 yield b
                 b = []
-        if b:
+        if drop_last == False and len(b) != 0:
             yield b
 
     return batch_reader

From 75ea577fd31a7ead1cbec6aa6b21338040ea585d Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Thu, 31 May 2018 16:01:16 +0800
Subject: [PATCH 04/24] allow profiler and timeline to work when dev_ctx is
 nullptr.

Sometimes dev_ctx is not available when RecordEvent.
---
 paddle/fluid/platform/profiler.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 3d8d64e4c2..01de9d7041 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -127,6 +127,7 @@ double Event::CpuElapsedMs(const Event& e) const {
 
 double Event::CudaElapsedMs(const Event& e) const {
 #ifdef PADDLE_WITH_CUDA
+  if (!has_cuda_) return 0.0;
   PADDLE_ENFORCE(e.has_cuda() && has_cuda());
   PADDLE_ENFORCE(e.device() == device());
   PADDLE_ENFORCE(cudaEventSynchronize(event_));

From e330cd032e9a92e2a5851506c35c2ef31e02e01a Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Thu, 31 May 2018 16:34:08 +0800
Subject: [PATCH 05/24] balance parameter update

---
 .../details/multi_devices_graph_builder.cc    | 41 +++++++++++++------
 .../details/multi_devices_graph_builder.h     |  2 +-
 2 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index d8e711994c..17baacd13e 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -11,11 +11,15 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include <algorithm>
 #include <fstream>
+#include <string>
 #include <utility>
+#include <vector>
+
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
@@ -26,9 +30,6 @@
 #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
 #endif
 
-#include <string>
-#include <vector>
-
 DEFINE_string(ssa_graph_path, "/tmp/ssa_graph.dot",
               "the ssa graph path only print with GLOG_v=10,"
               "default /tmp/graph.dot");
@@ -148,9 +149,9 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(
 
 std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
     const ProgramDesc &program) const {
-  std::unordered_map<std::string, proto::VarType::Type> var_types;
+  std::unordered_map<std::string, VarDesc *> all_vars;
   for (auto *var : program.Block(0).AllVars()) {
-    var_types[var->Name()] = var->GetType();
+    all_vars[var->Name()] = var;
   }
 
   auto graph = new SSAGraph();
@@ -167,12 +168,28 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
   auto send_vars = FindDistTrainSendVars(program);
   auto recv_vars = FindDistTrainRecvVars(program);
 
-  size_t cur_device_id = 0;
   std::vector<std::unordered_set<std::string>> var_name_on_devices;
   std::vector<std::unordered_set<std::string>> bcast_var_name_set;
   var_name_on_devices.resize(places_.size());
   bcast_var_name_set.resize(places_.size());
 
+  size_t cur_device_id = 0;
+  std::vector<int64_t> balance_grads(places_.size(), 0);
+
+  auto get_appropriate_dev = [&](std::string &g_name) -> size_t {
+    auto var_desc = all_vars.at(g_name);
+    PADDLE_ENFORCE_NOT_NULL(var_desc);
+    auto dim = framework::make_ddim(var_desc->GetShape());
+    int64_t numel = framework::product(dim);
+    PADDLE_ENFORCE_GE(numel, 0);
+    auto smallest =
+        std::min_element(std::begin(balance_grads), std::end(balance_grads));
+    size_t dev_id =
+        static_cast<size_t>(std::distance(std::begin(balance_grads), smallest));
+    balance_grads[dev_id] += numel;
+    return dev_id;
+  };
+
   bool is_forwarding = true;
   for (auto *op : program.Block(0).AllOps()) {
     if (boost::get<int>(
@@ -220,13 +237,13 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
 
               switch (strategy_.reduce_) {
                 case BuildStrategy::ReduceStrategy::kReduce:
+                  cur_device_id = get_appropriate_dev(g_name);
                   CreateReduceOp(&result, g_name, cur_device_id);
                   var_name_on_devices[cur_device_id].emplace(g_name);
                   bcast_var_name_set[cur_device_id].emplace(p_name);
-                  cur_device_id = (cur_device_id + 1) % places_.size();
                   break;
                 case BuildStrategy::ReduceStrategy::kAllReduce:
-                  if (IsSparseGradient(var_types, g_name)) {
+                  if (IsSparseGradient(all_vars, g_name)) {
                     CreateReduceOp(&result, g_name, 0);
                     CreateBroadcastOp(&result, g_name, 0);
                   } else {
@@ -269,10 +286,10 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
 }
 
 bool MultiDevSSAGraphBuilder::IsSparseGradient(
-    const std::unordered_map<std::string, proto::VarType::Type> &var_types,
+    const std::unordered_map<std::string, VarDesc *> &all_vars,
     const std::string &og) const {
-  PADDLE_ENFORCE(var_types.count(og) != 0);
-  if (var_types.at(og) == proto::VarType::SELECTED_ROWS) {
+  PADDLE_ENFORCE(all_vars.count(og) != 0);
+  if (all_vars.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
     return true;
   }
   return false;
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index e07597dbd8..544cbe585c 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -106,7 +106,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
                          size_t src_dev_id) const;
 
   bool IsSparseGradient(
-      const std::unordered_map<std::string, proto::VarType::Type> &var_types,
+      const std::unordered_map<std::string, VarDesc *> &all_vars,
       const std::string &og) const;
 
  private:

From 2a3c58d3fecab43a753bd8c47e327ceae9f0f467 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Thu, 31 May 2018 16:56:43 +0800
Subject: [PATCH 06/24] refine programdesc copy

---
 paddle/fluid/framework/block_desc.cc   |  2 +-
 paddle/fluid/framework/block_desc.h    |  2 +-
 paddle/fluid/framework/program_desc.cc | 15 +++++++++------
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc
index fd409ed4c0..b15aba9106 100644
--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -209,7 +209,7 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
     : prog_(prog), desc_(desc) {
   need_update_ = true;
   for (auto &op : other.ops_) {
-    ops_.emplace_back(new OpDesc(*op->Proto(), prog, this));
+    ops_.emplace_back(new OpDesc(*op, this));
   }
   for (auto &it : other.vars_) {
     auto *var = new VarDesc(*it.second);
diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index 600601669c..189dd6c52f 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -105,7 +105,7 @@ class BlockDesc {
 
   size_t OpSize() const { return ops_.size(); }
 
-  OpDesc *Op(int idx) { return ops_.at(idx).get(); }
+  OpDesc *Op(int idx) const { return ops_.at(idx).get(); }
 
   void Flush();
 
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index 64fb028f83..aa01f9928c 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -51,12 +51,15 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
     auto *block = desc_.mutable_blocks(i);
     blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this));
   }
-  for (auto &block : blocks_) {
-    for (auto *op : block->AllOps()) {
-      for (const auto &attr : op->Proto()->attrs()) {
-        if (attr.type() == proto::AttrType::BLOCK) {
-          size_t blk_idx = attr.block_idx();
-          op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
+  for (size_t block_id = 0; block_id < blocks_.size(); ++block_id) {
+    auto all_ops = blocks_[block_id]->AllOps();
+    for (size_t op_id = 0; op_id < all_ops.size(); ++op_id) {
+      auto &op = all_ops[op_id];
+      for (const std::string &attr_name : op->AttrNames()) {
+        if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) {
+          int sub_block_id =
+              o.Block(block_id).Op(op_id)->GetBlockAttr(attr_name);
+          op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
         }
       }
     }

From 97b7502772ca2758428b4c221eac4091f495525b Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Thu, 31 May 2018 17:39:39 +0800
Subject: [PATCH 07/24] inference API little fix (#11069)

---
 paddle/contrib/inference/CMakeLists.txt       |  8 +-
 .../contrib/inference/paddle_inference_api.h  | 44 +++++----
 .../inference/paddle_inference_api_impl.cc    | 94 ++++++-------------
 .../inference/paddle_inference_api_impl.h     | 18 +---
 .../test_paddle_inference_api_impl.cc         | 13 +--
 paddle/fluid/inference/CMakeLists.txt         | 11 ++-
 6 files changed, 75 insertions(+), 113 deletions(-)

diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index 9c55f189bc..3beb93c4e7 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -36,7 +36,7 @@ function(inference_api_test TARGET_NAME TEST_SRC)
         string(REGEX REPLACE "^_$" "" arg "${arg}")
         cc_test(${TARGET_NAME}
                 SRCS ${TEST_SRC}
-                DEPS paddle_fluid_api paddle_inference_api paddle_inference_api_impl
+                DEPS paddle_fluid_api paddle_inference_api
                 ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
         # TODO(panyx0178): Figure out how to add word2vec and image_classification
         # as deps.
@@ -47,13 +47,9 @@ endfunction(inference_api_test)
 
 
 cc_library(paddle_inference_api
-    SRCS paddle_inference_api.cc
+    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
     DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 
-cc_library(paddle_inference_api_impl
-           SRCS paddle_inference_api_impl.cc
-           DEPS paddle_inference_api paddle_fluid_api)
-
 cc_test(test_paddle_inference_api
         SRCS test_paddle_inference_api.cc
         DEPS paddle_inference_api)
diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/contrib/inference/paddle_inference_api.h
index f804d9b286..b4c7f9bef4 100644
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -45,10 +45,10 @@ struct PaddleTensor {
 };
 
 /*
-* A simple Inference API for Paddle. Currently this API might just be used by
-* non-sequence scenerios.
-* TODO(Superjomn) Prepare another API for NLP-related usages.
-*/
+ * A simple Inference API for Paddle. Currently this API can be used by
+ * non-sequence scenerios.
+ * TODO(Superjomn) Support another API for NLP-related usages.
+ */
 class PaddlePredictor {
  public:
   struct Config;
@@ -66,34 +66,38 @@ class PaddlePredictor {
   // be thread-safe.
   virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
 
-  virtual bool InitShared() { return false; }
   // Destroy the Predictor.
   virtual ~PaddlePredictor() {}
 
-  friend std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(
-      const PaddlePredictor::Config& config);
+  enum class EngineKind {
+    kNative = -1,  // Use the native Fluid facility.
+    // TODO(Superjomn) support latter.
+    // kAnakin,             // Use Anakin for inference.
+    // kTensorRT,           // Use TensorRT for inference.
+    // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
+    // kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
+  };
 
   // The common configs for all the predictors.
   struct Config {
-    enum class EngineKind;
-
     std::string model_dir;      // path to the model directory.
     bool enable_engine{false};  // Enable to execute (part of) the model on
-    // third-party engines.
-    EngineKind engine_kind{Config::EngineKind::kNone};
-
-    enum class EngineKind {
-      kNone = -1,          // Use the native Fluid facility.
-      kAnakin,             // Use Anakin for inference.
-      kTensorRT,           // Use TensorRT for inference.
-      kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
-      kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
-    };
   };
 };
 
+struct NativeConfig : public PaddlePredictor::Config {
+  bool use_gpu{false};
+  int device;
+  float fraction_of_gpu_memory;
+  std::string prog_file;
+  std::string param_file;
+  bool share_variables;
+};
+
 // A factory to help create difference predictor.
-template <typename ConfigT>
+template <
+    typename ConfigT,
+    PaddlePredictor::EngineKind engine = PaddlePredictor::EngineKind::kNative>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
 
 }  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.cc b/paddle/contrib/inference/paddle_inference_api_impl.cc
index ebe4c32918..989252f69e 100644
--- a/paddle/contrib/inference/paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
@@ -54,7 +54,7 @@ std::string num2str(T a) {
 }
 }  // namespace
 
-bool PaddlePredictorImpl::Init() {
+bool NativePaddlePredictor::Init() {
   VLOG(3) << "Predictor::init()";
 
   // TODO(panyx0718): Should CPU vs GPU device be decided by id?
@@ -96,8 +96,8 @@ bool PaddlePredictorImpl::Init() {
   return true;
 }
 
-bool PaddlePredictorImpl::Run(const std::vector<PaddleTensor> &inputs,
-                              std::vector<PaddleTensor> *output_data) {
+bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
+                                std::vector<PaddleTensor> *output_data) {
   VLOG(3) << "Predictor::predict";
   Timer timer;
   timer.tic();
@@ -133,59 +133,20 @@ bool PaddlePredictorImpl::Run(const std::vector<PaddleTensor> &inputs,
   return true;
 }
 
-std::unique_ptr<PaddlePredictor> PaddlePredictorImpl::Clone() {
+std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
   VLOG(3) << "Predictor::clone";
-  std::unique_ptr<PaddlePredictor> cls(new PaddlePredictorImpl(config_));
-  if (!cls->InitShared()) {
-    LOG(ERROR) << "fail to call InitShared";
+  std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
+
+  if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init()) {
+    LOG(ERROR) << "fail to call Init";
     return nullptr;
   }
   // fix manylinux compile error.
   return std::move(cls);
 }
 
-// TODO(panyx0718): Consider merge with Init()?
-bool PaddlePredictorImpl::InitShared() {
-  VLOG(3) << "Predictor::init_shared";
-  // 1. Define place, executor, scope
-  if (this->config_.device >= 0) {
-    place_ = platform::CUDAPlace();
-  } else {
-    place_ = platform::CPUPlace();
-  }
-  this->executor_.reset(new framework::Executor(this->place_));
-  this->scope_.reset(new framework::Scope());
-  // Initialize the inference program
-  if (!this->config_.model_dir.empty()) {
-    // Parameters are saved in separate files sited in
-    // the specified `dirname`.
-    this->inference_program_ = inference::Load(
-        this->executor_.get(), this->scope_.get(), this->config_.model_dir);
-  } else if (!this->config_.prog_file.empty() &&
-             !this->config_.param_file.empty()) {
-    // All parameters are saved in a single file.
-    // The file names should be consistent with that used
-    // in Python API `fluid.io.save_inference_model`.
-    this->inference_program_ = inference::Load(this->executor_.get(),
-                                               this->scope_.get(),
-                                               this->config_.prog_file,
-                                               this->config_.param_file);
-  }
-  this->ctx_ = this->executor_->Prepare(*this->inference_program_, 0);
-  // 3. create variables
-  // TODO(panyx0718): why test share_variables.
-  if (config_.share_variables) {
-    this->executor_->CreateVariables(
-        *this->inference_program_, this->scope_.get(), 0);
-  }
-  // 4. Get the feed_target_names and fetch_target_names
-  this->feed_target_names_ = this->inference_program_->GetFeedTargetNames();
-  this->fetch_target_names_ = this->inference_program_->GetFetchTargetNames();
-  return true;
-}
-
-bool PaddlePredictorImpl::SetFeed(const std::vector<PaddleTensor> &inputs,
-                                  std::vector<framework::LoDTensor> *feeds) {
+bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
+                                    std::vector<framework::LoDTensor> *feeds) {
   VLOG(3) << "Predictor::set_feed";
   if (inputs.size() != feed_target_names_.size()) {
     LOG(ERROR) << "wrong feed input size.";
@@ -213,7 +174,7 @@ bool PaddlePredictorImpl::SetFeed(const std::vector<PaddleTensor> &inputs,
   return true;
 }
 
-bool PaddlePredictorImpl::GetFetch(
+bool NativePaddlePredictor::GetFetch(
     const std::vector<framework::LoDTensor> &fetchs,
     std::vector<PaddleTensor> *outputs) {
   VLOG(3) << "Predictor::get_fetch";
@@ -280,23 +241,26 @@ bool PaddlePredictorImpl::GetFetch(
 }
 
 template <>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(
-    const ConfigImpl &config) {
-  VLOG(3) << "create PaddlePredictorImpl";
-  // 1. GPU memeroy
-  std::vector<std::string> flags;
-  if (config.fraction_of_gpu_memory >= 0.0f ||
-      config.fraction_of_gpu_memory <= 0.95f) {
-    flags.push_back("dummpy");
-    std::string flag = "--fraction_of_gpu_memory_to_use=" +
-                       num2str<float>(config.fraction_of_gpu_memory);
-    flags.push_back(flag);
-    VLOG(3) << "set flag: " << flag;
-    framework::InitGflags(flags);
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<NativeConfig, PaddlePredictor::EngineKind::kNative>(
+    const NativeConfig &config) {
+  VLOG(3) << "create NativePaddlePredictor";
+  if (config.use_gpu) {
+    // 1. GPU memeroy
+    std::vector<std::string> flags;
+    if (config.fraction_of_gpu_memory >= 0.0f ||
+        config.fraction_of_gpu_memory <= 0.95f) {
+      flags.push_back("dummpy");
+      std::string flag = "--fraction_of_gpu_memory_to_use=" +
+                         num2str<float>(config.fraction_of_gpu_memory);
+      flags.push_back(flag);
+      VLOG(3) << "set flag: " << flag;
+      framework::InitGflags(flags);
+    }
   }
 
-  std::unique_ptr<PaddlePredictor> predictor(new PaddlePredictorImpl(config));
-  if (!dynamic_cast<PaddlePredictorImpl *>(predictor.get())->Init()) {
+  std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config));
+  if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init()) {
     return nullptr;
   }
   return std::move(predictor);
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.h b/paddle/contrib/inference/paddle_inference_api_impl.h
index c545461680..84707e223d 100644
--- a/paddle/contrib/inference/paddle_inference_api_impl.h
+++ b/paddle/contrib/inference/paddle_inference_api_impl.h
@@ -29,17 +29,10 @@
 
 namespace paddle {
 
-struct ConfigImpl : public PaddlePredictor::Config {
-  int device;
-  float fraction_of_gpu_memory;
-  std::string prog_file;
-  std::string param_file;
-  bool share_variables;
-};
-
-class PaddlePredictorImpl : public PaddlePredictor {
+class NativePaddlePredictor : public PaddlePredictor {
  public:
-  explicit PaddlePredictorImpl(const ConfigImpl &config) : config_(config) {}
+  explicit NativePaddlePredictor(const NativeConfig &config)
+      : config_(config) {}
 
   bool Init();
 
@@ -48,16 +41,15 @@ class PaddlePredictorImpl : public PaddlePredictor {
 
   std::unique_ptr<PaddlePredictor> Clone() override;
 
-  ~PaddlePredictorImpl() override{};
+  ~NativePaddlePredictor() override{};
 
  private:
-  bool InitShared() override;
   bool SetFeed(const std::vector<PaddleTensor> &input_datas,
                std::vector<framework::LoDTensor> *feeds);
   bool GetFetch(const std::vector<framework::LoDTensor> &fetchs,
                 std::vector<PaddleTensor> *output_data);
 
-  ConfigImpl config_;
+  NativeConfig config_;
   platform::Place place_;
   std::unique_ptr<framework::Executor> executor_;
   std::unique_ptr<framework::Scope> scope_;
diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
index caba7931cb..5240fc2f20 100644
--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -40,19 +40,20 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
   return pt;
 }
 
-ConfigImpl GetConfig() {
-  ConfigImpl config;
+NativeConfig GetConfig() {
+  NativeConfig config;
   config.model_dir = FLAGS_dirname + "word2vec.inference.model";
   LOG(INFO) << "dirname  " << config.model_dir;
   config.fraction_of_gpu_memory = 0.15;
+  config.use_gpu = true;
   config.device = 0;
   config.share_variables = true;
   return config;
 }
 
 TEST(paddle_inference_api_impl, word2vec) {
-  ConfigImpl config = GetConfig();
-  std::unique_ptr<PaddlePredictor> predictor = CreatePaddlePredictor(config);
+  NativeConfig config = GetConfig();
+  auto predictor = CreatePaddlePredictor<NativeConfig>(config);
 
   framework::LoDTensor first_word, second_word, third_word, fourth_word;
   framework::LoD lod{{0, 1}};
@@ -104,7 +105,7 @@ TEST(paddle_inference_api_impl, image_classification) {
   int batch_size = 2;
   bool use_mkldnn = false;
   bool repeat = false;
-  ConfigImpl config = GetConfig();
+  NativeConfig config = GetConfig();
   config.model_dir =
       FLAGS_dirname + "image_classification_resnet.inference.model";
 
@@ -133,7 +134,7 @@ TEST(paddle_inference_api_impl, image_classification) {
                                                  is_combined,
                                                  use_mkldnn);
 
-  std::unique_ptr<PaddlePredictor> predictor = CreatePaddlePredictor(config);
+  auto predictor = CreatePaddlePredictor(config);
   std::vector<PaddleTensor> paddle_tensor_feeds;
   paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&input));
 
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index cc4a725dfb..ec16a1c600 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -5,14 +5,19 @@ cc_library(paddle_fluid_api
     SRCS io.cc
     DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 
-# Create static library
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-cc_library(paddle_fluid DEPS ${fluid_modules})
 
+if(WITH_CONTRIB)
+  set(fluid_modules "${fluid_modules}" paddle_inference_api)
+endif()
+
+# Create static library
+cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api)
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
     SRCS io.cc
-    DEPS ${fluid_modules})
+    DEPS ${fluid_modules} paddle_fluid_api)
+
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
 if(NOT APPLE)
   # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.

From 2f5bc5e02d117ddc501ba4398aac9fd36ca7b336 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Thu, 31 May 2018 17:53:52 +0800
Subject: [PATCH 08/24] fix transpiler package

---
 python/setup.py.in | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/setup.py.in b/python/setup.py.in
index c42601d335..8257f1d5e2 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -69,7 +69,8 @@ packages=['paddle',
           'paddle.fluid.proto',
           'paddle.fluid.proto.profiler',
           'paddle.fluid.layers',
-          'paddle.fluid.transpiler']
+          'paddle.fluid.transpiler',
+          'paddle.fluid.transpiler.details']
 
 if '${WITH_FLUID_ONLY}'== 'OFF':
     packages+=['paddle.proto',

From 2007f630b47d52738b3896ec7d6af90c50b129d2 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Thu, 31 May 2018 19:23:24 +0800
Subject: [PATCH 09/24] add build and install document of fluid inference
 library

---
 doc/fluid/howto/index_cn.rst                  |  2 +-
 doc/fluid/howto/index_en.rst                  |  1 -
 .../inference/build_and_install_lib_cn.rst    | 96 +++++++++++++++++++
 doc/fluid/howto/inference/index_cn.rst        |  8 ++
 ...id.md => inference_support_in_fluid_cn.md} | 59 +-----------
 5 files changed, 106 insertions(+), 60 deletions(-)
 create mode 100644 doc/fluid/howto/inference/build_and_install_lib_cn.rst
 create mode 100644 doc/fluid/howto/inference/index_cn.rst
 rename doc/fluid/howto/inference/{inference_support_in_fluid.md => inference_support_in_fluid_cn.md} (90%)

diff --git a/doc/fluid/howto/index_cn.rst b/doc/fluid/howto/index_cn.rst
index b7c6201797..b57af64f44 100644
--- a/doc/fluid/howto/index_cn.rst
+++ b/doc/fluid/howto/index_cn.rst
@@ -4,5 +4,5 @@
 .. toctree::
   :maxdepth: 1
 
+  inference/index_cn.rst
   optimization/index_cn.rst
-  inference/inference_support_in_fluid.md
diff --git a/doc/fluid/howto/index_en.rst b/doc/fluid/howto/index_en.rst
index f3ca41cdbf..fd21e167ce 100644
--- a/doc/fluid/howto/index_en.rst
+++ b/doc/fluid/howto/index_en.rst
@@ -5,4 +5,3 @@ HOW TO
   :maxdepth: 1
 
   optimization/index_en.rst
-  inference/inference_support_in_fluid.md
diff --git a/doc/fluid/howto/inference/build_and_install_lib_cn.rst b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
new file mode 100644
index 0000000000..c8d9992fcc
--- /dev/null
+++ b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
@@ -0,0 +1,96 @@
+安装与编译C++预测库
+===========================
+
+直接下载安装
+-------------
+
+======================   ========================================
+版本说明                            C++预测库   
+======================   ========================================
+cpu_avx_mkl              `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/fluid.tgz>`_ 
+cpu_avx_openblas         `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/fluid.tgz>`_
+cpu_noavx_openblas       `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/fluid.tgz>`_
+cuda7.5_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+cuda8.0_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+cuda8.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+======================   ========================================
+
+从源码编译
+----------
+用户也可以从 PaddlePaddle 核心代码编译C++预测库，只需在编译时配制下面这些编译选项：
+
+=================   =========
+选项                 值   
+=================   =========
+CMAKE_BUILD_TYPE    Release
+FLUID_INSTALL_DIR   安装路径    
+WITH_FLUID_ONLY     ON（推荐）
+WITH_SWIG_PY        OFF（推荐
+WITH_PYTHON         OFF（推荐）
+WITH_GPU            ON/OFF
+WITH_MKL            ON/OFF
+=================   =========
+
+建议按照推荐值设置，以避免链接不必要的库。其它可选编译选项按需进行设定。
+
+下面的代码片段从github拉取最新代码，配制编译选项（需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径）：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+     PADDLE_ROOT=/path/of/capi
+     git clone https://github.com/PaddlePaddle/Paddle.git
+     cd Paddle
+     mkdir build
+     cd build
+     cmake -DFLUID_INSTALL_DIR=$PADDLE_ROOT \
+           -DCMAKE_BUILD_TYPE=Release \
+           -DWITH_FLUID_ONLY=ON \
+           -DWITH_SWIG_PY=OFF \
+           -DWITH_PYTHON=OFF \
+           -DWITH_MKL=OFF \
+           -DWITH_GPU=OFF  \
+           ..
+      make
+      make inference_lib_dist
+
+成功编译后，使用C++预测库所需的依赖（包括：（1）编译出的PaddlePaddle预测库和头文件；（2）第三方链接库和头文件；（3）版本信息与编译选项信息）
+均会存放于PADDLE_ROOT目录中。目录结构如下：
+
+  .. code-block:: text
+
+     PaddleRoot/
+     ├── CMakeCache.txt
+     ├── paddle
+     │   └── fluid
+     │       ├── framework
+     │       ├── inference
+     │       ├── memory
+     │       ├── platform
+     │       ├── pybind
+     │       └── string
+     ├── third_party
+     │   ├── boost
+     │   │   └── boost
+     │   ├── eigen3
+     │   │   ├── Eigen
+     │   │   └── unsupported
+     │   └── install
+     │       ├── gflags
+     │       ├── glog
+     │       ├── mklml
+     │       ├── protobuf
+     │       ├── snappy
+     │       ├── snappystream
+     │       └── zlib
+     └── version.txt
+     
+version.txt 中记录了该预测库的版本信息，包括Git Commit ID、使用OpenBlas或MKL数学库、CUDA/CUDNN版本号，如：
+
+  .. code-block:: text
+
+     GIT COMMIT ID: c95cd4742f02bb009e651a00b07b21c979637dc8
+     WITH_MKL: ON
+     WITH_GPU: ON
+     CUDA version: 8.0
+     CUDNN version: v5
diff --git a/doc/fluid/howto/inference/index_cn.rst b/doc/fluid/howto/inference/index_cn.rst
new file mode 100644
index 0000000000..a903423548
--- /dev/null
+++ b/doc/fluid/howto/inference/index_cn.rst
@@ -0,0 +1,8 @@
+预测库
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  build_and_install_lib_cn.rst
+  inference_support_in_fluid_cn.md
diff --git a/doc/fluid/howto/inference/inference_support_in_fluid.md b/doc/fluid/howto/inference/inference_support_in_fluid_cn.md
similarity index 90%
rename from doc/fluid/howto/inference/inference_support_in_fluid.md
rename to doc/fluid/howto/inference/inference_support_in_fluid_cn.md
index d272cd3e3b..309b17fccd 100644
--- a/doc/fluid/howto/inference/inference_support_in_fluid.md
+++ b/doc/fluid/howto/inference/inference_support_in_fluid_cn.md
@@ -1,9 +1,8 @@
-# Fluid Inference使用指南
+# 使用指南
 
 ## 目录：
 
 - Python Inference API
-- 编译Fluid Inference库
 - Inference C++ API
 - Inference实例
 - Inference计算优化
@@ -55,62 +54,6 @@
     return [program, feed_target_names, fetch_targets]
   ```
 
-
-## 编译Fluid Inference库
-
-  - **不需要额外的CMake选项**
-    - 1、 配置CMake命令，更多配置请参考[源码编译PaddlePaddle](http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/build_from_source_cn.html)
-      ```bash
-      $ git clone https://github.com/PaddlePaddle/Paddle.git
-      $ cd Paddle
-      $ mkdir build
-      $ cd build
-      $ cmake -DCMAKE_INSTALL_PREFIX=your/path/to/paddle_inference_lib \
-          -DCMAKE_BUILD_TYPE=Release \
-          -DWITH_PYTHON=ON \
-          -DWITH_MKL=OFF \
-          -DWITH_GPU=OFF \
-          ..
-      ```
-
-    - 2、 编译PaddlePaddle
-      ```bash
-      $ make
-      ```
-
-    - 3、 部署。执行如下命令将PaddlePaddle Fluid Inference库部署到`your/path/to/paddle_inference_lib`目录。
-      ```bash
-      $ make inference_lib_dist
-      ```
-
-- 目录结构
-
-  ```bash
-  $ cd your/path/to/paddle_inference_lib
-  $ tree
-  .
-  |-- paddle
-  |   `-- fluid
-  |       |-- framework
-  |       |-- inference
-  |       |   |-- io.h
-  |       |   `-- libpaddle_fluid.so
-  |       |-- memory
-  |       |-- platform
-  |       `-- string
-  |-- third_party
-  |   |-- eigen3
-  |   `-- install
-  |       |-- gflags
-  |       |-- glog
-  |       `-- protobuf
-  `-- ...
-  ```
-
-  假设`PADDLE_ROOT=your/path/to/paddle_inference_lib`。
-
-
-
 ## 链接Fluid Inference库
 - 示例项目([链接](https://github.com/luotao1/fluid_inference_example.git))
 

From a3aca2a3cfeb6ab246ff95987374809be1a3c863 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Thu, 31 May 2018 20:58:26 +0800
Subject: [PATCH 10/24] fix bugs

---
 paddle/fluid/framework/block_desc.cc   |  2 +-
 paddle/fluid/framework/op_desc.cc      |  2 +-
 paddle/fluid/framework/op_desc.h       |  3 ++-
 paddle/fluid/framework/program_desc.cc | 10 ++++++++++
 4 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc
index b15aba9106..e7842e9b81 100644
--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -200,7 +200,7 @@ BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc)
     vars_[var_desc.name()].reset(new VarDesc(var_desc));
   }
   for (const proto::OpDesc &op_desc : desc_->ops()) {
-    ops_.emplace_back(new OpDesc(op_desc, prog, this));
+    ops_.emplace_back(new OpDesc(op_desc, this));
   }
 }
 
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 09b67e5a17..f92769192c 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -103,7 +103,7 @@ void OpDesc::CopyFrom(const OpDesc &op_desc) {
   need_update_ = true;
 }
 
-OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block)
+OpDesc::OpDesc(const proto::OpDesc &desc, BlockDesc *block)
     : desc_(desc), need_update_(false) {
   // restore inputs_
   int input_size = desc_.inputs_size();
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index 1a330db7cc..a02d3e2691 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -33,13 +33,14 @@ class OpDesc {
   OpDesc(const std::string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const AttributeMap &attrs);
 
-  OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block);
+  OpDesc(const proto::OpDesc &desc, BlockDesc *block);
 
   explicit OpDesc(BlockDesc *block) : block_(block) {}
 
   OpDesc(const OpDesc &other, BlockDesc *block) {
     *this = other;
     block_ = block;
+    need_update_ = true;
   }
 
   void CopyFrom(const OpDesc &op_desc);
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index aa01f9928c..1e01a6e900 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -89,6 +89,16 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
   for (auto &block_desc : *desc_.mutable_blocks()) {
     blocks_.emplace_back(new BlockDesc(this, &block_desc));
   }
+  for (auto &block : blocks_) {
+    for (auto *op : block->AllOps()) {
+      for (const auto &attr : op->Proto()->attrs()) {
+        if (attr.type() == proto::AttrType::BLOCK) {
+          size_t blk_idx = attr.block_idx();
+          op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
+        }
+      }
+    }
+  }
 }
 
 const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {

From 5870a6b486537d5f119282a36e23e6ab4be98804 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Thu, 31 May 2018 21:46:52 +0800
Subject: [PATCH 11/24] clean docstring_checker.pyc (#11093)

---
 tools/codestyle/docstring_checker.pyc | Bin 11769 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 tools/codestyle/docstring_checker.pyc

diff --git a/tools/codestyle/docstring_checker.pyc b/tools/codestyle/docstring_checker.pyc
deleted file mode 100644
index 1ce612ca2318ccb9b9f28d51cb93ce8e5e1d0680..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 11769
zcmdT~Npl;=7488*kbt;or54N9SSu6>B(->BXxWq~%SvoYHISncEfx%6hUAa~3^+Zo
zNn5H^N~tWz+>(Egs#N8a$}N>ceDg86<dz(B$|03g;!3{n^<V%}v?G;G$ssw-?$`V4
zw|8g%F;qPN=N<3YRq~U?|2OgJUm$UnnnP+SH=_be%^_p*88w%Yd{zZn)yS#2oNDx`
zxjtpFMNS2I)#z7q{ifWff&tYisJVhEW3-~0D=Igyeuyb{D7RmEgK7a?2MBqn)SbB@
z<rb8e6KFAo4s$J)(he?h3du}Z*Ky=G7q2XN^%XC&eQme3=h|^-*9o#)>+w?9wB4|7
zo0g|EGV6gi=Klvjv!x7%i8<iqn)Z@m;y#qyu^;G?#p)3x*Qe1x^xdS1sbD|XTd1{z
z*!Amiovry1k5};N{YdUBwTwY5W*%DnBbq8Qv)+vcj<@LR7^{~hdyyVXmbDX=QBqk=
zDq{jS@7t^<kD9=5Vx$ssN)mk&x$*ZjMjgLb^WFA1`nDThGuGpaeq8-1j8=53R`<ql
z)?7F6#^W#y^msjVJwekrFlP|gS-==jI)Fht3p<3oCKgpDE+9?URws-<th|g*-$CNx
z$Z^t0g@#B0S)M90IprdsQ$GZ?^(nVcfcXxg^pjB7Y#<>N5EO`nkfL%br6Q-8M3Sz+
zt_L-(?WHhqwH+^ccB>YVI<TjN8f-7{1Q}yR8$am*a>;Ul&m$Ad>EYaPfYpqgXO>iI
zxeCX~^PVl`kUCj5W-h>a;=n;1LQ3_(t3~FFSgXBY;c1*uwdyx<#MNV*lM^dz#7eiw
z#a>~}G?K**nSwH>x`j+{A=_KXsVQIjROK)$C9=${Y<@ag)IuRw*J_Pc06OYNo#Vyr
zsHtNTsZ;Z{r-cC=o|<&4p_|~y83bsX@#BzlPvO&?Us*%Rub@dNf#~6$Qc572fSm_f
z(!>1>`S?Ljt(DY$5Jpbf0Ig(HG?kQ(JOEfI`xSIq-KTD&R4H-TSzt~npY&mfg22FQ
z3JzuWf+fRtlJ%P|I96yW@ILecVYD_6SNk7(+l(ujY|)FWGW8hNs(C`n%3Fih9;;{#
zXX?ZfPnlFEM0Fm!GlK6SS^`DOkbFu#0Mic<hiiQn5H>UaIgtqTEd!yb?o)AA-R~oE
zAqrM36}_ugkEx~^Oj;7}dA3=Gh{&;J^f6_{{k&SpaA>qoYV%V2bIOe^Y6qnDx0E4U
z)E1@oPs-h~u@+O~R0q|;m7Iz)7WZgCS#|K>GA5G6TV~PjFc<VCZEmv;3jvOR=1Pg0
zM*jtgNGcmxhr06Wu^%?Iy%2>B$}SOAB+^Q|zGT<5O^!QjCme>1u-93;!+*9;d$8qX
zTH+X#g(X93rzfdDDJN@2*V#>|fgWjTgd}?<b8Q^zR@C-ve?dB6phVc}cD?Rtz0eMV
z_3i*d)*<GKv?yX>2~CERB|5`I1l3GuCWm@YizJclO**>>pvLL!bJ8LD3}2BTor5e=
z%^OkEEt2U)1jM2=T7e%!aY9mY0Ypa2*~2CWkeDWMEsFI=e!OG^7NuCN)$*Ed%#~Il
zysE6N#_KIl#2JxMlI4<FK-G{QVcoEB&s4S86Gah7T6ad#ZoB!Jo0P+O1sP3-Rs+S%
zAozI*9KGAhXGWkXHvDFGS_c5nTZc2nt_n$IBc-X~c1XyB#<ZE06EhN>Sbl1<U936E
ztH`LFyO`V<D385zlGd|WIq@Vb7ZR^N`NGKn&tl=EaW33TRQ|n9ODFl0?5+?_GjQ!j
z55dgLzp0=u!2(bNumzzm9zv8O1BE)wbQv-iHkX)S%TNObBGP|qby(p~Bxw#Jl+(-_
z`9TF=dVjh+S1W%!QGTsjo<DQ?uKr}=?8OJAyQQ%+cTabUlRjzLhz<^5xSTK$5t&`X
zf~zkH`z}UdyXDYu7j147HkG+C`5N2IzRnAn%HxtqEh@8@9BZA-h*u<bSGC%#HN0w7
zBv-ZC2;FwTBBfEa3Mo{rIzz~d1QZ$PoM7=J6XML_7G$38UL?0I-h?xN*^b~7X9z+j
zQ^*zi2MdK0g)Dy^9!MRZ{LsR<j89(x%7KQ|neei77U@G6?*U#aT!$>(P?!mH8sMT9
zl#gOwc_{X){ct!JKv@O#vRf3`4#41w0yYHLu)szD+bOVJfbAC89>DerY#(5w0^1MR
z0f8L^?2y2o1MINCo(Jp&fgJ(N7TAmGW%|Km;shTRPi9cL$0P#>__$=?^PZ5*h;mO#
zW~XvbDYqnU(k|tm7T|8>eoHcYlzT=pdzE`uGW(QUmdq&Kq;VqeDw3H{8@->nv{e4B
zbvlq8M-*<f*b3r$Qmvczi32TFnnVL|nQnJEY?fQ0_Gzxvg7QKbHEMAgy3shC$2K)x
z@z&vfx{tIYYhkjonXTOMib?KWu;_Jd19qe7+0@=#V8^Y4%YVr}Ctxi$0+!~LMaQ2*
z7x=@j{n3)wv}?9r3ZuBUFVKMQ4fymvycaWuF#he=CMM2lGRRDKCO7mn?4;O+Enw3G
zu#b)%wOd}~hpw3!6=hcF%`^9`_t8;m$r$O(=k;rtc{&JeAXJ;FU_behO`Ahw!8*3@
zc{bfy;2#_Gd*dFOc8aW{Tnhpoiq_{hpn{t~6^A$Hg&gd;rdWhMYfV>lL7%|@@A_IJ
z0z(WY<Pk<a1zMD0Xs|2>UAbX8Xqscwu6W}~#~^GjA`Eir^>Y*HBrl190)!`Rj4Nb&
zQ4~gJ8SJqy+7iI9L2g(r9VJcaRyO&vwA8e#aOzt$PG@>qA4j7*VVjF>2d<5Pi2Xrg
ze||tj)We1Z3(WXD({f^CXmZspShHTIGftw`(>pQZdq+tpens&3$i!j;e9$rfdwYui
zWzI5^R)V9grvKtG(%D}X=Pir%^Sgpi{{V?G=ctJhuQL7{(~LR*`MLt9>lBUJ3Sl{=
zFG>MpJ`^91`War4)JcDMIdF}jzTxI&FrIil8_V#*(t3&VaXI=^Bp><^Db<B`Q~V&r
z+l+kD_FD-k^-~fWF#*2R&6$40=@GxClz2H~EW!5y{%o0w)J#~{TS<suYj>yX;Mn5O
zfHgsAZ`A_bGlEa)H7hYHiQtJ)6J2c9EQ&i0DWSlJY)o&@6dfu@$7VuRDau-6lHgW)
z9vX2T4i(c16E7s8O0k3uKnS77UZEQ09%b0BmfscyAJd$E<Pd)bpx%o_4Gd*Qtz4!c
z-%hw1J1wC}nqT~cCOO6insmFtHyU5gt4NHWM>IH>Sj4xTVNgzF_GN6WU!vUcG&+m_
zMsI*R!E8VxrC_aJec2X>;tglO0+==t5-1qkoe~2DL_SPS&ry-?Vk-DV3v6ZqD6T50
zURYqMrmIGi3TYdGUKQ?sx+g8f^=qPr4A==5rbJ2yL5Q@54EDTiQY)2}hg6Jv3)aB0
zRBB~F=}QXI(m)^~DO2HV%nN#9w!#e(?78rD(2HoySWTd=zvJPEKyfZ&LLv1e?A-K9
zT%ki#!C@@CD?6mcEbuF#4G|4caMcE4Gc~wwG_plzJ!Q;Z#=O75mjq%d2##AvET4o@
zrwX9g!cP$VQ)CQ+Va)@L(Vi5uP&`0cmMDIRJVLW`3@pyfI}%m|A|)~hG??fh8UgVK
z7)!)oMvd|evML(?f-~rsGk~0~td6VOJkOFihJVMBdMUG`oKkxTH%_MB*v(&~T4E$F
z8M*|sJ<(ywg;b{|T-Y@4*PzU>W#zoYgksit3(0JWp*W+Zt}!P{i<d)&-iWJ!x~}EC
zO`s6xb>zB1GBemd*4j*7WO59N2!nb%f}W#!R|04o#kxc1=y&)M?KC2KX)rTn9foE)
zYz>%jHw8O4m1WCyB!(_Qp+AA2QWWYuDAWl%37~|_5j7q+(46`)v^;n}qke=QJJbxC
zuvK0z65cbE4LOQ9H=Nc1PQX|-<N*WQ8H>U|=#Y9y{{cxydUj~g1Y;AcsDvv9d&<5z
zahjeUTpY9(@1|?#Jb|SoCRE2<>W=V4#I>cN#M7-)qZ7eI?gf0BK@aGZ`MnGj0h>m|
z0$bm+Rz;?EF5=izMBM_2MPwOJq}14i{A&g#I%WgJiPW|jJmU9FEqYCFtrcDRE2lBk
zqPe=Fq(cHK4*8!WKJCyo?ByjPsz)0e;cl~Wm&f*?DDh?8Zfxc)PAdT_8C-VUMZmV~
z-MNA(|A;T)o&rM3xy)`WXC1}=!}uOmn^0^WsSestr8+iccnTiI!(IkD-y`(F9jF6G
zfQ>*7dRfFb_ytrU%9B?^U>V-_fMsyWA}m9VqJNL1Yh&~<O@}iiEYR`a?Nei?>@KB0
zCa=InH|_~|ZadgT=$*-!LwCNHi0DNaNxG3(9FfY;p;o*O?9be6yv_Dqxx5wapvCsE
zj5C9g|HjdI&!hS&bf3d_00I{zLw=xzLV-&H)89uDi^^Y{kn^>m=}%DY<9MO=K|7?H
z4DUBqck%)e77h%An<(%mR$n2Yg4iy7Rp|DVz7TT$L}&;h!cAsa!O##K5B8Nvv!?p*
zaj~8S5z!2QwPBaiC<!cqG;fFZmT`mFCETwrqOoP=@baijZQ|KF)aN36?}hHVXl=Oe
zxb&|z>z;(4YtTh_L?hOwTvHk|fg;RAo>Lm^UWS`VY&Lm^$yp>uk|DEA)c86^{x?VG
z4WKIIq4;?(mbd!w{U1=}zp(1wNI(x+rgmu>&O#)#hR`2kS0-+N&|#Naz7{d}Y$)_)
zAYm&CbsSbynA}1l_dA9udZ_UvtG^-CIEA&0LdO4ZYNT4@Co*z3x-1$QiQ6{lK<G(W
zm<V6bU)n-a4kAd(ml!ZN;R!(u&ZyN<d3pg^3OOlNP_Y)Ex53Sjq9{Q|G8)*VE~!wK
zVnV#)=&gpy`>8<00ZTf3N)3GtuGqD=87sh3F5KLRs)qf`#qt3L6!s2PS-Zx$X6E0J
zhaaeg>({TM&*->agK7MDqpW_0Rb!1}L+O74Nt?oFJv@S-T2Bakb42|~J-9D70UnDt
z(kZ{P@D^*($L^J;)OEdk?ZwIDsRp*KCvE2f;9b5EI}R~T+-#J1({LK*ObqAJF466f
zdr*na@cOsD<Ogo#HG7zgM#wa72o{^OxRey-z}rcO59`b$3vm=vP7@un*(bEqnX<@X
znG(J*tCk4pQ&9@v#@rKRL&^p<khezh{AmQ2uX}0VB7S+?Du}tlXB6}n2Qo%a@fD~u
z#)Q`d&IA)$SLW?V)#ocYxmds@E#2RnH!9a}UB59q{Z94WtCh<1wX4<ZGZHoX?$tZD
zZ#XkOVCN18qFtod8~4IMOu80spCUhA7j!o?-s4dyge`eT<h+5r^CpvNCXDJimzmJ<
z75iPRF^5Xfq3m(E^RCL3-1ayRwDCfG`a4MS*}PSNzNOyfZx(OOig<rgM6O@bUC8k>
zrG9IOm&g0%+Bjb*42~8?3wsKO3WwxfmEhetO8%OW{3DpfTzQUlZW*Z?l!H@N61Gli
z<yGwhp2=!+aD4Dm4}&0ink6qkozv_|M9Xm)v&%&6xvi&xKRNs<6O6tzj7gI#^+9V8
PA$Eq=2eXB~(VPDTcz=<X


From 4fb7cc7f5efcdba8a631449352afce08832c0b35 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 31 May 2018 20:48:45 -0500
Subject: [PATCH 12/24] Move sync_mode device ctx from grpc server (#10881)

---
 benchmark/fluid/kube_gen_job.py               |   2 +-
 .../inference/analysis/data_flow_graph.h      |   3 +
 .../data_flow_graph_to_fluid_pass_tester.cc   |   6 +-
 .../analysis/fluid_to_data_flow_graph_pass.cc |   4 +-
 .../analysis/fluid_to_data_flow_graph_pass.h  |   2 +
 .../fluid_to_data_flow_graph_pass_tester.cc   |   6 +-
 paddle/fluid/inference/analysis/helper.h      |   6 +-
 paddle/fluid/inference/analysis/pass.h        |   1 +
 .../inference/analysis/subgraph_splitter.h    |   2 +
 paddle/fluid/inference/analysis/ut_helper.h   |   1 +
 .../inference/tensorrt/convert/ut_helper.h    |   5 +-
 paddle/fluid/operators/detail/CMakeLists.txt  |   3 +-
 paddle/fluid/operators/detail/grpc_client.cc  |   2 +
 paddle/fluid/operators/detail/grpc_server.cc  | 372 +++++++-----------
 paddle/fluid/operators/detail/grpc_server.h   |  98 ++---
 .../operators/detail/grpc_server_test.cc      |  87 ++--
 .../fluid/operators/detail/request_handler.h  | 127 ++++++
 .../operators/detail/request_handler_impl.cc  | 115 ++++++
 .../operators/detail/request_handler_impl.h   |  64 +++
 paddle/fluid/operators/detail/rpc_server.cc   | 113 ++++++
 paddle/fluid/operators/detail/rpc_server.h    |  91 +++++
 .../operators/detail/variable_response.h      |   4 +-
 paddle/fluid/operators/gen_nccl_id_op.cc      |  21 +-
 paddle/fluid/operators/listen_and_serv_op.cc  | 211 +++-------
 paddle/fluid/operators/listen_and_serv_op.h   |  31 +-
 paddle/fluid/operators/send_barrier_op.cc     |   2 +
 paddle/fluid/operators/test_send_nccl_id.cc   |  59 +--
 paddle/fluid/platform/nccl_helper.h           |   1 +
 28 files changed, 886 insertions(+), 553 deletions(-)
 create mode 100644 paddle/fluid/operators/detail/request_handler.h
 create mode 100644 paddle/fluid/operators/detail/request_handler_impl.cc
 create mode 100644 paddle/fluid/operators/detail/request_handler_impl.h
 create mode 100644 paddle/fluid/operators/detail/rpc_server.cc
 create mode 100644 paddle/fluid/operators/detail/rpc_server.h

diff --git a/benchmark/fluid/kube_gen_job.py b/benchmark/fluid/kube_gen_job.py
index 39ba207fd9..9da8a69af1 100644
--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
@@ -49,7 +49,7 @@ def parse_args():
     parser.add_argument(
         '--fluid', default=1, type=int, help='whether is fluid job')
     parser.add_argument(
-        '--rdma', action='store_ture', help='whether mount rdma libs')
+        '--rdma', action='store_true', help='whether mount rdma libs')
     parser.add_argument(
         '--disttype',
         default="pserver",
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
index 9f6ce40ede..913e344d37 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -21,7 +21,10 @@ limitations under the License. */
 
 #include <deque>
 #include <stack>
+#include <string>
 #include <unordered_set>
+#include <utility>
+#include <vector>
 
 #include "paddle/fluid/inference/analysis/graph_traits.h"
 #include "paddle/fluid/inference/analysis/node.h"
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
index 60f159da91..dcee75cee5 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
@@ -44,6 +44,6 @@ TEST_F(DFG_Tester, Test) {
   LOG(INFO) << graph.nodes.size();
 }
 
-}  // analysis
-}  // inference
-}  // paddle
+};  // namespace analysis
+};  // namespace inference
+};  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
index f848a7d1ad..9f67c989cc 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include <string>
 #include <vector>
 
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+
 namespace paddle {
 namespace inference {
 namespace analysis {
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
index cd0d4fabaa..33517e57be 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
@@ -19,6 +19,8 @@
 
 #pragma once
 
+#include <string>
+
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/pass.h"
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
index 851c98bef3..817d32c92c 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
@@ -32,6 +32,6 @@ TEST_F(DFG_Tester, Init) {
   LOG(INFO) << '\n' << graph.DotString();
 }
 
-}  // analysis
-}  // inference
-}  // paddle
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index 24ea9a4bae..153dca576b 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -50,7 +50,7 @@ struct DataTypeNamer {
     return dic_.at(x);
   }
 
-  const std::string &repr(size_t &hash) const {
+  const std::string &repr(size_t &hash) const {  // NOLINT
     PADDLE_ENFORCE(dic_.count(hash), "unknown type for representation");
     return dic_.at(hash);
   }
@@ -62,7 +62,9 @@ struct DataTypeNamer {
     SET_TYPE(float);
   }
 
-  std::unordered_map<decltype(typeid(int).hash_code()), std::string> dic_;
+  std::unordered_map<decltype(typeid(int).hash_code()),  // NOLINT
+                     std::string>
+      dic_;
 };
 #undef SET_TYPE
 
diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h
index 5c89b1304d..aa0e8667b5 100644
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <iosfwd>
+#include <string>
 
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.h b/paddle/fluid/inference/analysis/subgraph_splitter.h
index ed90a0dcf3..a31afbe693 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter.h
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.h
@@ -18,6 +18,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
+
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/node.h"
 
diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h
index c86083d121..722fa99a48 100644
--- a/paddle/fluid/inference/analysis/ut_helper.h
+++ b/paddle/fluid/inference/analysis/ut_helper.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
+#include <string>
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index 37fcb5c503..e46c577cda 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -19,6 +19,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/analysis/helper.h"
@@ -58,7 +61,7 @@ class TRTConvertValidation {
  public:
   TRTConvertValidation() = delete;
 
-  TRTConvertValidation(int batch_size, int workspace_size = 1 << 10) {
+  explicit TRTConvertValidation(int batch_size, int workspace_size = 1024) {
     // create engine.
     engine_.reset(new TensorRTEngine(10, 1 << 10, &stream_));
     engine_->InitNetwork();
diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt
index b9a66474c9..cf20530513 100644
--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
@@ -1,6 +1,7 @@
 if(WITH_DISTRIBUTE)
   grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
-      grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
+      request_handler_impl.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor
+      selected_rows memory)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index f7ce778687..da9ca1a0c1 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -205,6 +205,8 @@ void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) {
 }
 
 bool RPCClient::Wait() {
+  VLOG(3) << "RPCClient begin Wait()"
+          << " req_count_:" << req_count_;
   if (req_count_ <= 0) {
     return true;
   }
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index 361cc24b5b..e73756d890 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/*Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,19 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/detail/grpc_server.h"
-
 #include <limits>
 #include <string>
 
-using ::grpc::ServerAsyncResponseWriter;
+#include "paddle/fluid/operators/detail/grpc_server.h"
 
-DEFINE_int32(rpc_server_handle_send_threads, 20,
-             "Number of threads used to handle send at rpc server.");
-DEFINE_int32(rpc_server_handle_get_threads, 20,
-             "Number of threads used to handle get at rpc server.");
-DEFINE_int32(rpc_server_handle_prefetch_threads, 1,
-             "Number of threads used to handle prefetch at rpc server.");
+using ::grpc::ServerAsyncResponseWriter;
 
 namespace paddle {
 namespace operators {
@@ -36,49 +29,40 @@ enum CallStatus { PROCESS = 0, FINISH };
 class RequestBase {
  public:
   explicit RequestBase(GrpcService::AsyncService* service,
-                       ::grpc::ServerCompletionQueue* cq, bool sync_mode,
-                       const platform::DeviceContext* dev_ctx)
+                       ::grpc::ServerCompletionQueue* cq,
+                       RequestHandler* request_handler, int req_id)
       : service_(service),
         cq_(cq),
-        sync_mode_(sync_mode),
         status_(PROCESS),
-        dev_ctx_(dev_ctx) {
+        request_handler_(request_handler),
+        req_id_(req_id) {
     PADDLE_ENFORCE(cq_);
   }
   virtual ~RequestBase() {}
-  virtual void Process() { assert(false); }
+  virtual void Process() = 0;
 
   CallStatus Status() { return status_; }
   void SetStatus(CallStatus status) { status_ = status; }
-  virtual std::string GetReqName() {
-    assert(false);
-    return "";
-  }
+  virtual std::string GetReqName() = 0;
 
  protected:
   ::grpc::ServerContext ctx_;
   GrpcService::AsyncService* service_;
   ::grpc::ServerCompletionQueue* cq_;
-  const bool sync_mode_;
   CallStatus status_;
-  const platform::DeviceContext* dev_ctx_;
+  RequestHandler* request_handler_;
+  int req_id_;
 };
 
 class RequestSend final : public RequestBase {
  public:
   explicit RequestSend(GrpcService::AsyncService* service,
-                       ::grpc::ServerCompletionQueue* cq, bool sync_mode,
-                       framework::Scope* scope, ReceivedQueue* queue,
-                       const platform::DeviceContext* dev_ctx, int req_id)
-      : RequestBase(service, cq, sync_mode, dev_ctx),
-        queue_(queue),
-        responder_(&ctx_),
-        req_id_(req_id) {
-    if (sync_mode_) {
-      request_.reset(new VariableResponse(scope, dev_ctx_, false));
-    } else {
-      request_.reset(new VariableResponse(scope, dev_ctx_, true));
-    }
+                       ::grpc::ServerCompletionQueue* cq,
+                       RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
+    request_.reset(new VariableResponse(request_handler->scope(),
+                                        request_handler->dev_ctx(),
+                                        !request_handler->sync_mode()));
     int method_id = static_cast<int>(detail::GrpcMethod::kSendVariable);
     service_->RequestAsyncUnary(
         method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
@@ -87,12 +71,17 @@ class RequestSend final : public RequestBase {
 
   virtual ~RequestSend() {}
 
-  virtual std::string GetReqName() { return request_->Varname(); }
+  std::string GetReqName() override { return request_->Varname(); }
+
+  void Process() override {
+    std::string varname = GetReqName();
+    VLOG(3) << "RequestSend var_name:" << varname;
 
-  virtual void Process() {
-    std::string var_name = GetReqName();
-    VLOG(3) << "RequestSend " << var_name;
-    queue_->Push(std::make_pair(var_name, request_));
+    auto scope = request_->GetMutableLocalScope();
+    auto invar = request_->GetVar();
+    framework::Variable* outvar = nullptr;
+
+    request_handler_->Handle(varname, scope, invar, &outvar);
 
     status_ = FINISH;
     responder_.Finish(reply_, ::grpc::Status::OK,
@@ -102,105 +91,85 @@ class RequestSend final : public RequestBase {
  protected:
   sendrecv::VoidMessage reply_;
   std::shared_ptr<VariableResponse> request_;
-  ReceivedQueue* queue_;
   ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
-  int req_id_;
 };
 
 class RequestGet final : public RequestBase {
  public:
   explicit RequestGet(GrpcService::AsyncService* service,
-                      ::grpc::ServerCompletionQueue* cq, bool sync_mode,
-                      framework::Scope* scope,
-                      const platform::DeviceContext* dev_ctx,
-                      framework::BlockingQueue<MessageWithName>* queue,
-                      int req_id)
-      : RequestBase(service, cq, sync_mode, dev_ctx),
-        responder_(&ctx_),
-        scope_(scope),
-        queue_(queue),
-        req_id_(req_id) {
+                      ::grpc::ServerCompletionQueue* cq,
+                      RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
     auto method_id = static_cast<int>(detail::GrpcMethod::kGetVariable);
     service_->RequestAsyncUnary(
         method_id, &ctx_, &request_, &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
   }
 
   virtual ~RequestGet() {}
 
-  virtual std::string GetReqName() { return request_.varname(); }
+  std::string GetReqName() override { return request_.varname(); }
 
-  virtual void Process() {
+  void Process() override {
     // proc request.
-    std::string var_name = request_.varname();
-    VLOG(3) << "RequestGet " << var_name;
-    auto* var = scope_->FindVar(var_name);
+    std::string varname = request_.varname();
+    VLOG(3) << "RequestGet " << varname;
+
+    auto scope = request_handler_->scope();
+    auto invar = scope->FindVar(varname);
+    framework::Variable* outvar = nullptr;
 
-    if (var_name != FETCH_BARRIER_MESSAGE) {
-      SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply_);
+    request_handler_->Handle(varname, scope, invar, &outvar);
+
+    if (outvar) {
+      SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(),
+                            &reply_);
     }
 
     status_ = FINISH;
     responder_.Finish(reply_, ::grpc::Status::OK,
                       reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
-
-    if (var_name == FETCH_BARRIER_MESSAGE) {
-      sendrecv::VariableMessage msg;
-      MessageWithName msg_with_name = std::make_pair(var_name, msg);
-      queue_->Push(msg_with_name);
-    }
   }
 
  protected:
   sendrecv::VariableMessage request_;
   ::grpc::ByteBuffer reply_;
   ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-  framework::Scope* scope_;
-  framework::BlockingQueue<MessageWithName>* queue_;
-  int req_id_;
 };
 
 class RequestPrefetch final : public RequestBase {
  public:
   explicit RequestPrefetch(GrpcService::AsyncService* service,
-                           ::grpc::ServerCompletionQueue* cq, bool sync_mode,
-                           framework::Scope* scope,
-                           const platform::DeviceContext* dev_ctx,
-                           framework::Executor* executor,
-                           framework::ProgramDesc* program,
-                           framework::ExecutorPrepareContext* prefetch_ctx,
-                           int req_id)
-      : RequestBase(service, cq, sync_mode, dev_ctx),
+                           ::grpc::ServerCompletionQueue* cq,
+                           RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id),
         responder_(&ctx_),
-        scope_(scope),
-        executor_(executor),
-        program_(program),
-        prefetch_ctx_(prefetch_ctx),
-        req_id_(req_id) {
-    // prefetch always create a new sub scope
-    request_.reset(new VariableResponse(scope, dev_ctx_, true));
+        local_scope_(nullptr) {
+    request_.reset(new VariableResponse(request_handler->scope(),
+                                        request_handler->dev_ctx(), true));
     int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
     service_->RequestAsyncUnary(
         method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
   }
 
   virtual ~RequestPrefetch() {}
 
-  virtual std::string GetReqName() { return request_->Varname(); }
+  std::string GetReqName() override { return request_->Varname(); }
 
-  virtual void Process() {
+  void Process() override {
     // prefetch process...
+    std::string varname = request_->OutVarname();
+    VLOG(3) << "RequestPrefetch " << varname;
+
+    auto scope = request_->GetMutableLocalScope();
+    auto invar = scope->FindVar(varname);
+    framework::Variable* outvar = nullptr;
 
-    std::string var_name = request_->OutVarname();
-    VLOG(3) << "RequestPrefetch " << var_name;
-    auto var_desc = program_->Block(0).FindVar(var_name);
-    framework::Scope* local_scope = request_->GetMutableLocalScope();
-    auto* var = local_scope->FindVar(var_name);
-    InitializeVariable(var, var_desc->GetType());
-    executor_->RunPreparedContext(prefetch_ctx_, local_scope);
+    request_handler_->Handle(varname, scope, invar, &outvar);
 
-    SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply_);
+    SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(),
+                          &reply_);
 
     status_ = FINISH;
     responder_.Finish(reply_, ::grpc::Status::OK,
@@ -211,202 +180,169 @@ class RequestPrefetch final : public RequestBase {
   std::shared_ptr<VariableResponse> request_;
   ::grpc::ByteBuffer reply_;
   ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-  framework::Scope* scope_;
-  framework::Executor* executor_;
-  framework::ProgramDesc* program_;
-  framework::ExecutorPrepareContext* prefetch_ctx_;
-  int req_id_;
+  framework::Scope* local_scope_;
 };
 
-void AsyncGRPCServer::WaitClientGet(int count) {
-  int fetch_barriers = 0;
-  while (fetch_barriers < count) {
-    auto msg = var_get_queue_.Pop();
-    if (msg.first == FETCH_BARRIER_MESSAGE) {
-      fetch_barriers++;
-    }
-  }
-}
-
 void AsyncGRPCServer::WaitServerReady() {
+  VLOG(3) << "AsyncGRPCServer is wait server ready";
   std::unique_lock<std::mutex> lock(this->mutex_ready_);
   condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
+  VLOG(3) << "AsyncGRPCServer WaitSeverReady";
 }
 
-void AsyncGRPCServer::RunSyncUpdate() {
+void AsyncGRPCServer::StartServer() {
   ::grpc::ServerBuilder builder;
-  builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials(),
+  builder.AddListeningPort(bind_address_, ::grpc::InsecureServerCredentials(),
                            &selected_port_);
+
   builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
   builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
   builder.RegisterService(&service_);
 
-  cq_send_ = builder.AddCompletionQueue();
-  cq_get_ = builder.AddCompletionQueue();
-  cq_prefetch_ = builder.AddCompletionQueue();
+  for (auto t : rpc_call_map_) {
+    rpc_cq_[t.first].reset(builder.AddCompletionQueue().release());
+  }
 
   server_ = builder.BuildAndStart();
-  LOG(INFO) << "Server listening on " << address_
+  LOG(INFO) << "Server listening on " << bind_address_
             << " selected port: " << selected_port_;
 
-  std::function<void(int)> send_register = std::bind(
-      &AsyncGRPCServer::TryToRegisterNewSendOne, this, std::placeholders::_1);
-  std::function<void(int)> get_register = std::bind(
-      &AsyncGRPCServer::TryToRegisterNewGetOne, this, std::placeholders::_1);
-  std::function<void(int)> prefetch_register =
-      std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this,
-                std::placeholders::_1);
+  std::function<void(const std::string&, int)> f =
+      std::bind(&AsyncGRPCServer::TryToRegisterNewOne, this,
+                std::placeholders::_1, std::placeholders::_2);
 
-  for (int i = 0; i < kSendReqsBufSize; ++i) {
-    TryToRegisterNewSendOne(i);
-  }
-  for (int i = 0; i < kGetReqsBufSize; ++i) {
-    TryToRegisterNewGetOne(i);
-  }
-  for (int i = 0; i < kPrefetchReqsBufSize; ++i) {
-    TryToRegisterNewPrefetchOne(i);
-  }
+  for (auto& t : rpc_call_map_) {
+    auto& rpc_name = t.first;
+    auto& cq = rpc_cq_[rpc_name];
+    auto threadnum = rpc_thread_num_[rpc_name];
+    auto& reqs = rpc_reqs_[rpc_name];
 
-  for (int i = 0; i < FLAGS_rpc_server_handle_send_threads; ++i) {
-    t_sends_.emplace_back(
-        new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
-                                  cq_send_.get(), "cq_send", send_register)));
-  }
-  for (int i = 0; i < FLAGS_rpc_server_handle_get_threads; ++i) {
-    t_gets_.emplace_back(
-        new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
-                                  cq_get_.get(), "cq_get", get_register)));
-  }
-  for (int i = 0; i < FLAGS_rpc_server_handle_prefetch_threads; ++i) {
-    t_prefetchs_.emplace_back(new std::thread(
-        std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(),
-                  "cq_prefetch", prefetch_register)));
+    reqs.reserve(kRequestBufSize);
+
+    for (int i = 0; i < kRequestBufSize; i++) {
+      TryToRegisterNewOne(rpc_name, i);
+    }
+
+    for (int i = 0; i < threadnum; i++) {
+      rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind(
+          &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f)));
+      VLOG(3) << t.first << " creates threads!";
+    }
   }
+
   {
     std::lock_guard<std::mutex> lock(this->mutex_ready_);
     ready_ = 1;
   }
   condition_ready_.notify_all();
+
   // wait server
   server_->Wait();
-  for (int i = 0; i < FLAGS_rpc_server_handle_send_threads; ++i) {
-    t_sends_[i]->join();
-  }
-  for (int i = 0; i < FLAGS_rpc_server_handle_get_threads; ++i) {
-    t_gets_[i]->join();
-  }
-  for (int i = 0; i < FLAGS_rpc_server_handle_prefetch_threads; ++i) {
-    t_prefetchs_[i]->join();
+
+  for (auto& t : rpc_threads_) {
+    auto& threads = t.second;
+    for (size_t i = 0; i < threads.size(); ++i) {
+      threads[i]->join();
+      VLOG(3) << t.first << " threads ends!";
+    }
   }
 }
 
 void AsyncGRPCServer::ShutdownQueue() {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  cq_send_->Shutdown();
-  cq_get_->Shutdown();
-  cq_prefetch_->Shutdown();
+  for (auto& t : rpc_cq_) {
+    t.second->Shutdown();
+    VLOG(3) << t.first << " shutdown!";
+  }
 }
 
-// This URL explains why shutdown is complicate:
-void AsyncGRPCServer::ShutDown() {
+void AsyncGRPCServer::ShutDownImpl() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
   is_shut_down_ = true;
   ShutdownQueue();
+
+  VLOG(3) << "server_ shutdown!";
   server_->Shutdown();
 }
 
-void AsyncGRPCServer::TryToRegisterNewSendOne(int i) {
+void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
+                                          int req_id) {
   std::unique_lock<std::mutex> lock(cq_mutex_);
   if (is_shut_down_) {
     VLOG(3) << "shutdown, do not TryToRegisterNewSendOne";
     return;
   }
-  RequestSend* send = new RequestSend(&service_, cq_send_.get(), sync_mode_,
-                                      scope_, &var_recv_queue_, dev_ctx_, i);
-  send_reqs_[i] = static_cast<RequestBase*>(send);
-  VLOG(4) << "Create RequestSend status:" << send->Status();
-}
 
-void AsyncGRPCServer::TryToRegisterNewGetOne(int req_id) {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  if (is_shut_down_) {
-    VLOG(3) << "shutdown, do not TryToRegisterNewGetOne";
-    return;
+  VLOG(4) << "register send rpc_name:" << rpc_name
+          << ", handler:" << rpc_call_map_[kRequestSend];
+
+  auto& reqs = rpc_reqs_[rpc_name];
+  auto& handler = rpc_call_map_[rpc_name];
+  auto& cq = rpc_cq_[rpc_name];
+
+  RequestBase* b = nullptr;
+  if (rpc_name == kRequestSend) {
+    b = new RequestSend(&service_, cq.get(), handler, req_id);
+  } else if (rpc_name == kRequestGet) {
+    b = new RequestGet(&service_, cq.get(), handler, req_id);
+  } else if (rpc_name == kRequestPrefetch) {
+    b = new RequestPrefetch(&service_, cq.get(), handler, req_id);
+  } else {
+    PADDLE_ENFORCE(false, "not surpported rpc");
   }
-  RequestGet* get = new RequestGet(&service_, cq_get_.get(), sync_mode_, scope_,
-                                   dev_ctx_, &var_get_queue_, req_id);
-  get_reqs_[req_id] = static_cast<RequestBase*>(get);
-  VLOG(4) << "Create RequestGet status:" << get->Status();
-}
 
-void AsyncGRPCServer::TryToRegisterNewPrefetchOne(int req_id) {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  if (is_shut_down_) {
-    VLOG(3) << "shutdown, do not TryToRegisterNewPrefetchOne";
-    return;
-  }
-  RequestPrefetch* prefetch = new RequestPrefetch(
-      &service_, cq_prefetch_.get(), sync_mode_, scope_, dev_ctx_, executor_,
-      program_, prefetch_ctx_.get(), req_id);
-  prefetch_reqs_[req_id] = static_cast<RequestBase*>(prefetch);
+  reqs[req_id] = b;
 
-  VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status();
+  VLOG(4) << "Create RequestSend status:" << b->Status();
 }
 
-// FIXME(typhoonzero): change cq_name to enum.
 void AsyncGRPCServer::HandleRequest(
-    ::grpc::ServerCompletionQueue* cq, const std::string& cq_name,
-    std::function<void(int)> TryToRegisterNewOne) {
+    ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name,
+    std::function<void(const std::string&, int)> TryToRegisterNewOne) {
   void* tag = NULL;
   bool ok = false;
 
   while (true) {
-    VLOG(3) << "HandleRequest for " << cq_name << " wait Next";
+    VLOG(3) << "HandleRequest " << rpc_name << " wait next";
     if (!cq->Next(&tag, &ok)) {
-      LOG(INFO) << cq_name << " CompletionQueue shutdown!";
+      LOG(INFO) << "CompletionQueue " << rpc_name << " shutdown!";
       break;
     }
-    VLOG(3) << "HandleRequest for " << cq_name << " get Next";
-    int req_id = static_cast<int>(reinterpret_cast<intptr_t>(tag));
 
-    if (sync_mode_) {
-      // FIXME(typhoonzero): de-couple the barriers with recv_op
-      if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1);
-      if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0);
-      VLOG(3) << "HandleRequest for " << cq_name << " after WaitCond";
-    }
+    int req_id = static_cast<int>(reinterpret_cast<intptr_t>(tag));
+    VLOG(3) << "HandleRequest " << rpc_name << ", req_id:" << req_id
+            << " get next";
 
+    auto& reqs = rpc_reqs_[rpc_name];
     RequestBase* base = nullptr;
     {
-      std::lock_guard<std::mutex> l(cq_mutex_);
-      if (cq_name == "cq_get") {
-        base = get_reqs_[req_id];
-      } else if (cq_name == "cq_send") {
-        base = send_reqs_[req_id];
-      } else if (cq_name == "cq_prefetch") {
-        base = prefetch_reqs_[req_id];
-      }
+      PADDLE_ENFORCE(req_id >= 0 && req_id < kRequestBufSize);
+      std::unique_lock<std::mutex> lock(cq_mutex_);
+      base = reqs[req_id];
     }
+
     // reference:
     // https://github.com/tensorflow/tensorflow/issues/5596
     // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
     // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
     if (!ok) {
-      LOG(WARNING) << cq_name << " recv no regular event:argument name["
+      LOG(WARNING) << "completion queue:" << rpc_name
+                   << " recv no regular event:argument name["
                    << base->GetReqName() << "]";
-      TryToRegisterNewOne(req_id);
+      TryToRegisterNewOne(rpc_name, req_id);
       delete base;
       continue;
     }
 
+    VLOG(3) << "queue id:" << rpc_name << ", req_id:" << req_id
+            << ", status:" << base->Status();
+
     switch (base->Status()) {
       case PROCESS: {
         base->Process();
-        VLOG(4) << cq_name << " PROCESS status:" << base->Status();
         break;
       }
       case FINISH: {
-        TryToRegisterNewOne(req_id);
-        VLOG(4) << cq_name << " FINISH status:" << base->Status();
+        TryToRegisterNewOne(rpc_name, req_id);
         delete base;
         break;
       }
@@ -415,20 +351,6 @@ void AsyncGRPCServer::HandleRequest(
   }
 }
 
-void AsyncGRPCServer::WaitCond(int cond) {
-  std::unique_lock<std::mutex> lock(this->barrier_mutex_);
-  barrier_condition_.wait(lock,
-                          [=] { return this->barrier_cond_step_ == cond; });
-}
-
-void AsyncGRPCServer::SetCond(int cond) {
-  {
-    std::lock_guard<std::mutex> lock(this->barrier_mutex_);
-    barrier_cond_step_ = cond;
-  }
-  barrier_condition_.notify_all();
-}
-
 }  // namespace detail
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
index bdff9801a9..d1fcbc414f 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <map>
+#include <set>
 #include <string>
 #include <thread>  // NOLINT
 #include <utility>
@@ -28,6 +30,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/detail/grpc_service.h"
+#include "paddle/fluid/operators/detail/request_handler.h"
+#include "paddle/fluid/operators/detail/rpc_server.h"
 #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
 #include "paddle/fluid/operators/detail/send_recv.pb.h"
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
@@ -37,106 +41,48 @@ namespace paddle {
 namespace operators {
 namespace detail {
 
-typedef std::pair<std::string, std::shared_ptr<VariableResponse>>
-    ReceivedMessage;
-typedef framework::BlockingQueue<ReceivedMessage> ReceivedQueue;
-
-typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
 class RequestBase;
 
-class AsyncGRPCServer final {
+class AsyncGRPCServer final : public RPCServer {
  public:
-  explicit AsyncGRPCServer(const std::string &address, bool sync_mode)
-      : address_(address), sync_mode_(sync_mode), ready_(0) {}
-
-  ~AsyncGRPCServer() {}
-  void WaitServerReady();
-  void RunSyncUpdate();
-
-  // functions to sync server barrier status.
-  void WaitCond(int cond);
-  void SetCond(int cond);
-  void WaitClientGet(int count);
-
-  void SetScope(framework::Scope *scope) { scope_ = scope; }
-
-  void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; }
-
-  void SetProgram(framework::ProgramDesc *program) { program_ = program; }
-
-  void SetExecutor(framework::Executor *executor) { executor_ = executor; }
-
-  void SetPrefetchPreparedCtx(
-      std::unique_ptr<framework::ExecutorPrepareContext> prepared) {
-    prefetch_ctx_.reset(prepared.release());
-  }
-
-  int GetSelectedPort() const { return selected_port_; }
-
-  const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); }
+  explicit AsyncGRPCServer(const std::string& address, int client_num)
+      : RPCServer(address, client_num), ready_(0) {}
 
-  void Push(const std::string &msg_name) {
-    this->var_recv_queue_.Push(std::make_pair(msg_name, nullptr));
-  }
+  virtual ~AsyncGRPCServer() {}
+  void WaitServerReady() override;
+  void StartServer() override;
 
-  void ShutDown();
+ private:
+  void HandleRequest(
+      ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name,
+      std::function<void(const std::string&, int)> TryToRegisterNewOne);
 
- protected:
-  void HandleRequest(::grpc::ServerCompletionQueue *cq,
-                     const std::string &cq_name,
-                     std::function<void(int)> TryToRegisterNewOne);
-  void TryToRegisterNewSendOne(int req_id);
-  void TryToRegisterNewGetOne(int req_id);
-  void TryToRegisterNewPrefetchOne(int req_id);
+  void TryToRegisterNewOne(const std::string& rpc_name, int req_id);
   void ShutdownQueue();
+  void ShutDownImpl() override;
 
  private:
-  static const int kSendReqsBufSize = 100;
-  static const int kGetReqsBufSize = 100;
-  static const int kPrefetchReqsBufSize = 10;
+  static const int kRequestBufSize = 100;
 
   std::mutex cq_mutex_;
   volatile bool is_shut_down_ = false;
-  std::unique_ptr<::grpc::ServerCompletionQueue> cq_send_;
-  std::unique_ptr<::grpc::ServerCompletionQueue> cq_get_;
-  std::unique_ptr<::grpc::ServerCompletionQueue> cq_prefetch_;
-
-  RequestBase *send_reqs_[kSendReqsBufSize];
-  RequestBase *get_reqs_[kGetReqsBufSize];
-  RequestBase *prefetch_reqs_[kPrefetchReqsBufSize];
 
   GrpcService::AsyncService service_;
   std::unique_ptr<::grpc::Server> server_;
 
-  std::string address_;
-  const bool sync_mode_;
-  framework::Scope *scope_;
-  const platform::DeviceContext *dev_ctx_;
-
-  // received variable from RPC, operators fetch variable from this queue.
-  framework::BlockingQueue<MessageWithName> var_get_queue_;
-  // client send variable to this queue.
-  ReceivedQueue var_recv_queue_;
-
   // condition of the sub program
   std::mutex barrier_mutex_;
   mutable int barrier_cond_step_;
   std::condition_variable barrier_condition_;
 
-  std::vector<std::unique_ptr<std::thread>> t_sends_;
-  std::vector<std::unique_ptr<std::thread>> t_gets_;
-  std::vector<std::unique_ptr<std::thread>> t_prefetchs_;
-
-  std::unique_ptr<std::thread> t_prefetch_;
-
-  std::unique_ptr<framework::ExecutorPrepareContext> prefetch_ctx_;
-  framework::ProgramDesc *program_;
-  framework::Executor *executor_;
-  int selected_port_;
-
   std::mutex mutex_ready_;
   std::condition_variable condition_ready_;
+
   int ready_;
+
+  std::map<std::string, std::unique_ptr<::grpc::ServerCompletionQueue>> rpc_cq_;
+  std::map<std::string, std::vector<std::unique_ptr<std::thread>>> rpc_threads_;
+  std::map<std::string, std::vector<RequestBase*>> rpc_reqs_;
 };
 
 };  // namespace detail
diff --git a/paddle/fluid/operators/detail/grpc_server_test.cc b/paddle/fluid/operators/detail/grpc_server_test.cc
index 350a7ee123..f97f638701 100644
--- a/paddle/fluid/operators/detail/grpc_server_test.cc
+++ b/paddle/fluid/operators/detail/grpc_server_test.cc
@@ -24,13 +24,16 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
+#include "paddle/fluid/operators/detail/request_handler_impl.h"
+
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace detail = paddle::operators::detail;
 
 USE_OP(lookup_table);
 
-std::unique_ptr<detail::AsyncGRPCServer> rpc_service_;
+std::unique_ptr<detail::AsyncGRPCServer> g_rpc_service;
+std::unique_ptr<detail::RequestHandler> g_req_handler;
 
 framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
   auto root_block = program->MutableBlock(0);
@@ -88,8 +91,7 @@ void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
   }
 }
 
-void StartServer(const std::string& endpoint) {
-  rpc_service_.reset(new detail::AsyncGRPCServer(endpoint, true));
+void StartServer() {
   framework::ProgramDesc program;
   framework::Scope scope;
   platform::CPUPlace place;
@@ -99,42 +101,59 @@ void StartServer(const std::string& endpoint) {
   auto prepared = exe.Prepare(program, block->ID());
   InitTensorsOnServer(&scope, &place, 10);
 
-  rpc_service_->SetProgram(&program);
-  rpc_service_->SetPrefetchPreparedCtx(std::move(prepared));
-  rpc_service_->SetDevCtx(&ctx);
-  rpc_service_->SetScope(&scope);
-  rpc_service_->SetExecutor(&exe);
+  g_req_handler->SetProgram(&program);
+  g_req_handler->SetPrefetchPreparedCtx(std::move(prepared));
+  g_req_handler->SetDevCtx(&ctx);
+  g_req_handler->SetScope(&scope);
+  g_req_handler->SetExecutor(&exe);
+
+  g_rpc_service->RegisterRPC(detail::kRequestPrefetch, g_req_handler.get());
+  g_req_handler->SetRPCServer(g_rpc_service.get());
+
+  std::thread server_thread(
+      std::bind(&detail::AsyncGRPCServer::StartServer, g_rpc_service.get()));
 
-  rpc_service_->RunSyncUpdate();
+  // FIXME(gongwb): don't use hard time.
+  sleep(10);
+  LOG(INFO) << "got nccl id and stop server...";
+  g_rpc_service->ShutDown();
+  server_thread.join();
 }
 
-TEST(PREFETCH, DISABLED_CPU) {
-  // start up a server instance backend
-  std::thread server_thread(StartServer, "127.0.0.1:8889");
-  sleep(2);
+TEST(PREFETCH, CPU) {
+  g_req_handler.reset(new detail::RequestPrefetchHandler(true));
+  g_rpc_service.reset(new detail::AsyncGRPCServer("127.0.0.1:0", 1));
+
+  std::thread server_thread(StartServer);
+  g_rpc_service->WaitServerReady();
+
+  detail::RPCClient client;
+  int port = g_rpc_service->GetSelectedPort();
+  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
+
   framework::Scope scope;
   platform::CPUPlace place;
   platform::CPUDeviceContext ctx(place);
-  // create var on local scope
-  int64_t rows_numel = 5;
-  InitTensorsOnClient(&scope, &place, rows_numel);
-  std::string in_var_name("ids");
-  std::string out_var_name("out");
-
-  auto client = detail::RPCClient::GetInstance();
-  client->AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, in_var_name,
-                                out_var_name);
-  client->Wait();
-
-  auto var = scope.Var(out_var_name);
-  auto value = var->GetMutable<framework::SelectedRows>()->value();
-  auto ptr = value.mutable_data<float>(place);
-
-  rpc_service_->ShutDown();
-  server_thread.join();
-  rpc_service_.reset(nullptr);
-
-  for (int64_t i = 0; i < rows_numel; ++i) {
-    EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast<float>(i * 2));
+  {
+    // create var on local scope
+    int64_t rows_numel = 5;
+    InitTensorsOnClient(&scope, &place, rows_numel);
+    std::string in_var_name("ids");
+    std::string out_var_name("out");
+
+    client.AsyncPrefetchVariable(ep, ctx, scope, in_var_name, out_var_name);
+    client.Wait();
+    auto var = scope.Var(out_var_name);
+    auto value = var->GetMutable<framework::SelectedRows>()->value();
+    auto ptr = value.mutable_data<float>(place);
+
+    for (int64_t i = 0; i < rows_numel; ++i) {
+      EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast<float>(i * 2));
+    }
   }
+
+  server_thread.join();
+  LOG(INFO) << "begin reset";
+  g_rpc_service.reset(nullptr);
+  g_req_handler.reset(nullptr);
 }
diff --git a/paddle/fluid/operators/detail/request_handler.h b/paddle/fluid/operators/detail/request_handler.h
new file mode 100644
index 0000000000..4bc5e7f10e
--- /dev/null
+++ b/paddle/fluid/operators/detail/request_handler.h
@@ -0,0 +1,127 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <time.h>
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+constexpr char kRequestSend[] = "RequestSend";
+constexpr char kRequestGet[] = "RequestGet";
+constexpr char kRequestPrefetch[] = "RequestPrefetch";
+
+class RPCServer;
+
+class RequestHandler {
+ public:
+  explicit RequestHandler(bool sync_mode)
+      : sync_mode_(sync_mode),
+        dev_ctx_(nullptr),
+        executor_(nullptr),
+        scope_(nullptr),
+        program_(nullptr),
+        rpc_server_(nullptr) {}
+
+  virtual ~RequestHandler() {}
+
+  // Set attributes.
+  void SetScope(framework::Scope* scope) { scope_ = scope; }
+  void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; }
+  void SetProgram(framework::ProgramDesc* program) { program_ = program; }
+  void SetExecutor(framework::Executor* executor) { executor_ = executor; }
+  void SetPrefetchPreparedCtx(
+      std::unique_ptr<framework::ExecutorPrepareContext> prepared) {
+    prefetch_ctx_.reset(prepared.release());
+  }
+
+  // Used for async.
+  void SetGradToPreparedCtx(
+      std::unordered_map<
+          std::string, std::shared_ptr<framework::ExecutorPrepareContext>>* g) {
+    grad_to_prepared_ctx_ = g;
+  }
+
+  void SetRPCServer(RPCServer* rpc_server) { rpc_server_ = rpc_server; }
+
+  // Get attributes.
+  bool sync_mode() { return sync_mode_; }
+  framework::Scope* scope() { return scope_; }
+  const platform::DeviceContext* dev_ctx() { return dev_ctx_; }
+  framework::ExecutorPrepareContext* prefetch_ctx() {
+    return prefetch_ctx_.get();
+  }
+  framework::ProgramDesc* program() { return program_; }
+  framework::Executor* executor() { return executor_; }
+  std::vector<framework::Variable*>& sparse_vars() { return sparse_vars_; }
+
+  // This function processes user's rpc request.
+  // The implemention is in request_handler_impl.
+  // example:
+  //    std::string varname = request_.varname();
+  //
+  //    auto scope = request_handler_->scope();
+  //    auto invar = scope->FindVar(varname);
+  //    framework::Variable* outvar = nullptr;
+  //
+  //    request_handler_->Handle(varname, scope, invar, &outvar);
+  //    if (outvar) {
+  //        SerializeToByteBuffer(varname, outvar,
+  //           *request_handler_->dev_ctx(), &reply_);
+  //    }
+  virtual bool Handle(const std::string& varname, framework::Scope* scope,
+                      framework::Variable* var,
+                      framework::Variable** outvar) = 0;
+
+ protected:
+  const bool sync_mode_;
+
+  const platform::DeviceContext* dev_ctx_;
+  framework::Executor* executor_;
+  framework::Scope* scope_;
+  framework::ProgramDesc* program_;
+  std::unique_ptr<framework::ExecutorPrepareContext> prefetch_ctx_;
+
+  // Used for async.
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>*
+      grad_to_prepared_ctx_;
+
+  // Record received sparse variables, so that
+  // we could reset those after execute optimize program
+  std::vector<framework::Variable*> sparse_vars_;
+  RPCServer* rpc_server_;
+
+  std::mutex sparse_var_mutex_;
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/request_handler_impl.cc b/paddle/fluid/operators/detail/request_handler_impl.cc
new file mode 100644
index 0000000000..f16c06d52f
--- /dev/null
+++ b/paddle/fluid/operators/detail/request_handler_impl.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/detail/request_handler_impl.h"
+#include "paddle/fluid/operators/detail/rpc_server.h"
+#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/detail/variable_response.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+bool RequestSendHandler::Handle(const std::string& varname,
+                                framework::Scope* scope,
+                                framework::Variable* invar,
+                                framework::Variable** outvar) {
+  VLOG(4) << "RequestSendHandler:" << varname;
+
+  // Async
+  if (!sync_mode_) {
+    try {
+      executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
+                                    scope);
+    } catch (std::exception& e) {
+      LOG(ERROR) << "async: run sub program error " << e.what();
+      return false;
+    }
+    return true;
+  }
+
+  // Sync
+  if (varname == BATCH_BARRIER_MESSAGE) {
+    VLOG(3) << "sync: recv batch barrier message";
+    rpc_server_->IncreaseBatchBarrier(kRequestSend);
+  } else {
+    VLOG(3) << "sync: received var_name: " << varname;
+    if (sync_mode_) {
+      rpc_server_->WaitCond(kRequestSend);
+    }
+
+    if (invar == nullptr) {
+      LOG(ERROR) << "sync: Can not find server side var: " << varname;
+      PADDLE_THROW("sync: Can not find server side var");
+      return false;
+    }
+
+    if (invar->IsType<framework::SelectedRows>()) {
+      std::unique_lock<std::mutex> lock(sparse_var_mutex_);
+      sparse_vars_.push_back(invar);
+    }
+  }
+
+  return true;
+}
+
+bool RequestGetHandler::Handle(const std::string& varname,
+                               framework::Scope* scope,
+                               framework::Variable* invar,
+                               framework::Variable** outvar) {
+  VLOG(4) << "RequestGetHandler:" << varname;
+
+  if (varname != FETCH_BARRIER_MESSAGE) {
+    if (sync_mode_) {
+      rpc_server_->WaitCond(kRequestGet);
+    }
+    *outvar = scope_->FindVar(varname);
+    return true;
+  }
+
+  // FETCH_BARRIER_MESSAGE
+  if (sync_mode_) {
+    VLOG(3) << "sync: recv fetch barrier message";
+    rpc_server_->IncreaseBatchBarrier(kRequestGet);
+  }
+
+  return true;
+}
+
+bool RequestPrefetchHandler::Handle(const std::string& varname,
+                                    framework::Scope* scope,
+                                    framework::Variable* invar,
+                                    framework::Variable** outvar) {
+  VLOG(4) << "RequestPrefetchHandler " << varname;
+
+  auto var_desc = program_->Block(0).FindVar(varname);
+  *outvar = scope->FindVar(varname);
+  InitializeVariable(*outvar, var_desc->GetType());
+  executor_->RunPreparedContext(prefetch_ctx_.get(), scope);
+
+  return true;
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/request_handler_impl.h b/paddle/fluid/operators/detail/request_handler_impl.h
new file mode 100644
index 0000000000..8d0c62232b
--- /dev/null
+++ b/paddle/fluid/operators/detail/request_handler_impl.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <time.h>
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/detail/request_handler.h"
+#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+class RequestSendHandler final : public RequestHandler {
+ public:
+  explicit RequestSendHandler(bool sync_mode) : RequestHandler(sync_mode) {}
+  virtual ~RequestSendHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar) override;
+};
+
+class RequestGetHandler final : public RequestHandler {
+ public:
+  explicit RequestGetHandler(bool sync_mode) : RequestHandler(sync_mode) {}
+  virtual ~RequestGetHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar) override;
+};
+
+class RequestPrefetchHandler final : public RequestHandler {
+ public:
+  explicit RequestPrefetchHandler(bool sync_mode) : RequestHandler(sync_mode) {}
+  virtual ~RequestPrefetchHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar) override;
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/rpc_server.cc b/paddle/fluid/operators/detail/rpc_server.cc
new file mode 100644
index 0000000000..448763372a
--- /dev/null
+++ b/paddle/fluid/operators/detail/rpc_server.cc
@@ -0,0 +1,113 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <string>
+
+#include "paddle/fluid/operators/detail/rpc_server.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+void RPCServer::ShutDown() {
+  LOG(INFO) << "RPCServer ShutDown ";
+  ShutDownImpl();
+
+  exit_flag_ = true;
+  barrier_cond_.notify_all();
+  rpc_cond_.notify_all();
+}
+
+void RPCServer::SavePort() const {
+  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
+  std::ofstream port_file;
+  port_file.open(file_path);
+  port_file << selected_port_;
+  port_file.close();
+  VLOG(4) << "selected port written to " << file_path;
+}
+
+void RPCServer::WaitBarrier(const std::string& rpc_name) {
+  std::unique_lock<std::mutex> lock(this->mutex_);
+  barrier_cond_.wait(lock, [=] {
+    return (barrier_counter_[rpc_name] >= client_num_ || exit_flag_.load());
+  });
+
+  VLOG(3) << "batch_barrier_:" << barrier_counter_[rpc_name];
+}
+
+void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
+  VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
+  int b = 0;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    b = ++barrier_counter_[rpc_name];
+  }
+
+  VLOG(3) << "RPCServer IncreaseBatchBarrier " << rpc_name
+          << ", barrier_count:" << b << ", fan_in" << client_num_;
+
+  if (b >= client_num_) {
+    barrier_cond_.notify_all();
+  }
+}
+
+void RPCServer::ResetBarrierCounter() {
+  VLOG(3) << "RPCServer ResetBarrierCounter ";
+  std::unique_lock<std::mutex> lock(mutex_);
+  for (auto& t : barrier_counter_) {
+    t.second = 0;
+  }
+}
+
+void RPCServer::RegisterRPC(const std::string& rpc_name,
+                            RequestHandler* handler, int thread_num) {
+  rpc_call_map_[rpc_name] = handler;
+  rpc_thread_num_[rpc_name] = thread_num;
+
+  static int cond = -1;
+  rpc_cond_map_[rpc_name] = ++cond;
+  VLOG(4) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler
+          << ", cond:" << rpc_cond_map_[rpc_name];
+}
+
+void RPCServer::SetCond(const std::string& rpc_name) {
+  VLOG(3) << "RPCServer SetCond " << rpc_name;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cur_cond_ = rpc_cond_map_[rpc_name];
+  }
+
+  rpc_cond_.notify_all();
+}
+
+void RPCServer::WaitCond(const std::string& rpc_name) {
+  VLOG(3) << "RPCServer WaitCond " << rpc_name;
+  int cond = 0;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cond = rpc_cond_map_[rpc_name];
+  }
+
+  std::unique_lock<std::mutex> lock(mutex_);
+  rpc_cond_.wait(
+      lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/rpc_server.h b/paddle/fluid/operators/detail/rpc_server.h
new file mode 100644
index 0000000000..c2e7ae706c
--- /dev/null
+++ b/paddle/fluid/operators/detail/rpc_server.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <set>
+#include <string>
+#include <thread>  // NOLINT
+#include <utility>
+#include <vector>
+#include "paddle/fluid/operators/detail/request_handler.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+class RPCServer {
+ public:
+  explicit RPCServer(const std::string& address, int client_num)
+      : cur_cond_(0),
+        bind_address_(address),
+        exit_flag_(false),
+        selected_port_(0),
+        client_num_(client_num) {}
+
+  virtual ~RPCServer() {}
+  virtual void StartServer() = 0;
+  virtual void WaitServerReady() = 0;
+
+  void ShutDown();
+
+  bool IsExit() { return exit_flag_.load(); }
+
+  int GetSelectedPort() const { return selected_port_; }
+  void SavePort() const;
+
+  // RegisterRPC, register the rpc method name to a handler
+  // class, and auto generate a condition id for this call
+  // to be used for the barrier.
+  void RegisterRPC(const std::string& rpc_name, RequestHandler* handler,
+                   int thread_num = 5);
+
+  // Wait util all the clients have reached the barrier for one
+  // rpc method. This function should be called in the
+  // RequestHandler if you want to run the server/client in a
+  // synchronous mode.
+  void WaitBarrier(const std::string& rpc_name);
+
+  void SetCond(const std::string& rpc_name);
+  void WaitCond(const std::string& rpc_name);
+  void IncreaseBatchBarrier(const std::string rpc_name);
+  void ResetBarrierCounter();
+
+ protected:
+  virtual void ShutDownImpl() = 0;
+
+ private:
+  std::mutex mutex_;
+  std::unordered_map<std::string, int> barrier_counter_;
+  std::condition_variable barrier_cond_;
+
+  std::unordered_map<std::string, int> rpc_cond_map_;
+  std::atomic<int> cur_cond_;
+  std::condition_variable rpc_cond_;
+
+ protected:
+  std::string bind_address_;
+  std::atomic<int> exit_flag_;
+  int selected_port_;
+
+  const int client_num_;
+
+  std::unordered_map<std::string, RequestHandler*> rpc_call_map_;
+  std::unordered_map<std::string, int> rpc_thread_num_;
+  friend class RequestHandler;
+};
+
+};  // namespace detail
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/detail/variable_response.h b/paddle/fluid/operators/detail/variable_response.h
index bf624da2a6..69cfd784f8 100644
--- a/paddle/fluid/operators/detail/variable_response.h
+++ b/paddle/fluid/operators/detail/variable_response.h
@@ -67,8 +67,8 @@ class VariableResponse {
 
   framework::Scope* GetMutableLocalScope() const { return local_scope_; }
 
-  inline std::string Varname() { return meta_.varname(); }
-  inline std::string OutVarname() { return meta_.out_varname(); }
+  inline std::string Varname() const { return meta_.varname(); }
+  inline std::string OutVarname() const { return meta_.out_varname(); }
 
   // should call parse first.
   framework::Variable* GetVar() {
diff --git a/paddle/fluid/operators/gen_nccl_id_op.cc b/paddle/fluid/operators/gen_nccl_id_op.cc
index a5678f6346..4bce2d322d 100644
--- a/paddle/fluid/operators/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/gen_nccl_id_op.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/detail/grpc_client.h"
 #include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/detail/request_handler_impl.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 
 namespace paddle {
@@ -75,19 +76,23 @@ class GenNCCLIdOp : public framework::OperatorBase {
     // NOTE: Can not use unique_ptr here because the default
     // deleter will call GRPC Server's base class's dtor and
     // that will cause a wired crash.
-    detail::AsyncGRPCServer rpc_service(endpoint, true);
+    detail::RequestSendHandler rpc_h(true);
+    detail::AsyncGRPCServer rpc_service(endpoint, 1);
+    rpc_service.RegisterRPC(detail::kRequestSend, &rpc_h);
+    rpc_h.SetRPCServer(&rpc_service);
+
     framework::ProgramDesc empty_program;
     framework::Executor executor(dev_ctx.GetPlace());
-    rpc_service.SetScope(scope);
-    rpc_service.SetDevCtx(&dev_ctx);
-    rpc_service.SetProgram(&empty_program);
-    rpc_service.SetExecutor(&executor);
+    rpc_h.SetScope(scope);
+    rpc_h.SetDevCtx(&dev_ctx);
+    rpc_h.SetProgram(&empty_program);
+    rpc_h.SetExecutor(&executor);
 
     std::thread server_thread(
-        std::bind(&detail::AsyncGRPCServer::RunSyncUpdate, &rpc_service));
-    rpc_service.SetCond(0);
+        std::bind(&detail::AsyncGRPCServer::StartServer, &rpc_service));
+    rpc_service.SetCond(detail::kRequestSend);
     VLOG(3) << "start getting nccl id from trainer 0...";
-    auto recv = rpc_service.Get();
+    rpc_service.WaitBarrier(detail::kRequestSend);
     VLOG(3) << "got nccl id and stop server...";
     rpc_service.ShutDown();
     VLOG(3) << "rpc server stopped";
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index df5f229acd..71e75c2532 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -19,14 +19,16 @@ limitations under the License. */
 #include <thread>  // NOLINT
 #include <vector>
 
+#include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/detail/request_handler_impl.h"
 #include "paddle/fluid/operators/listen_and_serv_op.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
 
-void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
-  service->RunSyncUpdate();
+void RunServer(std::shared_ptr<detail::RPCServer> service) {
+  service->StartServer();
   VLOG(4) << "RunServer thread end";
 }
 static void split(const std::string &str, char sep,
@@ -67,8 +69,6 @@ static void ParallelExecuteBlocks(
   for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
 }
 
-std::atomic_int ListenAndServOp::selected_port_{0};
-
 ListenAndServOp::ListenAndServOp(const std::string &type,
                                  const framework::VariableNameMap &inputs,
                                  const framework::VariableNameMap &outputs,
@@ -78,7 +78,6 @@ ListenAndServOp::ListenAndServOp(const std::string &type,
 ListenAndServOp::~ListenAndServOp() { Stop(); }
 
 void ListenAndServOp::Stop() {
-  rpc_service_->Push(LISTEN_TERMINATE_MESSAGE);
   rpc_service_->ShutDown();
   server_thread_->join();
   auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
@@ -87,26 +86,13 @@ void ListenAndServOp::Stop() {
 
 void ListenAndServOp::SavePort() const {
   // NOTE: default write file to /tmp/paddle.selected_port
-  selected_port_ = rpc_service_->GetSelectedPort();
-  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
-  std::ofstream port_file;
-  port_file.open(file_path);
-  port_file << selected_port_.load();
-  port_file.close();
-  VLOG(4) << "selected port written to " << file_path;
-}
-
-void ListenAndServOp::WaitServerReady() {
-  while (selected_port_.load() == 0) {
-  }
+  rpc_service_->SavePort();
 }
 
 void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
                                   framework::ProgramDesc *program,
                                   framework::Scope *recv_scope,
                                   framework::BlockDesc *prefetch_block) const {
-  auto fan_in = Attr<int>("Fanin");
-
   size_t num_blocks = program->Size();
   PADDLE_ENFORCE_GE(num_blocks, 2,
                     "server program should have at least 2 blocks");
@@ -121,49 +107,24 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
       optimize_prepared.begin(),
       std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
 
-  bool exit_flag = false;
+  rpc_service_->ResetBarrierCounter();
   // Record received sparse variables, so that
   // we could reset those after execute optimize program
   std::vector<framework::Variable *> sparse_vars;
-  while (!exit_flag && !SignalHandler::IsProgramExit()) {
+  while (true) {
     // Get from multiple trainers, we don't care about the order in which
     // the gradients arrives, just add suffix 0~n and merge the gradient.
-    rpc_service_->SetCond(0);
-    size_t recv_var_cnt = 0;
-    int batch_barrier = 0;
-    while (batch_barrier != fan_in) {
-      const detail::ReceivedMessage v = rpc_service_->Get();
-      auto recv_var_name = v.first;
-      if (recv_var_name == LISTEN_TERMINATE_MESSAGE) {
-        LOG(INFO) << "received terminate message and exit";
-        exit_flag = true;
-        break;
-      } else if (recv_var_name == BATCH_BARRIER_MESSAGE) {
-        VLOG(3) << "recv batch barrier message";
-        batch_barrier++;
-        continue;
-      } else {
-        VLOG(3) << "received grad: " << recv_var_name;
-        recv_var_cnt++;
-        auto var = v.second->GetVar();
-        if (var == nullptr) {
-          LOG(ERROR) << "Can not find server side var: " << recv_var_name;
-          PADDLE_THROW("Can not find server side var");
-        }
-        if (var->IsType<framework::SelectedRows>()) {
-          sparse_vars.push_back(var);
-        }
-      }
-    }
-    if (exit_flag) {
-      rpc_service_->SetCond(1);
-      rpc_service_->ShutDown();
+    rpc_service_->SetCond(detail::kRequestSend);
+    rpc_service_->WaitBarrier(detail::kRequestSend);
+
+    if (rpc_service_->IsExit()) {
+      LOG(WARNING) << "get exit!rpc_processor break!";
+      rpc_service_->SetCond(detail::kRequestGet);
       break;
     }
 
     // NOTE: if is_gpu_place, CUDA kernels are launched by multiple threads
     // and this will still work.
-
     // The optimize blocks which have the same parent ID would run parallel
     // TODO(Yancey1989): need to use ParallelExecutor for future
     int32_t last_parent_blkid = program->Block(1).Parent();
@@ -194,52 +155,18 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
       var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
     }
 
-    rpc_service_->SetCond(1);
-    // FIXME(typhoonzero): use another condition to sync wait clients get.
-    rpc_service_->WaitClientGet(fan_in);
-    sparse_vars.clear();
+    rpc_service_->SetCond(detail::kRequestGet);
+    rpc_service_->WaitBarrier(detail::kRequestGet);
+    rpc_service_->ResetBarrierCounter();
   }  // while(true)
 }
 
-static void AsyncUpdateThread(
-    const std::string &var_name, const bool &exit_flag,
-    const std::shared_ptr<detail::ReceivedQueue> &queue,
-    framework::Executor *executor,
-    framework::ExecutorPrepareContext *prepared) {
-  VLOG(3) << "update thread for " << var_name << " started";
-  while (!exit_flag && !SignalHandler::IsProgramExit()) {
-    const detail::ReceivedMessage v = queue->Pop();
-    if (SignalHandler::IsProgramExit()) {
-      VLOG(3) << "update thread for " << var_name << " exit";
-      break;
-    }
-    auto recv_var_name = v.first;
-    VLOG(4) << "async update " << recv_var_name;
-    auto var = v.second->GetVar();
-    if (var == nullptr) {
-      LOG(ERROR) << "Can not find server side var: " << recv_var_name;
-      PADDLE_THROW("Can not find server side var");
-    }
-    auto fs = framework::Async([var_name, &executor, &v, prepared] {
-      try {
-        executor->RunPreparedContext(prepared,
-                                     v.second->GetMutableLocalScope());
-      } catch (const std::exception &e) {
-        LOG(ERROR) << "run sub program error " << e.what();
-      }
-    });
-    fs.wait();
-  }
-}
-
 void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
                                    framework::ProgramDesc *program) const {
   VLOG(3) << "RunAsyncLoop in";
   // grad name to block id
   std::unordered_map<std::string, int32_t> grad_to_block_id;
   std::unordered_map<int32_t, std::string> id_to_grad;
-  std::unordered_map<std::string, std::shared_ptr<detail::ReceivedQueue>>
-      grad_to_queue;
 
   auto grad_to_block_id_str =
       Attr<std::vector<std::string>>("grad_to_block_id");
@@ -249,13 +176,9 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
     VLOG(3) << "after split, grad = " << pieces[0] << ", id=" << pieces[1];
     PADDLE_ENFORCE_EQ(pieces.size(), 2);
     PADDLE_ENFORCE_EQ(grad_to_block_id.count(pieces[0]), 0);
+
     int block_id = std::stoi(pieces[1]);
     grad_to_block_id[pieces[0]] = block_id;
-    std::shared_ptr<detail::ReceivedQueue> queue =
-        std::make_shared<detail::ReceivedQueue>();
-    grad_to_queue[pieces[0]] = queue;
-    // record blocking queue in SignalHandler
-    SignalHandler::RegisterBlockingQueue(queue);
     id_to_grad[block_id] = pieces[0];
   }
   size_t num_blocks = program->Size();
@@ -274,39 +197,36 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
     grad_to_prepared_ctx[id_to_grad[block_list[i]]] = optimize_prepared[i];
   }
 
-  bool exit_flag = false;
+  request_send_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
+  request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
+  request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
 
-  VLOG(3) << "start async optimize threads";
-  std::vector<std::future<void>> fs;
-  for (auto iter = grad_to_queue.begin(); iter != grad_to_queue.end(); iter++) {
-    std::string grad_name = iter->first;
-    VLOG(3) << "create async update thread for " << grad_name;
-    fs.push_back(framework::AsyncIO([grad_name, &exit_flag, &executor,
-                                     &grad_to_queue, &grad_to_prepared_ctx]() {
-      AsyncUpdateThread(grad_name, exit_flag, grad_to_queue[grad_name],
-                        executor, grad_to_prepared_ctx[grad_name].get());
-    }));
-  }
   VLOG(3) << "RunAsyncLoop into while";
-  while (!exit_flag && !SignalHandler::IsProgramExit()) {
-    const detail::ReceivedMessage v = rpc_service_->Get();
-    auto recv_var_name = v.first;
-    if (recv_var_name == LISTEN_TERMINATE_MESSAGE) {
-      LOG(INFO) << "received terminate message and exit";
-      exit_flag = true;
+  while (true) {
+    if (rpc_service_->IsExit()) {
+      LOG(INFO) << "get exit!rpc_processor break!";
       break;
-    } else {
-      VLOG(3) << "received grad: " << recv_var_name;
-      grad_to_queue[recv_var_name]->Push(v);
     }
 
-    if (exit_flag) {
-      rpc_service_->ShutDown();
-      break;
-    }
+    sleep(1);
   }  // while(true)
 }
 
+static void FillRequestCtx(detail::RequestHandler *h, framework::Scope *scope,
+                           platform::DeviceContext *dev_ctx,
+                           framework::Executor *executor,
+                           framework::ProgramDesc *program,
+                           framework::ExecutorPrepareContext *prefetch_ctx,
+                           detail::RPCServer *rpc_server) {
+  h->SetScope(scope);
+  h->SetDevCtx(dev_ctx);
+  h->SetExecutor(executor);
+  h->SetProgram(program);
+  h->SetPrefetchPreparedCtx(std::move(
+      std::unique_ptr<framework::ExecutorPrepareContext>(prefetch_ctx)));
+  h->SetRPCServer(rpc_server);
+}
+
 void ListenAndServOp::RunImpl(const framework::Scope &scope,
                               const platform::Place &dev_place) const {
   // Mark this as PS that it should decide profiling by listening from trainer.
@@ -316,27 +236,42 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   framework::Scope &recv_scope = scope.NewScope();
 
   bool sync_mode = Attr<bool>("sync_mode");
+  auto fan_in = Attr<int>("Fanin");
 
   PADDLE_ENFORCE(!rpc_service_);
   std::string endpoint = Attr<std::string>("endpoint");
 
-  rpc_service_.reset(new detail::AsyncGRPCServer(endpoint, sync_mode));
+  LOG(INFO) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in
+            << ", end_point:" << endpoint;
+
+  // request_handler_.reset(new detail::GRPCRequestSendHandler(sync_mode));
+  rpc_service_.reset(new detail::AsyncGRPCServer(endpoint, fan_in));
+  request_send_handler_.reset(new detail::RequestSendHandler(sync_mode));
+  request_get_handler_.reset(new detail::RequestGetHandler(sync_mode));
+  request_prefetch_handler_.reset(
+      new detail::RequestPrefetchHandler(sync_mode));
+
+  rpc_service_->RegisterRPC(detail::kRequestSend, request_send_handler_.get());
+  rpc_service_->RegisterRPC(detail::kRequestGet, request_get_handler_.get());
+  rpc_service_->RegisterRPC(detail::kRequestPrefetch,
+                            request_prefetch_handler_.get());
 
   auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock);
   auto *prefetch_block = Attr<framework::BlockDesc *>(kPrefetchBlock);
   auto *program = optimize_block->Program();
   framework::Executor executor(dev_place);
 
-  // prepare rpc_service
-  rpc_service_->SetScope(&recv_scope);
-  rpc_service_->SetDevCtx(&dev_ctx);
-  rpc_service_->SetProgram(program);
-  rpc_service_->SetExecutor(&executor);
-
   // prepare for prefetch
   VLOG(3) << "prefetch block id is " << prefetch_block->ID();
   auto prefetch_prepared = executor.Prepare(*program, prefetch_block->ID());
-  rpc_service_->SetPrefetchPreparedCtx(std::move(prefetch_prepared));
+
+  auto f = std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope,
+                     &dev_ctx, &executor, program, prefetch_prepared.release(),
+                     rpc_service_.get());
+
+  f(request_send_handler_.get());
+  f(request_get_handler_.get());
+  f(request_prefetch_handler_.get());
 
   // start the server listening after all member initialized.
   server_thread_.reset(new std::thread(RunServer, rpc_service_));
@@ -348,8 +283,6 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   signal(SIGTERM, SignalHandler::StopAndExit);
 
   // Write to a file of server selected port for python use.
-  std::string file_path = string::Sprintf("/tmp/paddle.%d.selected_port",
-                                          static_cast<int>(::getpid()));
   SavePort();
   if (sync_mode) {
     RunSyncLoop(&executor, program, &recv_scope, prefetch_block);
@@ -385,27 +318,9 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-bool SignalHandler::program_exit_flag_ = false;
-
-SignalHandler::BlockingQueueSet SignalHandler::blocking_queue_set_{};
-
 void SignalHandler::StopAndExit(int signal_num) {
   VLOG(3) << "Catch interrupt signal: " << signal_num << ", program will exit";
-
-  program_exit_flag_ = true;
-
-  // awake all blocking queues
-  for (BlockingQueueSet::iterator iter = blocking_queue_set_.begin();
-       iter != blocking_queue_set_.end(); iter++) {
-    iter->get()->Push(
-        std::make_pair(std::string(LISTEN_TERMINATE_MESSAGE), nullptr));
-  }
-
-  exit(EXIT_SUCCESS);
-}
-
-void SignalHandler::RegisterBlockingQueue(BlockingQueue &queue) {
-  blocking_queue_set_.insert(queue);
+  exit(0);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h
index 6f868369dc..87952cb0e6 100644
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -23,7 +23,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/detail/request_handler.h"
+#include "paddle/fluid/operators/detail/rpc_server.h"
 
 namespace paddle {
 namespace operators {
@@ -31,7 +32,7 @@ namespace operators {
 constexpr char kOptimizeBlock[] = "OptimizeBlock";
 constexpr char kPrefetchBlock[] = "PrefetchBlock";
 
-void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service);
+void RunServer(std::shared_ptr<detail::RPCServer> service);
 
 class ListenAndServOp : public framework::OperatorBase {
  public:
@@ -52,41 +53,27 @@ class ListenAndServOp : public framework::OperatorBase {
 
   void SavePort() const;
 
-  void WaitServerReady();
-
-  int GetSelectedPort() { return selected_port_; }
+  int GetSelectedPort() { return rpc_service_->GetSelectedPort(); }
 
   void Stop() override;
 
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override;
 
-  static void ResetPort() { selected_port_ = 0; }
-
  protected:
-  mutable std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
+  mutable std::shared_ptr<detail::RPCServer> rpc_service_;
+  mutable std::shared_ptr<detail::RequestHandler> request_send_handler_;
+  mutable std::shared_ptr<detail::RequestHandler> request_get_handler_;
+  mutable std::shared_ptr<detail::RequestHandler> request_prefetch_handler_;
+
   mutable std::shared_ptr<std::thread> server_thread_;
-  // FIXME(wuyi): it's static so that the operator can be cloned.
-  static std::atomic_int selected_port_;
 };
 
 class SignalHandler {
- public:
-  typedef std::shared_ptr<detail::ReceivedQueue> BlockingQueue;
-  typedef std::unordered_set<BlockingQueue> BlockingQueueSet;
-
  public:
   static void StopAndExit(int signal_num);
 
-  static void RegisterBlockingQueue(BlockingQueue&);
-
-  static inline bool IsProgramExit() { return program_exit_flag_; }
-
  private:
-  static bool program_exit_flag_;
-
-  static BlockingQueueSet blocking_queue_set_;
-
   DISABLE_COPY_AND_ASSIGN(SignalHandler);
 };
 
diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc
index 2c77ee2e27..bcd8e81609 100644
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -46,6 +46,8 @@ class SendBarrierOp : public framework::OperatorBase {
 
     auto rpc_client = detail::RPCClient::GetInstance();
 
+    VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode;
+
     // need to wait before sending send_barrier message
     PADDLE_ENFORCE(rpc_client->Wait());
     if (sync_mode) {
diff --git a/paddle/fluid/operators/test_send_nccl_id.cc b/paddle/fluid/operators/test_send_nccl_id.cc
index 719f039a0f..a845ba2eb0 100644
--- a/paddle/fluid/operators/test_send_nccl_id.cc
+++ b/paddle/fluid/operators/test_send_nccl_id.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/detail/request_handler_impl.h"
 #include "paddle/fluid/operators/listen_and_serv_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
@@ -35,42 +37,44 @@ namespace m = paddle::operators::math;
 namespace detail = paddle::operators::detail;
 namespace string = paddle::string;
 
-std::unique_ptr<detail::AsyncGRPCServer> rpc_service;
+std::unique_ptr<detail::AsyncGRPCServer> g_rpc_service;
+std::unique_ptr<detail::RequestHandler> g_req_handler;
 
-void StartServer(std::atomic<bool>* initialized) {
+void StartServer() {
   f::Scope scope;
   p::CPUPlace place;
   scope.Var(NCCL_ID_VARNAME);
   p::DeviceContextPool& pool = p::DeviceContextPool::Instance();
   auto& dev_ctx = *pool.Get(p::CPUPlace());
 
-  rpc_service.reset(new detail::AsyncGRPCServer("127.0.0.1:0", true));
-
   f::ProgramDesc empty_program;
   f::Executor executor(dev_ctx.GetPlace());
-  rpc_service->SetScope(&scope);
-  rpc_service->SetDevCtx(&dev_ctx);
-  rpc_service->SetProgram(&empty_program);
-  rpc_service->SetExecutor(&executor);
+  g_req_handler->SetScope(&scope);
+  g_req_handler->SetDevCtx(&dev_ctx);
+  g_req_handler->SetProgram(&empty_program);
+  g_req_handler->SetExecutor(&executor);
+
+  g_rpc_service->RegisterRPC(detail::kRequestSend, g_req_handler.get());
+  g_req_handler->SetRPCServer(g_rpc_service.get());
 
   std::thread server_thread(
-      std::bind(&detail::AsyncGRPCServer::RunSyncUpdate, rpc_service.get()));
-  *initialized = true;
-  rpc_service->SetCond(0);
-  auto recv = rpc_service->Get();
+      std::bind(&detail::AsyncGRPCServer::StartServer, g_rpc_service.get()));
+
+  g_rpc_service->SetCond(detail::kRequestSend);
+  std::cout << "before WaitFanInOfSend" << std::endl;
+  g_rpc_service->WaitBarrier(detail::kRequestSend);
+
   LOG(INFO) << "got nccl id and stop server...";
-  rpc_service->ShutDown();
+  g_rpc_service->ShutDown();
   server_thread.join();
 }
 
-TEST(SendNcclId, DISABLED_Normal) {
-  std::atomic<bool> initialized{false};
-  std::thread server_thread(StartServer, &initialized);
-  while (!initialized) {
-  }
-  // wait server to start
-  // sleep(2);
-  rpc_service->WaitServerReady();
+TEST(SendNcclId, GrpcServer) {
+  g_req_handler.reset(new detail::RequestSendHandler(true));
+  g_rpc_service.reset(new detail::AsyncGRPCServer("127.0.0.1:0", 1));
+
+  std::thread server_thread(StartServer);
+  g_rpc_service->WaitServerReady();
 
   f::Scope scope;
   p::CPUPlace place;
@@ -78,17 +82,20 @@ TEST(SendNcclId, DISABLED_Normal) {
   auto& dev_ctx = *pool.Get(p::CPUPlace());
 
   auto var = scope.Var(NCCL_ID_VARNAME);
-  // var->SetType(f::proto::VarType_Type_RAW);
   auto id = var->GetMutable<ncclUniqueId>();
   p::dynload::ncclGetUniqueId(id);
 
-  int port = rpc_service->GetSelectedPort();
+  int port = g_rpc_service->GetSelectedPort();
+
   std::string ep = string::Sprintf("127.0.0.1:%d", port);
   detail::RPCClient client;
-
+  LOG(INFO) << "connect to server" << ep;
   client.AsyncSendVariable(ep, dev_ctx, scope, NCCL_ID_VARNAME);
   client.Wait();
+  client.AsyncSendBatchBarrier(ep);
+  client.Wait();
+
   server_thread.join();
-  auto* ptr = rpc_service.release();
-  delete ptr;
+  g_rpc_service.reset(nullptr);
+  g_req_handler.reset(nullptr);
 }
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index 09367889a9..6f8e3f22db 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <stdio.h>
+#include <string>
 #include <thread>  // NOLINT
 #include <typeindex>
 #include <vector>

From e6bb67a3013e82c2bd003db85e01eff9715a4bf0 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Fri, 1 Jun 2018 09:58:26 +0800
Subject: [PATCH 13/24] add authors

---
 AUTHORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/AUTHORS.md b/AUTHORS.md
index 4ee0542098..11f227be71 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -4,6 +4,7 @@
 | backyes | Yan-Fei Wang |
 | baiyfbupt | Yi-Fan Bai |
 | beckett1124 | Bin Qi |
+| ChengduoZH | Cheng-Duo Zhao|
 | chengxiaohua1105 | Xiao-Hua Cheng |
 | cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
 | cxysteven | Xing-Yi Cheng |

From 04ccbed5b8539bd3fb97df5169ff9103edac3d60 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 1 Jun 2018 10:04:59 +0800
Subject: [PATCH 14/24] fix a compile error

---
 paddle/fluid/inference/tensorrt/convert/ut_helper.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index 37fcb5c503..dd481fa234 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -101,7 +101,7 @@ class TRTConvertValidation {
     engine_->FreezeNetwork();
 
     // Declare outputs.
-    op_desc_.reset(new framework::OpDesc(desc, nullptr, nullptr));
+    op_desc_.reset(new framework::OpDesc(desc, nullptr));
 
     // Set Inputs.
     for (const auto& input : op_desc_->InputArgumentNames()) {

From 86efecb93c988119ce4dabbbb38bd3cd095622f9 Mon Sep 17 00:00:00 2001
From: Lei Wang <bestwanglei@gmail.com>
Date: Thu, 31 May 2018 19:15:36 -0700
Subject: [PATCH 15/24] Build: add dependencies for
 test_paddle_inference_api_impl. (#11064)

* Build: add test_word2vec test_image_classification as dependencies of test_paddle_inference_api_impl.

* Fix build error when WITH_TESTING is OFF.
---
 paddle/contrib/inference/CMakeLists.txt | 37 ++++++++++---------------
 1 file changed, 14 insertions(+), 23 deletions(-)

diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index 3beb93c4e7..6847f7db7f 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -17,32 +17,21 @@ if(APPLE)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
 endif(APPLE)
 
-function(inference_api_test TARGET_NAME TEST_SRC)
+function(inference_api_test TARGET_NAME)
     set(options "")
     set(oneValueArgs "")
     set(multiValueArgs ARGS)
     cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
     set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
-    set(arg_list "")
+    cc_test(test_paddle_inference_${TARGET_NAME}
+            SRCS test_paddle_inference_${TARGET_NAME}.cc
+            DEPS paddle_fluid_api paddle_inference_api
+            ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
     if(inference_test_ARGS)
-        foreach(arg ${inference_test_ARGS})
-            list(APPEND arg_list "_${arg}")
-        endforeach()
-    else()
-        list(APPEND arg_list "_")
+        set_tests_properties(test_paddle_inference_${TARGET_NAME}
+                 PROPERTIES DEPENDS "${inference_test_ARGS}")
     endif()
-    foreach(arg ${arg_list})
-        string(REGEX REPLACE "^_$" "" arg "${arg}")
-        cc_test(${TARGET_NAME}
-                SRCS ${TEST_SRC}
-                DEPS paddle_fluid_api paddle_inference_api
-                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
-        # TODO(panyx0178): Figure out how to add word2vec and image_classification
-        # as deps.
-        # set_tests_properties(${TARGET_NAME}
-        #         PROPERTIES DEPENDS ${DEP_TEST})
-    endforeach()
 endfunction(inference_api_test)
 
 
@@ -50,9 +39,11 @@ cc_library(paddle_inference_api
     SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
     DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 
-cc_test(test_paddle_inference_api
-        SRCS test_paddle_inference_api.cc
-        DEPS paddle_inference_api)
+if(WITH_TESTING)
+    cc_test(test_paddle_inference_api
+            SRCS test_paddle_inference_api.cc
+            DEPS paddle_inference_api)
 
-inference_api_test(test_paddle_inference_api_impl
-                   test_paddle_inference_api_impl.cc)
+    inference_api_test(api_impl
+                       ARGS test_word2vec test_image_classification)
+endif()

From 31f0533c5ddce9d3db8dbabb8a581f3694f0a7e1 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 1 Jun 2018 10:54:19 +0800
Subject: [PATCH 16/24] fix compile errors

---
 paddle/fluid/inference/tensorrt/convert/activation_op.cc | 2 +-
 paddle/fluid/inference/tensorrt/convert/mul_op.cc        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
index 6297051e5a..79d01b640a 100644
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -24,7 +24,7 @@ class ReluOpConverter : public OpConverter {
   void operator()(const framework::proto::OpDesc& op) override {
     // Here the two nullptr looks strange, that's because the
     // framework::OpDesc's constructor is strange.
-    framework::OpDesc op_desc(op, nullptr, nullptr);
+    framework::OpDesc op_desc(op, nullptr);
     LOG(INFO) << "convert a fluid relu op to tensorrt activation layer whose "
                  "type is Relu";
     const nvinfer1::ITensor* input_tensor =
diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
index ed09f54bde..aa8e66490f 100644
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
@@ -27,7 +27,7 @@ class MulOpConverter : public OpConverter {
   void operator()(const framework::proto::OpDesc& op) override {
     VLOG(4) << "convert a fluid mul op to tensorrt fc layer without bias";
 
-    framework::OpDesc op_desc(op, nullptr, nullptr);
+    framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
     auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);

From 85c203b117797d9bf67a96b47006d3473862fcb5 Mon Sep 17 00:00:00 2001
From: whs <wanghaoshuang@baidu.com>
Date: Fri, 1 Jun 2018 13:14:23 +0800
Subject: [PATCH 17/24] Make bilinear_interp_op support attrs from input.
 (#11041)

* Make bilinear_interp_op support attrs from input.

* Fix python api.
---
 paddle/fluid/operators/bilinear_interp_op.cc  | 23 ++++++++++++
 paddle/fluid/operators/bilinear_interp_op.cu  | 25 ++++++++++++-
 paddle/fluid/operators/bilinear_interp_op.h   | 22 +++++++++--
 python/paddle/fluid/layers/nn.py              | 19 +++++++---
 .../unittests/test_bilinear_interp_op.py      | 37 +++++++++++++++++--
 5 files changed, 111 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/operators/bilinear_interp_op.cc b/paddle/fluid/operators/bilinear_interp_op.cc
index d46fda54e7..3321adf274 100644
--- a/paddle/fluid/operators/bilinear_interp_op.cc
+++ b/paddle/fluid/operators/bilinear_interp_op.cc
@@ -34,9 +34,22 @@ class BilinearInterpOp : public framework::OperatorWithKernel {
     int out_w = ctx->Attrs().Get<int>("out_w");
     PADDLE_ENFORCE_EQ(dim_x.size(), 4, "X's dimension must be 4");
 
+    if (ctx->HasInput("OutSize")) {
+      auto out_size_dim = ctx->GetInputDim("OutSize");
+      PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
+                        "OutSize's dimension size must be 1");
+      PADDLE_ENFORCE_EQ(out_size_dim[0], 2, "OutSize's dim[0] must be 2");
+    }
     std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
     ctx->SetOutputDim("Out", framework::make_ddim(dim_out));
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+  }
 };
 
 class BilinearInterpOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -45,6 +58,10 @@ class BilinearInterpOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X",
              "(Tensor) The input tensor of bilinear interpolation, "
              "This is a 4-D tensor with shape of (N x C x h x w)");
+    AddInput("OutSize",
+             "(Tensor) This is a 1-D tensor with two number. "
+             "The first number is height and the second number is width.")
+        .AsDispensable();
     AddOutput("Out",
               "(Tensor) The dimension of output is (N x C x out_h x out_w]");
 
@@ -78,6 +95,12 @@ class BilinearInterpOpGrad : public framework::OperatorWithKernel {
       ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/bilinear_interp_op.cu b/paddle/fluid/operators/bilinear_interp_op.cu
index 510190f1aa..4c19715384 100644
--- a/paddle/fluid/operators/bilinear_interp_op.cu
+++ b/paddle/fluid/operators/bilinear_interp_op.cu
@@ -102,10 +102,21 @@ class BilinearInterpOpCUDAKernel : public framework::OpKernel<T> {
     auto* input_t = ctx.Input<Tensor>("X");      // float tensor
     auto* output_t = ctx.Output<Tensor>("Out");  // float tensor
     auto* input = input_t->data<T>();
-    auto* output = output_t->mutable_data<T>(ctx.GetPlace());
 
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
+    auto out_dims = output_t->dims();
+    auto out_size_t = ctx.Input<Tensor>("OutSize");
+    if (out_size_t != nullptr) {
+      Tensor sizes;
+      framework::TensorCopy(*out_size_t, platform::CPUPlace(), &sizes);
+      auto size_data = sizes.data<int>();
+      out_h = size_data[0];
+      out_w = size_data[1];
+    }
+    auto* output = output_t->mutable_data<T>(
+        {out_dims[0], out_dims[1], out_h, out_w}, ctx.GetPlace());
+
     int batch_size = input_t->dims()[0];
     int channels = input_t->dims()[1];
     int in_h = input_t->dims()[2];
@@ -139,8 +150,8 @@ class BilinearInterpGradOpCUDAKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* d_input_t = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* d_output_t = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
     auto* d_output = d_output_t->data<T>();
+    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
 
     auto& device_ctx =
         ctx.template device_context<platform::CUDADeviceContext>();
@@ -149,6 +160,16 @@ class BilinearInterpGradOpCUDAKernel : public framework::OpKernel<T> {
 
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
+
+    auto out_size_t = ctx.Input<Tensor>("OutSize");
+    if (out_size_t != nullptr) {
+      Tensor sizes;
+      framework::TensorCopy(*out_size_t, platform::CPUPlace(), &sizes);
+      auto size_data = sizes.data<int>();
+      out_h = size_data[0];
+      out_w = size_data[1];
+    }
+
     int batch_size = d_input_t->dims()[0];
     int channels = d_input_t->dims()[1];
     int in_h = d_input_t->dims()[2];
diff --git a/paddle/fluid/operators/bilinear_interp_op.h b/paddle/fluid/operators/bilinear_interp_op.h
index f6cd77e4d4..8b03cd5a06 100644
--- a/paddle/fluid/operators/bilinear_interp_op.h
+++ b/paddle/fluid/operators/bilinear_interp_op.h
@@ -24,11 +24,18 @@ class BilinearInterpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* input_t = ctx.Input<Tensor>("X");      // float tensor
     auto* output_t = ctx.Output<Tensor>("Out");  // float tensor
+    auto out_dims = output_t->dims();
     auto* input = input_t->data<T>();
-    auto* output = output_t->mutable_data<T>(ctx.GetPlace());
-
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
+    auto out_size_t = ctx.Input<Tensor>("OutSize");
+    if (out_size_t != nullptr) {
+      auto out_size_data = out_size_t->data<int>();
+      out_h = out_size_data[0];
+      out_w = out_size_data[1];
+    }
+    auto* output = output_t->mutable_data<T>(
+        {out_dims[0], out_dims[1], out_h, out_w}, ctx.GetPlace());
     int batch_size = input_t->dims()[0];
     int channels = input_t->dims()[1];
     int in_h = input_t->dims()[2];
@@ -83,9 +90,8 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* d_input_t = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* d_output_t = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
     auto* d_output = d_output_t->data<T>();
-
+    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
     auto& device_ctx =
         ctx.template device_context<platform::CPUDeviceContext>();
     math::SetConstant<platform::CPUDeviceContext, T> zero;
@@ -93,6 +99,14 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> {
 
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
+
+    auto out_size_t = ctx.Input<Tensor>("OutSize");
+    if (out_size_t != nullptr) {
+      auto out_size_data = out_size_t->data<int>();
+      out_h = out_size_data[0];
+      out_w = out_size_data[1];
+    }
+
     int batch_size = d_input_t->dims()[0];
     int channels = d_input_t->dims()[1];
     int in_h = d_input_t->dims()[2];
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 63ec831514..cb87653c47 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -3944,7 +3944,7 @@ def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
         input (Variable): The input tensor of bilinear interpolation,
                           This is a 4-D tensor of the shape
                           (num_batches, channels, in_h, in_w).
-        out_shape(list|tuple|None): Output shape of bilinear interpolation
+        out_shape(list|tuple|Variable|None): Output shape of bilinear interpolation
                                     layer, the shape is (out_h, out_w).
                                     Default: None
         scale(int|None): The multiplier for the input height or width.
@@ -3971,13 +3971,20 @@ def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
     def _is_list_or_turple_(data):
         return (isinstance(data, list) or isinstance(data, tuple))
 
+    out_h = 0
+    out_w = 0
+    inputs = {"X": input}
     if out_shape is not None:
-        if not (_is_list_or_turple_(out_shape) and len(out_shape) == 2):
+        if not (_is_list_or_turple_(out_shape) and len(out_shape) == 2) and (
+                out_shape is not Variable):
             raise ValueError('out_shape should be a list or tuple ',
                              'with length 2, (out_h, out_w).')
-        out_shape = list(map(int, out_shape))
-        out_h = out_shape[0]
-        out_w = out_shape[1]
+        if _is_list_or_turple_(out_shape):
+            out_shape = list(map(int, out_shape))
+            out_h = out_shape[0]
+            out_w = out_shape[1]
+        else:
+            inputs['OutSize'] = out_shape
     else:
         out_h = int(input.shape[2] * scale)
         out_w = int(input.shape[3] * scale)
@@ -3985,7 +3992,7 @@ def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
     out = helper.create_tmp_variable(dtype)
     helper.append_op(
         type="bilinear_interp",
-        inputs={"X": input},
+        inputs=inputs,
         outputs={"Out": out},
         attrs={"out_h": out_h,
                "out_w": out_w})
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
index bffb4f3b66..87c11e7880 100644
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
@@ -17,7 +17,10 @@ import numpy as np
 from op_test import OpTest
 
 
-def bilinear_interp_np(input, out_h, out_w):
+def bilinear_interp_np(input, out_h, out_w, out_size):
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
     batch_size, channel, in_h, in_w = input.shape
     if out_h > 1:
         ratio_h = (in_h - 1.0) / (out_h - 1.0)
@@ -49,12 +52,15 @@ def bilinear_interp_np(input, out_h, out_w):
 
 class TestBilinearInterpOp(OpTest):
     def setUp(self):
+        self.out_size = None
         self.init_test_case()
         self.op_type = "bilinear_interp"
         input_np = np.random.random(self.input_shape).astype("float32")
-        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w)
-
+        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
+                                       self.out_size)
         self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
         self.attrs = {'out_h': self.out_h, 'out_w': self.out_w}
         self.outputs = {'Out': output_np}
 
@@ -68,6 +74,7 @@ class TestBilinearInterpOp(OpTest):
         self.input_shape = [2, 3, 4, 4]
         self.out_h = 2
         self.out_w = 2
+        self.out_size = np.array([3, 3]).astype("int32")
 
 
 class TestCase1(TestBilinearInterpOp):
@@ -91,5 +98,29 @@ class TestCase3(TestBilinearInterpOp):
         self.out_w = 128
 
 
+class TestCase4(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.out_size = np.array([2, 2]).astype("int32")
+
+
+class TestCase5(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.out_size = np.array([11, 11]).astype("int32")
+
+
+class TestCase6(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 128, 64]
+        self.out_h = 64
+        self.out_w = 128
+        self.out_size = np.array([65, 129]).astype("int32")
+
+
 if __name__ == "__main__":
     unittest.main()

From ed365919b409749d903a2a5b4fbc4d3b00bb6f7c Mon Sep 17 00:00:00 2001
From: Wu Yi <typhoonzero1986@gmail.com>
Date: Fri, 1 Jun 2018 14:46:19 +0800
Subject: [PATCH 18/24] Add fluid benchmark Dockerfile (#11095)

* add fluid benchmark Dockerfile

* add_fluid_benchmark_dockerfile
---
 benchmark/fluid/Dockerfile | 22 ++++++++++++++++++++++
 benchmark/fluid/README.md  | 16 +++++++++++++++-
 benchmark/fluid/run.sh     | 26 ++++++++++++++------------
 3 files changed, 51 insertions(+), 13 deletions(-)
 create mode 100644 benchmark/fluid/Dockerfile

diff --git a/benchmark/fluid/Dockerfile b/benchmark/fluid/Dockerfile
new file mode 100644
index 0000000000..46140a9d1b
--- /dev/null
+++ b/benchmark/fluid/Dockerfile
@@ -0,0 +1,22 @@
+FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop
+RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
+RUN pip install -U pip
+RUN pip install -U kubernetes opencv-python paddlepaddle
+
+# IMPORTANT:
+# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
+
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.imikolov.fetch()" | python'
+RUN pip uninstall -y paddlepaddle && mkdir /workspace
+
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s
+
+ENV LD_LIBRARY_PATH=/usr/local/lib
+ADD fluid_benchmark.py dataset.py models/ /workspace/
diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md
index 7071e9fdcd..1b0c7dce8b 100644
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
@@ -44,11 +44,25 @@ Currently supported `--model` argument include:
 
 ## Run Distributed Benchmark on Kubernetes Cluster
 
+You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
+have to start all those processes mannually on each node, which is not recommended.
+
+To build the Docker image, you need to choose a paddle "whl" package to run with, you may either
+download it from
+http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_en.html or
+build it by your own. Once you've got the "whl" package, put it under the current directory and run:
+
+```bash
+docker build -t [your docker image name]:[your docker image tag] .
+```
+
+Then push the image to a Docker registry that your Kubernetes cluster can reach.
+
 We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit
 distributed benchmark jobs to your cluster. To generate a job yaml, just run:
 
 ```bash
-python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver " --disttype pserver
+python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --gpus 8 --device GPU --update_method pserver " --disttype pserver
 ```
 
 Then the yaml files are generated under directory `myjob`, you can run:
diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh
index f6dfd20bf2..afaab5f4de 100644
--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
@@ -37,7 +37,8 @@ nohup stdbuf -oL nvidia-smi \
       -l 1 &
 # mnist
 # mnist gpu mnist 128
-FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=mnist \
                --device=GPU \
                --batch_size=128 \
                --skip_batch_num=5 \
@@ -46,7 +47,8 @@ FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
 
 # vgg16
 # gpu cifar10 128
-FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=vgg16 \
                --device=GPU \
                --batch_size=128 \
                --skip_batch_num=5 \
@@ -54,7 +56,8 @@ FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
                2>&1 | tee -a vgg16_gpu_128.log
 
 # flowers gpu  128
-FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=vgg16 \
                --device=GPU \
                --batch_size=32 \
                --data_set=flowers \
@@ -64,40 +67,39 @@ FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
 
 # resnet50
 # resnet50 gpu cifar10 128
-FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=resnet50 \
                --device=GPU \
                --batch_size=128 \
                --data_set=cifar10 \
-               --model=resnet_cifar10 \
                --skip_batch_num=5 \
                --iterations=30 \
                2>&1 | tee -a resnet50_gpu_128.log
 
 # resnet50 gpu flowers 64
-FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=resnet50 \
                --device=GPU \
                --batch_size=64 \
                --data_set=flowers \
-               --model=resnet_imagenet \
                --skip_batch_num=5 \
                --iterations=30 \
                2>&1 | tee -a resnet50_gpu_flowers_64.log
 
 # lstm
 # lstm gpu imdb 32 # tensorflow only support batch=32
-FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=stacked_dynamic_lstm \
                --device=GPU \
                --batch_size=32 \
                --skip_batch_num=5 \
                --iterations=30 \
-               --hidden_dim=512 \
-               --emb_dim=512 \
-               --crop_size=1500 \
                2>&1 | tee -a lstm_gpu_32.log
 
 # seq2seq
 # seq2seq gpu wmb 128
-FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=machine_translation \
                --device=GPU \
                --batch_size=128 \
                --skip_batch_num=5 \

From 28dc9ba3c14edb2b3d8389080ba3ea06f60684c2 Mon Sep 17 00:00:00 2001
From: whs <wanghaoshuang@baidu.com>
Date: Fri, 1 Jun 2018 15:13:48 +0800
Subject: [PATCH 19/24]  Add shape op to get the shape of variable. (#11048)

* Add shape op to get the shape of variable.

* Rename get_shape to shape.

* Add checker for output and fix comments.
---
 paddle/fluid/operators/shape_op.cc            | 54 +++++++++++++++++++
 paddle/fluid/operators/shape_op.cu            | 20 +++++++
 paddle/fluid/operators/shape_op.h             | 38 +++++++++++++
 python/paddle/fluid/layers/ops.py             |  1 +
 .../fluid/tests/unittests/test_shape_op.py    | 47 ++++++++++++++++
 5 files changed, 160 insertions(+)
 create mode 100644 paddle/fluid/operators/shape_op.cc
 create mode 100644 paddle/fluid/operators/shape_op.cu
 create mode 100644 paddle/fluid/operators/shape_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_shape_op.py

diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
new file mode 100644
index 0000000000..c75fce7959
--- /dev/null
+++ b/paddle/fluid/operators/shape_op.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/shape_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class ShapeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input (Input) of get_shape op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output (Out) of get_shape op should not be null.");
+    auto in_dim = ctx->GetInputDim("Input");
+    ctx->SetOutputDim("Out", {in_dim.size()});
+  }
+};
+
+class ShapeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Input", "(Tensor), The input tensor.");
+    AddOutput("Out", "(Tensor), The shape of input tensor.");
+    AddComment(R"DOC(
+Shape Operator. 
+Get the shape of input tensor.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(shape, ops::ShapeOp, ops::ShapeOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<int>, ops::ShapeKernel<int64_t>,
+                       ops::ShapeKernel<float>, ops::ShapeKernel<double>);
diff --git a/paddle/fluid/operators/shape_op.cu b/paddle/fluid/operators/shape_op.cu
new file mode 100644
index 0000000000..7736a2a1e1
--- /dev/null
+++ b/paddle/fluid/operators/shape_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/shape_op.h"
+
+REGISTER_OP_CUDA_KERNEL(shape, paddle::operators::ShapeKernel<int>,
+                        paddle::operators::ShapeKernel<int64_t>,
+                        paddle::operators::ShapeKernel<float>,
+                        paddle::operators::ShapeKernel<double>);
diff --git a/paddle/fluid/operators/shape_op.h b/paddle/fluid/operators/shape_op.h
new file mode 100644
index 0000000000..3be86b66a5
--- /dev/null
+++ b/paddle/fluid/operators/shape_op.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class ShapeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_t = ctx.Input<Tensor>("Input");
+    auto* out_t = ctx.Output<Tensor>("Out");
+    auto out_data = out_t->mutable_data<int64_t>(platform::CPUPlace());
+    auto in_dims = in_t->dims();
+    for (int i = 0; i < in_dims.size(); ++i) {
+      out_data[i] = in_dims[i];
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index a9fe25744c..60f8cbbfa7 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -71,6 +71,7 @@ __all__ = [
     'cumsum',
     'scatter',
     'sum',
+    'shape',
 ] + __activations__
 
 for _OP in set(__all__):
diff --git a/python/paddle/fluid/tests/unittests/test_shape_op.py b/python/paddle/fluid/tests/unittests/test_shape_op.py
new file mode 100644
index 0000000000..a62ee05007
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_shape_op.py
@@ -0,0 +1,47 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestShapeOp(OpTest):
+    def setUp(self):
+        self.op_type = "shape"
+        self.config()
+        self.shape = [2, 3]
+        input = np.zeros(self.shape)
+        self.inputs = {'Input': input}
+        self.outputs = {'Out': np.array(self.shape)}
+
+    def config(self):
+        self.shape = [2, 3]
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class case1(TestShapeOp):
+    def config(self):
+        self.shape = [2]
+
+
+class case2(TestShapeOp):
+    def config(self):
+        self.shape = [1, 2, 3]
+
+
+if __name__ == '__main__':
+    unittest.main()

From 86d8659c8de7e91c066935e723da29f31ffd6364 Mon Sep 17 00:00:00 2001
From: whs <wanghaoshuang@baidu.com>
Date: Fri, 1 Jun 2018 15:14:08 +0800
Subject: [PATCH 20/24] Add python wrapper for gather op. (#11033)

* Add python wrapper for gather op.

* Add unitest for 'rank==1' and fix comments.

* Fix comments.
---
 doc/fluid/api/layers.rst                      |  6 +++
 paddle/fluid/operators/gather_op.cc           |  1 -
 python/paddle/fluid/layers/nn.py              | 51 ++++++++++++++++++-
 .../fluid/tests/unittests/test_gather_op.py   | 15 +++++-
 4 files changed, 69 insertions(+), 4 deletions(-)

diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index f53da4d194..dbb99d3c03 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -1009,3 +1009,9 @@ ____
 ..  autofunction:: paddle.fluid.layers.upsampling_bilinear2d
     :noindex:
 
+gather
+____
+
+..  autofunction:: paddle.fluid.layers.gather
+    :noindex:
+
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index e21b572589..aa3e05b83b 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -33,7 +33,6 @@ class GatherOp : public framework::OperatorWithKernel {
     auto index_dims = ctx->GetInputDim("Index");
     PADDLE_ENFORCE(index_dims.size() == 1);
     int batch_size = ctx->GetInputDim("Index")[0];
-    PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
     framework::DDim output_dims(ctx->GetInputDim("X"));
     output_dims[0] = batch_size;
     ctx->SetOutputDim("Out", output_dims);
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index cb87653c47..56f5c6b4be 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -82,6 +82,7 @@ __all__ = [
     'roi_pool',
     'dice_loss',
     'upsampling_bilinear2d',
+    'gather',
     'random_crop',
 ]
 
@@ -3889,7 +3890,6 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
 
 def dice_loss(input, label, epsilon=0.00001):
     """
-    **Dice loss Layer**
     Dice loss for comparing the similarity of two batch of data,
     usually is used for binary image segmentation i.e. labels are binary.
     The dice loss can be defined as below equation:
@@ -3999,6 +3999,55 @@ def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
     return out
 
 
+def gather(input, index):
+    """
+    Output is obtained by gathering entries of the outer-most dimension 
+    of X indexed by `index` and concatenate them together.
+
+    .. math::
+
+	Out = X[Index]
+
+
+    .. code-block:: text
+
+
+                Given:
+
+    		X = [[1, 2],
+         	     [3, 4],
+                     [5, 6]]
+
+                Index = [1, 2]
+
+                Then:
+
+                Out = [[3, 4],
+                       [5, 6]]
+
+    Args:
+        input (Variable): The source input with rank>=1. 
+        index (Variable): The index input with rank=1.
+
+    Returns:
+        output (Variable): The output is a tensor with the same rank as input.
+
+    Examples:
+        .. code-block:: python
+
+            output = fluid.layers.gather(x, index)
+    """
+    helper = LayerHelper('gather', **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="gather",
+        inputs={"X": input,
+                "Index": index},
+        outputs={"Out": out})
+    return out
+
+
 def random_crop(input, shape, seed=1):
     helper = LayerHelper("random_crop", **locals())
     dtype = helper.input_dtype()
diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py
index 6fd043c27e..4ae9086480 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
@@ -20,8 +20,9 @@ from op_test import OpTest
 class TestGatherOp(OpTest):
     def setUp(self):
         self.op_type = "gather"
-        xnp = np.random.random((10, 20)).astype("float32")
-        self.inputs = {'X': xnp, 'Index': np.array([1, 3, 5]).astype("int32")}
+        self.config()
+        xnp = np.random.random(self.x_shape).astype("float32")
+        self.inputs = {'X': xnp, 'Index': np.array(self.index).astype("int32")}
         self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
 
     def test_check_output(self):
@@ -30,6 +31,16 @@ class TestGatherOp(OpTest):
     def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
+    def config(self):
+        self.x_shape = (10, 20)
+        self.index = [1, 3, 5]
+
+
+class TestCase1(TestGatherOp):
+    def config(self):
+        self.x_shape = (10)
+        self.index = [1, 3, 5]
+
 
 if __name__ == "__main__":
     unittest.main()

From 18d640255efb6807a360c29d6e1c672aa679818a Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Fri, 1 Jun 2018 15:38:45 +0800
Subject: [PATCH 21/24] simplify inference api (#11104)

---
 .../contrib/inference/paddle_inference_api.h  | 40 +++++++++++--------
 .../inference/paddle_inference_api_impl.cc    | 22 +++++-----
 .../test_paddle_inference_api_impl.cc         |  1 -
 3 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/contrib/inference/paddle_inference_api.h
index b4c7f9bef4..5fe8399762 100644
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -40,14 +40,23 @@ struct PaddleBuf {
 struct PaddleTensor {
   std::string name;  // variable name.
   std::vector<int> shape;
+  // TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
   PaddleBuf data;  // blob of data.
   PaddleDType dtype;
 };
 
+enum class PaddleEngineKind {
+  kNative = 0,  // Use the native Fluid facility.
+  // TODO(Superjomn) support following engines latter.
+  // kAnakin,             // Use Anakin for inference.
+  // kTensorRT,           // Use TensorRT for inference.
+  // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
+  // kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
+};
+
 /*
  * A simple Inference API for Paddle. Currently this API can be used by
  * non-sequence scenerios.
- * TODO(Superjomn) Support another API for NLP-related usages.
  */
 class PaddlePredictor {
  public:
@@ -69,15 +78,6 @@ class PaddlePredictor {
   // Destroy the Predictor.
   virtual ~PaddlePredictor() {}
 
-  enum class EngineKind {
-    kNative = -1,  // Use the native Fluid facility.
-    // TODO(Superjomn) support latter.
-    // kAnakin,             // Use Anakin for inference.
-    // kTensorRT,           // Use TensorRT for inference.
-    // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
-    // kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
-  };
-
   // The common configs for all the predictors.
   struct Config {
     std::string model_dir;      // path to the model directory.
@@ -86,18 +86,24 @@ class PaddlePredictor {
 };
 
 struct NativeConfig : public PaddlePredictor::Config {
+  // GPU related fields.
   bool use_gpu{false};
-  int device;
-  float fraction_of_gpu_memory;
+  int device{0};
+  float fraction_of_gpu_memory{-1.f};  // Negative to notify initialization.
+
   std::string prog_file;
   std::string param_file;
-  bool share_variables;
 };
 
-// A factory to help create difference predictor.
-template <
-    typename ConfigT,
-    PaddlePredictor::EngineKind engine = PaddlePredictor::EngineKind::kNative>
+// A factory to help create different predictors.
+//
+// FOR EXTENSION DEVELOPER:
+// Different predictors are designated by config type and engine kind. Similar
+// configs can be merged, but there shouldn't be a huge config containing
+// different fields for more than one kind of predictors.
+//
+// Similarly, each engine kind should map to a unique predictor implementation.
+template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
 
 }  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.cc b/paddle/contrib/inference/paddle_inference_api_impl.cc
index 989252f69e..99a64662d4 100644
--- a/paddle/contrib/inference/paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
@@ -57,8 +57,7 @@ std::string num2str(T a) {
 bool NativePaddlePredictor::Init() {
   VLOG(3) << "Predictor::init()";
 
-  // TODO(panyx0718): Should CPU vs GPU device be decided by id?
-  if (config_.device >= 0) {
+  if (config_.use_gpu) {
     place_ = paddle::platform::CUDAPlace(config_.device);
   } else {
     place_ = paddle::platform::CPUPlace();
@@ -85,11 +84,13 @@ bool NativePaddlePredictor::Init() {
   }
   ctx_ = executor_->Prepare(*inference_program_, 0);
 
-  // Create variables
-  // TODO(panyx0718): Why need to test share_variables here?
-  if (config_.share_variables) {
-    executor_->CreateVariables(*inference_program_, scope_.get(), 0);
-  }
+  // Create temporary variables first, so that the first batch do not need to
+  // create variables in the runtime. This is the logics of the old inference
+  // API.
+  // TODO(Superjomn) this should be modified when `Clone` is valid for
+  // multi-thread application.
+  executor_->CreateVariables(*inference_program_, scope_.get(), 0);
+
   // Get the feed_target_names and fetch_target_names
   feed_target_names_ = inference_program_->GetFeedTargetNames();
   fetch_target_names_ = inference_program_->GetFetchTargetNames();
@@ -124,7 +125,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
                                 scope_.get(),
                                 &feed_targets,
                                 &fetch_targets,
-                                !config_.share_variables);
+                                false /* don't create variable eatch time */);
   if (!GetFetch(fetchs, output_data)) {
     LOG(ERROR) << "fail to get fetchs";
     return false;
@@ -242,11 +243,14 @@ bool NativePaddlePredictor::GetFetch(
 
 template <>
 std::unique_ptr<PaddlePredictor>
-CreatePaddlePredictor<NativeConfig, PaddlePredictor::EngineKind::kNative>(
+CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
     const NativeConfig &config) {
   VLOG(3) << "create NativePaddlePredictor";
   if (config.use_gpu) {
     // 1. GPU memeroy
+    PADDLE_ENFORCE(
+        config.fraction_of_gpu_memory > 0.f,
+        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
     std::vector<std::string> flags;
     if (config.fraction_of_gpu_memory >= 0.0f ||
         config.fraction_of_gpu_memory <= 0.95f) {
diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
index 5240fc2f20..07b17acd48 100644
--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -47,7 +47,6 @@ NativeConfig GetConfig() {
   config.fraction_of_gpu_memory = 0.15;
   config.use_gpu = true;
   config.device = 0;
-  config.share_variables = true;
   return config;
 }
 

From 0c0c5df4cbed8a9c947fd2819640e9d402555ed1 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Fri, 1 Jun 2018 15:39:30 +0800
Subject: [PATCH 22/24] feature/add TRT fc converter (#11043)

---
 .../inference/tensorrt/convert/CMakeLists.txt |   2 +
 .../inference/tensorrt/convert/conv2d_op.cc   |   3 +-
 .../fluid/inference/tensorrt/convert/fc_op.cc | 119 ++++++++++++++++++
 .../inference/tensorrt/convert/mul_op.cc      |   5 +-
 .../inference/tensorrt/convert/op_converter.h |  41 ++++--
 .../inference/tensorrt/convert/test_fc_op.cc  |  46 +++++++
 .../inference/tensorrt/convert/test_mul_op.cc |   4 +-
 .../tensorrt/convert/test_op_converter.cc     |   7 +-
 .../inference/tensorrt/convert/ut_helper.h    |  40 +++---
 paddle/fluid/inference/tensorrt/engine.cc     |   1 +
 paddle/fluid/inference/tensorrt/engine.h      |   4 +-
 paddle/fluid/operators/tensorrt_engine_op.cc  |   3 +-
 12 files changed, 240 insertions(+), 35 deletions(-)
 create mode 100644 paddle/fluid/inference/tensorrt/convert/fc_op.cc
 create mode 100644 paddle/fluid/inference/tensorrt/convert/test_fc_op.cc

diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 5ada1d6312..23ca8bfac8 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -8,3 +8,5 @@ nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS
 nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
 nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
+nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 209936c3ba..668d344f1b 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -21,7 +21,8 @@ namespace tensorrt {
 class Conv2dOpConverter : public OpConverter {
  public:
   Conv2dOpConverter() {}
-  void operator()(const framework::proto::OpDesc& op) override {
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope) override {
     LOG(INFO)
         << "convert a fluid conv2d op to tensorrt conv layer without bias";
   }
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
new file mode 100644
index 0000000000..bd05608d76
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+// Reorder the elements from istrides to ostrides, borrowed from TRT convert in
+// tensorflow.
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/tensorrt/convert/convert_nodes.cc#L318
+template <typename T>
+void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
+              T* odata, nvinfer1::DimsHW ostrides) {
+  for (int h = 0; h < shape.h(); ++h) {
+    for (int w = 0; w < shape.w(); ++w) {
+      odata[h * ostrides.h() + w * ostrides.w()] =
+          idata[h * ostrides.h() + w * ostrides.w()];
+    }
+  }
+}
+
+// Reorder the data layout from CK to KC.
+void ReorderCKtoKC(TensorRTEngine::Weight& iweights,
+                   TensorRTEngine::Weight* oweights) {
+  int c = iweights.dims[0];
+  int k = iweights.dims[1];
+  oweights->dims.assign({k, c});
+  nvinfer1::DimsHW istrides = {1, k};
+  nvinfer1::DimsHW ostrides = {c, 1};
+  Reorder2({k, c}, static_cast<float const*>(iweights.get().values), istrides,
+           static_cast<float*>(const_cast<void*>(oweights->get().values)),
+           ostrides);
+}
+
+/*
+ * FC converter convert a MUL op in Fluid to a FC layer in TRT.
+ */
+class FcOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope) override {
+    VLOG(4) << "convert a fluid fc op to tensorrt fc layer without bias";
+
+    framework::OpDesc op_desc(op, nullptr, nullptr);
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+    // Declare inputs
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+
+    // Declare weights
+    auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
+    PADDLE_ENFORCE_NOT_NULL(Y_v);
+    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+    // This may trigger a GPU->CPU copy, because TRT's weight can only be
+    // assigned from CPU memory, that can't be avoided.
+    auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
+    PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL);  // a matrix
+    size_t n_output = Y_t->dims()[1];
+
+    framework::LoDTensor tmp;
+    tmp.Resize(Y_t->dims());
+    memcpy(tmp.mutable_data<float>(platform::CPUPlace()), Y_t->data<float>(),
+           Y_t->dims()[0] * Y_t->dims()[1]);
+
+    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+                                  static_cast<void*>(weight_data),
+                                  Y_t->memory_size() / sizeof(float)};
+    TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT,
+                                      static_cast<void*>(tmp.data<float>()),
+                                      Y_t->memory_size() / sizeof(float));
+    weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]});
+    tmp_weight.dims = weight.dims;
+
+    // The data layout of TRT FC layer's weight is different from fluid's FC,
+    // need to reorder the elements.
+    ReorderCKtoKC(tmp_weight, &weight);
+
+    // Currently, the framework can only handle one fluid op -> one TRT layer,
+    // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just
+    // handle `mul`, leave `add` as another layer.
+    // DEBUG
+    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected,
+                                       *const_cast<nvinfer1::ITensor*>(X),
+                                       n_output, weight.get(), bias.get());
+
+    auto output_name = op_desc.Output("Out").front();
+    engine_->DeclareOutput(layer, 0, output_name);
+  }
+};
+
+REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter);
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(mul);
diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
index aa8e66490f..6bb07709c7 100644
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
@@ -24,8 +24,9 @@ namespace tensorrt {
 class MulOpConverter : public OpConverter {
  public:
   MulOpConverter() {}
-  void operator()(const framework::proto::OpDesc& op) override {
-    VLOG(4) << "convert a fluid mul op to tensorrt fc layer without bias";
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope) override {
+    VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias";
 
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 1cd3ed9a00..4d21e241c0 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -31,27 +31,42 @@ namespace tensorrt {
 class OpConverter {
  public:
   OpConverter() {}
-  virtual void operator()(const framework::proto::OpDesc& op) {}
 
-  void Run(const framework::proto::OpDesc& op, TensorRTEngine* engine) {
-    std::string type = op.type();
-    auto* it = Registry<OpConverter>::Lookup(type);
-    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", type);
-    it->SetEngine(engine);
-    (*it)(op);
-  }
+  // Converter logic for an op.
+  virtual void operator()(const framework::proto::OpDesc& op,
+                          const framework::Scope& scope) {}
+
+  // Convert a single fluid operaotr and add the corresponding layer to TRT.
+  void ConvertOp(const framework::proto::OpDesc& op,
+                 const std::unordered_set<std::string>& parameters,
+                 const framework::Scope& scope, TensorRTEngine* engine) {
+    framework::OpDesc op_desc(op, nullptr, nullptr);
+
+    OpConverter* it{nullptr};
 
-  // convert fluid op to tensorrt layer
-  void ConvertOp(const framework::proto::OpDesc& op, TensorRTEngine* engine) {
-    OpConverter::Run(op, engine);
+    if (op_desc.Type() == "mul") {
+      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
+      std::string Y = op_desc.Input("Y")[0];
+      if (parameters.count(Y)) {
+        it = Registry<OpConverter>::Lookup("fc");
+      }
+    }
+    if (!it) {
+      it = Registry<OpConverter>::Lookup(op_desc.Type());
+    }
+    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
+                            op_desc.Type());
+    it->SetEngine(engine);
+    (*it)(op, scope);
   }
 
   // convert fluid block to tensorrt network
   void ConvertBlock(const framework::proto::BlockDesc& block,
-                    TensorRTEngine* engine) {
+                    const std::unordered_set<std::string>& parameters,
+                    const framework::Scope& scope, TensorRTEngine* engine) {
     for (int i = 0; i < block.ops_size(); i++) {
       const auto& op = block.ops(i);
-      OpConverter::Run(op, engine);
+      ConvertOp(op, parameters, scope, engine);
     }
   }
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
new file mode 100644
index 0000000000..a30253072a
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(fc_op, test) {
+  std::unordered_set<std::string> parameters({"mul-Y"});
+  framework::Scope scope;
+  TRTConvertValidation validator(20, parameters, scope, 1000);
+
+  validator.DeclInputVar("mul-X", nvinfer1::Dims4(8, 3, 1, 1));
+  validator.DeclParamVar("mul-Y", nvinfer1::Dims2(3, 2));
+  validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(8, 2));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("mul");
+  desc.SetInput("X", {"mul-X"});
+  desc.SetInput("Y", {"mul-Y"});
+  desc.SetOutput("Out", {"mul-Out"});
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(10);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
index d8b61d5f08..1ce1130e5d 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
@@ -21,7 +21,9 @@ namespace inference {
 namespace tensorrt {
 
 TEST(MulOpConverter, main) {
-  TRTConvertValidation validator(10, 1000);
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
   validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6));
   validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10));
   validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10));
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 9ae7de9cbf..1d3f5eabb2 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
 namespace paddle {
 namespace inference {
@@ -27,7 +28,9 @@ TEST(OpConverter, ConvertBlock) {
   conv2d_op->SetType("conv2d");
 
   OpConverter converter;
-  converter.ConvertBlock(*block->Proto(), nullptr /*TensorRTEngine*/);
+  framework::Scope scope;
+  converter.ConvertBlock(*block->Proto(), {}, scope,
+                         nullptr /*TensorRTEngine*/);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index 684bbc208f..d7e05dd5b5 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -61,7 +61,10 @@ class TRTConvertValidation {
  public:
   TRTConvertValidation() = delete;
 
-  explicit TRTConvertValidation(int batch_size, int workspace_size = 1024) {
+  TRTConvertValidation(int batch_size,
+                       const std::unordered_set<std::string>& parameters,
+                       framework::Scope& scope, int workspace_size = 1 << 10)
+      : parameters_(parameters), scope_(scope) {
     // create engine.
     engine_.reset(new TensorRTEngine(10, 1 << 10, &stream_));
     engine_->InitNetwork();
@@ -76,19 +79,22 @@ class TRTConvertValidation {
     engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, dims);
   }
 
+  // Declare a parameter varaible in the scope.
+  void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) {
+    DeclVar(name, dims);
+  }
+
   void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) {
     DeclVar(name, dims);
   }
 
+  // Declare a variable in a fluid Scope.
   void DeclVar(const std::string& name, const nvinfer1::Dims& dims) {
     platform::CPUPlace place;
     platform::CPUDeviceContext ctx(place);
 
     // Init Fluid tensor.
-    std::vector<int> dim_vec(dims.nbDims);
-    for (int i = 0; i < dims.nbDims; i++) {
-      dim_vec[i] = dims.d[i];
-    }
+    std::vector<int> dim_vec(dims.d, dims.d + dims.nbDims);
     auto* x = scope_.Var(name);
     auto* x_tensor = x->GetMutable<framework::LoDTensor>();
     x_tensor->Resize(framework::make_ddim(dim_vec));
@@ -99,7 +105,7 @@ class TRTConvertValidation {
     op_ = framework::OpRegistry::CreateOp(desc);
 
     OpConverter op_converter;
-    op_converter.ConvertOp(desc, engine_.get());
+    op_converter.ConvertOp(desc, parameters_, scope_, engine_.get());
 
     engine_->FreezeNetwork();
 
@@ -108,11 +114,13 @@ class TRTConvertValidation {
 
     // Set Inputs.
     for (const auto& input : op_desc_->InputArgumentNames()) {
+      if (parameters_.count(input)) continue;
       auto* var = scope_.FindVar(input);
       PADDLE_ENFORCE(var);
       auto tensor = var->GetMutable<framework::LoDTensor>();
+
       engine_->SetInputFromCPU(
-          input, static_cast<void*>(tensor->data<float>()),
+          input, static_cast<void*>(tensor->data<void>()),
           sizeof(float) *
               analysis::AccuDims(tensor->dims(), tensor->dims().size()));
     }
@@ -120,18 +128,21 @@ class TRTConvertValidation {
 
   void Execute(int batch_size) {
     // Execute Fluid Op
-    // Execute TRT
     platform::CPUPlace place;
     platform::CPUDeviceContext ctx(place);
-    engine_->Execute(batch_size);
-
     op_->Run(scope_, place);
+    // Execute TRT.
+    engine_->Execute(batch_size);
+    cudaStreamSynchronize(*engine_->stream());
 
     ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
+    const size_t output_space_size = 200;
     for (const auto& output : op_desc_->OutputArgumentNames()) {
       std::vector<float> fluid_out;
-      std::vector<float> trt_out(200);
-      engine_->GetOutputInCPU(output, &trt_out[0], 200 * sizeof(float));
+      std::vector<float> trt_out(output_space_size);
+      engine_->GetOutputInCPU(output, &trt_out[0],
+                              output_space_size * sizeof(float));
+      cudaStreamSynchronize(*engine_->stream());
 
       auto* var = scope_.FindVar(output);
       auto tensor = var->GetMutable<framework::LoDTensor>();
@@ -139,7 +150,7 @@ class TRTConvertValidation {
       // Compare two output
       ASSERT_FALSE(fluid_out.empty());
       for (size_t i = 0; i < fluid_out.size(); i++) {
-        EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 0.001);
+        EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 1e-6);
       }
     }
   }
@@ -149,9 +160,10 @@ class TRTConvertValidation {
  private:
   std::unique_ptr<TensorRTEngine> engine_;
   cudaStream_t stream_;
-  framework::Scope scope_;
   std::unique_ptr<framework::OperatorBase> op_;
   std::unique_ptr<framework::OpDesc> op_desc_;
+  const std::unordered_set<std::string>& parameters_;
+  framework::Scope& scope_;
 };
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index a88236ae98..3d75fefc1a 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -106,6 +106,7 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
                     name);
 
   auto* output = layer->getOutput(offset);
+  SetITensor(name, output);
   PADDLE_ENFORCE(output != nullptr);
   output->setName(name.c_str());
   infer_network_->markOutput(*output);
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index d9d3163b66..fabcfd9e80 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -37,13 +37,15 @@ class TensorRTEngine : public EngineBase {
   // Weight is model parameter.
   class Weight {
    public:
-    Weight(nvinfer1::DataType dtype, void* value, int num_elem) {
+    Weight(nvinfer1::DataType dtype, void* value, size_t num_elem) {
       w_.type = dtype;
       w_.values = value;
       w_.count = num_elem;
     }
     const nvinfer1::Weights& get() { return w_; }
 
+    std::vector<int64_t> dims;
+
    private:
     nvinfer1::Weights w_;
   };
diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc
index 83e768b4dc..855157e7c4 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -31,8 +31,9 @@ void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
   auto max_workspace = context.Attr<int>("max_workspace");
   engine_.reset(new inference::tensorrt::TensorRTEngine(
       max_batch_, max_workspace, nullptr));
+  // TODO(Superjomn) parameters should be passed after analysised from outside.
   inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
-      block, engine_.get());
+      block, {}, context.scope(), engine_.get());
   engine_->FreezeNetwork();
 }
 

From 9503dbb173f76f7b68d4a6edc18ce31cf7865c30 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Fri, 1 Jun 2018 17:14:18 +0800
Subject: [PATCH 23/24] fix compile error (#11119)

---
 paddle/fluid/inference/tensorrt/convert/fc_op.cc       | 2 +-
 paddle/fluid/inference/tensorrt/convert/op_converter.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index bd05608d76..45b0795597 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -59,7 +59,7 @@ class FcOpConverter : public OpConverter {
                   const framework::Scope& scope) override {
     VLOG(4) << "convert a fluid fc op to tensorrt fc layer without bias";
 
-    framework::OpDesc op_desc(op, nullptr, nullptr);
+    framework::OpDesc op_desc(op, nullptr);
     PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
     PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
     PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 4d21e241c0..3beafeefd0 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -40,7 +40,7 @@ class OpConverter {
   void ConvertOp(const framework::proto::OpDesc& op,
                  const std::unordered_set<std::string>& parameters,
                  const framework::Scope& scope, TensorRTEngine* engine) {
-    framework::OpDesc op_desc(op, nullptr, nullptr);
+    framework::OpDesc op_desc(op, nullptr);
 
     OpConverter* it{nullptr};
 

From 663f4e6168ef9852991dc1ccfea462307d19a5d0 Mon Sep 17 00:00:00 2001
From: baiyf <baiyfbupt@gmail.com>
Date: Fri, 1 Jun 2018 19:30:59 +0800
Subject: [PATCH 24/24] Fix bilinear_op Python API (#11117)

* fix conflict

* code clean
---
 doc/fluid/api/layers.rst                      |  4 ++--
 python/paddle/fluid/layers/nn.py              | 21 +++++++++----------
 .../fluid/tests/unittests/test_layers.py      |  6 +++---
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index dbb99d3c03..5329adaa18 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -1003,10 +1003,10 @@ dice_loss
 ..  autofunction:: paddle.fluid.layers.dice_loss
     :noindex:
 
-upsampling_bilinear2d
+resize_bilinear
 ____
 
-..  autofunction:: paddle.fluid.layers.upsampling_bilinear2d
+..  autofunction:: paddle.fluid.layers.resize_bilinear
     :noindex:
 
 gather
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 56f5c6b4be..bd6ed0f30e 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -81,7 +81,7 @@ __all__ = [
     'label_smooth',
     'roi_pool',
     'dice_loss',
-    'upsampling_bilinear2d',
+    'resize_bilinear',
     'gather',
     'random_crop',
 ]
@@ -3929,9 +3929,9 @@ def dice_loss(input, label, epsilon=0.00001):
     return reduce_mean(dice_score)
 
 
-def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
+def resize_bilinear(input, out_shape=None, scale=None, name=None):
     """
-    The mathematical meaning of upsampling_bilinear2d is also called
+    The mathematical meaning of resize bilinear layer is
     Bilinear interpolation.
     Bilinear interpolation is an extension of linear interpolation for
     interpolating functions of two variables (e.g. H-direction and
@@ -3941,13 +3941,13 @@ def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
     https://en.wikipedia.org/wiki/Bilinear_interpolation
 
     Args:
-        input (Variable): The input tensor of bilinear interpolation,
+        input (Variable): The input tensor of resize bilinear layer,
                           This is a 4-D tensor of the shape
                           (num_batches, channels, in_h, in_w).
-        out_shape(list|tuple|Variable|None): Output shape of bilinear interpolation
+        out_shape(list|tuple|Variable|None): Output shape of resize bilinear
                                     layer, the shape is (out_h, out_w).
                                     Default: None
-        scale(int|None): The multiplier for the input height or width.
+        scale(float|None): The multiplier for the input height or width.
                          At least one of out_shape or scale must be set.
                          And out_shape has a higher priority than scale.
                          Default: None
@@ -3961,7 +3961,7 @@ def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
     Examples:
         .. code-block:: python
 
-            out = fluid.layers.bilinear_interp(input, out_shape=[12, 12])
+            out = fluid.layers.resize_bilinear(input, out_shape=[12, 12])
     """
     if out_shape is None and scale is None:
         raise ValueError("One of out_shape and scale must not be None")
@@ -3975,10 +3975,9 @@ def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
     out_w = 0
     inputs = {"X": input}
     if out_shape is not None:
-        if not (_is_list_or_turple_(out_shape) and len(out_shape) == 2) and (
-                out_shape is not Variable):
-            raise ValueError('out_shape should be a list or tuple ',
-                             'with length 2, (out_h, out_w).')
+        if not (_is_list_or_turple_(out_shape) and
+                len(out_shape) == 2) and not isinstance(out_shape, Variable):
+            raise ValueError('out_shape should be a list or tuple or variable')
         if _is_list_or_turple_(out_shape):
             out_shape = list(map(int, out_shape))
             out_h = out_shape[0]
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 60dc1f83fc..ca08fd7fc8 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -369,13 +369,13 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(output)
         print(str(program))
 
-    def test_upsampling_bilinear2d(self):
+    def test_resize_bilinear(self):
         program = Program()
         with program_guard(program):
             x = layers.data(name='x', shape=[3, 9, 6], dtype="float32")
-            output = layers.upsampling_bilinear2d(x, out_shape=[12, 12])
+            output = layers.resize_bilinear(x, out_shape=[12, 12])
             self.assertIsNotNone(output)
-            output = layers.upsampling_bilinear2d(x, scale=3)
+            output = layers.resize_bilinear(x, scale=3)
             self.assertIsNotNone(output)
         print(str(program))