Merge branch 'develop' of github.com:PaddlePaddle/Paddle into parallel_bcast

7 years ago · b084dfab7e
parent 5ce1a960a5 08cfe27c63
commit b084dfab7e
17 changed files with 497 additions and 338 deletions
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@ -262,7 +262,7 @@ void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch,
  if (buffer.empty() || buffer.length() < sizeof(T) * data.size()) {
    buffer.Resize(sizeof(T) * data.size());
  }
-  std::memcpy(buffer.data(), data.data(), buffer.length());
+  std::memcpy(buffer.data(), data.data(), sizeof(T) * data.size());
  // copy LoD
  for (const auto &level : fetch.lod()) {
    output->lod.emplace_back(level);
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@ -117,34 +117,6 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  input_slots->assign({input_tensor});
 }
 void BenchAllData(const std::string &model_path, const std::string &data_file,
                  const int batch_size, const int repeat) {
  NativeConfig config;
  config.model_dir = model_path;
  config.use_gpu = false;
  config.device = 0;
  config.specify_input_name = true;
  std::vector<PaddleTensor> input_slots, outputs_slots;
  DataRecord data(data_file, batch_size);
  auto predictor =
      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
  GetOneBatch(&input_slots, &data, batch_size);
  for (int i = 0; i < FLAGS_burning; i++) {
    predictor->Run(input_slots, &outputs_slots);
  }
  Timer timer;
  double sum = 0;
  for (int i = 0; i < repeat; i++) {
    for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
      GetOneBatch(&input_slots, &data, batch_size);
      timer.tic();
      predictor->Run(input_slots, &outputs_slots);
      sum += timer.toc();
    }
  }
  PrintTime(batch_size, repeat, 1, 0, sum / repeat);
 }
 const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25,
                                25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43,
                                44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39,
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@ -130,12 +130,13 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
      const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
      bool is_persistent = false) {
    auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
    auto weights_pd = conv_pd_->weights_primitive_desc();
    return this->AcquireMemory(weights_pd, user_weights_pd,
                               user_weights_memory_p, "@weights_mem_p",
-                               pipeline);
+                               pipeline, is_persistent);
  }
  std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive(
@ -266,6 +267,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                   "It must use CPUPlace.");
    const bool is_test = ctx.Attr<bool>("is_test");
    auto& dev_ctx =
        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
    const auto& mkldnn_engine = dev_ctx.GetEngine();
@ -296,6 +299,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
    bool fuse_relu = ctx.Attr<bool>("fuse_relu");
    int groups = ctx.Attr<int>("groups");
    // TODO(pzelazko-intel) add support for group convolution and dilation
@ -348,11 +352,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
      bias_tz = paddle::framework::vectorize2int(bias->dims());
      auto bias_md = platform::MKLDNNMemDesc(
          bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
-      conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
+      conv_pd =
-                                     strides, paddings, mkldnn_engine);
+          ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, strides,
                               paddings, mkldnn_engine, fuse_relu);
    } else {
      conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides,
-                                     paddings, mkldnn_engine);
+                                     paddings, mkldnn_engine, fuse_relu);
    }
    // Save conv_pd/src_memory/weights_memory for backward pass
    dev_ctx.SetBlob(key_conv_pd, conv_pd);
@ -371,7 +376,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto src_memory_p =
        handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
    auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
-        user_weights_memory_p, pipeline);
+        user_weights_memory_p, pipeline, is_test);
    auto dst_memory_p =
        handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
@ -402,11 +407,26 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  }
 private:
  mkldnn::primitive_attr AddRelu() const {
    // Fusion with ReLU layer is executed through the PostOps feature. Create a
    // PostOps object and configure it to execute an eltwise relu operation.
    mkldnn::primitive_attr conv_attr;
    constexpr float scale = 1.0f;
    constexpr float negative_slope = 0.0f;
    constexpr float placeholder = 0.0f;
    mkldnn::post_ops post_operations;
    post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
                                   negative_slope, placeholder);
    conv_attr.set_post_ops(post_operations);
    return conv_attr;
  }
  std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
  ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
                       const memory::desc& dst, const std::vector<int>& strides,
                       const std::vector<int>& paddings,
-                       const mkldnn::engine& engine) const {
+                       const mkldnn::engine& engine,
                       const bool fuse_relu) const {
    memory::dims stride_dims = {strides[0], strides[1]};
    memory::dims padding_dims = {paddings[0], paddings[1]};
@ -415,8 +435,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
        dst, stride_dims, padding_dims, padding_dims,
        mkldnn::padding_kind::zero);
-    auto p_conv_pd =
+    mkldnn::primitive_attr conv_attr;
-        new mkldnn::convolution_forward::primitive_desc(conv_desc, engine);
+    if (fuse_relu) {
      conv_attr = AddRelu();
    }
    auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
        conv_desc, conv_attr, engine);
    return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
        p_conv_pd);
@ -427,7 +452,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                       const memory::desc& bias, const memory::desc& dst,
                       const std::vector<int>& strides,
                       const std::vector<int>& paddings,
-                       const mkldnn::engine& engine) const {
+                       const mkldnn::engine& engine,
                       const bool fuse_relu) const {
    memory::dims stride_dims = {strides[0], strides[1]};
    memory::dims padding_dims = {paddings[0], paddings[1]};
@ -436,8 +462,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
        bias, dst, stride_dims, padding_dims, padding_dims,
        mkldnn::padding_kind::zero);
-    auto p_conv_pd =
+    mkldnn::primitive_attr conv_attr;
-        new mkldnn::convolution_forward::primitive_desc(conv_desc, engine);
+    if (fuse_relu) {
      conv_attr = AddRelu();
    }
    auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
        conv_desc, conv_attr, engine);
    return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
        p_conv_pd);
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@ -109,6 +109,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
 }
 void Conv2DOpMaker::Make() {
  AddAttr<bool>("is_test", "").SetDefault(false);
  AddInput(
      "Input",
      "(Tensor) The input tensor of convolution operator. "
@ -161,6 +162,8 @@ void Conv2DOpMaker::Make() {
  AddAttr<bool>("use_mkldnn",
                "(bool, default false) Only used in mkldnn kernel")
      .SetDefault(false);
  AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
      .SetDefault(false);
  AddAttr<std::string>(
      "data_format",
      "(string, default NCHW) Only used in "
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@ -20,6 +20,7 @@ if(WITH_GRPC)
    DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
  cc_test(rpc_server_test SRCS rpc_server_test.cc
    DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
  cc_test(varhandle_test SRCS varhandle_test.cc)
  return()
 endif()
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@ -53,15 +53,14 @@ void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
 class BaseProcessor {
 public:
-  explicit BaseProcessor(std::shared_ptr<grpc::Channel> ch) {
+  BaseProcessor() { context_ = nullptr; }
    context_ = nullptr;
  }
  virtual ~BaseProcessor() {}
-  virtual void Prepare(const VarHandle& var_info, int64_t time_out) {
+  virtual void Prepare(VarHandlePtr h, int64_t time_out) {
    var_h_ = h;
    context_.reset(new grpc::ClientContext());
    var_h_ = var_info;
    context_->set_wait_for_ready(true);
    if (time_out) {
      std::chrono::system_clock::time_point deadline =
@ -71,21 +70,21 @@ class BaseProcessor {
    }
  }
-  virtual void Prepare(int64_t time_out) {
+  void Process() {
-    context_.reset(new grpc::ClientContext());
+    ProcessImpl();
-    context_->set_wait_for_ready(true);
+    var_h_->Finish(true);
    std::chrono::system_clock::time_point deadline =
        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
    context_->set_deadline(deadline);
  }
-  virtual void Process() = 0;
+  VarHandlePtr GetVarHandlePtr() { return var_h_; }
  bool Wait() { return var_h_->Wait(); }
  void Finish(bool ok) { return var_h_->Finish(ok); }
  virtual void ProcessImpl() = 0;
  std::unique_ptr<grpc::ClientContext> context_;
  grpc::Status status_;
-  VarHandle var_h_;
+
 protected:
  VarHandlePtr var_h_;
 };
 typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
@ -94,13 +93,13 @@ typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
 class SendProcessor : public BaseProcessor {
 public:
  explicit SendProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch), stub_g_(ch) {}
+      : BaseProcessor(), stub_g_(ch) {}
  virtual ~SendProcessor() {}
-  virtual void Process() {
+  void ProcessImpl() override {
    if (response_call_back_) {
-      response_call_back_(var_h_, reply_);
+      response_call_back_(*var_h_.get(), reply_);
    }
  }
@ -115,13 +114,13 @@ typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
 class GetProcessor : public BaseProcessor {
 public:
  explicit GetProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch), stub_g_(ch) {}
+      : BaseProcessor(), stub_g_(ch) {}
  virtual ~GetProcessor() {}
-  virtual void Process() {
+  void ProcessImpl() override {
    if (response_call_back_) {
-      response_call_back_(var_h_, reply_);
+      response_call_back_(*var_h_.get(), reply_);
    }
  }
@ -133,13 +132,13 @@ class GetProcessor : public BaseProcessor {
 class BatchBarrierProcessor : public BaseProcessor {
 public:
  explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {
+      : BaseProcessor() {
    stub_ = sendrecv::SendRecvService::NewStub(ch);
  }
  virtual ~BatchBarrierProcessor() {}
-  virtual void Process() {}
+  void ProcessImpl() override {}
  sendrecv::VoidMessage reply_;
  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
 };
@ -147,13 +146,13 @@ class BatchBarrierProcessor : public BaseProcessor {
 class FetchBarrierProcessor : public BaseProcessor {
 public:
  explicit FetchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {
+      : BaseProcessor() {
    stub_ = sendrecv::SendRecvService::NewStub(ch);
  }
  virtual ~FetchBarrierProcessor() {}
-  virtual void Process() {}
+  void ProcessImpl() override {}
  sendrecv::VariableMessage reply_;
  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
 };
@ -161,13 +160,13 @@ class FetchBarrierProcessor : public BaseProcessor {
 class CheckpointNotifyProcessor : public BaseProcessor {
 public:
  explicit CheckpointNotifyProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {
+      : BaseProcessor() {
    stub_ = sendrecv::SendRecvService::NewStub(ch);
  }
  virtual ~CheckpointNotifyProcessor() {}
-  virtual void Process() {}
+  void ProcessImpl() override {}
  sendrecv::VoidMessage reply_;
  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
 };
@ -177,32 +176,37 @@ class GRPCClient : public RPCClient {
  GRPCClient() : ok_(true), completed_(false), stopped_(false) {}
  virtual ~GRPCClient();
-  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
+  VarHandlePtr AsyncSendVar(const std::string& ep,
-                    const framework::Scope& scope, const std::string& var_name,
+                            const platform::DeviceContext& ctx,
                            const framework::Scope& scope,
                            const std::string& var_name,
                            int64_t time_out = FLAGS_rpc_deadline) override;
-  bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
+  VarHandlePtr AsyncGetVar(const std::string& ep,
-                   const framework::Scope& scope, const std::string& var_name,
+                           const platform::DeviceContext& ctx,
                           const framework::Scope& scope,
                           const std::string& var_name,
                           int64_t time_out = FLAGS_rpc_deadline) override;
-  bool AsyncPrefetchVar(const std::string& ep,
+  VarHandlePtr AsyncPrefetchVar(const std::string& ep,
                                const platform::DeviceContext& ctx,
                                const framework::Scope& scope,
                                const std::string& in_var_name,
                                const std::string& out_var_name,
                                int64_t time_out = FLAGS_rpc_deadline) override;
-  void AsyncSendBatchBarrier(const std::string& ep,
+  VarHandlePtr AsyncSendBatchBarrier(
-                             int64_t time_out = FLAGS_rpc_deadline) override;
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
-  void AsyncSendFetchBarrier(const std::string& ep,
+  VarHandlePtr AsyncSendFetchBarrier(
-                             int64_t time_out = FLAGS_rpc_deadline) override;
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
-  void AsyncCheckpointNotify(const std::string& ep, const std::string& dir,
+  VarHandlePtr AsyncCheckpointNotify(
      const std::string& ep, const std::string& dir,
      int64_t time_out = FLAGS_rpc_deadline) override;
-  void AsyncSendComplete(const std::string& ep,
+  VarHandlePtr AsyncSendComplete(
-                         int64_t time_out = FLAGS_rpc_deadline) override;
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
  bool Wait() override;
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@ -28,6 +28,7 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/macros.h"
 namespace paddle {
 namespace operators {
@ -49,23 +50,77 @@ constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
 class RPCServer;
-struct VarHandle {
+class VarHandle {
-  // RPC endpoint.
+ public:
-  std::string ep;
+  VarHandle(const std::string ep, const std::string& method,
-  const platform::DeviceContext* ctx;
+            const std::string& name,
-  const framework::Scope* scope;
+            const platform::DeviceContext* p_ctx = nullptr,
-  // Variable name.
+            const framework::Scope* p_scope = nullptr)
-  std::string name;
+      : ok_(kVarHandleDefaultState) {
-  // RPC method name.
+    ep_ = ep;
-  std::string method;
+    ctx_ = p_ctx;
    scope_ = p_scope;
    name_ = name;
    method_ = method;
  }
  virtual ~VarHandle() {}
 public:
  bool Wait() {
    {
      std::unique_lock<std::mutex> lk(sync_mutex_);
      wait_cond_.wait(lk, [this] { return ok_ != kVarHandleDefaultState; });
    }
    VLOG(7) << "VarHandle wait:" << ok_;
    return ok_ != 0;
  }
  void Finish(bool ok) {
    {
      std::unique_lock<std::mutex> lk(sync_mutex_);
      ok_ = ok;
    }
    VLOG(7) << "VarHandle finish:" << ok;
    wait_cond_.notify_all();
  }
  std::string String() const {
    std::ostringstream s;
-    s << method << " name:[" << name << "], ep:[" << ep << "]";
+    s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], ok:[" << ok_
      << "]";
    return s.str();
  }
  std::string ep() const { return ep_; }
  const platform::DeviceContext* ctx() const { return ctx_; }
  const framework::Scope* scope() const { return scope_; }
  std::string name() const { return name_; }
  std::string method() const { return method_; }
 protected:
  // RPC endpoint.
  std::string ep_;
  const platform::DeviceContext* ctx_;
  const framework::Scope* scope_;
  // Variable name.
  std::string name_;
  // RPC method name.
  std::string method_;
 protected:
  std::mutex sync_mutex_;
  std::condition_variable wait_cond_;
  int ok_;
  static const int kVarHandleDefaultState = -1;
 private:
  DISABLE_COPY_AND_ASSIGN(VarHandle);
 };
 typedef std::shared_ptr<VarHandle> VarHandlePtr;
 class RequestHandler {
 public:
  explicit RequestHandler(bool sync_mode)
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@ -14,12 +14,14 @@
 #pragma once
 #include <condition_variable>  // NOLINT
 #include <string>
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 DECLARE_int32(rpc_deadline);
@ -31,37 +33,36 @@ class RPCClient {
 public:
  RPCClient() {}
  virtual ~RPCClient() {}
-  virtual bool AsyncSendVar(const std::string& ep,
+  virtual VarHandlePtr AsyncSendVar(const std::string& ep,
                                    const platform::DeviceContext& ctx,
                                    const framework::Scope& scope,
                                    const std::string& var_name,
                                    int64_t time_out = FLAGS_rpc_deadline) = 0;
-  virtual bool AsyncGetVar(const std::string& ep,
+  virtual VarHandlePtr AsyncGetVar(const std::string& ep,
                                   const platform::DeviceContext& ctx,
                                   const framework::Scope& scope,
                                   const std::string& var_name,
                                   int64_t time_out = FLAGS_rpc_deadline) = 0;
-  virtual bool AsyncPrefetchVar(const std::string& ep,
+  virtual VarHandlePtr AsyncPrefetchVar(
-                                const platform::DeviceContext& ctx,
+      const std::string& ep, const platform::DeviceContext& ctx,
-                                const framework::Scope& scope,
+      const framework::Scope& scope, const std::string& in_var_name,
                                const std::string& in_var_name,
      const std::string& out_var_name,
      int64_t time_out = FLAGS_rpc_deadline) = 0;
-  virtual void AsyncSendBatchBarrier(const std::string& ep,
+  virtual VarHandlePtr AsyncSendBatchBarrier(
-                                     int64_t time_out = FLAGS_rpc_deadline) = 0;
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
-  virtual void AsyncSendFetchBarrier(const std::string& ep,
+  virtual VarHandlePtr AsyncSendFetchBarrier(
-                                     int64_t time_out = FLAGS_rpc_deadline) = 0;
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
-  virtual void AsyncCheckpointNotify(const std::string& ep,
+  virtual VarHandlePtr AsyncCheckpointNotify(
-                                     const std::string& dir,
+      const std::string& ep, const std::string& dir,
      int64_t time_out = FLAGS_rpc_deadline) = 0;
-  virtual void AsyncSendComplete(const std::string& ep,
+  virtual VarHandlePtr AsyncSendComplete(
-                                 int64_t time_out = FLAGS_rpc_deadline) = 0;
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
  // Complete tells all the pserver instances that finishe the training,
  // the pserver can reduce it's barrier count, and continue to train
--- a/paddle/fluid/operators/distributed/varhandle_test.cc
+++ b/paddle/fluid/operators/distributed/varhandle_test.cc
@ -0,0 +1,55 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <unistd.h>
 #include <string>
 #include <thread>  // NOLINT
 #include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 using paddle::operators::distributed::VarHandlePtr;
 using paddle::operators::distributed::VarHandle;
 void WaitTrue(VarHandlePtr s) { EXPECT_TRUE(s->Wait()); }
 void WaitFalse(VarHandlePtr s) { EXPECT_FALSE(s->Wait()); }
 TEST(VarHandle, Run) {
  std::vector<VarHandlePtr> a;
  for (int i = 0; i < 12; i++) {
    VarHandlePtr s(new VarHandle("", "", "", nullptr, nullptr));
    a.push_back(s);
  }
  std::vector<std::unique_ptr<std::thread>> t;
  for (int i = 0; i < 6; i++) {
    t.emplace_back(new std::thread(WaitFalse, a[i]));
  }
  for (int i = 0; i < 6; i++) {
    a[i]->Finish(false);
    t[i]->join();
  }
  for (int i = 6; i < 12; i++) {
    t.emplace_back(new std::thread(WaitTrue, a[i]));
  }
  for (int i = 6; i < 12; i++) {
    a[i]->Finish(true);
    t[i]->join();
  }
 }
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
@ -44,16 +44,20 @@ class PrefetchOp : public framework::OperatorBase {
    distributed::RPCClient* rpc_client =
        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
    std::vector<distributed::VarHandlePtr> rets;
    for (size_t i = 0; i < ins.size(); i++) {
      if (NeedSend(scope, ins[i])) {
        VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get "
                << outs[i] << " back";
-        rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope, ins[i], outs[i]);
+        rets.push_back(rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope,
                                                    ins[i], outs[i]));
      } else {
        VLOG(3) << "don't send no-initialied variable: " << ins[i];
      }
    }
-    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
+    for (size_t i = 0; i < rets.size(); i++) {
      PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
    }
  }
 };
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@ -44,12 +44,15 @@ class RecvOp : public framework::OperatorBase {
    distributed::RPCClient* rpc_client =
        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
    std::vector<distributed::VarHandlePtr> rets;
    for (size_t i = 0; i < outs.size(); i++) {
      VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
-      rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]);
+      rets.push_back(rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]));
    }
    if (sync_mode) {
-      PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
+      for (size_t i = 0; i < rets.size(); i++) {
        PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
      }
    }
  }
 };
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@ -15,6 +15,7 @@ limitations under the License. */
 #include <future>  // NOLINT
 #include <ostream>
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@ -45,18 +46,19 @@ class SendOp : public framework::OperatorBase {
    distributed::RPCClient* rpc_client =
        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
    std::vector<distributed::VarHandlePtr> rets;
    for (size_t i = 0; i < ins.size(); i++) {
      if (NeedSend(scope, ins[i])) {
        VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
-        // TODO(Yancey1989): we need to use an IO threadpool which has
+        rets.push_back(rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]));
        // a larger number of threads than the computing threadpool.
        rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]);
      } else {
        VLOG(3) << "don't send no-initialied variable: " << ins[i];
      }
    }
    if (sync_send) {
-      PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
+      for (size_t i = 0; i < rets.size(); i++) {
        PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
      }
    }
  }
 };
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@ -192,7 +192,8 @@ class MKLDNNHandler {
      mkldnn::memory::primitive_desc& user_mpd,  // NOLINT
      const std::shared_ptr<mkldnn::memory> user_memory_p,
      const std::string& suffix,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
      bool is_persistent = false) {
    // create reorder primitive if the input format is not the preferred one
    auto local_key = key_ + suffix;
    auto key_reorder_p = key_ + suffix + "reorder_p";
@ -213,7 +214,7 @@ class MKLDNNHandler {
        pipeline.push_back(*reorder_p);
      }
      dev_ctx_.SetBlob(local_key, target_memory_p);
-    } else {
+    } else if (!is_persistent) {
      // Make reorder if needed
      auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
          dev_ctx_.GetBlob(key_reorder_p));
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@ -128,6 +128,13 @@ class ParallelExecutor(object):
                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
                exec_strategy.num_threads = cpu_num * 2
        # Set 1 thread num under nccl2 distribute 
        #   env to make sure all gpus run ops in same order.
        if num_trainers > 1:
            assert (use_cuda)
            # FIXME(gongwb): avoid this set.
            exec_strategy.num_threads = 1
        if build_strategy is None:
            build_strategy = BuildStrategy()
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@ -60,12 +60,46 @@ class InferenceTranspiler(object):
        if not isinstance(scope, core.Scope):
            raise TypeError("scope should be as Scope type or None")
        use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
        self._fuse_batch_norm(program, place, scope)
        if use_mkldnn:
            self._fuse_relu_mkldnn(program)
            self._fuse_conv_bias_mkldnn(program)
            self._fuse_conv_relu_mkldnn(program)
            self._fuse_bn_relu_mkldnn(program)
    def _fuse_conv_relu_mkldnn(self, program):
        '''
        Transpile the program by fused relu activation for MKLDNN program.
        Relu activation following convolution OP can be fused by adding
        'fuse_relu' attribute to convolution OP.
        The result of fuse is:
            - before:
                - conv->relu->any_other_op
            - after:
                - conv->any_other_op
        :param program: program to transpile
        :type program: Program
        '''
        self.block = program.block(0)
        i = 0
        while i < len(self.block.ops):
            current_op = self.block.ops[i]
            if current_op.type in ['conv2d']:
                next_op = self.block.ops[i + 1]
                if next_op.type == 'relu':
                    # modify conv OP to include relu
                    current_op.set_attr("fuse_relu", True)
                    # remove conv OP
                    self.block._remove_op(i + 1)
            i = i + 1
-    def _fuse_relu_mkldnn(self, program):
+        # TODO(luotao): use clone() method to flush the program.desc in force,
        # since some large program.desc will not be flushed immediately.
        # And a better solution will be considered later.
        program = program.clone()
    def _fuse_bn_relu_mkldnn(self, program):
        '''
        Transpile the program by fused relu activation for MKLDNN program.
@ -160,7 +194,6 @@ class InferenceTranspiler(object):
                self.block._remove_op(i + 1)  # Remove old conv
                self.block._remove_op(i + 1)  # Remove elementwise_add
            i = i + 1
            i = i + 1
        self._remove_unused_var()
        # TODO(luotao): use clone() method to flush the program.desc in force,