Merge branch 'develop' of github.com:PaddlePaddle/Paddle into parallel_bcast

7 years ago · b084dfab7e
parent 5ce1a960a5 08cfe27c63
commit b084dfab7e
17 changed files with 497 additions and 338 deletions
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@ -262,7 +262,7 @@ void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch,
  if (buffer.empty() || buffer.length() < sizeof(T) * data.size()) {
    buffer.Resize(sizeof(T) * data.size());
  }
-  std::memcpy(buffer.data(), data.data(), buffer.length());
+  std::memcpy(buffer.data(), data.data(), sizeof(T) * data.size());
  // copy LoD
  for (const auto &level : fetch.lod()) {
    output->lod.emplace_back(level);
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@ -117,34 +117,6 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  input_slots->assign({input_tensor});
 }

-void BenchAllData(const std::string &model_path, const std::string &data_file,
-                  const int batch_size, const int repeat) {
-  NativeConfig config;
-  config.model_dir = model_path;
-  config.use_gpu = false;
-  config.device = 0;
-  config.specify_input_name = true;
-  std::vector<PaddleTensor> input_slots, outputs_slots;
-  DataRecord data(data_file, batch_size);
-  auto predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
-  GetOneBatch(&input_slots, &data, batch_size);
-  for (int i = 0; i < FLAGS_burning; i++) {
-    predictor->Run(input_slots, &outputs_slots);
-  }
-  Timer timer;
-  double sum = 0;
-  for (int i = 0; i < repeat; i++) {
-    for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
-      GetOneBatch(&input_slots, &data, batch_size);
-      timer.tic();
-      predictor->Run(input_slots, &outputs_slots);
-      sum += timer.toc();
-    }
-  }
-  PrintTime(batch_size, repeat, 1, 0, sum / repeat);
-}
-
 const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25,
                                25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43,
                                44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39,
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@ -130,12 +130,13 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {

  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
      const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
+      bool is_persistent = false) {
    auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
    auto weights_pd = conv_pd_->weights_primitive_desc();
    return this->AcquireMemory(weights_pd, user_weights_pd,
                               user_weights_memory_p, "@weights_mem_p",
-                               pipeline);
+                               pipeline, is_persistent);
  }

  std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive(
@ -266,6 +267,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                   "It must use CPUPlace.");

+    const bool is_test = ctx.Attr<bool>("is_test");
+
    auto& dev_ctx =
        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
    const auto& mkldnn_engine = dev_ctx.GetEngine();
@ -296,6 +299,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    bool fuse_relu = ctx.Attr<bool>("fuse_relu");
    int groups = ctx.Attr<int>("groups");

    // TODO(pzelazko-intel) add support for group convolution and dilation
@ -348,11 +352,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
      bias_tz = paddle::framework::vectorize2int(bias->dims());
      auto bias_md = platform::MKLDNNMemDesc(
          bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
-      conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
-                                     strides, paddings, mkldnn_engine);
+      conv_pd =
+          ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, strides,
+                               paddings, mkldnn_engine, fuse_relu);
    } else {
      conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides,
-                                     paddings, mkldnn_engine);
+                                     paddings, mkldnn_engine, fuse_relu);
    }
    // Save conv_pd/src_memory/weights_memory for backward pass
    dev_ctx.SetBlob(key_conv_pd, conv_pd);
@ -371,7 +376,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto src_memory_p =
        handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
    auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
-        user_weights_memory_p, pipeline);
+        user_weights_memory_p, pipeline, is_test);
    auto dst_memory_p =
        handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));

@ -402,11 +407,26 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  }

 private:
+  mkldnn::primitive_attr AddRelu() const {
+    // Fusion with ReLU layer is executed through the PostOps feature. Create a
+    // PostOps object and configure it to execute an eltwise relu operation.
+    mkldnn::primitive_attr conv_attr;
+    constexpr float scale = 1.0f;
+    constexpr float negative_slope = 0.0f;
+    constexpr float placeholder = 0.0f;
+    mkldnn::post_ops post_operations;
+    post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
+                                   negative_slope, placeholder);
+    conv_attr.set_post_ops(post_operations);
+    return conv_attr;
+  }
+
  std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
  ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
                       const memory::desc& dst, const std::vector<int>& strides,
                       const std::vector<int>& paddings,
-                       const mkldnn::engine& engine) const {
+                       const mkldnn::engine& engine,
+                       const bool fuse_relu) const {
    memory::dims stride_dims = {strides[0], strides[1]};
    memory::dims padding_dims = {paddings[0], paddings[1]};

@ -415,8 +435,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
        dst, stride_dims, padding_dims, padding_dims,
        mkldnn::padding_kind::zero);

-    auto p_conv_pd =
-        new mkldnn::convolution_forward::primitive_desc(conv_desc, engine);
+    mkldnn::primitive_attr conv_attr;
+    if (fuse_relu) {
+      conv_attr = AddRelu();
+    }
+
+    auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
+        conv_desc, conv_attr, engine);

    return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
        p_conv_pd);
@ -427,7 +452,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                       const memory::desc& bias, const memory::desc& dst,
                       const std::vector<int>& strides,
                       const std::vector<int>& paddings,
-                       const mkldnn::engine& engine) const {
+                       const mkldnn::engine& engine,
+                       const bool fuse_relu) const {
    memory::dims stride_dims = {strides[0], strides[1]};
    memory::dims padding_dims = {paddings[0], paddings[1]};

@ -436,8 +462,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
        bias, dst, stride_dims, padding_dims, padding_dims,
        mkldnn::padding_kind::zero);

-    auto p_conv_pd =
-        new mkldnn::convolution_forward::primitive_desc(conv_desc, engine);
+    mkldnn::primitive_attr conv_attr;
+    if (fuse_relu) {
+      conv_attr = AddRelu();
+    }
+
+    auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
+        conv_desc, conv_attr, engine);

    return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
        p_conv_pd);
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@ -109,6 +109,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
 }

 void Conv2DOpMaker::Make() {
+  AddAttr<bool>("is_test", "").SetDefault(false);
  AddInput(
      "Input",
      "(Tensor) The input tensor of convolution operator. "
@ -161,6 +162,8 @@ void Conv2DOpMaker::Make() {
  AddAttr<bool>("use_mkldnn",
                "(bool, default false) Only used in mkldnn kernel")
      .SetDefault(false);
+  AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
  AddAttr<std::string>(
      "data_format",
      "(string, default NCHW) Only used in "
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@ -20,6 +20,7 @@ if(WITH_GRPC)
    DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
  cc_test(rpc_server_test SRCS rpc_server_test.cc
    DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
+  cc_test(varhandle_test SRCS varhandle_test.cc)
  return()
 endif()

--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@ -53,15 +53,14 @@ void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);

 class BaseProcessor {
 public:
-  explicit BaseProcessor(std::shared_ptr<grpc::Channel> ch) {
-    context_ = nullptr;
-  }
+  BaseProcessor() { context_ = nullptr; }

  virtual ~BaseProcessor() {}

-  virtual void Prepare(const VarHandle& var_info, int64_t time_out) {
+  virtual void Prepare(VarHandlePtr h, int64_t time_out) {
+    var_h_ = h;
+
    context_.reset(new grpc::ClientContext());
-    var_h_ = var_info;
    context_->set_wait_for_ready(true);
    if (time_out) {
      std::chrono::system_clock::time_point deadline =
@ -71,21 +70,21 @@ class BaseProcessor {
    }
  }

-  virtual void Prepare(int64_t time_out) {
-    context_.reset(new grpc::ClientContext());
-    context_->set_wait_for_ready(true);
-
-    std::chrono::system_clock::time_point deadline =
-        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
-
-    context_->set_deadline(deadline);
+  void Process() {
+    ProcessImpl();
+    var_h_->Finish(true);
  }

-  virtual void Process() = 0;
+  VarHandlePtr GetVarHandlePtr() { return var_h_; }
+  bool Wait() { return var_h_->Wait(); }
+  void Finish(bool ok) { return var_h_->Finish(ok); }
+  virtual void ProcessImpl() = 0;

  std::unique_ptr<grpc::ClientContext> context_;
  grpc::Status status_;
-  VarHandle var_h_;
+
+ protected:
+  VarHandlePtr var_h_;
 };

 typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
@ -94,13 +93,13 @@ typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
 class SendProcessor : public BaseProcessor {
 public:
  explicit SendProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch), stub_g_(ch) {}
+      : BaseProcessor(), stub_g_(ch) {}

  virtual ~SendProcessor() {}

-  virtual void Process() {
+  void ProcessImpl() override {
    if (response_call_back_) {
-      response_call_back_(var_h_, reply_);
+      response_call_back_(*var_h_.get(), reply_);
    }
  }

@ -115,13 +114,13 @@ typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
 class GetProcessor : public BaseProcessor {
 public:
  explicit GetProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch), stub_g_(ch) {}
+      : BaseProcessor(), stub_g_(ch) {}

  virtual ~GetProcessor() {}

-  virtual void Process() {
+  void ProcessImpl() override {
    if (response_call_back_) {
-      response_call_back_(var_h_, reply_);
+      response_call_back_(*var_h_.get(), reply_);
    }
  }

@ -133,13 +132,13 @@ class GetProcessor : public BaseProcessor {
 class BatchBarrierProcessor : public BaseProcessor {
 public:
  explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {
+      : BaseProcessor() {
    stub_ = sendrecv::SendRecvService::NewStub(ch);
  }

  virtual ~BatchBarrierProcessor() {}

-  virtual void Process() {}
+  void ProcessImpl() override {}
  sendrecv::VoidMessage reply_;
  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
 };
@ -147,13 +146,13 @@ class BatchBarrierProcessor : public BaseProcessor {
 class FetchBarrierProcessor : public BaseProcessor {
 public:
  explicit FetchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {
+      : BaseProcessor() {
    stub_ = sendrecv::SendRecvService::NewStub(ch);
  }

  virtual ~FetchBarrierProcessor() {}

-  virtual void Process() {}
+  void ProcessImpl() override {}
  sendrecv::VariableMessage reply_;
  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
 };
@ -161,13 +160,13 @@ class FetchBarrierProcessor : public BaseProcessor {
 class CheckpointNotifyProcessor : public BaseProcessor {
 public:
  explicit CheckpointNotifyProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {
+      : BaseProcessor() {
    stub_ = sendrecv::SendRecvService::NewStub(ch);
  }

  virtual ~CheckpointNotifyProcessor() {}

-  virtual void Process() {}
+  void ProcessImpl() override {}
  sendrecv::VoidMessage reply_;
  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
 };
@ -177,32 +176,37 @@ class GRPCClient : public RPCClient {
  GRPCClient() : ok_(true), completed_(false), stopped_(false) {}
  virtual ~GRPCClient();

-  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
-                    const framework::Scope& scope, const std::string& var_name,
+  VarHandlePtr AsyncSendVar(const std::string& ep,
+                            const platform::DeviceContext& ctx,
+                            const framework::Scope& scope,
+                            const std::string& var_name,
                            int64_t time_out = FLAGS_rpc_deadline) override;

-  bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
-                   const framework::Scope& scope, const std::string& var_name,
+  VarHandlePtr AsyncGetVar(const std::string& ep,
+                           const platform::DeviceContext& ctx,
+                           const framework::Scope& scope,
+                           const std::string& var_name,
                           int64_t time_out = FLAGS_rpc_deadline) override;

-  bool AsyncPrefetchVar(const std::string& ep,
+  VarHandlePtr AsyncPrefetchVar(const std::string& ep,
                                const platform::DeviceContext& ctx,
                                const framework::Scope& scope,
                                const std::string& in_var_name,
                                const std::string& out_var_name,
                                int64_t time_out = FLAGS_rpc_deadline) override;

-  void AsyncSendBatchBarrier(const std::string& ep,
-                             int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncSendBatchBarrier(
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;

-  void AsyncSendFetchBarrier(const std::string& ep,
-                             int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncSendFetchBarrier(
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;

-  void AsyncCheckpointNotify(const std::string& ep, const std::string& dir,
+  VarHandlePtr AsyncCheckpointNotify(
+      const std::string& ep, const std::string& dir,
      int64_t time_out = FLAGS_rpc_deadline) override;

-  void AsyncSendComplete(const std::string& ep,
-                         int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncSendComplete(
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;

  bool Wait() override;

--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@ -28,6 +28,7 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/platform/macros.h"

 namespace paddle {
 namespace operators {
@ -49,23 +50,77 @@ constexpr char kRequestPassBarrier[] = "RequestPassBarrier";

 class RPCServer;

-struct VarHandle {
-  // RPC endpoint.
-  std::string ep;
-  const platform::DeviceContext* ctx;
-  const framework::Scope* scope;
-  // Variable name.
-  std::string name;
-  // RPC method name.
-  std::string method;
+class VarHandle {
+ public:
+  VarHandle(const std::string ep, const std::string& method,
+            const std::string& name,
+            const platform::DeviceContext* p_ctx = nullptr,
+            const framework::Scope* p_scope = nullptr)
+      : ok_(kVarHandleDefaultState) {
+    ep_ = ep;
+    ctx_ = p_ctx;
+    scope_ = p_scope;
+    name_ = name;
+    method_ = method;
+  }
+
+  virtual ~VarHandle() {}
+
+ public:
+  bool Wait() {
+    {
+      std::unique_lock<std::mutex> lk(sync_mutex_);
+      wait_cond_.wait(lk, [this] { return ok_ != kVarHandleDefaultState; });
+    }
+    VLOG(7) << "VarHandle wait:" << ok_;
+    return ok_ != 0;
+  }
+
+  void Finish(bool ok) {
+    {
+      std::unique_lock<std::mutex> lk(sync_mutex_);
+      ok_ = ok;
+    }
+    VLOG(7) << "VarHandle finish:" << ok;
+    wait_cond_.notify_all();
+  }

  std::string String() const {
    std::ostringstream s;
-    s << method << " name:[" << name << "], ep:[" << ep << "]";
+    s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], ok:[" << ok_
+      << "]";
    return s.str();
  }
+
+  std::string ep() const { return ep_; }
+  const platform::DeviceContext* ctx() const { return ctx_; }
+  const framework::Scope* scope() const { return scope_; }
+  std::string name() const { return name_; }
+  std::string method() const { return method_; }
+
+ protected:
+  // RPC endpoint.
+  std::string ep_;
+  const platform::DeviceContext* ctx_;
+  const framework::Scope* scope_;
+  // Variable name.
+  std::string name_;
+  // RPC method name.
+  std::string method_;
+
+ protected:
+  std::mutex sync_mutex_;
+  std::condition_variable wait_cond_;
+  int ok_;
+
+  static const int kVarHandleDefaultState = -1;
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(VarHandle);
 };

+typedef std::shared_ptr<VarHandle> VarHandlePtr;
+
 class RequestHandler {
 public:
  explicit RequestHandler(bool sync_mode)
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@ -14,12 +14,14 @@

 #pragma once

+#include <condition_variable>  // NOLINT
 #include <string>
 #include "gflags/gflags.h"

 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"

 DECLARE_int32(rpc_deadline);

@ -31,37 +33,36 @@ class RPCClient {
 public:
  RPCClient() {}
  virtual ~RPCClient() {}
-  virtual bool AsyncSendVar(const std::string& ep,
+  virtual VarHandlePtr AsyncSendVar(const std::string& ep,
                                    const platform::DeviceContext& ctx,
                                    const framework::Scope& scope,
                                    const std::string& var_name,
                                    int64_t time_out = FLAGS_rpc_deadline) = 0;

-  virtual bool AsyncGetVar(const std::string& ep,
+  virtual VarHandlePtr AsyncGetVar(const std::string& ep,
                                   const platform::DeviceContext& ctx,
                                   const framework::Scope& scope,
                                   const std::string& var_name,
                                   int64_t time_out = FLAGS_rpc_deadline) = 0;

-  virtual bool AsyncPrefetchVar(const std::string& ep,
-                                const platform::DeviceContext& ctx,
-                                const framework::Scope& scope,
-                                const std::string& in_var_name,
+  virtual VarHandlePtr AsyncPrefetchVar(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& in_var_name,
      const std::string& out_var_name,
      int64_t time_out = FLAGS_rpc_deadline) = 0;

-  virtual void AsyncSendBatchBarrier(const std::string& ep,
-                                     int64_t time_out = FLAGS_rpc_deadline) = 0;
+  virtual VarHandlePtr AsyncSendBatchBarrier(
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;

-  virtual void AsyncSendFetchBarrier(const std::string& ep,
-                                     int64_t time_out = FLAGS_rpc_deadline) = 0;
+  virtual VarHandlePtr AsyncSendFetchBarrier(
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;

-  virtual void AsyncCheckpointNotify(const std::string& ep,
-                                     const std::string& dir,
+  virtual VarHandlePtr AsyncCheckpointNotify(
+      const std::string& ep, const std::string& dir,
      int64_t time_out = FLAGS_rpc_deadline) = 0;

-  virtual void AsyncSendComplete(const std::string& ep,
-                                 int64_t time_out = FLAGS_rpc_deadline) = 0;
+  virtual VarHandlePtr AsyncSendComplete(
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;

  // Complete tells all the pserver instances that finishe the training,
  // the pserver can reduce it's barrier count, and continue to train
--- a/paddle/fluid/operators/distributed/varhandle_test.cc
+++ b/paddle/fluid/operators/distributed/varhandle_test.cc
@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <string>
+#include <thread>  // NOLINT
+
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+
+using paddle::operators::distributed::VarHandlePtr;
+using paddle::operators::distributed::VarHandle;
+
+void WaitTrue(VarHandlePtr s) { EXPECT_TRUE(s->Wait()); }
+
+void WaitFalse(VarHandlePtr s) { EXPECT_FALSE(s->Wait()); }
+
+TEST(VarHandle, Run) {
+  std::vector<VarHandlePtr> a;
+  for (int i = 0; i < 12; i++) {
+    VarHandlePtr s(new VarHandle("", "", "", nullptr, nullptr));
+    a.push_back(s);
+  }
+
+  std::vector<std::unique_ptr<std::thread>> t;
+  for (int i = 0; i < 6; i++) {
+    t.emplace_back(new std::thread(WaitFalse, a[i]));
+  }
+
+  for (int i = 0; i < 6; i++) {
+    a[i]->Finish(false);
+    t[i]->join();
+  }
+
+  for (int i = 6; i < 12; i++) {
+    t.emplace_back(new std::thread(WaitTrue, a[i]));
+  }
+
+  for (int i = 6; i < 12; i++) {
+    a[i]->Finish(true);
+    t[i]->join();
+  }
+}
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
@ -44,16 +44,20 @@ class PrefetchOp : public framework::OperatorBase {
    distributed::RPCClient* rpc_client =
        distributed::RPCClient::GetInstance<RPCCLIENT_T>();

+    std::vector<distributed::VarHandlePtr> rets;
    for (size_t i = 0; i < ins.size(); i++) {
      if (NeedSend(scope, ins[i])) {
        VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get "
                << outs[i] << " back";
-        rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope, ins[i], outs[i]);
+        rets.push_back(rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope,
+                                                    ins[i], outs[i]));
      } else {
        VLOG(3) << "don't send no-initialied variable: " << ins[i];
      }
    }
-    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
+    for (size_t i = 0; i < rets.size(); i++) {
+      PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+    }
  }
 };

--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@ -44,12 +44,15 @@ class RecvOp : public framework::OperatorBase {
    distributed::RPCClient* rpc_client =
        distributed::RPCClient::GetInstance<RPCCLIENT_T>();

+    std::vector<distributed::VarHandlePtr> rets;
    for (size_t i = 0; i < outs.size(); i++) {
      VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
-      rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]);
+      rets.push_back(rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]));
    }
    if (sync_mode) {
-      PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
+      for (size_t i = 0; i < rets.size(); i++) {
+        PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+      }
    }
  }
 };
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@ -15,6 +15,7 @@ limitations under the License. */
 #include <future>  // NOLINT
 #include <ostream>

+#include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@ -45,18 +46,19 @@ class SendOp : public framework::OperatorBase {
    distributed::RPCClient* rpc_client =
        distributed::RPCClient::GetInstance<RPCCLIENT_T>();

+    std::vector<distributed::VarHandlePtr> rets;
    for (size_t i = 0; i < ins.size(); i++) {
      if (NeedSend(scope, ins[i])) {
        VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
-        // TODO(Yancey1989): we need to use an IO threadpool which has
-        // a larger number of threads than the computing threadpool.
-        rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]);
+        rets.push_back(rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]));
      } else {
        VLOG(3) << "don't send no-initialied variable: " << ins[i];
      }
    }
    if (sync_send) {
-      PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
+      for (size_t i = 0; i < rets.size(); i++) {
+        PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+      }
    }
  }
 };
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@ -192,7 +192,8 @@ class MKLDNNHandler {
      mkldnn::memory::primitive_desc& user_mpd,  // NOLINT
      const std::shared_ptr<mkldnn::memory> user_memory_p,
      const std::string& suffix,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
+      bool is_persistent = false) {
    // create reorder primitive if the input format is not the preferred one
    auto local_key = key_ + suffix;
    auto key_reorder_p = key_ + suffix + "reorder_p";
@ -213,7 +214,7 @@ class MKLDNNHandler {
        pipeline.push_back(*reorder_p);
      }
      dev_ctx_.SetBlob(local_key, target_memory_p);
-    } else {
+    } else if (!is_persistent) {
      // Make reorder if needed
      auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
          dev_ctx_.GetBlob(key_reorder_p));
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@ -128,6 +128,13 @@ class ParallelExecutor(object):
                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
                exec_strategy.num_threads = cpu_num * 2

+        # Set 1 thread num under nccl2 distribute 
+        #   env to make sure all gpus run ops in same order.
+        if num_trainers > 1:
+            assert (use_cuda)
+            # FIXME(gongwb): avoid this set.
+            exec_strategy.num_threads = 1
+
        if build_strategy is None:
            build_strategy = BuildStrategy()

--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@ -60,12 +60,46 @@ class InferenceTranspiler(object):
        if not isinstance(scope, core.Scope):
            raise TypeError("scope should be as Scope type or None")
        use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
+
        self._fuse_batch_norm(program, place, scope)
        if use_mkldnn:
-            self._fuse_relu_mkldnn(program)
            self._fuse_conv_bias_mkldnn(program)
+            self._fuse_conv_relu_mkldnn(program)
+            self._fuse_bn_relu_mkldnn(program)
+
+    def _fuse_conv_relu_mkldnn(self, program):
+        '''
+        Transpile the program by fused relu activation for MKLDNN program.
+        Relu activation following convolution OP can be fused by adding
+        'fuse_relu' attribute to convolution OP.
+        The result of fuse is:
+            - before:
+                - conv->relu->any_other_op
+            - after:
+                - conv->any_other_op
+        :param program: program to transpile
+        :type program: Program
+        '''
+        self.block = program.block(0)
+
+        i = 0
+        while i < len(self.block.ops):
+            current_op = self.block.ops[i]
+            if current_op.type in ['conv2d']:
+                next_op = self.block.ops[i + 1]
+                if next_op.type == 'relu':
+                    # modify conv OP to include relu
+                    current_op.set_attr("fuse_relu", True)
+                    # remove conv OP
+                    self.block._remove_op(i + 1)
+            i = i + 1

-    def _fuse_relu_mkldnn(self, program):
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
+        # And a better solution will be considered later.
+        program = program.clone()
+
+    def _fuse_bn_relu_mkldnn(self, program):
        '''
        Transpile the program by fused relu activation for MKLDNN program.

@ -160,7 +194,6 @@ class InferenceTranspiler(object):
                self.block._remove_op(i + 1)  # Remove old conv
                self.block._remove_op(i + 1)  # Remove elementwise_add
            i = i + 1
-            i = i + 1

        self._remove_unused_var()
        # TODO(luotao): use clone() method to flush the program.desc in force,