merge conflict, test=develop

6 years ago · 92c8ac8a74
parent 99120698b7 a06f4b2b2c
commit 92c8ac8a74
74 changed files with 2564 additions and 1747 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -75,7 +75,6 @@ option(WITH_INFERENCE_API_TEST   "Test fluid inference high-level api interface"
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
 option(WITH_FAST_MATH   "Make use of fast math library, might affect the precision to some extent" ON)
-option(WITH_WBAES       "Compile PaddlePaddle with WBAES support"       ON)

 # PY_VERSION
 if(NOT PY_VERSION)
@ -149,7 +148,6 @@ include(external/dlpack)
 include(external/snappy)    # download snappy
 include(external/snappystream) # download snappystream
 include(external/warpctc)   # download, build, install warpctc
-include(external/wbaes)     # download wbaes

 if (NOT WIN32)
 # there is no official support of nccl, cupti in windows
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -157,7 +157,3 @@ endif(WITH_BRPC_RDMA)
 if(ON_INFER)
    add_definitions(-DPADDLE_ON_INFERENCE)
 endif(ON_INFER)
-
-if(WITH_WBAES)
-    add_definitions(-DPADDLE_WITH_WBAES)
-endif(WITH_WBAES)
--- a/cmake/external/wbaes.cmake
+++ b/cmake/external/wbaes.cmake
@ -1,71 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-IF(NOT ${WITH_WBAES})
-    return()
-ENDIF(NOT ${WITH_WBAES})
-
-INCLUDE(ExternalProject)
-SET(WBAES_DST_DIR       "wbaes")
-SET(WBAES_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
-SET(WBAES_INSTALL_DIR   ${WBAES_INSTALL_ROOT}/${WBAES_DST_DIR})
-SET(WBAES_ROOT          ${WBAES_INSTALL_DIR})
-SET(WBAES_INC_DIR       ${WBAES_ROOT}/include)
-SET(WBAES_LIB_DIR       ${WBAES_ROOT}/lib)
-
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${WBAES_ROOT}/lib")
-SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
-
-IF(APPLE)
-    SET(WBAES_TAG   "v1.0.0" CACHE STRING "" FORCE)
-    SET(WBAES_URL   "http://paddlepaddledeps.bj.bcebos.com/wbaes-sdk.mac.${WBAES_TAG}.tgz" CACHE STRING "" FORCE)
-    SET(WBAES_LIB   ${WBAES_LIB_DIR}/libwbaes.dylib)
-    SET(WBAES_SHARED_LIB   ${WBAES_LIB_DIR}/libwbaes.dylib)
-ELSEIF(WIN32)
-    SET(WBAES_TAG   "v1.0.0" CACHE STRING "" FORCE)
-    SET(WBAES_URL   "http://paddlepaddledeps.bj.bcebos.com/wbaes-sdk.windows-x64.${WBAES_TAG}.tgz" CACHE STRING "" FORCE)
-    SET(WBAES_LIB   ${WBAES_LIB_DIR}/libwbaes.lib)
-    SET(WBAES_SHARED_LIB   ${WBAES_LIB_DIR}/libwbaes.dll)
-ELSE()
-    SET(WBAES_TAG   "v1.0.2" CACHE STRING "" FORCE)
-    SET(WBAES_URL   "http://paddlepaddledeps.bj.bcebos.com/wbaes-sdk.linux-x86_64.${WBAES_TAG}.tgz" CACHE STRING "" FORCE)
-    SET(WBAES_LIB   ${WBAES_LIB_DIR}/libwbaes.so)
-    SET(WBAES_SHARED_LIB   ${WBAES_LIB_DIR}/libwbaes.so)
-ENDIF()
-
-SET(WBAES_PROJECT       "extern_wbaes")
-MESSAGE(STATUS "WBAES_URL: ${WBAES_URL}, WBAES_LIB: ${WBAES_LIB}")
-SET(WBAES_SOURCE_DIR    "${THIRD_PARTY_PATH}/wbaes") 
-SET(WBAES_DOWNLOAD_DIR  "${WBAES_SOURCE_DIR}/src/${WBAES_PROJECT}")
-
-ExternalProject_Add(
-    ${WBAES_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                  ${WBAES_SOURCE_DIR}
-    URL                     ${WBAES_URL}
-    DOWNLOAD_DIR            ${WBAES_DOWNLOAD_DIR}
-    DOWNLOAD_NO_PROGRESS    1
-    CONFIGURE_COMMAND       ""
-    BUILD_COMMAND           ""
-    INSTALL_COMMAND         ""
-        ${CMAKE_COMMAND} -E copy_directory ${WBAES_DOWNLOAD_DIR}/include ${WBAES_INC_DIR} &&
-        ${CMAKE_COMMAND} -E copy_directory ${WBAES_DOWNLOAD_DIR}/lib ${WBAES_LIB_DIR}
-)
-
-INCLUDE_DIRECTORIES(${WBAES_INC_DIR})
-
-ADD_LIBRARY(wbaes SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET wbaes PROPERTY IMPORTED_LOCATION ${WBAES_LIB})
-SET_PROPERTY(TARGET wbaes PROPERTY IMPORTED_NO_SONAME 1)
-ADD_DEPENDENCIES(wbaes ${WBAES_PROJECT})
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -264,14 +264,6 @@ function(cc_library TARGET_NAME)
        list(REMOVE_ITEM cc_library_DEPS warpctc)
        add_dependencies(${TARGET_NAME} warpctc)
      endif()
-      # Only deps libwbaes.so, not link
-      if("${cc_library_DEPS};" MATCHES "wbaes;")
-        list(REMOVE_ITEM cc_library_DEPS wbaes)
-        if(NOT "${TARGET_NAME}" MATCHES "dynload_wbaes")
-          list(APPEND cc_library_DEPS dynload_wbaes)
-        endif()
-        add_dependencies(${TARGET_NAME} wbaes)
-      endif()
      # Only deps libmklml.so, not link
      if("${cc_library_DEPS};" MATCHES "mklml;")
        list(REMOVE_ITEM cc_library_DEPS mklml)
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@ -170,14 +170,6 @@ copy(snappystream_lib
        DSTS ${dst_dir} ${dst_dir}/lib
        DEPS snappystream)

-if (WITH_WBAES)
-    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/wbaes")
-    copy(wbaes_lib
-            SRCS ${WBAES_INC_DIR} ${WBAES_LIB}
-            DSTS ${dst_dir} ${dst_dir}/lib
-            DEPS wbaes)
-endif ()
-
 set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
 copy(zlib_lib
        SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -236,6 +236,7 @@ paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], vararg
 paddle.fluid.layers.kldiv_loss (ArgSpec(args=['x', 'target', 'reduction', 'name'], varargs=None, keywords=None, defaults=('mean', None)), ('document', '776d536cac47c89073abc7ee524d5aec'))
 paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607'))
 paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329'))
+paddle.fluid.layers.pixel_shuffle (ArgSpec(args=['x', 'upscale_factor'], varargs=None, keywords=None, defaults=None), ('document', 'ad669cdf83e72a69ebc5ed79e36486de'))
 paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', 'b76ccca3735bea4a58a0dbf0d77c5393'))
 paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139'))
 paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc'))
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@ -242,6 +242,11 @@ void InMemoryDataFeed<T>::SetTrainerNum(int trainer_num) {
  trainer_num_ = trainer_num;
 }

+template <typename T>
+void InMemoryDataFeed<T>::SetFleetSendBatchSize(int64_t size) {
+  fleet_send_batch_size_ = size;
+}
+
 template <typename T>
 void InMemoryDataFeed<T>::PutInsToChannel(const std::string& ins_str) {
 #ifdef _LINUX
@ -361,8 +366,13 @@ void InMemoryDataFeed<T>::GlobalShuffle() {
  VLOG(3) << "GlobalShuffle() begin, thread_id=" << thread_id_;
  auto fleet_ptr = FleetWrapper::GetInstance();
  std::vector<std::vector<T*>> send_vec(trainer_num_);
+  std::vector<int> send_index(trainer_num_);
+  uint64_t reserve_len = fleet_send_batch_size_ / trainer_num_;
  for (auto& vec : send_vec) {
-    vec.reserve(fleet_send_batch_size_);
+    vec.reserve(reserve_len);
+  }
+  for (int i = 0; i < trainer_num_; ++i) {
+    send_index[i] = i;
  }
  std::vector<std::future<int32_t>> total_status;
  auto interval = GetMemoryDataInterval();
@ -375,7 +385,10 @@ void InMemoryDataFeed<T>::GlobalShuffle() {
    int64_t node_id = random_num % trainer_num_;
    send_vec[node_id].push_back(&((*memory_data_)[i]));
    if (i % fleet_send_batch_size_ == 0 && i != 0) {
-      for (int j = 0; j < send_vec.size(); ++j) {
+      // shuffle the sequence of sending to avoid network timeout error
+      std::random_shuffle(send_index.begin(), send_index.end());
+      for (int index = 0; index < send_index.size(); ++index) {
+        int j = send_index[index];
        std::string send_str;
        SerializeIns(send_vec[j], &send_str);
        VLOG(3) << "send str_length=" << send_str.length()
@ -388,7 +401,10 @@ void InMemoryDataFeed<T>::GlobalShuffle() {
      }
    }
  }
-  for (int j = 0; j < send_vec.size(); ++j) {
+  // shuffle the sequence of sending to avoid network timeout error
+  std::random_shuffle(send_index.begin(), send_index.end());
+  for (int index = 0; index < send_index.size(); ++index) {
+    int j = send_index[index];
    if (send_vec[j].size() != 0) {
      std::string send_str;
      SerializeIns(send_vec[j], &send_str);
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@ -94,6 +94,8 @@ class DataFeed {
  virtual void SetThreadNum(int thread_num) {}
  // This function will do nothing at default
  virtual void SetTrainerNum(int trainer_num) {}
+  // This function will do nothing at default
+  virtual void SetFleetSendBatchSize(int64_t size) {}
  virtual void SetFileListMutex(std::mutex* mutex) {
    mutex_for_pick_file_ = mutex;
  }
@ -212,6 +214,7 @@ class InMemoryDataFeed : public PrivateQueueDataFeed<T> {
  virtual void SetThreadId(int thread_id);
  virtual void SetThreadNum(int thread_num);
  virtual void SetTrainerNum(int trainer_num);
+  virtual void SetFleetSendBatchSize(int64_t size);
  virtual void PutInsToChannel(const std::string& ins_str);
  virtual void FillMemoryDataToChannel();
  virtual void FillChannelToMemoryData();
--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@ -64,6 +64,17 @@ void DatasetImpl<T>::SetTrainerNum(int trainer_num) {
  }
 }

+// if you run distributed, and want to do global shuffle,
+// set this before global shuffle.
+// be sure you call CreateReaders before SetFleetSendBatchSize
+template <typename T>
+void DatasetImpl<T>::SetFleetSendBatchSize(int64_t size) {
+  fleet_send_batch_size_ = size;
+  for (auto reader : readers_) {
+    reader->SetFleetSendBatchSize(size);
+  }
+}
+
 template <typename T>
 void DatasetImpl<T>::SetHdfsConfig(const std::string& fs_name,
                                   const std::string& fs_ugi) {
--- a/paddle/fluid/framework/data_set.h
+++ b/paddle/fluid/framework/data_set.h
@ -47,6 +47,8 @@ class Dataset {
  virtual void SetThreadNum(int thread_num) = 0;
  // set workers' num
  virtual void SetTrainerNum(int trainer_num) = 0;
+  // set fleet send batch size
+  virtual void SetFleetSendBatchSize(int64_t size) = 0;
  // set fs name and ugi
  virtual void SetHdfsConfig(const std::string& fs_name,
                             const std::string& fs_ugi) = 0;
@ -59,6 +61,8 @@ class Dataset {
  virtual int GetThreadNum() = 0;
  // get worker num
  virtual int GetTrainerNum() = 0;
+  // get fleet send batch size
+  virtual int64_t GetFleetSendBatchSize() = 0;
  // get hdfs config
  virtual std::pair<std::string, std::string> GetHdfsConfig() = 0;
  // get data fedd desc
@ -98,6 +102,7 @@ class DatasetImpl : public Dataset {
  virtual void SetFileList(const std::vector<std::string>& filelist);
  virtual void SetThreadNum(int thread_num);
  virtual void SetTrainerNum(int trainer_num);
+  virtual void SetFleetSendBatchSize(int64_t size);
  virtual void SetHdfsConfig(const std::string& fs_name,
                             const std::string& fs_ugi);
  virtual void SetDataFeedDesc(const std::string& data_feed_desc_str);
@ -105,6 +110,7 @@ class DatasetImpl : public Dataset {
  virtual const std::vector<std::string>& GetFileList() { return filelist_; }
  virtual int GetThreadNum() { return thread_num_; }
  virtual int GetTrainerNum() { return trainer_num_; }
+  virtual int64_t GetFleetSendBatchSize() { return fleet_send_batch_size_; }
  virtual std::pair<std::string, std::string> GetHdfsConfig() {
    return std::make_pair(fs_name_, fs_ugi_);
  }
@ -137,6 +143,7 @@ class DatasetImpl : public Dataset {
  std::string fs_name_;
  std::string fs_ugi_;
  unsigned int rand_seed;
+  int64_t fleet_send_batch_size_;
 };

 // use std::vector<MultiSlotType> as data type
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@ -53,6 +53,10 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
      this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
    }
  }
+  // TODO(gongwb) :polish them!
+  if (is_encoded) {
+    VLOG(1) << "Use dgc allreduce mode";
+  }
 }
 #else
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
@ -86,7 +90,7 @@ void AllReduceOpHandle::RunImplEncoded() {
        paddle::framework::GradOriginalVarName(in_var_handles[i]->name());
    auto encode_var_name = original_name + g_dgc_encoded;
    auto *in_var = local_scope->FindVar(encode_var_name);
-    PADDLE_ENFORCE_NOT_NULL(in_var);
+    PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name);
    auto &in = in_var->Get<LoDTensor>();
    ins.emplace_back(&in);

--- a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc
+++ b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc
--- a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.h
+++ b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.h
@ -0,0 +1,79 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <algorithm>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+void SetFuseParameterGroupsSize(int group_size);
+int GetFuseParameterGroupsSize();
+
+void SetFuseParameterMemorySize(uint64_t memory_size);
+uint64_t GetFuseParameterMemorySize();
+
+class AllocContinuousSpaceForGradPass : public ir::Pass {
+ protected:
+  void ApplyImpl(ir::Graph *graph) const override;
+
+  template <typename AttrType>
+  void ResetAttribute(const std::string &attr_name, ir::Graph *graph) const;
+
+  void SetGroupGradsAndParams(
+      const std::unordered_map<std::string, ir::Node *> &var_nodes,
+      const ParamsAndGrads &params_grads,
+      GroupGradsAndParams *group_grads_params) const;
+
+  void SetGroupAccordingToLayers(
+      const std::unordered_map<std::string, ir::Node *> &var_nodes,
+      const ParamsAndGrads &params_grads,
+      GroupGradsAndParams *group_grads_params) const;
+
+  void SetGroupAccordingToMemorySize(
+      const std::unordered_map<std::string, ir::Node *> &var_nodes,
+      GroupGradsAndParams *group_grads_params) const;
+
+  void SetGroupAccordingToGroupSize(
+      const std::unordered_map<std::string, ir::Node *> &var_nodes,
+      GroupGradsAndParams *group_grads_params) const;
+
+ private:
+  bool IsSupportedVarType(const proto::VarType::Type &type) const;
+
+  void RecordParamsAndGrads(ir::Node *node, ParamsAndGrads *params_grads) const;
+
+  void InitFusedVarsAndAllocSpaceForVars(
+      const std::vector<platform::Place> &places,
+      const std::vector<Scope *> &local_scopes,
+      const std::unordered_map<std::string, ir::Node *> &vars,
+      const std::string &fused_var_name,
+      const ParamsAndGrads &params_grads) const;
+
+  void AppendAllocSpaceForVarsOp(const std::vector<std::string> &params_name,
+                                 const std::vector<std::string> &grads_name,
+                                 const std::string &fused_var_name,
+                                 BlockDesc *global_block) const;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@ -142,6 +142,14 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
      AppendPass("memory_optimize_pass");
    }

+    // runtime_context_cache pass should be the last pass to enable the attr of
+    // all original and fused operators. But no operators can be enabled this
+    // attr if putting it after MultiDevPass.
+    if (strategy_.cache_runtime_context_) {
+      VLOG(10) << "Add runtime_context_cache_pass";
+      AppendPass("runtime_context_cache_pass");
+    }
+
    AppendMultiDevPass(strategy_);

    if (strategy_.fuse_all_reduce_ops_) {
@ -243,7 +251,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
  CreatePassesFromStrategy(false);

  for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
-    VLOG(3) << "apply " << pass->Type();
+    VLOG(3) << "BuildStrategy::Apply pass:" << pass->Type();
    if (IsMultiDevPass(pass->Type())) {
      pass->Erase(kPlaces);
      pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
@ -328,3 +336,4 @@ USE_PASS(graph_to_program_pass);
 USE_PASS(fuse_adam_op_pass);
 USE_PASS(fuse_sgd_op_pass);
 USE_PASS(fuse_all_reduce_op_pass);
+USE_PASS(runtime_context_cache_pass);
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@ -107,6 +107,8 @@ struct BuildStrategy {
  std::vector<std::string> trainers_endpoints_;
  bool remove_unnecessary_lock_{true};

+  bool cache_runtime_context_{false};
+
  // NOTE:
  // Before you add new options, think if it's a general strategy that works
  // with other strategy. If not, the strategy should be created through
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@ -237,6 +237,7 @@ void FleetWrapper::PushDenseParamSync(
  std::vector<paddle::ps::Region> regions;
  for (auto& t : var_names) {
    Variable* var = scope.FindVar(t);
+    CHECK(var != nullptr) << "var[" << t << "] not found";
    LoDTensor* tensor = var->GetMutable<LoDTensor>();
    float* g = tensor->mutable_data<float>(place);
    paddle::ps::Region reg(g, tensor->numel());
--- a/paddle/fluid/framework/io/shell.cc
+++ b/paddle/fluid/framework/io/shell.cc
@ -126,7 +126,7 @@ static int shell_popen_fork_internal(const char* real_cmd, bool do_read,
  }

  close_open_fds_internal();
-  if (execl("/bin/sh", "sh", "-c", real_cmd, NULL) < 0) {
+  if (execl("/bin/bash", "bash", "-c", real_cmd, NULL) < 0) {
    return -1;
  }
  exit(127);
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@ -68,6 +68,7 @@ pass_library(transpose_flatten_concat_fuse_pass inference)
 pass_library(identity_scale_op_clean_pass base)
 pass_library(sync_batch_norm_pass base)
 pass_library(runtime_context_cache_pass base)
+pass_library(expected_kernel_cache_pass base)
 pass_library(quant_conv2d_dequant_fuse_pass inference)
 pass_library(fillconstant_elementwisemul_fuse inference)

--- a/paddle/fluid/framework/ir/expected_kernel_cache_pass.cc
+++ b/paddle/fluid/framework/ir/expected_kernel_cache_pass.cc
@ -0,0 +1,37 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/expected_kernel_cache_pass.h"
+#include <memory>
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void ExpectedKernelCachePass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(3) << "Applies Expected Kernel Cache strategy.";
+  for (const Node* n : graph->Nodes()) {
+    if (n->IsOp()) {
+      n->Op()->SetAttr(kEnableCacheExpectedKernel, true);
+    }
+  }
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(expected_kernel_cache_pass,
+              paddle::framework::ir::ExpectedKernelCachePass);
--- a/paddle/fluid/framework/ir/expected_kernel_cache_pass.h
+++ b/paddle/fluid/framework/ir/expected_kernel_cache_pass.h
@ -12,23 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifdef PADDLE_WITH_WBAES
+#pragma once

-#include "paddle/fluid/platform/dynload/wbaes.h"
+#include <memory>
+#include "paddle/fluid/framework/ir/pass.h"

 namespace paddle {
-namespace platform {
-namespace dynload {
+namespace framework {
+namespace ir {

-std::once_flag wbaes_dso_flag;
-void *wbaes_dso_handle = nullptr;
+class ExpectedKernelCachePass : public Pass {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};

-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-WBAES_ROUTINE_EACH(DEFINE_WRAP);
-
-}  // namespace dynload
-}  // namespace platform
+}  // namespace ir
+}  // namespace framework
 }  // namespace paddle
-
-#endif
--- a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
+++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
@ -23,7 +23,7 @@ namespace ir {
 void RuntimeContextCachePass::ApplyImpl(ir::Graph* graph) const {
  VLOG(3) << "Applies Runtime Context Cache strategy.";
  for (const Node* n : graph->Nodes()) {
-    if (n->IsOp()) {
+    if (n->IsOp() && n->Op()) {
      n->Op()->SetAttr(kEnableCacheRuntimeContext, true);
    }
  }
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@ -899,50 +899,23 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  auto* dev_ctx = pool.Get(place);

-  // check if op[type] has kernel registered.
-  auto& all_op_kernels = AllOpKernels();
-  auto kernels_iter = all_op_kernels.find(type_);
-  if (kernels_iter == all_op_kernels.end()) {
-    PADDLE_THROW(
-        "There are no kernels which are registered in the %s operator.", type_);
+  if (!HasAttr(kEnableCacheExpectedKernel) || !kernel_type_) {
+    ChooseKernel(*runtime_ctx, scope, place);
  }

-  OpKernelMap& kernels = kernels_iter->second;
-
-  auto expected_kernel_key = this->GetExpectedKernelType(
-      ExecutionContext(*this, scope, *dev_ctx, *runtime_ctx, nullptr));
-  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
-
-  auto kernel_iter = kernels.find(expected_kernel_key);
-#ifdef PADDLE_WITH_MKLDNN
-  // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
-  if (kernel_iter == kernels.end() &&
-      expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
-    VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
-    expected_kernel_key.library_type_ = LibraryType::kPlain;
-    expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
-    kernel_iter = kernels.find(expected_kernel_key);
-  }
-#endif
-  if (kernel_iter == kernels.end()) {
-    PADDLE_THROW("op %s does not have kernel for %s", type_,
-                 KernelTypeToString(expected_kernel_key));
-  }
-
-  std::vector<KernelConfig>* kernel_configs =
-      GetKernelConfig(expected_kernel_key);
+  std::vector<KernelConfig>* kernel_configs = GetKernelConfig(*kernel_type_);

  // do data transformScope &transfer_scope;
  std::vector<std::string> transfered_inplace_vars;
-  auto* transfer_scope = PrepareData(scope, expected_kernel_key,
-                                     &transfered_inplace_vars, runtime_ctx);
+  auto* transfer_scope =
+      PrepareData(scope, *kernel_type_, &transfered_inplace_vars, runtime_ctx);

  // exec scope is the scope that kernel actually executed on.
  const Scope& exec_scope =
      (transfer_scope == nullptr ? scope : *transfer_scope);

-  if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) {
-    dev_ctx = pool.Get(expected_kernel_key.place_);
+  if (!(kernel_type_->place_ == dev_ctx->GetPlace())) {
+    dev_ctx = pool.Get(kernel_type_->place_);
  }

  if (!HasAttr(kAllKernelsMustComputeRuntimeShape)) {
@ -951,8 +924,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  }
  // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
  // not Scope. Imperative mode only pass inputs and get outputs.
-  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx,
-                                       *runtime_ctx, kernel_configs));
+  (*kernel_func_)(ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx,
+                                   kernel_configs));

  if (!transfered_inplace_vars.empty()) {
    // there is inplace variable has been transfered.
@ -978,6 +951,46 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  }
 }

+void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
+                                      const Scope& scope,
+                                      const platform::Place& place) const {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(place);
+
+  // check if op[type] has kernel registered.
+  auto& all_op_kernels = AllOpKernels();
+  auto kernels_iter = all_op_kernels.find(type_);
+  if (kernels_iter == all_op_kernels.end()) {
+    PADDLE_THROW(
+        "There are no kernels which are registered in the %s operator.", type_);
+  }
+
+  OpKernelMap& kernels = kernels_iter->second;
+
+  auto expected_kernel_key = this->GetExpectedKernelType(
+      ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr));
+  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+
+  auto kernel_iter = kernels.find(expected_kernel_key);
+#ifdef PADDLE_WITH_MKLDNN
+  // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
+  if (kernel_iter == kernels.end() &&
+      expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
+    VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
+    expected_kernel_key.library_type_ = LibraryType::kPlain;
+    expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
+#endif
+  if (kernel_iter == kernels.end()) {
+    PADDLE_THROW("op %s does not have kernel for %s", type_,
+                 KernelTypeToString(expected_kernel_key));
+  }
+
+  kernel_type_.reset(new OpKernelType(expected_kernel_key));
+  kernel_func_.reset(new OpKernelFunc(kernel_iter->second));
+}
+
 void OperatorWithKernel::TransferInplaceVarsBack(
    const Scope& scope, const std::vector<std::string>& inplace_vars,
    const Scope& transfer_scope) const {
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@ -70,6 +70,12 @@ constexpr char kNewGradSuffix[] = "@NEWGRAD@";
 /// this Op's execution to save the elapsed time.
 constexpr char kEnableCacheRuntimeContext[] = "@ENABLE_CACHE_RUNTIME_CONTEXT@";

+/// If an Op has attribtue kEnableCacheExpectedKernel, it means that in a same
+/// name scope and same place, since the expected kerenl of this Op does not
+/// change in the execution, it could be recorded only at the first iteration of
+/// this Op's execution to save the elapsed time.
+constexpr char kEnableCacheExpectedKernel[] = "@ENABLE_CACHE_EXPECTED_KERNEL@";
+
 /// If an Op has this attribute, all its kernels should calculate output
 /// variable's shape in the corresponding Compute() function. And
 /// OperatorWithKernel::RunImpl() would skip call this Op's InferShape()
@ -491,8 +497,13 @@ class OperatorWithKernel : public OperatorBase {
                               const std::vector<std::string>& inplace_vars,
                               const Scope& exec_scope) const;

+  void ChooseKernel(const RuntimeContext& ctx, const Scope& scope,
+                    const platform::Place& place) const;
+
 protected:
  mutable OpKernelConfigsMap kernel_configs_map_;
+  mutable std::unique_ptr<OpKernelType> kernel_type_;
+  mutable std::unique_ptr<OpKernelFunc> kernel_func_;
  mutable std::unique_ptr<RuntimeContext> runtime_ctx_;
  mutable const Scope* pre_scope_ = nullptr;
 };
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@ -3,4 +3,7 @@ cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybi
 cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind)
 cc_library(engine SRCS engine.cc)
 cc_library(imperative_profiler SRCS profiler.cc)
+cc_library(nccl_context SRCS nccl_context.cc DEPS device_context)
+
+cc_test(nccl_context_test SRCS nccl_context_test.cc  DEPS nccl_context)
 endif()
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@ -0,0 +1,133 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/nccl_context.h"
+
+namespace paddle {
+namespace imperative {
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+void NCCLParallelContext::RecvNCCLID(const std::string &ep,
+                                     ncclUniqueId *nccl_id) {
+  auto addr = paddle::string::Split(ep, ':');
+  PADDLE_ENFORCE_EQ(addr.size(), 2UL,
+                    "The endpoint should contain host and port: %s", ep);
+  std::string host = addr[0];
+  int port = std::stoi(addr[1]);
+
+  int server_fd, new_socket;
+  struct sockaddr_in address;
+  int addrlen = sizeof(address);
+  char buffer[1024] = {0};
+  int opt = 0;
+  // creating socket fd
+  if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == 0)
+    PADDLE_THROW("create server fd failed");
+  if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)))
+    PADDLE_THROW("set socket opt failed");
+
+  address.sin_family = AF_INET;
+  address.sin_addr.s_addr = INADDR_ANY;
+  address.sin_port = htons(port);
+
+  if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0)
+    PADDLE_THROW("binding failed on ep: %s", ep);
+  VLOG(3) << "listening on: " << ep;
+  if (listen(server_fd, 3) < 0) PADDLE_THROW("listen on server fd failed");
+
+  if ((new_socket =
+           accept(server_fd, reinterpret_cast<struct sockaddr *>(&address),
+                  reinterpret_cast<socklen_t *>(&addrlen))) < 0)
+    PADDLE_THROW("accept the new socket fd failed");
+
+  if (read(new_socket, buffer, 1024) < 0)
+    PADDLE_THROW("reading the ncclUniqueId from socket failed");
+  VLOG(3) << "recevived the ncclUniqueId";
+  memcpy(nccl_id, buffer, NCCL_UNIQUE_ID_BYTES);
+
+  VLOG(3) << "closing the socket server: " << ep;
+  close(server_fd);
+}
+
+void NCCLParallelContext::SendNCCLID(const std::string &ep,
+                                     ncclUniqueId *nccl_id) {
+  auto addr = paddle::string::Split(ep, ':');
+  PADDLE_ENFORCE_EQ(addr.size(), 2UL,
+                    "The endpoint should contain host and port: %s", ep);
+  std::string host = addr[0];
+  int port = std::stoi(addr[1]);
+  // struct sockaddr_in address;
+  int sock = 0;
+  struct sockaddr_in serv_addr;
+  char buffer[1024] = {0};
+
+  memcpy(buffer, nccl_id, NCCL_UNIQUE_ID_BYTES);
+  if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0)
+    PADDLE_THROW("create socket failed");
+
+  memset(&serv_addr, '0', sizeof(serv_addr));
+  serv_addr.sin_family = AF_INET;
+  serv_addr.sin_port = htons(port);
+
+  if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0)
+    PADDLE_THROW("invalied address: %s", ep);
+
+  while (true) {
+    if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
+      VLOG(0) << "worker: " << ep
+              << " is not ready, will retry after 3 seconds...";
+      std::this_thread::sleep_for(std::chrono::seconds(3));
+      continue;
+    }
+    VLOG(3) << "sending the ncclUniqueId to " << ep;
+    send(sock, buffer, NCCL_UNIQUE_ID_BYTES, 0);
+    break;
+  }
+}
+
+void NCCLParallelContext::BcastNCCLId(ncclUniqueId *nccl_id, int root) {
+  if (strategy_.local_rank_ == root) {
+    for (auto ep : strategy_.trainer_endpoints_) {
+      if (ep != strategy_.current_endpoint_) SendNCCLID(ep, nccl_id);
+    }
+  } else {
+    RecvNCCLID(strategy_.current_endpoint_, nccl_id);
+  }
+}
+
+void NCCLParallelContext::Init() {
+  ncclUniqueId nccl_id;
+  ncclComm_t comm;
+  if (strategy_.local_rank_ == 0) {
+    // generate the unique ncclid on the root worker
+    platform::dynload::ncclGetUniqueId(&nccl_id);
+    BcastNCCLId(&nccl_id, 0);
+  } else {
+    BcastNCCLId(&nccl_id, 0);
+  }
+  int gpu_id = boost::get<platform::CUDAPlace>(place_).device;
+  VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
+          << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id;
+
+  PADDLE_ENFORCE(cudaSetDevice(gpu_id));
+  PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
+      &comm, strategy_.nranks_, nccl_id, strategy_.local_rank_));
+
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(pool.Get(place_));
+  dev_ctx->set_nccl_comm(comm);
+}
+#endif
+
+}  //  namespace imperative
+}  //  namespace paddle
--- a/Show More
+++ b/Show More