Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_grpc_destroy_bug

7 years ago · 2c4fb585db
parent 0d04545e9c 8b80d6da23
commit 2c4fb585db
507 changed files with 1301 additions and 1023 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -65,6 +65,7 @@ option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better d
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
--- a/README.md
+++ b/README.md
@ -18,6 +18,8 @@ learning to many products at Baidu.
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
 ### Lastest PaddlePaddle Version: [Fluid](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid)
 ## Features
 - **Flexibility**
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@ -83,18 +83,20 @@ else()
  set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
 endif()
-find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
+if(WITH_SYSTEM_BLAS)
  find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
        ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
-find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
+  find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
        ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
-if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
+  if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
-  set(CBLAS_FOUND ON)
+    set(CBLAS_FOUND ON)
-  set(CBLAS_PROVIDER REFERENCE)
+    set(CBLAS_PROVIDER REFERENCE)
-  set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
+    set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
-  set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY})
+    set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY})
-  add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
+    add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
-  message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+    message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
  endif()
 endif()
 if(IOS_USE_VECLIB_FOR_BLAS AND VECLIB_FOUND)
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -257,8 +257,8 @@ function(cc_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS ARGS)
    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
    add_test(NAME ${TARGET_NAME}
             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
@ -324,8 +324,8 @@ function(nv_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
+    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
    add_test(${TARGET_NAME} ${TARGET_NAME})
    if (nv_test_SERIAL)
        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
--- a/doc/fluid/design/dist_train/dist_train_nccl2.md
+++ b/doc/fluid/design/dist_train/dist_train_nccl2.md
@ -0,0 +1,35 @@
 # Distributed Training with NCCL2
 We design a pattern that can enable training with `ParallelExecutor` and
 using [NCCL2](https://developer.nvidia.com/nccl) as it's collective
 communication library.
 In `ParallelExecutor` we can use `AllReduce` or `Reduce` and `Broadcast`
 to do multi GPU training. And if we initialize NCCL2 communicators as
 ranks in a distributed environment, we can simply run the `ParallelExecutor`
 as a distributed program! The only thing that may be different than in
 the single node version is that we need to broadcast the NCCL unique ID
 to all the nodes, and initialize communicators using that ID, so NCCL2
 will know each other as ranks.
 To achieve this feature, we introduce a new operator: `gen_nccl_id` op,
 so we are ***not*** "bind to" running NCCL2 with MPI, we can run it in
 what ever platform you like.
 It have two running modes:
 1. Generate and broadcast mode, which should be used on trainer 0;
 1. Listen and fetch mode, which should be used on trainers other than 0.
 In both two modes, this op can save the NCCL ID into current scope as a
 persistable variable, Then we can insert this op at the end of
 "startup program" of fluid, so that all workers can get the same ID to
 initialize NCCL communicator objects.
 <img src="src/ncc2_design.png">
 The above figure indicates the general process when training with NCCL2
 distributed. Each trainer have the number of communicators equal to the
 number of GPUs, but the ranks should match the global ranks number: here
 we have total 8 GPUs, so `nranks==8`, for each trainer, the ranks should
 be from 0 ~ 3 on trainer 0 and 4 ~ 7 on trainer 1.
--- a/doc/fluid/design/dist_train/distributed_lookup_table_design.md
+++ b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
@ -119,6 +119,32 @@ optimization algorithm $f$ runs on the storage service.
 - Con: the storage service needs to be able to run the optimization
  algorithm.
 ## Distributed Sparse Table in Fluid
 For another design, we can implement a distributed sparse table in Fluid,
 and don't need to maintain an external storage component while training.
 You may need to read Fluid [Distributed Training Architecture](./distributed_architecture.md)
 and [Parameter Server](./parameter_server.md) before going on.
 ![fluid lookup remote table](./src/fluid_lookup_remote_table.png)
 Partition a large table into multiple pserver instances
 1. `DistributeTranspiler` would split the table partitioned into some small
 table blocks with some partitioned algorithms such as
 [RoundRobin](https://en.wikipedia.org/wiki/Round-robin_scheduling),
 [Hash](https://en.wikipedia.org/wiki/Hash) and etc...
 1. For some cases, the range of input `Ids` is very wide and unpredictable, so the sparse
 table would be able to fill a new value for the id that didn't appear before with
 zero, uniform random or Gaussian distribution.
 For each Trainer's training process:
 1. In the forward pass, we use `pre-fetch` op to pre-fetch parameter blocks according to the
 input `Ids` from PServers instead of the local `lookup_table` op, and then merge the blocks
 into a parameter `W`.
 1. Compute `GRAD@W'` in the backward pass using the pre-fetched `W` and send it to PServer to
 execute the optimize pass.
 ## Conclusion
 Let us do the "storage service does not optimize" solution first, as a
--- a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle
+++ b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle
--- a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png
+++ b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png
--- a/doc/fluid/design/dist_train/src/ncc2_design.graffle
+++ b/doc/fluid/design/dist_train/src/ncc2_design.graffle
--- a/doc/fluid/design/dist_train/src/ncc2_design.png
+++ b/doc/fluid/design/dist_train/src/ncc2_design.png
--- a/doc/v2/design/cluster_train/large_model_dist_train.md
+++ b/doc/v2/design/cluster_train/large_model_dist_train.md
@ -52,7 +52,7 @@ In `trainer_internal.cpp:L93 trainOneBatch`:
 When doing actual network forward and backward, at the beginning of each batch, the trainer will try to download one row of data from pserver.
-In `trainer/RemoteParameterUpdater.cpp`: `parameterUpdater_->getParametersRemote();`:
+In `legacy/trainer/RemoteParameterUpdater.cpp`: `parameterUpdater_->getParametersRemote();`:
 ```c++
 if (fullSize) {
--- a/doc/v2/dev/new_layer_en.rst
+++ b/doc/v2/dev/new_layer_en.rst
@ -339,7 +339,7 @@ If you are creating a new file for the test, such as :code:`paddle/legacy/gserve
 Implement Python Wrapper
 ========================
-Implementing Python wrapper allows us to use the added layer in configuration files. All the Python wrappers are in file :code:`python/paddle/trainer/config_parser.py`. An example of the Python wrapper for fully connected layer is listed below. It has the following steps:
+Implementing Python wrapper allows us to use the added layer in configuration files. All the Python wrappers are in file :code:`python/paddle/legacy/trainer/config_parser.py`. An example of the Python wrapper for fully connected layer is listed below. It has the following steps:
 - Use :code:`@config_layer('fc')` at the decorator for all the Python wrapper class. :code:`fc` is the identifier of the layer.
 - Implements :code:`__init__` constructor function.
--- a/doc/v2/howto/capi/compile_paddle_lib_cn.md
+++ b/doc/v2/howto/capi/compile_paddle_lib_cn.md
@ -18,7 +18,7 @@
 </tr>
 <tr>
 <td>cpu_avx_openblas</td>
-<td>暂无</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cpu_noavx_openblas</td>
@ -35,7 +35,12 @@
 <tr>
 <td>cuda8.0_cudnn7_avx_mkl</td>
 <td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
-</tr></tbody></table>
+</tr>
 <tr>
 <td>cuda9.0_cudnn7_avx_mkl</td>
 <td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
 </tr>
 </tbody></table>
 ### 从源码编译
--- a/doc/v2/howto/capi/compile_paddle_lib_en.md
+++ b/doc/v2/howto/capi/compile_paddle_lib_en.md
@ -17,7 +17,7 @@
 </tr>
 <tr>
 <td>cpu_avx_openblas</td>
-<td>-</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cpu_noavx_openblas</td>
@ -34,7 +34,12 @@
 <tr>
 <td>cuda8.0_cudnn7_avx_mkl</td>
 <td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
-</tr></tbody></table>
+</tr>
 <tr>
 <td>cuda9.0_cudnn7_avx_mkl</td>
 <td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
 </tr>
 </tbody></table>
 ### From source
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@ -1,24 +1,24 @@
 if(NOT WITH_FLUID_ONLY)
  add_subdirectory(legacy/cuda)
  add_subdirectory(legacy/function)
-  add_subdirectory(utils)
+  add_subdirectory(legacy/utils)
  add_subdirectory(legacy/math)
  add_subdirectory(legacy/gserver)
  add_subdirectory(legacy/parameter)
  if(MOBILE_INFERENCE)
-    add_subdirectory(capi)
+    add_subdirectory(legacy/capi)
  else()
    add_subdirectory(legacy/pserver)
-    add_subdirectory(trainer)
+    add_subdirectory(legacy/trainer)
    add_subdirectory(scripts)
    if(WITH_C_API)
-      add_subdirectory(capi)
+      add_subdirectory(legacy/capi)
    endif()
    if(WITH_SWIG_PY)
-      add_subdirectory(api)
+      add_subdirectory(legacy/api)
    endif()
  endif()
 endif()
--- a/paddle/contrib/inference/paddle_inference_api_impl.h
+++ b/paddle/contrib/inference/paddle_inference_api_impl.h
@ -22,9 +22,9 @@
 #include "paddle/contrib/inference/paddle_inference_api.h"
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -21,10 +21,10 @@ endif()
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
-nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context init)
+nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context tensor)
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
-nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
+nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
@ -38,7 +38,7 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
 cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
 nv_test(data_device_transform_test SRCS data_device_transform_test.cu
-        DEPS operator op_registry init math_function)
+        DEPS operator op_registry device_context math_function)
 if(WITH_GPU)
  nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
@ -63,7 +63,7 @@ cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
    shape_inference data_transform lod_tensor profiler)
-cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
+cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
@ -101,14 +101,14 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
 cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece operator)
 cc_test(init_test SRCS init_test.cc DEPS init)
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
 # cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )
-cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
+
-        channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
+# disable test temporarily.
-        conditional_block_op while_op assign_op print_op executor proto_desc)
+# TODO https://github.com/PaddlePaddle/Paddle/issues/11971
 # cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
 #         channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
 #         conditional_block_op while_op assign_op print_op executor proto_desc)
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@ -14,13 +14,13 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
 namespace paddle {
 namespace framework {
--- a/paddle/fluid/framework/details/data_balance_op_handle.cc
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
@ -62,7 +62,7 @@ std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan(
  }
  if (total_size < device_num) {
    // No enough data.
-    PADDLE_THROW("There is no next data.");
+    PADDLE_THROW_EOF();
  }
  std::sort(size_device_vec.begin(), size_device_vec.end(),
            [](const std::array<int, 2> &a, const std::array<int, 2> &b) {
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@ -124,16 +124,10 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 #ifdef PADDLE_WITH_CUDA
  if (!events_.empty()) {  // Use event
    std::function<void()> method = callback;
    // NOTE(zcd): device context must be ordered here because RecordEvent
    // will use a mutex to ensure the safe of multi-threads.
    std::map<platform::DeviceContext *, platform::Place> ordered_ctxes;
    for (auto &p : dev_ctxes_) {
      ordered_ctxes.emplace(p.second, p.first);
    }
    for (auto &p : ordered_ctxes) {
      method = [method, p, this]() {
-        static_cast<platform::CUDADeviceContext *>(p.first)->RecordEvent(
+        static_cast<platform::CUDADeviceContext *>(p.second)->RecordEvent(
-            events_.at(boost::get<platform::CUDAPlace>(p.second).device),
+            events_.at(boost::get<platform::CUDAPlace>(p.first).device),
            method);
      };
    }
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@ -13,9 +13,9 @@
 // limitations under the License.
 #pragma once
 #include <map>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/details/var_handle.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/macros.h"
@ -92,9 +92,7 @@ class OpHandleBase {
  std::vector<VarHandleBase *> inputs_;
  std::vector<VarHandleBase *> outputs_;
-  std::unordered_map<platform::Place, platform::DeviceContext *,
+  std::map<platform::Place, platform::DeviceContext *> dev_ctxes_;
                     platform::PlaceHash>
      dev_ctxes_;
 #ifdef PADDLE_WITH_CUDA
  std::unordered_map<int, cudaEvent_t> events_;
--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@ -54,8 +54,7 @@ struct ReduceLoDTensor {
 inline void GatherSelectedRows(
    const std::vector<const SelectedRows *> &src_selecte_rows_,
    const std::vector<platform::Place> &in_places,
-    const std::unordered_map<platform::Place, platform::DeviceContext *,
+    const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
                             platform::PlaceHash> &dev_ctxes,
    const platform::Place &out_place, SelectedRows *dst_selecte_rows) {
  PADDLE_ENFORCE(!src_selecte_rows_.empty());
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@ -98,9 +98,18 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
    if (timeout) {
      std::lock_guard<std::mutex> l(exception_mu_);
      if (exception_) {
-        auto exp = *exception_;
+        std::exception *exp = exception_.get();
-        exception_.reset();
+        if (dynamic_cast<platform::EOFException *>(exp)) {
-        throw exp;
+          auto e = *static_cast<platform::EOFException *>(exp);
          exception_.reset();
          throw e;
        } else if (dynamic_cast<platform::EnforceNotMet *>(exp)) {
          auto e = *static_cast<platform::EnforceNotMet *>(exp);
          exception_.reset();
          throw e;
        } else {
          LOG(FATAL) << "Unknown exception.";
        }
      } else {
        continue;
      }
@ -199,6 +208,12 @@ void ThreadedSSAGraphExecutor::RunOp(
      running_ops_--;
      ready_var_q->Extend(op->Outputs());
      VLOG(10) << op << " " << op->Name() << "Signal posted";
    } catch (platform::EOFException ex) {
      std::lock_guard<std::mutex> l(exception_mu_);
      // EOFException will not cover up existing EnforceNotMet.
      if (exception_.get() == nullptr) {
        exception_.reset(new platform::EOFException(ex));
      }
    } catch (platform::EnforceNotMet ex) {
      std::lock_guard<std::mutex> l(exception_mu_);
      exception_.reset(new platform::EnforceNotMet(ex));
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@ -57,7 +57,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  std::vector<platform::Place> places_;
  platform::DeviceContextPool fetch_ctxs_;
  std::mutex exception_mu_;
-  std::unique_ptr<platform::EnforceNotMet> exception_;
+  std::unique_ptr<std::exception> exception_;
  std::atomic<int> running_ops_;
  void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -46,9 +46,16 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
 Executor::Executor(const platform::Place& place) : place_(place) {}
 #ifdef PADDLE_WITH_DISTRIBUTE
-void Executor::Complete() {
+void Executor::BeginPass() {
-  ::paddle::operators::distributed::RPCClient::GetInstance<RPCCLIENT_T>()
+  ::paddle::operators::distributed::RPCClient::GetInstance<
-      ->SendComplete();
+      ::paddle::operators::distributed::GRPCClient>()
      ->SendBeginPass();
 }
 void Executor::EndPass() {
  ::paddle::operators::distributed::RPCClient::GetInstance<
      ::paddle::operators::distributed::GRPCClient>()
      ->SendEndPass();
 }
 #endif
--- a/Show More
+++ b/Show More