Merge branch 'develop' of github.com:PaddlePaddle/Paddle into fix_pserver_sub_blocks

7 years ago · 3a37e14285
parent 0970bd9edc bcea248b60
commit 3a37e14285
71 changed files with 1041 additions and 801 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -23,7 +23,7 @@ repos:
    -   id: clang-format-with-version-check
        name: clang-format
        description: Format files with ClangFormat.
-        entry: bash ./.clang_format.hook -i
+        entry: bash ./tools/codestyle/clang_format.hook -i
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
 -   repo: local
@ -52,7 +52,7 @@ repos:
    hooks:
    -   id: copyright_checker
        name: copyright_checker
-        entry: python ./.copyright.hook
+        entry: python ./tools/codestyle/copyright.hook
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
        exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
--- a/paddle/contrib/inference/high_level_api.md
+++ b/paddle/contrib/inference/high_level_api.md
@ -0,0 +1,59 @@
 # Inference High-level APIs
 This document describes the high-level inference APIs one can use to easily deploy a Paddle model for an application.
 The APIs are described in `paddle_inference_api.h`, just one header file, and two libaries `libpaddle_fluid.so` and `libpaddle_fluid_api.so` are needed.
 ## PaddleTensor
 We provide the `PaddleTensor` data structure is to give a general tensor interface.
 The definition is 
 ```c++
 struct PaddleTensor {
  std::string name;  // variable name.
  std::vector<int> shape;
  PaddleBuf data;  // blob of data.
  PaddleDType dtype;
 };
 ```
 The data is stored in a continuous memory `PaddleBuf`, and tensor's data type is specified by a `PaddleDType`. 
 The `name` field is used to specify the name of input variable, 
 that is important when there are multiple inputs and need to distiuish which variable to set.
 ## engine
 The inference APIs has two different underlying implementation, currently there are two valid engines:
 - the native engine, which is consists of the native operators and framework,
 - the Anakin engine, which is a Anakin library embeded.
 The native engine takes a native Paddle model as input, and supports any model that trained by Paddle, 
 but the Anakin engine can only take the Anakin model as input(user need to manully transform the format first) and currently not all Paddle models are supported.
 ```c++
 enum class PaddleEngineKind {
  kNative = 0,  // Use the native Fluid facility.
  kAnakin,      // Use Anakin for inference.
 };
 ```
 ## PaddlePredictor and how to create one
 The main interface is `PaddlePredictor`, there are following methods 
 - `bool Run(const std::vector<PaddleTensor>& inputs, std::vector<PaddleTensor>* output_data)`
  - take inputs and output `output_data`
 - `Clone` to clone a predictor from an existing one, with model parameter shared.
 There is a factory method to help create a predictor, and the user takes the ownership of this object.
 ```c++
 template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
 ```
 By specifying the engine kind and config, one can get an specific implementation.
 ## Reference
 - [paddle_inference_api.h](./paddle_inference_api.h)
 - [demos](./demo)
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
@ -109,8 +109,7 @@ class PaddlePredictor {
  // The common configs for all the predictors.
  struct Config {
-    std::string model_dir;      // path to the model directory.
+    std::string model_dir;  // path to the model directory.
    bool enable_engine{false};  // Enable to execute (part of) the model on
  };
 };
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@ -47,10 +47,11 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 #endif
  std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
  int GetVarDeviceID(const std::string &varname) const;
 private:
  void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op,
-                         size_t place_id) const;
+                         size_t device_id) const;
 private:
  std::string loss_var_name_;
@ -96,21 +97,23 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
      const std::string &og,
      std::unordered_set<std::string> *og_has_been_broadcast) const;
-  int GetOpDeviceID(
+  int GetOpDeviceID(const OpDesc &op) const;
      const std::vector<std::unordered_set<std::string>> &var_name_on_devices,
      const OpDesc &op) const;
  void InsertAllReduceOp(SSAGraph *result, const std::string &og) const;
  void CreateBroadcastOp(SSAGraph *result, const std::string &p_name,
                         size_t src_dev_id) const;
-  bool IsSparseGradient(
+  bool IsSparseGradient(const std::string &og) const;
-      const std::unordered_map<std::string, VarDesc *> &all_vars,
+
-      const std::string &og) const;
+  size_t GetAppropriateDeviceID(
      const std::vector<std::string> &var_names) const;
 private:
  BuildStrategy strategy_;
  mutable std::unordered_map<std::string, VarDesc *> all_vars_;
  mutable std::unordered_map<std::string, int> var_name_on_devices_;
  mutable std::vector<int64_t> balance_vars_;
  void SetCommunicationContext(OpHandleBase *op_handle,
                               const platform::Place &p) const;
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@ -30,6 +30,7 @@ class SSAGraphBuilder {
  SSAGraphBuilder() {}
  virtual ~SSAGraphBuilder() {}
  virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0;
  virtual int GetVarDeviceID(const std::string &var_name) const { return -1; }
  DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@ -96,6 +96,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
    auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
    if (timeout) {
      std::lock_guard<std::mutex> l(exception_mu_);
      if (exception_) {
        auto exp = *exception_;
        exception_.reset();
@ -199,6 +200,7 @@ void ThreadedSSAGraphExecutor::RunOp(
      ready_var_q->Extend(op->Outputs());
      VLOG(10) << op << " " << op->Name() << "Signal posted";
    } catch (platform::EnforceNotMet ex) {
      std::lock_guard<std::mutex> l(exception_mu_);
      exception_.reset(new platform::EnforceNotMet(ex));
    } catch (...) {
      LOG(FATAL) << "Unknown exception catched";
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@ -56,6 +56,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;
  platform::DeviceContextPool fetch_ctxs_;
  std::mutex exception_mu_;
  std::unique_ptr<platform::EnforceNotMet> exception_;
  std::atomic<int> running_ops_;
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/distributed/grpc_client.h"
 #endif
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
@ -49,8 +49,8 @@ Executor::Executor(const platform::Place& place) : place_(place) {}
 #ifdef PADDLE_WITH_DISTRIBUTE
 void Executor::Complete() {
-  ::paddle::operators::detail::RPCClient::GetInstance<
+  ::paddle::operators::distributed::RPCClient::GetInstance<
-      ::paddle::operators::detail::GRPCClient>()
+      ::paddle::operators::distributed::GRPCClient>()
      ->SendComplete();
 }
 #endif
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -110,7 +110,6 @@ ParallelExecutor::ParallelExecutor(
  // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
  // ncclOp
  details::SSAGraphBuilderFactory builder_factory(
      member_->places_, loss_var_name, params, member_->local_scopes_,
      build_strategy);
@ -122,9 +121,10 @@ ParallelExecutor::ParallelExecutor(
 #endif
  }
  builder_ = std::move(builder_factory.Create());
  member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
      exec_strategy, member_->local_scopes_, places,
-      builder_factory.Create()->Build(main_program)));
+      builder_->Build(main_program)));
  member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
      exec_strategy, member_->local_scopes_, std::move(var_infos),
@ -133,10 +133,22 @@ ParallelExecutor::ParallelExecutor(
 void ParallelExecutor::BCastParamsToGPUs(
    const std::unordered_set<std::string> &vars) const {
-  auto *main_scope = member_->local_scopes_[0];
+  // the the initialize bcast, all vars would be bcast from device(0), otherwise
  // bcast from the specified device.
  bool initialize = builder_.get() == nullptr ? true : false;
  for (auto &var : vars) {
-    auto *main_var = main_scope->FindVar(var);
+    int var_dev_id =
        builder_.get() == nullptr ? -1 : builder_->GetVarDeviceID(var);
    if (!initialize && var_dev_id == -1) continue;
    framework::Variable *main_var = nullptr;
    if (initialize) {
      main_var = member_->local_scopes_[0]->FindVar(var);
    } else {
      main_var = member_->local_scopes_[var_dev_id]->FindVar(var);
    }
    if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
      continue;
    }
@ -151,7 +163,8 @@ void ParallelExecutor::BCastParamsToGPUs(
      for (size_t i = 0; i < member_->places_.size(); ++i) {
        auto place = member_->places_[i];
        void *buffer;
-        if (i == 0) {
+
        if ((initialize && i == 0) || (!initialize && i == var_dev_id)) {
          buffer = const_cast<void *>(main_tensor.data<void>());
        } else {
          auto local_scope = member_->local_scopes_[i];
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@ -19,12 +19,14 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 namespace paddle {
 namespace framework {
@ -68,6 +70,7 @@ class ParallelExecutor {
 private:
  ParallelExecutorPrivate *member_;
  std::unique_ptr<details::SSAGraphBuilder> builder_;
 };
 }  // namespace framework
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@ -184,9 +184,9 @@ else()
    set(DEPS_OPS ${DEPS_OPS} nccl_op)
 endif()
 add_subdirectory(detail)
 if(WITH_DISTRIBUTE)
-
+    add_subdirectory(distributed)
    set(DISTRIBUTE_DEPS "")
    if(WITH_GRPC)
        set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
@ -195,18 +195,11 @@ if(WITH_DISTRIBUTE)
    endif()
    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-    op_library(prefetch_op DEPS ${DISTRIBUTE_DEPS})
+    foreach(dist_op "prefetch_op" "listen_and_serv_op" "send_op" "recv_op" "send_barrier_op" "fetch_barrier_op")
-    set_source_files_properties(prefetch_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+        op_library(${dist_op} DEPS ${DISTRIBUTE_DEPS})
-    op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
+        set_source_files_properties(${dist_op}.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    endforeach()
-    op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
+    
    set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    op_library(send_op DEPS ${DISTRIBUTE_DEPS})
    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
    op_library(fetch_barrier_op DEPS ${DISTRIBUTE_DEPS})
    set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    set_source_files_properties(fetch_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
    #        listen_and_serv_op sum_op executor SERIAL)
--- a/paddle/fluid/operators/detail/macros.h
+++ b/paddle/fluid/operators/detail/macros.h
@ -15,13 +15,13 @@
 #pragma once
 #ifdef PADDLE_WITH_GRPC
-#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/distributed/grpc_client.h"
-#include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/distributed/grpc_server.h"
-#define RPCSERVER_T detail::AsyncGRPCServer
+#define RPCSERVER_T distributed::AsyncGRPCServer
-#define RPCCLIENT_T detail::GRPCClient
+#define RPCCLIENT_T distributed::GRPCClient
 #else
-#include "paddle/fluid/operators/detail/brpc_client.h"
+#include "paddle/fluid/operators/distributed/brpc_client.h"
-#include "paddle/fluid/operators/detail/brpc_server.h"
+#include "paddle/fluid/operators/distributed/brpc_server.h"
-#define RPCSERVER_T detail::AsyncBRPCServer
+#define RPCSERVER_T distributed::AsyncBRPCServer
-#define RPCCLIENT_T detail::BRPCClient
+#define RPCCLIENT_T distributed::BRPCClient
 #endif
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@ -1,8 +1,3 @@
 if(NOT WITH_DISTRIBUTE)
    return()
 endif()
 if(WITH_GRPC)
  grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
      request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor
--- a/paddle/fluid/operators/distributed/brpc_client.cc
+++ b/paddle/fluid/operators/distributed/brpc_client.cc
@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/detail/brpc_client.h"
+#include "paddle/fluid/operators/distributed/brpc_client.h"
 #include "paddle/fluid/framework/threadpool.h"
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 DEFINE_int32(brpc_channel_num, 24,
             "Number of channels to send requests connected to one server");
@ -175,6 +175,6 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
  return q;
 }
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/distributed/brpc_client.h
+++ b/paddle/fluid/operators/distributed/brpc_client.h
@ -31,13 +31,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/detail/rpc_client.h"
+#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/detail/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 struct ChannelContext {
  brpc::Channel channel;
@ -95,6 +95,6 @@ class BRPCClient : public RPCClient {
  DISABLE_COPY_AND_ASSIGN(BRPCClient);
 };
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/distributed/brpc_server.cc
+++ b/paddle/fluid/operators/distributed/brpc_server.cc
@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/detail/brpc_server.h"
+#include "paddle/fluid/operators/distributed/brpc_server.h"
-#include "paddle/fluid/operators/detail/request_handler.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
 namespace sendrecv {
 typedef std::unordered_map<std::string,
-                           paddle::operators::detail::RequestHandler*>
+                           paddle::operators::distributed::RequestHandler*>
    HandlerMap;
 class BRPCServiceImpl : public SendRecvService {
@ -27,17 +27,17 @@ class BRPCServiceImpl : public SendRecvService {
      : request_send_h_(nullptr),
        request_get_h_(nullptr),
        request_prefetch_h_(nullptr) {
-    auto it = rpc_call_map.find(paddle::operators::detail::kRequestSend);
+    auto it = rpc_call_map.find(paddle::operators::distributed::kRequestSend);
    if (it != rpc_call_map.end()) {
      request_send_h_ = it->second;
    }
-    it = rpc_call_map.find(paddle::operators::detail::kRequestSend);
+    it = rpc_call_map.find(paddle::operators::distributed::kRequestSend);
    if (it != rpc_call_map.end()) {
      request_get_h_ = it->second;
    }
-    it = rpc_call_map.find(paddle::operators::detail::kRequestPrefetch);
+    it = rpc_call_map.find(paddle::operators::distributed::kRequestPrefetch);
    if (it != rpc_call_map.end()) {
      request_prefetch_h_ = it->second;
    }
@ -88,15 +88,15 @@ class BRPCServiceImpl : public SendRecvService {
  }
 private:
-  paddle::operators::detail::RequestHandler* request_send_h_;
+  paddle::operators::distributed::RequestHandler* request_send_h_;
-  paddle::operators::detail::RequestHandler* request_get_h_;
+  paddle::operators::distributed::RequestHandler* request_get_h_;
-  paddle::operators::detail::RequestHandler* request_prefetch_h_;
+  paddle::operators::distributed::RequestHandler* request_prefetch_h_;
 };
 }  // namespace sendrecv
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 void AsyncBRPCServer::StartServer() {
  // Instance of your service.
@ -139,6 +139,6 @@ void AsyncBRPCServer::WaitServerReady() {
  VLOG(3) << "AsyncGRPCServer WaitSeverReady";
 }
-};  // namespace detail
+};  // namespace distributed
 };  // namespace operators
 };  // namespace paddle
--- a/paddle/fluid/operators/distributed/brpc_server.h
+++ b/paddle/fluid/operators/distributed/brpc_server.h
@ -19,12 +19,12 @@ limitations under the License. */
 #include <string>
 #include "brpc/server.h"
-#include "paddle/fluid/operators/detail/rpc_server.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/operators/detail/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 class AsyncBRPCServer final : public RPCServer {
 public:
@ -48,6 +48,6 @@ class AsyncBRPCServer final : public RPCServer {
  int ready_;
 };
-};  // namespace detail
+};  // namespace distributed
 };  // namespace operators
 };  // namespace paddle
--- a/paddle/fluid/operators/distributed/bytebuffer_stream.cc
+++ b/paddle/fluid/operators/distributed/bytebuffer_stream.cc
@ -17,11 +17,11 @@ limitations under the License. */
 //       file and did some modifications so that we can send gRPC
 //       requests without too much copying of the tensor data.
-#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 GrpcByteBufferSource::GrpcByteBufferSource() {}
@ -83,6 +83,6 @@ google::protobuf::int64 GrpcByteBufferSource::ByteCount() const {
  return byte_count_;
 }
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/distributed/bytebuffer_stream.h
+++ b/paddle/fluid/operators/distributed/bytebuffer_stream.h
@ -106,7 +106,7 @@ class GrpcBufferReader final
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 // Source provides a way for a particular RPC implementation to provide
 // received data to ParseFrom.
 class Source {
@ -183,6 +183,6 @@ class GrpcByteSource : public Source {
  char space_[sizeof(Reader)];
 };
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@ -12,19 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/distributed/grpc_client.h"
 #include <sys/time.h>
 #include <limits>
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/detail/request_handler.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 void GRPCClient::InitImpl() { InitEventLoop(); }
@ -276,6 +276,6 @@ std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
  return ch;
 }
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@ -38,13 +38,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/detail/rpc_client.h"
+#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 struct VarHandle {
  std::string ep;
@ -226,6 +226,6 @@ class GRPCClient : public RPCClient {
  DISABLE_COPY_AND_ASSIGN(GRPCClient);
 };
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/distributed/grpc_serde_test.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde_test.cc
@ -21,8 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/operators/detail/variable_response.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
@ -50,7 +50,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
  for (int i = 0; i < 564; ++i) rows->push_back(i);
  ::grpc::ByteBuffer msg;
-  operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
+  operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg);
  EXPECT_GT(msg.Length(), static_cast<size_t>(0));
  // deserialize
@ -81,10 +81,10 @@ void RunSerdeTestSelectedRows(platform::Place place) {
  // deserialize zero-copy
  // framework::Variable var2;
-  // operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
+  // operators::distributed::DeserializeFromByteBuffer(msg, ctx, &var2);
  framework::Scope scope;
  scope.Var("myvar");
-  operators::detail::VariableResponse resp(&scope, &ctx);
+  operators::distributed::VariableResponse resp(&scope, &ctx);
  EXPECT_EQ(resp.Parse(msg), 0);
  framework::Variable* var2 = resp.GetVar();
@ -128,7 +128,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
  math::set_constant(ctx, tensor, 31.9);
  ::grpc::ByteBuffer msg;
-  operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
+  operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg);
  EXPECT_GT(msg.Length(), static_cast<size_t>(0));
  // deserialize
@ -171,7 +171,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
  // deserialize zero-copy
  framework::Scope scope;
  scope.Var("myvar");
-  operators::detail::VariableResponse resp(&scope, &ctx);
+  operators::distributed::VariableResponse resp(&scope, &ctx);
  if (from_type == 0) {
    EXPECT_EQ(resp.Parse(msg), 0);
  } else {
--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@ -15,13 +15,13 @@ limitations under the License. */
 #include <limits>
 #include <string>
-#include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/distributed/grpc_server.h"
 using ::grpc::ServerAsyncResponseWriter;
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 enum CallStatus { PROCESS = 0, FINISH };
 // reference:
@ -74,7 +74,7 @@ class RequestSend final : public RequestBase {
    request_.reset(new VariableResponse(request_handler->scope(),
                                        request_handler->dev_ctx(),
                                        !request_handler->sync_mode()));
-    int method_id = static_cast<int>(detail::GrpcMethod::kSendVariable);
+    int method_id = static_cast<int>(distributed::GrpcMethod::kSendVariable);
    service_->RequestAsyncUnary(
        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
@ -106,7 +106,7 @@ class RequestGet final : public RequestBase {
                      ::grpc::ServerCompletionQueue* cq,
                      RequestHandler* request_handler, int req_id)
      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    auto method_id = static_cast<int>(detail::GrpcMethod::kGetVariable);
+    auto method_id = static_cast<int>(distributed::GrpcMethod::kGetVariable);
    service_->RequestAsyncUnary(
        method_id, &ctx_, &request_, &responder_, cq_, cq_,
        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
@ -150,7 +150,8 @@ class RequestPrefetch final : public RequestBase {
        local_scope_(nullptr) {
    request_.reset(new VariableResponse(request_handler->scope(),
                                        request_handler->dev_ctx(), true));
-    int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
+    int method_id =
        static_cast<int>(distributed::GrpcMethod::kPrefetchVariable);
    service_->RequestAsyncUnary(
        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
@ -354,6 +355,6 @@ void AsyncGRPCServer::HandleRequest(
  }
 }
-}  // namespace detail
+}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/distributed/grpc_server.h
+++ b/paddle/fluid/operators/distributed/grpc_server.h
@ -29,17 +29,17 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/detail/grpc_service.h"
+#include "paddle/fluid/operators/distributed/grpc_service.h"
-#include "paddle/fluid/operators/detail/request_handler.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/detail/rpc_server.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/detail/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {
 namespace operators {
-namespace detail {
+namespace distributed {
 class RequestBase;
@ -84,6 +84,6 @@ class AsyncGRPCServer final : public RPCServer {
  std::map<std::string, std::vector<RequestBase*>> rpc_reqs_;
 };
-};  // namespace detail
+};  // namespace distributed
 };  // namespace operators
 };  // namespace paddle
--- a/Show More
+++ b/Show More