Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into gru-add-mode

7 years ago · 72618c8da5
parent 4ecb9c93f0 e1679b8847
commit 72618c8da5
49 changed files with 2125 additions and 390 deletions
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -184,7 +184,7 @@ endif()
 target_link_libraries(executor garbage_collector)

 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
-        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
+        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor
        graph build_strategy
        fast_threaded_ssa_graph_executor variable_helper)

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@ -77,6 +77,8 @@ cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUT
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
        simple_threadpool device_context)

+cc_library(parallel_ssa_graph_executor SRCS parallel_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor)
+
 cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
        device_context broadcast_op_handle)
 cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@ -19,6 +19,13 @@
 #include "paddle/fluid/framework/details/variable_visitor.h"
 #include "paddle/fluid/platform/profiler.h"

+// asynchronous nccl allreduce or synchronous issue:
+// https://github.com/PaddlePaddle/Paddle/issues/15049
+DEFINE_bool(
+    sync_nccl_allreduce, false,
+    "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
+    "after allreduce, this mode can get better performance in some scenarios.");
+
 namespace paddle {
 namespace framework {
 namespace details {
@ -48,100 +55,104 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 void AllReduceOpHandle::RunImpl() {
  platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);

-// FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR,
-// this is a distributed or inter-process call, find a better way.
+  WaitInputVarGenerated();
+  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
+  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), places_.size(),
+      "The NoDummyInputSize should be equal to the number of places.");
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), out_var_handles.size(),
+      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
+
+  std::vector<const LoDTensor *> lod_tensors;
+  for (size_t i = 0; i < local_scopes_.size(); ++i) {
+    auto *s = local_scopes_[i];
+    auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
+    auto &lod_tensor =
+        local_scope.FindVar(in_var_handles[i]->name_)->Get<LoDTensor>();
+    lod_tensors.emplace_back(&lod_tensor);
+    PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_,
+                      "The name of input and output should be equal.");
+  }
+
+  if (platform::is_gpu_place(lod_tensors[0]->place())) {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  if (NoDummyInputSize() == 1 &&
-      local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) {
-#else
-  if (NoDummyInputSize() == 1) {
-#endif
-    return;  // No need to all reduce when GPU count = 1;
-  } else {
-    // Wait input done
-    WaitInputVarGenerated();
-    auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
-    auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-    PADDLE_ENFORCE_EQ(
-        in_var_handles.size(), places_.size(),
-        "The NoDummyInputSize should be equal to the number of places.");
-    PADDLE_ENFORCE_EQ(
-        in_var_handles.size(), out_var_handles.size(),
-        "The NoDummyInputSize and NoDummyOutputSize should be equal.");
-
-    std::vector<const LoDTensor *> lod_tensors;
+    PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
+    int dtype = -1;
+    size_t numel = 0;
+    std::vector<std::function<void()>> all_reduce_calls;
    for (size_t i = 0; i < local_scopes_.size(); ++i) {
-      auto *s = local_scopes_[i];
-      auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
-      auto &lod_tensor =
-          local_scope.FindVar(in_var_handles[i]->name_)->Get<LoDTensor>();
-      lod_tensors.emplace_back(&lod_tensor);
-      PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_,
-                        "The name of input and output should be equal.");
-    }
+      auto &p = places_[i];
+      auto &lod_tensor = *lod_tensors[i];
+      void *buffer = const_cast<void *>(lod_tensor.data<void>());

-    if (platform::is_gpu_place(lod_tensors[0]->place())) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
-      int dtype = -1;
-      size_t numel = 0;
-      std::vector<std::function<void()>> all_reduce_calls;
-      for (size_t i = 0; i < local_scopes_.size(); ++i) {
-        auto &p = places_[i];
-        auto &lod_tensor = *lod_tensors[i];
-        void *buffer = const_cast<void *>(lod_tensor.data<void>());
-
-        if (dtype == -1) {
-          dtype = platform::ToNCCLDataType(lod_tensor.type());
-        }
+      if (dtype == -1) {
+        dtype = platform::ToNCCLDataType(lod_tensor.type());
+      }
+
+      if (numel == 0) {
+        numel = static_cast<size_t>(lod_tensor.numel());
+      }

-        if (numel == 0) {
-          numel = static_cast<size_t>(lod_tensor.numel());
+      int dev_id = boost::get<platform::CUDAPlace>(p).device;
+      auto &nccl_ctx = nccl_ctxs_->at(dev_id);
+      auto stream = nccl_ctx.stream();
+      auto comm = nccl_ctx.comm_;
+      all_reduce_calls.emplace_back([=] {
+        PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+            buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum,
+            comm, stream));
+      });
+    }
+
+    this->RunAndRecordEvent([&] {
+      if (all_reduce_calls.size() == 1UL) {
+        // Do not use NCCLGroup when manage NCCL by per thread per device
+        all_reduce_calls[0]();
+      } else {
+        platform::NCCLGroupGuard guard;
+        for (auto &call : all_reduce_calls) {
+          call();
        }
+      }
+    });

+    if (FLAGS_sync_nccl_allreduce) {
+      for (auto &p : places_) {
        int dev_id = boost::get<platform::CUDAPlace>(p).device;
        auto &nccl_ctx = nccl_ctxs_->at(dev_id);
        auto stream = nccl_ctx.stream();
-        auto comm = nccl_ctx.comm_;
-        all_reduce_calls.emplace_back([=] {
-          PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
-              buffer, buffer, numel, static_cast<ncclDataType_t>(dtype),
-              ncclSum, comm, stream));
-        });
+        cudaStreamSynchronize(stream);
      }
-      this->RunAndRecordEvent([&] {
-        platform::NCCLGroupGuard guard;
-        for (auto &call : all_reduce_calls) {
-          call();
-        }
-      });
+    }
+
 #else
-      PADDLE_THROW("Not compiled with CUDA");
+    PADDLE_THROW("Not compiled with CUDA");
 #endif
-    } else {  // Special handle CPU only Operator's gradient. Like CRF
-      auto &trg = *this->local_scopes_[0]
-                       ->FindVar(kLocalExecScopeName)
-                       ->Get<Scope *>()
-                       ->FindVar(out_var_handles[0]->name_)
-                       ->GetMutable<framework::LoDTensor>();
-
-      // Reduce All Tensor to trg in CPU
-      ReduceLoDTensor func(lod_tensors, &trg);
-      VisitDataType(lod_tensors[0]->type(), func);
-
-      for (size_t i = 1; i < local_scopes_.size(); ++i) {
-        auto &scope =
-            *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
-        auto &p = places_[i];
-        auto *var = scope.FindVar(out_var_handles[i]->name_);
-        auto *dev_ctx = dev_ctxes_.at(p);
-
-        RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
-          auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
-          auto &tensor_cpu = trg;
-          TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu);
-        });
-      }
+  } else {  // Special handle CPU only Operator's gradient. Like CRF
+    auto &trg = *this->local_scopes_[0]
+                     ->FindVar(kLocalExecScopeName)
+                     ->Get<Scope *>()
+                     ->FindVar(out_var_handles[0]->name_)
+                     ->GetMutable<framework::LoDTensor>();
+
+    // Reduce All Tensor to trg in CPU
+    ReduceLoDTensor func(lod_tensors, &trg);
+    VisitDataType(lod_tensors[0]->type(), func);
+
+    for (size_t i = 1; i < local_scopes_.size(); ++i) {
+      auto &scope =
+          *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
+      auto &p = places_[i];
+      auto *var = scope.FindVar(out_var_handles[i]->name_);
+      auto *dev_ctx = dev_ctxes_.at(p);
+
+      RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
+        auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
+        auto &tensor_cpu = trg;
+        TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu);
+      });
    }
  }
 }
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@ -31,7 +31,11 @@ namespace framework {
 namespace details {

 static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) {
-  return (!strategy.enable_sequential_execution_ && strategy.num_trainers_ > 1);
+  // Should fix the allreduce op order if scheduling
+  // them in multiple threads or processes to avoid hang.
+  return (!strategy.enable_sequential_execution_ &&
+          strategy.num_trainers_ > 1) ||
+         strategy.enable_parallel_graph_;
 }

 class ParallelExecutorPassBuilder : public ir::PassBuilder {
@ -86,8 +90,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
    auto multi_devices_pass = AppendPass("multi_devices_pass");
    multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
                                                         &strategy_);
-    multi_devices_pass->Set<int>("num_trainers",
-                                 new int(strategy_.num_trainers_));

    // Add a graph print pass to record a graph with device info.
    if (!strategy_.debug_graphviz_path_.empty()) {
@ -132,6 +134,7 @@ std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy(
 std::unique_ptr<ir::Graph> BuildStrategy::Apply(
    const ProgramDesc &main_program, const std::vector<platform::Place> &places,
    const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
+    const size_t &nranks,
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
    const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const {
 #else
@ -150,6 +153,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
      pass->Erase("local_scopes");
      pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
                                                    &local_scopes);
+      pass->Erase("nranks");
+      pass->Set<size_t>("nranks", new size_t(nranks));
+
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
      platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
      pass->Erase("nccl_ctxs");
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@ -110,6 +110,7 @@ struct BuildStrategy {
                                   const std::vector<platform::Place> &places,
                                   const std::string &loss_var_name,
                                   const std::vector<Scope *> &local_scopes,
+                                   const size_t &nranks,
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
                                   const bool use_cuda,
                                   platform::NCCLContextMap *nccl_ctxs) const;
@ -117,6 +118,13 @@ struct BuildStrategy {
                                   const bool use_cuda) const;
 #endif

+  // If set true, ParallelExecutor would build the main_program into multiple
+  // graphs,
+  // each of the graphs would run with one device. This approach can achieve
+  // better performance
+  // on some scenarios.
+  mutable bool enable_parallel_graph_ = false;
+
 private:
  mutable bool is_finalized_ = false;
  mutable std::shared_ptr<ir::PassBuilder> pass_builder_;
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@ -138,7 +138,7 @@ static const char kLossVarName[] = "loss_var_name";
 static const char kPlaces[] = "places";
 static const char kLocalScopes[] = "local_scopes";
 static const char kStrategy[] = "strategy";
-static const char kNumTrainers[] = "num_trainers";
+static const char kNRanks[] = "nranks";

 void MultiDevSSAGraphBuilder::Init() const {
  all_vars_.clear();
@ -174,7 +174,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
  auto nodes = graph->ReleaseNodes();
  ir::Graph &result = *graph;

-  int num_trainers = Get<int>(kNumTrainers);
+  size_t nranks = Get<size_t>(kNRanks);

  for (auto &node : nodes) {
    if (node->IsVar() && node->Var()) {
@ -251,7 +251,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
          CreateComputationalOps(&result, node, places_.size());
        }

-        if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) {
+        if (!is_forwarding && nranks > 1UL) {
          bool is_bk_op =
              static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
                                    OpProtoAndCheckerMaker::OpRoleAttrName())) &
@ -649,12 +649,13 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(
 void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
    ir::Graph *result, const std::string &loss_grad_name,
    ir::Node *out_var_node, proto::VarType::Type dtype) const {
+  size_t nranks = Get<size_t>("nranks");
  for (size_t i = 0; i < places_.size(); ++i) {
    // Insert ScaleCost OpHandle
    auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]);
    auto *op_handle = new ScaleLossGradOpHandle(
        result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation),
-        local_scopes_.size(), local_scopes_[i], places_[i], dev_ctx, dtype);
+        nranks, local_scopes_[i], places_[i], dev_ctx, dtype);
    result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);

    // FIXME: Currently ScaleLossGradOp only use device_count as scale
@ -887,4 +888,4 @@ REGISTER_PASS(multi_devices_pass,
    .RequirePassAttr(paddle::framework::details::kPlaces)
    .RequirePassAttr(paddle::framework::details::kLocalScopes)
    .RequirePassAttr(paddle::framework::details::kStrategy)
-    .RequirePassAttr(paddle::framework::details::kNumTrainers);
+    .RequirePassAttr(paddle::framework::details::kNRanks);
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@ -0,0 +1,99 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
+    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places,
+    std::vector<std::unique_ptr<ir::Graph>> &&graphs)
+    : strategy_(std::move(strategy)),
+      local_scopes_(std::move(local_scopes)),
+      pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
+      places_(std::move(places)),
+      graphs_(std::move(graphs)) {
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+
+  // set the correct size of thread pool to each device.
+  strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
+                               ? 1UL
+                               : strategy_.num_threads_ / places_.size();
+  VLOG(1) << "set num_threads: " << strategy_.num_threads_
+          << " to run the operators of the graph on each device.";
+  for (size_t i = 0; i < places.size(); ++i) {
+    executors_.emplace_back(new details::ThreadedSSAGraphExecutor(
+        strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i])));
+  }
+}
+
+FeedFetchList ParallelSSAGraphExecutor::Run(
+    const std::vector<std::string> &fetch_tensors) {
+  std::vector<std::future<FeedFetchList>> run_futures;
+
+  std::vector<FeedFetchList> fetch_data;
+  FeedFetchList ret;
+
+  fetch_data.reserve(places_.size());
+  ret.reserve(fetch_tensors.size());
+  exception_holder_.Clear();
+
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto call = [this, i, &fetch_tensors]() -> FeedFetchList {
+      try {
+        return executors_[i]->Run(fetch_tensors);
+      } catch (...) {
+        exception_holder_.Catch(std::current_exception());
+      }
+      return FeedFetchList();
+    };
+
+    if (pool_) {
+      run_futures.emplace_back(pool_->enqueue(std::move(call)));
+    } else {
+      fetch_data.emplace_back(std::move(call()));
+    }
+  }
+
+  if (pool_) {
+    for (auto &f : run_futures) {
+      if (exception_holder_.IsCaught()) {
+        f.wait();
+      } else {
+        fetch_data.emplace_back(std::move(f.get()));
+      }
+    }
+  }
+  if (exception_holder_.IsCaught()) {
+    exception_holder_.ReThrow();
+  }
+
+  for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) {
+    std::vector<const LoDTensor *> lodtensor_ptrs;
+    lodtensor_ptrs.reserve(local_scopes_.size());
+    for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) {
+      lodtensor_ptrs.push_back(&fetch_data.at(scope_idx).at(fetch_idx));
+    }
+    ret.emplace_back();
+    ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
+  }
+  return ret;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
@ -0,0 +1,51 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "ThreadPool.h"
+#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class ParallelSSAGraphExecutor : public SSAGraphExecutor {
+ public:
+  ParallelSSAGraphExecutor(const ExecutionStrategy &strategy,
+                           const std::vector<Scope *> &local_scopes,
+                           const std::vector<platform::Place> &places,
+                           std::vector<std::unique_ptr<ir::Graph>> &&graphs);
+  ~ParallelSSAGraphExecutor() final = default;
+  const ir::Graph &Graph() const override { return *graphs_[0]; }
+
+  FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
+
+ private:
+  ExecutionStrategy strategy_;
+  std::vector<Scope *> local_scopes_;
+  std::unique_ptr<::ThreadPool> pool_{nullptr};
+  std::vector<platform::Place> places_;
+  std::vector<std::unique_ptr<ir::Graph>> graphs_;
+
+  std::vector<std::unique_ptr<details::ThreadedSSAGraphExecutor>> executors_;
+  ExceptionHolder exception_holder_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@ -56,7 +56,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
    }
  }
  std::vector<framework::LoDTensor> fetch_data;
-  std::exception_ptr eptr;
+  std::exception_ptr eptr = nullptr;
  try {
    fetch_data = underlying_executor_->Run(fetch_tensors);
  } catch (...) {
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -21,12 +21,9 @@ limitations under the License. */

 #include "paddle/fluid/framework/ir/graph.h"

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
@ -38,6 +35,8 @@ limitations under the License. */
 DEFINE_string(pe_profile_fname, "",
              "Profiler filename for PE, which generated by gperftools."
              "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable.");
+DEFINE_bool(enable_parallel_graph, false,
+            "Force disable parallel graph execution mode if set false.");

 namespace paddle {
 namespace framework {
@ -106,6 +105,7 @@ class ParallelExecutorPrivate {
  bool own_local_scope_;
  bool use_cuda_;
  bool use_all_reduce_;
+  size_t nranks_;

  // global_ref_cnts_ is only initialized when ParallelExecutor constructs, and
  // then keeps unchanged
@ -201,6 +201,7 @@ ParallelExecutor::ParallelExecutor(
  member_->build_strategy_ = build_strategy;
  member_->use_all_reduce_ =
      build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
+  member_->nranks_ = num_trainers * places.size();

  if (!member_->use_all_reduce_) {
    PADDLE_ENFORCE(places.size() > 1,
@ -224,62 +225,98 @@ ParallelExecutor::ParallelExecutor(
    }
  }

+  // FIXME(Yancey1989): parallel graph mode get better performance
+  // in GPU allreduce distributed training. Need an elegant way to
+  // choice the execution strategy.
+  build_strategy.enable_parallel_graph_ =
+      EnableParallelGraphExecution(main_program, exec_strategy, build_strategy);
+
+  VLOG(1) << "Enable ParallelGraph Execution: "
+          << build_strategy.enable_parallel_graph_;
+
  if (member_->use_cuda_) {
 // Bcast Parameters to all GPUs
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
    ncclUniqueId *nccl_id = nullptr;
+    // gen_nccl_id operator can broadcast the ncclUniqueId for nccl2 collective
+    // distributed training
+    auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
    if (nccl_id_var != nullptr) {
      nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
    }
+    if (build_strategy.enable_parallel_graph_ && member_->nranks_ > 1UL) {
+      if (nccl_id == nullptr) {
+        local_nccl_id_.reset(new ncclUniqueId());
+        platform::dynload::ncclGetUniqueId(local_nccl_id_.get());
+        nccl_id = local_nccl_id_.get();
+      }
+    }
+
    member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
        member_->places_, nccl_id, num_trainers, trainer_id));
 #else
    PADDLE_THROW("Not compiled with CUDA");
 #endif
  }
-
  if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
    BCastParamsToDevices(bcast_vars);
  }
-// Startup Program has been run. All local scopes has correct parameters.
+  // Startup Program has been run. All local scopes has correct parameters.

-// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
-// ncclOp
+  // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
+  // ncclOp
+  std::vector<std::unique_ptr<ir::Graph>> graphs;
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  if (build_strategy.enable_parallel_graph_) {
+    for (size_t i = 0; i < member_->places_.size(); ++i) {
+      std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
+          main_program, {member_->places_[i]}, loss_var_name,
+          {member_->local_scopes_[i]}, member_->nranks_, member_->use_cuda_,
+          member_->nccl_ctxs_.get());
+      graphs.push_back(std::move(graph));
+    }
+  } else {
+    std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
+        main_program, member_->places_, loss_var_name, member_->local_scopes_,
+        member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_.get());
+    graphs.push_back(std::move(graph));
+  }
+#else
  std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
      main_program, member_->places_, loss_var_name, member_->local_scopes_,
-      member_->use_cuda_, member_->nccl_ctxs_.get());
-#else
-  std::unique_ptr<ir::Graph> graph =
-      build_strategy.Apply(main_program, member_->places_, loss_var_name,
-                           member_->local_scopes_, member_->use_cuda_);
+      member_->nranks_, member_->use_cuda_);
+  graphs.push_back(std::move(graph));
 #endif
  auto max_memory_size = GetEagerDeletionThreshold();
  if (max_memory_size >= 0) {
-    graph = member_->PrepareGCAndRefCnts(std::move(graph),
-                                         static_cast<size_t>(max_memory_size));
+    for (size_t i = 0; i < graphs.size(); ++i) {
+      graphs[i] = member_->PrepareGCAndRefCnts(
+          std::move(graphs[i]), static_cast<size_t>(max_memory_size));
+    }
  }

  // Step 3. Create vars in each scope. Passes may also create new vars.
  //         skip control vars and empty vars
  std::vector<details::VariableInfo> var_infos;
-  for (auto &node : graph->Nodes()) {
-    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
-      var_infos.emplace_back();
-      var_infos.back().name_ = node->Var()->Name();
-      var_infos.back().type_ = node->Var()->GetType();
-      var_infos.back().persistable_ = node->Var()->Persistable();
+  for (auto &graph : graphs) {
+    for (auto &node : graph->Nodes()) {
+      if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
+        var_infos.emplace_back();
+        var_infos.back().name_ = node->Var()->Name();
+        var_infos.back().type_ = node->Var()->GetType();
+        var_infos.back().persistable_ = node->Var()->Persistable();
+      }
    }
  }
+
  // If the loss_var_name is given, the number of graph should be only one.
  if (loss_var_name.size()) {
-    size_t graph_num = ir::GraphNum(*graph);
+    size_t graph_num = ir::GraphNum(*graphs[0]);
    if (graph_num > 1) {
      LOG(WARNING)
          << "The number of graph should be only one, "
             "but the current graph has "
-          << ir::GraphNum(*graph)
+          << ir::GraphNum(*graphs[0])
          << " sub_graphs. If you want to see the nodes of the "
             "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' "
             "to specify the output dir. NOTES: if you not do training, "
@ -287,14 +324,20 @@ ParallelExecutor::ParallelExecutor(
    }
  }

-  if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
-    member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
+  if (build_strategy.enable_parallel_graph_) {
+    member_->executor_.reset(new details::ParallelSSAGraphExecutor(
        exec_strategy, member_->local_scopes_, member_->places_,
-        std::move(graph)));
+        std::move(graphs)));
  } else {
-    member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->places_,
-        std::move(graph)));
+    if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
+      member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
+          exec_strategy, member_->local_scopes_, member_->places_,
+          std::move(graphs[0])));
+    } else {
+      member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
+          exec_strategy, member_->local_scopes_, member_->places_,
+          std::move(graphs[0])));
+    }
  }

  member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
@ -423,6 +466,36 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
  }
 }

+bool ParallelExecutor::EnableParallelGraphExecution(
+    const ProgramDesc &main_program, const ExecutionStrategy &exec_strategy,
+    const BuildStrategy &build_strategy) const {
+  if (!FLAGS_enable_parallel_graph) return false;
+
+  bool enable_parallel_graph = true;
+  // TODO(Yancey1989): support sparse update in ParallelGraph mode.
+  for (auto &var_desc : main_program.Block(0).AllVars()) {
+    if (var_desc->GetType() == proto::VarType::SELECTED_ROWS) {
+      enable_parallel_graph = false;
+    }
+  }
+
+  // TODO(Yancey1989): support pserver mode
+  for (auto &op_desc : main_program.Block(0).AllOps()) {
+    if (op_desc->Type() == "send" || op_desc->Type() == "recv") {
+      enable_parallel_graph = false;
+      break;
+    }
+  }
+
+  if (!member_->use_all_reduce_ || !member_->use_cuda_)
+    enable_parallel_graph = false;
+
+  if (build_strategy.enable_sequential_execution_ ||
+      exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental)
+    enable_parallel_graph = false;
+  return enable_parallel_graph;
+}
+
 ParallelExecutor::~ParallelExecutor() {
  for (auto &p : member_->places_) {
    platform::DeviceContextPool::Instance().Get(p)->Wait();
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@ -28,6 +28,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"

+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
 namespace paddle {
 namespace framework {

@ -68,8 +72,14 @@ class ParallelExecutor {

 private:
  void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
+  bool EnableParallelGraphExecution(const ProgramDesc &main_program,
+                                    const ExecutionStrategy &exec_strategy,
+                                    const BuildStrategy &build_strategy) const;

  ParallelExecutorPrivate *member_;
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  std::unique_ptr<ncclUniqueId> local_nccl_id_;
+#endif
 };

 }  // namespace framework
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@ -89,7 +89,6 @@ void ThreadPool::TaskLoop() {
      task = std::move(tasks_.front());
      tasks_.pop();
    }
-
    // run the task
    task();
  }
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@ -98,10 +98,12 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
 #endif

  auto input_data_type = ctx.Input<Tensor>("Input")->type();
-  auto filter_data_type = ctx.Input<Tensor>("Filter")->type();
-  PADDLE_ENFORCE_EQ(input_data_type, filter_data_type,
-                    "input and filter data type should be consistent");
-
+  if (input_data_type != framework::proto::VarType::INT8 &&
+      input_data_type != framework::proto::VarType::UINT8) {
+    auto filter_data_type = ctx.Input<Tensor>("Filter")->type();
+    PADDLE_ENFORCE_EQ(input_data_type, filter_data_type,
+                      "input and filter data type should be consistent");
+  }
  if (input_data_type == framework::proto::VarType::FP16) {
    PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN,
                      "float16 can only be used when CUDNN is used");
@ -179,6 +181,26 @@ void Conv2DOpMaker::Make() {
                "whenever convolution output is as an input to residual "
                "connection.")
      .SetDefault(false);
+  AddAttr<float>("Scale_in",
+                 "Scale_in to be used for int8 input data."
+                 "Only used with MKL-DNN INT8.")
+      .SetDefault(1.0f);
+  AddAttr<float>("Scale_out",
+                 "Scale_out to be used for int8 output data."
+                 "Only used with MKL-DNN INT8.")
+      .SetDefault(1.0f);
+  AddAttr<float>("Scale_in_eltwise",
+                 "Scale_in_eltwise to be used for int8 eltwise input data."
+                 "Only used with MKL-DNN INT8.")
+      .SetDefault(1.0f);
+  AddAttr<std::vector<float>>("Scale_weights",
+                              "Scale_weights to be used for int8 weights data."
+                              "Only used with MKL-DNN INT8.")
+      .SetDefault({1.0f});
+  AddAttr<bool>("force_fp32_output",
+                "(bool, default false) Force INT8 kernel output FP32, only "
+                "used in MKL-DNN INT8")
+      .SetDefault(false);
  AddAttr<std::string>(
      "data_format",
      "(string, default NCHW) Only used in "
@ -303,6 +325,9 @@ void Conv3DOpMaker::Make() {
      "Defaults to \"NHWC\". Specify the data format of the output data, "
      "the input will be transformed automatically. ")
      .SetDefault("AnyLayout");
+  AddAttr<bool>("force_fp32_output",
+                "(bool, default false) Only used in mkldnn INT8 kernel")
+      .SetDefault(false);
  // TODO(dzhwinter): need to registered layout transform function
  AddAttr<int>("workspace_size_MB",
               "Only used in cudnn kernel. workspace size for cudnn, in MB, "
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@ -29,6 +29,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 constexpr int kConvMKLDNNFP32 = 1;
 constexpr int kConvMKLDNNINT8 = 2;
+constexpr int MaxKeyLength = 256;

 // Base convolution operator definations for other conv
 // like operators to reuse the implementation.
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@ -32,7 +32,7 @@ namespace paddle {
 namespace operators {
 namespace distributed {

-using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
 using LoDTensor = framework::LoDTensor;
 using SelectedRows = framework::SelectedRows;
 using DDim = framework::DDim;
@ -117,6 +117,12 @@ static void MergeMultipleVarsIntoOneBySection(
  auto& id_tensor = scope->FindVar(id_name)->Get<framework::LoDTensor>();
  auto* out_tensor =
      scope->FindVar(out_name)->GetMutable<framework::LoDTensor>();
+
+  PADDLE_ENFORCE_GT(
+      out_tensor->numel(), 0,
+      "When calling this method, the LoDTensor's numel must larger than zero. "
+      "Please check LoDTensor::Resize has been called first.");
+
  auto* out_tensor_data = out_tensor->mutable_data<float>(id_tensor.place());

  bool is_on_cpu_place = true;
@ -138,7 +144,7 @@ static void MergeMultipleVarsIntoOneBySection(

      auto row_numel = dims[1];

-      for (size_t i = 0; i < dims[0]; ++i) {
+      for (int64_t i = 0; i < dims[0]; ++i) {
        auto id = ids_in_this_section[i];
        auto origin_id = id + abs_sections[section_idx];
        auto& offsets = id_to_offset[origin_id];
@ -172,8 +178,9 @@ void prefetch(const std::string& id_name, const std::string& out_name,
              const std::vector<std::string>& table_names,
              const std::vector<std::string>& epmap,
              const std::vector<int>& height_sections,
-              const framework::ExecutionContext& context) {
-  auto& local_scope = context.scope().NewScope();
+              const framework::ExecutionContext& context,
+              const framework::Scope& scope) {
+  auto& local_scope = scope.NewScope();

  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  auto& cpu_ctx = *pool.Get(platform::CPUPlace());
@ -190,11 +197,11 @@ void prefetch(const std::string& id_name, const std::string& out_name,
    out_var_names.push_back(out_name + "@" + epmap[i]);
  }

-  auto& id_tensor = local_scope.FindVar(id_name)->Get<framework::LoDTensor>();
+  auto& id_tensor = scope.FindVar(id_name)->Get<framework::LoDTensor>();
  std::vector<int64_t> ids_vector;
  if (platform::is_cpu_place(id_tensor.place())) {
    auto* id_data = id_tensor.data<int64_t>();
-    for (size_t i = 0; i < id_tensor.numel(); ++i) {
+    for (int64_t i = 0; i < id_tensor.numel(); ++i) {
      ids_vector.push_back(id_data[i]);
    }
  } else {
@ -202,7 +209,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
    PADDLE_THROW("paddle is not compiled with CUDA!");
 #else
    auto cpu_place = platform::CPUPlace();
-    framework::Tensor cpu_tensor;
+    framework::LoDTensor cpu_tensor;
    auto* cpu_tensor_data =
        cpu_tensor.mutable_data<int64_t>(id_tensor.dims(), cpu_place);
    auto stream =
@ -246,8 +253,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
  MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name,
                                    out_var_names, height_sections, splited_ids,
                                    context, &local_scope, &actual_ctx);
-
-  context.scope().DeleteScope(&local_scope);
+  scope.DeleteScope(&local_scope);
 }

 };  // namespace distributed
--- a/paddle/fluid/operators/distributed/parameter_prefetch.h
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.h
@ -27,7 +27,56 @@ void prefetch(const std::string& id_name, const std::string& out_name,
              const std::vector<std::string>& table_names,
              const std::vector<std::string>& epmap,
              const std::vector<int>& height_sections,
-              const framework::ExecutionContext& context);
+              const framework::ExecutionContext& context,
+              const framework::Scope& scope);
+
+template <typename T>
+void prefetch_with_reconstruct(const std::string& id_name,
+                               const std::string& out_name,
+                               const std::vector<std::string>& table_names,
+                               const std::vector<std::string>& epmap,
+                               const std::vector<int>& height_sections,
+                               const framework::ExecutionContext& context,
+                               const framework::Scope& scope,
+                               framework::LoDTensor* original) {
+  prefetch(id_name, out_name, table_names, epmap, height_sections, context,
+           scope);
+  auto& out = scope.FindVar(out_name)->Get<framework::LoDTensor>();
+  auto& ids = scope.FindVar(id_name)->Get<framework::LoDTensor>();
+  auto* original_value = original->data<T>();
+  auto* out_value = out.data<T>();
+  size_t original_width = original->numel() / original->dims()[0];
+
+  bool is_on_cpu_place = true;
+  if (!platform::is_cpu_place(ids.place())) {
+    is_on_cpu_place = false;
+  }
+  if (is_on_cpu_place) {
+    for (int64_t i = 0; i < ids.numel(); i++) {
+      const T* out_rows = out_value + original_width * i;
+      T* original_row =
+          original_value + original_width * ids.data<int64_t>()[i];
+      std::memcpy(original_row, out_rows, original_width * sizeof(T));
+    }
+  } else {
+#ifndef PADDLE_WITH_CUDA
+    PADDLE_THROW("paddle is not compiled with CUDA!");
+#else
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& actual_ctx = *pool.Get(context.GetPlace());
+    for (int64_t i = 0; i < ids.numel(); i++) {
+      const T* out_rows = out_value + original_width * i;
+      T* original_row =
+          original_value + original_width * ids.data<int64_t>()[i];
+      auto stream =
+          static_cast<platform::CUDADeviceContext*>(&actual_ctx)->stream();
+      memory::Copy(boost::get<platform::CUDAPlace>(ids.place()), original_row,
+                   platform::CPUPlace(), out_rows, original_width * sizeof(T),
+                   stream);
+    }
+#endif
+  }
+}

 };  // namespace distributed
 };  // namespace operators
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@ -2,7 +2,9 @@ include(operators)
 register_operators(EXCLUDES fusion_transpose_flatten_concat_op fusion_conv_inception_op)
 if (WITH_GPU)
  op_library(fusion_transpose_flatten_concat_op)
-  op_library(fusion_conv_inception_op)
  file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);\n")
-  file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_inception_fusion);\n")
+  if (NOT ${CUDNN_VERSION} VERSION_LESS 7100)
+      op_library(fusion_conv_inception_op)
+      file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_inception_fusion);\n")
+  endif()
 endif()
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
@ -21,7 +21,7 @@ DECLARE_uint64(conv_workspace_size_limit);
 namespace paddle {
 namespace operators {

-#if CUDNN_VERSION >= 7001
+#if CUDNN_VERSION >= 7100
 using Tensor = framework::Tensor;
 using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
@ -264,7 +264,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle

-#if CUDNN_VERSION >= 7001
+#if CUDNN_VERSION >= 7100
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(conv2d_inception_fusion,
                        ops::CUDNNConvInceptionFusionOpKernel<float>,
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@ -67,6 +67,11 @@ class HierarchicalSigmoidOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("PreOut"),
                   "Output(PreOut) should not be null.");
+    auto with_prefetch = ctx->Attrs().Get<bool>("remote_prefetch");
+    if (with_prefetch) {
+      PADDLE_ENFORCE(ctx->HasOutput("W_Out"),
+                     "Output(W_Out) should not be null.");
+    }
    const int64_t batch_size = ctx->GetInputDim("X")[0];
    std::vector<int64_t> output_shape({batch_size, 1});
    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
@ -95,7 +100,7 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Label",
             "(LoDTensor, required), The labels of training data. It's a"
             "tensor with shape [N, 1].");
-    AddInput("PTable",
+    AddInput("PathTable",
             "(LoDTensor, optional), The Path Table from root to current word"
             "it should have shape like [N, L], L is the length of the Path")
        .AsDispensable();
@ -119,8 +124,30 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
              "[batch_size, code_length], where code_length represents the "
              "maximum path length from root to leaf nodes.")
        .AsIntermediate();
+    AddOutput(
+        "W_Out",
+        "(LoDTensor, optinal) using input 'W' as Output to make it mutable"
+        "When we are using prefetch")
+        .AsIntermediate();
    AddAttr<AttrType>("num_classes", "(int, optional), The number of classes")
        .SetDefault(2);
+    // for parameter prefetch
+    AddAttr<bool>("remote_prefetch", "").SetDefault(false);
+    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
+    AddAttr<std::vector<int>>("height_sections",
+                              "Height for each output SelectedRows.")
+        .SetDefault(std::vector<int>({}));
+    AddAttr<std::vector<std::string>>(
+        "epmap",
+        "(string vector, default 127.0.0.1:6164)"
+        "Server endpoints in the order of input variables for mapping")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>(
+        "table_names",
+        "(string vector, the splited table names that will be fetched from "
+        "parameter server)"
+        "in the order of input variables for mapping")
+        .SetDefault({});
    AddComment(R"DOC(
 The hierarchical sigmoid operator organize the classes into a binary tree.
 At each node, a sigmoid function is used to calculate the probability of
@ -189,23 +216,17 @@ class HierarchicalSigmoidGradOpGradVarTypeInference
               << " is set to SelectedRows";
      block->Var(w_grad_var_name)
          ->SetType(framework::proto::VarType::SELECTED_ROWS);
-      if (hasBias) {
-        VLOG(30) << "hierarchical_sigmoid_grad op "
-                 << framework::GradVarName("Bias") << " is set to SelectedRows";
-        block->Var(bias_grad_var_name)
-            ->SetType(framework::proto::VarType::SELECTED_ROWS);
-      }
    } else {
      VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
               << " is set to LoDTensor";
      block->Var(w_grad_var_name)
          ->SetType(framework::proto::VarType::LOD_TENSOR);
-      if (hasBias) {
-        VLOG(30) << "hierarchical_sigmoid_grad op "
-                 << framework::GradVarName("Bias") << " is set to LoDTensor";
-        block->Var(bias_grad_var_name)
-            ->SetType(framework::proto::VarType::LOD_TENSOR);
-      }
+    }
+    if (hasBias) {
+      VLOG(30) << "hierarchical_sigmoid_grad op "
+               << framework::GradVarName("Bias") << " is set to LoDTensor";
+      block->Var(bias_grad_var_name)
+          ->SetType(framework::proto::VarType::LOD_TENSOR);
    }
    block->Var(w_grad_var_name)->SetDataType(block->Var("W")->GetDataType());
  }
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once
 #include <iostream>
+#include <iterator>
 #include <set>
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
@ -24,6 +26,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/matrix_bit_code.h"
 #include "paddle/fluid/platform/transform.h"

+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
+#endif
+
 namespace paddle {
 namespace operators {

@ -34,8 +40,9 @@ using platform::Transform;

 static std::vector<int64_t> PathToRows(const framework::LoDTensor& path) {
  std::set<int64_t> rows;
+  const int64_t* paths = path.data<int64_t>();
  for (int64_t i = 0; i < path.numel(); ++i) {
-    int64_t row = path.data<int64_t>()[i];
+    int64_t row = paths[i];
    if (row < 0) {
      continue;
    }
@ -49,13 +56,54 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto& in = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
    auto& w = detail::Ref(ctx.Input<framework::LoDTensor>("W"));
-    auto* path = ctx.Input<framework::LoDTensor>("PTable");
+    auto* path = ctx.Input<framework::LoDTensor>("PathTable");
    auto* code = ctx.Input<framework::LoDTensor>("PathCode");
    auto& label = detail::Ref(ctx.Input<framework::LoDTensor>("Label"));
    auto* bias = ctx.Input<framework::LoDTensor>("Bias");
    auto* out = ctx.Output<framework::LoDTensor>("Out");
    auto* pre_out = ctx.Output<framework::LoDTensor>("PreOut");
    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
+    // for remote prefetch
+
+    auto epmap = ctx.Attr<std::vector<std::string>>("epmap");
+    if (!epmap.empty()) {
+      // if epmap is not empty, then the parameter will be fetched from remote
+      // parameter
+      // server
+      auto height_sections = ctx.Attr<std::vector<int>>("height_sections");
+      auto table_names = ctx.Attr<std::vector<std::string>>("table_names");
+      std::vector<int64_t> real_rows = PathToRows(*path);
+      framework::Scope& local_scope = ctx.scope().NewScope();
+      auto* ids = local_scope.Var("Ids@Prefetch");
+      auto* x_tensor = ids->GetMutable<framework::LoDTensor>();
+
+      x_tensor->mutable_data<int64_t>(
+          framework::make_ddim({static_cast<int64_t>(real_rows.size()), 1}),
+          ctx.GetPlace());
+      // copy.
+
+      std::memcpy(x_tensor->data<int64_t>(), real_rows.data(),
+                  real_rows.size() * sizeof(int64_t));
+
+      framework::DDim w_dims = ctx.Input<Tensor>("W")->dims();
+      w_dims[0] = x_tensor->dims()[0];
+      auto* w_tensor =
+          local_scope.Var("W@Prefetch")->GetMutable<framework::LoDTensor>();
+      w_tensor->Resize(w_dims);
+
+#ifdef PADDLE_WITH_DISTRIBUTE
+      // w_Out is set to used by prefetch, never change it in other cases
+      auto* w_out = ctx.Output<framework::LoDTensor>("W_Out");
+      operators::distributed::prefetch_with_reconstruct<T>(
+          "Ids@Prefetch", "W@Prefetch", table_names, epmap, height_sections,
+          ctx, local_scope, w_out);
+#else
+      PADDLE_THROW(
+          "paddle is not compiled with distribute support, can not do "
+          "parameter prefetch!");
+#endif
+    }
+
    bool is_custom = false;
    if (path) {
      is_custom = true;
@ -116,9 +164,8 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto& in = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
    auto& w = detail::Ref(ctx.Input<framework::LoDTensor>("W"));
-    auto* path = ctx.Input<framework::LoDTensor>("PTable");
+    auto* path = ctx.Input<framework::LoDTensor>("PathTable");
    auto* code = ctx.Input<framework::LoDTensor>("PathCode");
-    auto* bias = ctx.Input<framework::LoDTensor>("Bias");
    auto* in_grad =
        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
    bool is_sparse = ctx.Attr<bool>("is_sparse");
@ -173,15 +220,14 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
    }
    // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
    // be consistent with the clipping in forward.
-
+    auto* bias_grad =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("Bias"));
+    if (bias_grad) {
+      bias_grad->mutable_data<T>(ctx.GetPlace());
+      zero(dev_ctx, bias_grad, static_cast<T>(0.0));
+      bit_code->AddGrad(pre_out_grad, bias_grad);
+    }
    if (!is_sparse) {
-      auto* bias_grad =
-          ctx.Output<framework::LoDTensor>(framework::GradVarName("Bias"));
-      if (bias_grad) {
-        bias_grad->mutable_data<T>(ctx.GetPlace());
-        zero(dev_ctx, bias_grad, static_cast<T>(0.0));
-        bit_code->AddGrad(pre_out_grad, bias_grad);
-      }
      auto* w_grad =
          ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
      w_grad->mutable_data<T>(ctx.GetPlace());
@ -200,21 +246,6 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {

      w_grad_value->mutable_data<T>(temp_dim, ctx.GetPlace());
      zero(dev_ctx, w_grad_value, static_cast<T>(0.0));
-      auto* bias_grad =
-          ctx.Output<framework::SelectedRows>(framework::GradVarName("Bias"));
-      if (bias_grad) {
-        bias_grad->set_rows(real_rows);
-        // build ids -> rows index map
-        bias_grad->SyncIndex();
-        bias_grad->set_height(bias->dims()[0]);
-        auto* bias_grad_value = bias_grad->mutable_value();
-        std::vector<int64_t> dims = {static_cast<int64_t>(real_rows.size()),
-                                     bias->dims()[1]};
-        bias_grad_value->mutable_data<T>(framework::make_ddim(dims),
-                                         ctx.GetPlace());
-        zero(dev_ctx, bias_grad_value, static_cast<T>(0.0));
-        bit_code->AddGrad(pre_out_grad, bias_grad);
-      }
      bit_code->MulGradWeight(pre_out_grad, w_grad, in);
    }
    bit_code->MulGradError(pre_out_grad, w, in_grad);
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@ -92,7 +92,8 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
 // server
 #ifdef PADDLE_WITH_DISTRIBUTE
      operators::distributed::prefetch(id_name, out_name, table_names, epmap,
-                                       height_sections, context);
+                                       height_sections, context,
+                                       context.scope());
 #else
      PADDLE_THROW(
          "paddle is not compiled with distribute support, can not do "
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@ -59,7 +59,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
 // server
 #ifdef PADDLE_WITH_DISTRIBUTE
      operators::distributed::prefetch(id_name, out_name, table_names, epmap,
-                                       height_sections, context);
+                                       height_sections, context,
+                                       context.scope());
 #else
      PADDLE_THROW(
          "paddle is not compiled with distribute support, can not do "
--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
@ -84,41 +84,6 @@ void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor &tmat,
  code_table_.apply_visitor(func);
 }

-template <typename T>
-struct MatrixBitCodeFunctorSelectedRowsAddGrad
-    : public boost::static_visitor<void> {
-  const framework::Tensor &tmat_;
-  framework::SelectedRows *vec_;
-
-  MatrixBitCodeFunctorSelectedRowsAddGrad(const framework::Tensor &tmat,
-                                          framework::SelectedRows *vec)
-      : tmat_(tmat), vec_(vec) {}
-
-  template <typename CodeTable>
-  void operator()(const CodeTable &code_table) {
-    size_t batch_size = tmat_.dims()[0];
-    size_t width = tmat_.dims()[1];
-    auto *vec_data = vec_->mutable_value()->template data<T>();
-    auto *tmat_data = tmat_.data<T>();
-    for (size_t i = 0; i < batch_size; ++i) {
-      auto code = code_table.get_code(i);
-      int code_length = code.get_length();
-      for (int j = 0; j < code_length; ++j) {
-        size_t index = code.calc_index(j);
-        int64_t row_index = vec_->GetIndexFromId(static_cast<int64_t>(index));
-        vec_data[row_index] += tmat_data[i * width + j];
-      }
-    }
-  }
-};
-
-template <typename T>
-void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor &tmat,
-                                      framework::SelectedRows *vec) {
-  MatrixBitCodeFunctorSelectedRowsAddGrad<T> func(tmat, vec);
-  code_table_.apply_visitor(func);
-}
-
 template <typename T>
 struct MatrixBitCodeFunctorSum : public boost::static_visitor<void> {
  const framework::Tensor &tmat_;
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@ -124,11 +124,12 @@ class SimpleCode {
 template <typename T>
 class CustomCode {
 public:
-  CustomCode(const framework::Tensor& ptable, const framework::Tensor& pcode,
-             const int64_t* ids, int index) {
-    seq_len_ = ptable.dims()[1];
-    ptable_data_ = ptable.data<T>() + seq_len_ * index;
-    pcode_data_ = pcode.data<T>() + seq_len_ * index;
+  CustomCode(const framework::Tensor& path_table,
+             const framework::Tensor& path_code, const int64_t* ids,
+             int index) {
+    seq_len_ = path_table.dims()[1];
+    path_table_data_ = path_table.data<T>() + seq_len_ * index;
+    path_code_data_ = path_code.data<T>() + seq_len_ * index;
  }
  /**
   * Here the id of root should be 1 rather than 0, thus the encoding of class c
@ -139,25 +140,25 @@ class CustomCode {
   * Binary classification path is the suffixes of encoding, thus leave out the
   * left most bit in calc_bit.
   */
-  size_t calc_index(int bit) const { return ptable_data_[bit]; }
-  bool calc_bit(int bit) const { return pcode_data_[bit]; }
+  size_t calc_index(int bit) const { return path_table_data_[bit]; }
+  bool calc_bit(int bit) const { return path_code_data_[bit]; }

  // NOTE: this function is not thread-safe.
  int get_length() const {
    if (length_ < 0) {
      auto len = seq_len_;
-      length_ =
-          static_cast<int>(std::find_if(ptable_data_, ptable_data_ + len,
-                                        [](const T& val) { return val < 0; }) -
-                           ptable_data_);
+      length_ = static_cast<int>(
+          std::find_if(path_table_data_, path_table_data_ + len,
+                       [](const T& val) { return val < 0; }) -
+          path_table_data_);
    }
    return length_;
  }

 private:
  int64_t seq_len_;
-  const T* ptable_data_;
-  const T* pcode_data_;
+  const T* path_table_data_;
+  const T* path_code_data_;
  mutable int length_{-1};
 };

@ -181,9 +182,9 @@ class SimpleCodeTable {
 template <typename T>
 class CustomCodeTable {
 public:
-  CustomCodeTable(const framework::Tensor& ptable,
-                  const framework::Tensor& pcode, const int64_t* ids)
-      : ptable_(ptable), pcode_(pcode), ids_(ids) {}
+  CustomCodeTable(const framework::Tensor& path_table,
+                  const framework::Tensor& path_code, const int64_t* ids)
+      : ptable_(path_table), pcode_(path_code), ids_(ids) {}

  CustomCode<T> get_code(int64_t code) const {
    return CustomCode<T>(ptable_, pcode_, ids_, code);
@ -210,11 +211,11 @@ class MatrixBitCodeFunctor {
        ids_(ids),
        code_table_(SimpleCodeTable(num_classes, ids)) {}

-  MatrixBitCodeFunctor(const framework::Tensor& ptable,
-                       const framework::Tensor& pcode, const int64_t* ids)
-      : num_classes_(static_cast<size_t>(ptable.dims()[1])),
+  MatrixBitCodeFunctor(const framework::Tensor& path_table,
+                       const framework::Tensor& path_code, const int64_t* ids)
+      : num_classes_(static_cast<size_t>(path_table.dims()[1])),
        ids_(ids),
-        code_table_(CustomCodeTable<int64_t>(ptable, pcode, ids)) {}
+        code_table_(CustomCodeTable<int64_t>(path_table, path_code, ids)) {}
  /* For j < code_length
       tmat(i, j) += vec(0, index(i, j))
  */
@ -225,11 +226,6 @@ class MatrixBitCodeFunctor {
  */
  void AddGrad(const framework::Tensor& tmat, framework::Tensor* vec);

-  /* For selected rows For j < code_length
-       vec(0, index(i, j)) += tmat(i, j)
-  */
-  void AddGrad(const framework::Tensor& tmat, framework::SelectedRows* vec);
-
  /* For j < code_length
    sum(i, 0) = \sum_j bit(i, j) * tmat(i, j)
  */
--- a/Show More
+++ b/Show More