Fix the correctness of async mode at distributed training (#18863)

* fix correctness of the communicator * fix a bug in send thread when sending var context is empty, test=develop * add lookup_table_prefetch_op and prefetch optimize, test=develop * remove remote prefetch GPU supported * word2vec force with CPU, test=develop * test dist remote lookup table force with CPU, test=develop
6 years ago · 65c7368400
parent 61389ae5aa
commit 65c7368400
19 changed files with 509 additions and 290 deletions
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
@ -133,13 +133,6 @@ class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
      VLOG(1) << "set recv op do_not_run to true";
      node->Op()->SetAttr("do_not_run", 1);
      node->Op()->Flush();
    } else if (node->Name() == "lookup_table" || node->Name() == "nce" ||
               node->Name() == "hierarchical_sigmoid") {
      // in async_mode, we do not need remote prefetch, because communicator
      // will do async parameter recv.
      VLOG(1) << "set " << node->Name() << " op remote_prefetch to false";
      node->Op()->SetAttr("remote_prefetch", false);
      node->Op()->Flush();
    }
    return false;
  }
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@ -248,6 +248,8 @@ class ExecutionContext {
    return op_.Attr<T>(name);
  }
  bool HasAttr(const std::string& name) const { return op_.HasAttr(name); }
  bool HasInput(const std::string& name) const;
  bool HasOutput(const std::string& name) const;
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@ -76,14 +76,26 @@ Communicator::Communicator(const RpcCtxMap &send_varname_to_ctx,
  VLOG(0) << "communicator_fake_rpc: " << FLAGS_communicator_fake_rpc;
  VLOG(0) << "communicator_merge_sparse_grad: "
          << FLAGS_communicator_merge_sparse_grad;
-  send_scope_.reset(new Scope());
+
-  for (auto &iter : send_varname_to_ctx_) {
+  if (send_varname_to_ctx.size() == 0) {
-    send_varname_to_queue_[iter.first] =
+    VLOG(0) << "nothing need to be send, will not start send_thread";
-        std::make_shared<BlockingQueue<std::shared_ptr<Variable>>>(
+  } else {
-            FLAGS_communicator_send_queue_size);
+    send_scope_.reset(new Scope());
    for (auto &iter : send_varname_to_ctx_) {
      send_varname_to_queue_[iter.first] =
          std::make_shared<BlockingQueue<std::shared_ptr<Variable>>>(
              FLAGS_communicator_send_queue_size);
    }
    send_threadpool_.reset(
        new ::ThreadPool(FLAGS_communicator_thread_pool_size));
  }
  if (recv_varname_to_ctx.size() == 0) {
    VLOG(0) << "nothing need to be received, will not start recv_thread";
  } else {
    recv_threadpool_.reset(
        new ::ThreadPool(FLAGS_communicator_thread_pool_size));
  }
  send_threadpool_.reset(new ::ThreadPool(FLAGS_communicator_thread_pool_size));
  recv_threadpool_.reset(new ::ThreadPool(FLAGS_communicator_thread_pool_size));
 }
 Communicator::~Communicator() {
@ -160,18 +172,28 @@ void Communicator::SendThread() {
      task_f.wait();
    }
    auto after_run_send_graph = GetCurrentUS();
-    auto send_graph_use_time = after_run_send_graph - before_run_send_graph;
+
-    if (send_graph_use_time > 100) {
+    VLOG(3) << "run send graph use time "
-      VLOG(1) << "run send graph use time "
+            << after_run_send_graph - before_run_send_graph;
-              << after_run_send_graph - before_run_send_graph;
+    RecvNonIndependent();
    }
    if (!FLAGS_communicator_independent_recv_thread) {
      RecvAll();
    }
  }
  VLOG(0) << "communicator stopped, send thread exit";
 }
 void Communicator::RecvNonIndependent() {
  if (!FLAGS_communicator_independent_recv_thread) {
    return;
  }
  auto grad_num = grad_num_.load();
  if (grad_num > 0) {
    RecvAll();
    grad_num_.store(0);
  } else {
    std::this_thread::sleep_for(std::chrono::milliseconds(10));
  }
 }
 void Communicator::RecvAll() {
  VLOG(3) << "parallel run recv graph";
  if (!running_) return;
--- a/paddle/fluid/operators/distributed/communicator.h
+++ b/paddle/fluid/operators/distributed/communicator.h
@ -175,6 +175,7 @@ class Communicator {
 private:
  // recv all parameter
  void RecvAll();
  void RecvNonIndependent();
  void SendThread();
  void RecvThread();
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
--- a/paddle/fluid/operators/distributed/parameter_prefetch.h
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.h
@ -15,6 +15,7 @@
 #pragma once
 #include <string>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/operator.h"
@ -23,61 +24,25 @@ namespace paddle {
 namespace operators {
 namespace distributed {
 constexpr int64_t kNoPadding = -1;
 void prefetchs(const std::vector<std::string>& id_var_names,
               const std::vector<std::string>& out_var_names,
               const std::string& persistable_var_name, const bool backfill,
               const std::vector<std::string>& table_names,
               const std::vector<std::string>& endpoints,
               const std::vector<int64_t>& height_sections,
               const framework::ExecutionContext& context,
               const framework::Scope& scope);
 void prefetch(const std::string& id_name, const std::string& out_name,
              const std::string& persistable_var_name, const bool backfill,
              const std::vector<std::string>& table_names,
-              const std::vector<std::string>& epmap,
+              const std::vector<std::string>& endpoints,
              const std::vector<int64_t>& height_sections,
              const framework::ExecutionContext& context,
              const framework::Scope& scope);
 template <typename T>
 void prefetch_with_reconstruct(const std::string& id_name,
                               const std::string& out_name,
                               const std::vector<std::string>& table_names,
                               const std::vector<std::string>& epmap,
                               const std::vector<int64_t>& height_sections,
                               const framework::ExecutionContext& context,
                               const framework::Scope& scope,
                               framework::LoDTensor* original) {
  prefetch(id_name, out_name, table_names, epmap, height_sections, context,
           scope);
  auto& out = scope.FindVar(out_name)->Get<framework::LoDTensor>();
  auto& ids = scope.FindVar(id_name)->Get<framework::LoDTensor>();
  auto* original_value = original->data<T>();
  auto* out_value = out.data<T>();
  size_t original_width = original->numel() / original->dims()[0];
  bool is_on_cpu_place = true;
  if (!platform::is_cpu_place(ids.place())) {
    is_on_cpu_place = false;
  }
  if (is_on_cpu_place) {
    for (int64_t i = 0; i < ids.numel(); i++) {
      const T* out_rows = out_value + original_width * i;
      T* original_row =
          original_value + original_width * ids.data<int64_t>()[i];
      std::memcpy(original_row, out_rows, original_width * sizeof(T));
    }
  } else {
 #ifndef PADDLE_WITH_CUDA
    PADDLE_THROW("paddle is not compiled with CUDA!");
 #else
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    auto& actual_ctx = *pool.Get(context.GetPlace());
    for (int64_t i = 0; i < ids.numel(); i++) {
      const T* out_rows = out_value + original_width * i;
      T* original_row =
          original_value + original_width * ids.data<int64_t>()[i];
      auto stream =
          static_cast<platform::CUDADeviceContext*>(&actual_ctx)->stream();
      memory::Copy(boost::get<platform::CUDAPlace>(ids.place()), original_row,
                   platform::CPUPlace(), out_rows, original_width * sizeof(T),
                   stream);
    }
 #endif
  }
 }
 };  // namespace distributed
 };  // namespace operators
 };  // namespace paddle
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@ -116,42 +116,7 @@ bool RequestGetHandler::Handle(const std::string& varname,
        VLOG(3) << "copying " << varname << " to " << param_bak_name;
        framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t);
      }
-      if (AsyncSparseParamUpdateRecorder::GetInstance()->HasParam(varname) &&
+      *outvar = scope_->FindVar(varname);
          !table_name.empty()) {
        std::vector<int64_t> updated_rows;
        AsyncSparseParamUpdateRecorder::GetInstance()->GetAndClear(
            varname, trainer_id, &updated_rows);
        if (VLOG_IS_ON(3)) {
          std::ostringstream sstream;
          sstream << "[";
          for (auto& row_id : updated_rows) {
            sstream << row_id << ", ";
          }
          sstream << "]";
          VLOG(3) << "updated_rows size: " << updated_rows.size() << " "
                  << sstream.str();
        }
        auto& origin_tensor =
            scope_->FindVar(varname)->Get<framework::LoDTensor>();
        auto* origin_tensor_data = origin_tensor.data<float>();
        auto& dims = origin_tensor.dims();
        *outvar = scope->Var();
        auto* out_slr = (*outvar)->GetMutable<framework::SelectedRows>();
        out_slr->set_rows(updated_rows);
        out_slr->set_height(dims[0]);
        auto out_dims = framework::make_ddim(
            {static_cast<int64_t>(updated_rows.size()), dims[1]});
        auto* data = out_slr->mutable_value()->mutable_data<float>(
            out_dims, origin_tensor.place());
        auto width = dims[1];
        for (auto i = 0; i < updated_rows.size(); ++i) {
          PADDLE_ENFORCE_LT(updated_rows[i], dims[0]);
          memcpy(data + i * width, origin_tensor_data + updated_rows[i] * width,
                 sizeof(float) * width);
        }
      } else {
        *outvar = scope_->FindVar(varname);
      }
    }
  }
  return true;
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
@ -0,0 +1,166 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/distributed/parameter_prefetch.h"
 #include "paddle/fluid/operators/math/math_function.h"
 namespace paddle {
 namespace operators {
 class DistributedLookupTableOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInputs("Ids"),
                   "Input(Ids) of LookupTableOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("W"),
                   "Input(W) of LookupTableOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutputs("Outputs"),
                   "Output(Outs) of LookupTableOp should not be null.");
    auto ids_dims = ctx->GetInputsDim("Ids");
    auto table_dims = ctx->GetInputDim("W");
    PADDLE_ENFORCE_EQ(table_dims.size(), 2,
                      "Only 2 dimensions of the 'Embedding' is supported.");
    for (auto &ids_dim : ids_dims) {
      PADDLE_ENFORCE_EQ(ids_dim.size(), 2,
                        "The dimension of the 'Ids' tensor must be 2.");
      PADDLE_ENFORCE_EQ(ids_dim[1], 1,
                        "The last dimension of the 'Ids' tensor must be 1.");
    }
    auto lookup_tables =
        ctx->Attrs().Get<std::vector<std::string>>("table_names");
    auto height_sections =
        ctx->Attrs().Get<std::vector<int64_t>>("height_sections");
    auto endpoints = ctx->Attrs().Get<std::vector<std::string>>("endpoints");
    PADDLE_ENFORCE(lookup_tables.size() == height_sections.size() &&
                       lookup_tables.size() == endpoints.size() &&
                       lookup_tables.size() != 0,
                   "Attrs lookup_tables/height_sections/endpoints must have "
                   "save size and can not be 0.");
    auto outputs_dims = std::vector<framework::DDim>();
    for (auto &ids_dim : ids_dims) {
      outputs_dims.push_back(framework::make_ddim({ids_dim[0], table_dims[1]}));
    }
    ctx->SetOutputsDim("Outputs", outputs_dims);
    ctx->ShareLoD("Ids", /*->*/ "Outputs");
  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
    return framework::OpKernelType(data_type, ctx.device_context());
  }
 };
 template <typename T>
 class DistributedLookupTableKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
    auto ids_vars = context.MultiInputVar("Ids");
    auto emb_vars = context.MultiOutput<framework::Tensor>("Embeddings");
    auto id_names = context.Inputs("Ids");
    auto embedding_name = context.Inputs("W").front();
    auto out_names = context.Outputs("Outputs");
    auto lookup_tables = context.Attr<std::vector<std::string>>("table_names");
    auto height_sections =
        context.Attr<std::vector<int64_t>>("height_sections");
    auto endpoints = context.Attr<std::vector<std::string>>("endpoints");
    operators::distributed::prefetchs(
        id_names, out_names, embedding_name, false, lookup_tables, endpoints,
        height_sections, context, context.scope());
  }
 };
 class DistributedLookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
    AddInput("Ids",
             "(LoDTensor) Ids's type should be LoDTensor"
             "THe ids to be looked up in W.")
        .AsDuplicable();
    AddInput("W",
             "(Tensor) The input represents embedding tensors, "
             "which is a learnable parameter.");
    AddOutput("Outputs",
              "(LoDTensor) The lookup results, which have the same type as W.")
        .AsDuplicable();
    AddAttr<std::vector<std::string>>(
        "table_names",
        "(string vector, such as emb_block0, emb_block1)"
        "Server endpoints in the order of input variables for mapping")
        .SetDefault({""});
    AddAttr<std::vector<int64_t>>("height_sections",
                                  "Height for each output SelectedRows.")
        .SetDefault(std::vector<int64_t>({}));
    AddAttr<std::vector<std::string>>(
        "endpoints",
        "(string vector, default 127.0.0.1:6164)"
        "Server endpoints in the order of input variables for mapping")
        .SetDefault({"127.0.0.1:6164"});
    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
    AddAttr<int64_t>("padding_idx",
                     "(int64, default -1) "
                     "If the value is -1, it makes no effect to lookup. "
                     "Otherwise the given value indicates padding the output "
                     "with zeros whenever lookup encounters it in Ids.")
        .SetDefault(distributed::kNoPadding);
    AddComment(R"DOC(
 Lookup Tablel Prefetch Operator.
 This operator is used to perform lookup on parameter W,
 then concatenated into a sparse tensor.
 The type of Ids(Input) is SelectedRows, the rows of Ids contains
 the ids to be looked up in W;
 if the Id is not in the sparse table, this operator will return a
 random value and set the value into the table for the next looking up.
 )DOC");
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(distributed_lookup_table, ops::DistributedLookupTableOp,
                  ops::DistributedLookupTableOpMaker);
 REGISTER_OP_CPU_KERNEL(distributed_lookup_table,
                       ops::DistributedLookupTableKernel<float>);
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@ -97,10 +97,10 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_DISTRIBUTE
      // w_Out is set to used by prefetch, never change it in other cases
-      auto* w_out = ctx.Output<framework::LoDTensor>("W_Out");
+      auto weight = ctx.Outputs("W_Out").front();
-      operators::distributed::prefetch_with_reconstruct<T>(
+      operators::distributed::prefetch("Ids@Prefetch", "W@Prefetch", weight,
-          "Ids@Prefetch", "W@Prefetch", table_names, epmap, height_sections,
+                                       true, table_names, epmap,
-          ctx, local_scope, w_out);
+                                       height_sections, ctx, local_scope);
 #else
      PADDLE_THROW(
          "paddle is not compiled with distribute support, can not do "
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@ -98,46 +98,27 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
    auto id_name = context.Inputs("Ids").front();
    auto out_name = context.Outputs("Out").front();
-    // for remote prefetch
+    size_t N = table_t->dims()[0];
-    auto epmap = context.Attr<std::vector<std::string>>("epmap");
+    size_t D = table_t->dims()[1];
-    auto height_sections =
+    size_t K = ids_t->numel();
-        context.Attr<std::vector<int64_t>>("height_sections");
+
-    auto table_names = context.Attr<std::vector<std::string>>("table_names");
+    auto *ids = ids_t->data<int64_t>();
-
+    auto *table = table_t->data<T>();
-    if (!epmap.empty()) {
+    auto *output = output_t->mutable_data<T>(context.GetPlace());
-// if epmap is not empty, then the parameter will be fetched from remote
+
-// parameter
+    dim3 threads(128, 8);
-// server
+    dim3 grids(8, 1);
-#ifdef PADDLE_WITH_DISTRIBUTE
+
-      operators::distributed::prefetch(id_name, out_name, table_names, epmap,
+    if (padding_idx == -1)
-                                       height_sections, context,
+      LookupTable<
-                                       context.scope());
+          T, 128, 8, 8,
-#else
+          false><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-      PADDLE_THROW(
+          output, table, ids, N, K, D, padding_idx);
-          "paddle is not compiled with distribute support, can not do "
+    else
-          "parameter prefetch!");
+      LookupTable<
-#endif
+          T, 128, 8, 8,
-    } else {
+          true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-      size_t N = table_t->dims()[0];
+          output, table, ids, N, K, D, padding_idx);
      size_t D = table_t->dims()[1];
      size_t K = ids_t->numel();
      auto *ids = ids_t->data<int64_t>();
      auto *table = table_t->data<T>();
      auto *output = output_t->mutable_data<T>(context.GetPlace());
      dim3 threads(128, 8);
      dim3 grids(8, 1);
      if (padding_idx == -1)
        LookupTable<T, 128, 8, 8, false><<<
            grids, threads, 0, context.cuda_device_context().stream()>>>(
            output, table, ids, N, K, D, padding_idx);
      else
        LookupTable<T, 128, 8, 8, true><<<
            grids, threads, 0, context.cuda_device_context().stream()>>>(
            output, table, ids, N, K, D, padding_idx);
    }
  }
 };
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@ -46,6 +46,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
    auto *table_var = context.InputVar("W");
    auto id_name = context.Inputs("Ids").front();
    auto embedding_name = context.Inputs("W").front();
    auto out_name = context.Outputs("Out").front();
    // for remote prefetch
@ -57,12 +58,12 @@ class LookupTableKernel : public framework::OpKernel<T> {
    if (remote_prefetch && !epmap.empty()) {
 // if epmap is not empty, then the parameter will be fetched from remote
-// parameter
+// parameter server
-// server
+
 #ifdef PADDLE_WITH_DISTRIBUTE
-      operators::distributed::prefetch(id_name, out_name, table_names, epmap,
+      operators::distributed::prefetch(id_name, out_name, embedding_name, false,
-                                       height_sections, context,
+                                       table_names, epmap, height_sections,
-                                       context.scope());
+                                       context, context.scope());
 #else
      PADDLE_THROW(
          "paddle is not compiled with distribute support, can not do "
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@ -195,9 +195,10 @@ class NCEKernel : public framework::OpKernel<T> {
      w_tensor->Resize(framework::make_ddim(w_dims));
 #ifdef PADDLE_WITH_DISTRIBUTE
      auto weight = context.Inputs("Weight").front();
      operators::distributed::prefetch("Ids@Prefetch", "Weight@Prefetch",
-                                       table_names, epmap, height_sections,
+                                       weight, false, table_names, epmap,
-                                       context, local_scope);
+                                       height_sections, context, local_scope);
 #else
      PADDLE_THROW(
          "paddle is not compiled with distribute support, can not do "
--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/init.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/init.py
@ -210,11 +210,6 @@ class DistributedTranspiler(Fleet):
        self._transpile_config = config
        self._transpiler = OriginTranspiler(config)
        print("server endpoints")
        print(fleet.server_endpoints(to_string=True))
        print("worker index: %d" % fleet.worker_index())
        print("worker num: %d" % fleet.worker_num())
        if self.is_worker():
            self._transpiler.transpile(
                trainer_id=fleet.worker_index(),
@ -222,12 +217,11 @@ class DistributedTranspiler(Fleet):
                trainers=fleet.worker_num(),
                sync_mode=config.sync_mode)
            wait_port = True
            if isinstance(self._role_maker, MPISymetricRoleMaker):
-                wait_port = False
+                config.wait_port = False
            self.main_program = self._transpiler.get_trainer_program(
-                wait_port=wait_port)
+                wait_port=config.wait_port)
            self.startup_program = default_startup_program()
        else:
            self._transpiler.transpile(
--- a/python/paddle/fluid/tests/test_communicator.py
+++ b/python/paddle/fluid/tests/test_communicator.py
@ -15,12 +15,51 @@
 from __future__ import print_function
 import unittest
 import time
 import paddle.fluid as fluid
 from paddle.fluid.communicator import Communicator
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
 from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
 class TestCommunicator(unittest.TestCase):
    def net(self):
        x = fluid.layers.data(name='x', shape=[13], dtype='float32')
        y_predict = fluid.layers.fc(input=x, size=1, act=None)
        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
        avg_cost = fluid.layers.mean(cost)
        return avg_cost
    def test_communicator_init_and_start(self):
        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.WORKER,
            worker_num=2,
            server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
        fleet.init(role)
        avg_cost = self.net()
        optimizer = fluid.optimizer.SGD(0.01)
        strategy = DistributeTranspilerConfig()
        strategy.sync_mode = True
        strategy.wait_port = False
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)
        comm = Communicator(fleet.main_program)
        comm.start()
        time.sleep(10)
        comm.stop()
 class TestCommunicator2(unittest.TestCase):
    def test_communicator_init_and_start(self):
        prog = fluid.Program()
        comm = Communicator(prog)
--- a/python/paddle/fluid/tests/unittests/test_dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_ctr.py
@ -18,6 +18,18 @@ import unittest
 from test_dist_base import TestDistBase
 def skip_ci(func):
    on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
    def __func__(*args, **kwargs):
        if on_ci:
            return
        return func(*args, **kwargs)
    return __func__
@skip_ci
 class TestDistCTR2x2(TestDistBase):
    def _setup_config(self):
        self._sync_mode = True
@ -27,6 +39,7 @@ class TestDistCTR2x2(TestDistBase):
        self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False)
@skip_ci
 class TestDistCTRWithL2Decay2x2(TestDistBase):
    def _setup_config(self):
        self._sync_mode = True
@ -37,7 +50,7 @@ class TestDistCTRWithL2Decay2x2(TestDistBase):
        self.check_with_place(
            "dist_ctr.py",
            delta=1e-7,
-            check_error_log=False,
+            check_error_log=True,
            need_envs=need_envs)
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
@ -19,6 +19,18 @@ import unittest
 from test_dist_fleet_base import TestFleetBase
 def skip_ci(func):
    on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
    def __func__(*args, **kwargs):
        if on_ci:
            return
        return func(*args, **kwargs)
    return __func__
@skip_ci
 class TestDistMnist2x2(TestFleetBase):
    def _setup_config(self):
        self._sync_mode = False
--- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
@ -20,6 +20,7 @@ from test_dist_base import TestDistBase
 class TestDistW2V2x2(TestDistBase):
    def _setup_config(self):
        self._sync_mode = True
        self._enforce_place = "CPU"
    def test_dist_train(self):
        self.check_with_place("dist_word2vec.py", delta=1e-4)
@ -29,6 +30,7 @@ class TestDistW2V2x2WithMemOpt(TestDistBase):
    def _setup_config(self):
        self._sync_mode = True
        self._mem_opt = True
        self._enforce_place = "CPU"
    def test_dist_train(self):
        self.check_with_place("dist_word2vec.py", delta=1e-4)
@ -37,6 +39,7 @@ class TestDistW2V2x2WithMemOpt(TestDistBase):
 class TestDistW2V2x2Async(TestDistBase):
    def _setup_config(self):
        self._sync_mode = False
        self._enforce_place = "CPU"
    def test_dist_train(self):
        self.check_with_place("dist_word2vec.py", delta=100)
--- a/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
@ -185,8 +185,6 @@ class TestListenAndServOp(unittest.TestCase):
        port1 = self._get_pserver_port(p1.pid)
        places = [core.CPUPlace()]
        if core.is_compiled_with_cuda():
            places.append(core.CUDAPlace(0))
        for place in places:
            self._run_lookup_table_op_one_pserver(place, port0)
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@ -357,14 +357,49 @@ class DistributeTranspiler(object):
                sparse_update_ops.append(op)
        return sparse_update_ops
-    def _update_remote_sparse_update_op(self, param_varname, height_sections,
+    def _update_remote_sparse_update_op(self, program, param_varname,
-                                        endpint_map, table_names):
+                                        height_sections, endpoints,
                                        table_names):
        ops = []
        op_type = ""
        for op in self.sparse_update_ops:
-            if param_varname in op.input_arg_names:
+            if param_varname in op.input_arg_names and op_type == "":
-                op._set_attr('epmap', endpint_map)
+                op_type = op.type
-                op._set_attr('table_names', table_names)
+                ops.append(op)
-                op._set_attr('height_sections', height_sections)
+
-                op._set_attr('trainer_id', self.trainer_id)
+            elif param_varname in op.input_arg_names and op_type == op.type:
                ops.append(op)
        if op_type == "lookup_table":
            all_ops = program.global_block().ops
            op_idxs = [all_ops.index(op) for op in ops]
            inputs = [
                program.global_block().vars[op.input("Ids")[0]] for op in ops
            ]
            w = program.global_block().vars[ops[0].input("W")[0]]
            padding_idx = ops[0].attr("padding_idx")
            outputs = [
                program.global_block().vars[op.output("Out")[0]] for op in ops
            ]
            for idx in op_idxs[::-1]:
                program.global_block()._remove_op(idx)
            program.global_block()._insert_op(
                index=op_idxs[0],
                type="distributed_lookup_table",
                inputs={"Ids": inputs,
                        'W': w},
                outputs={"Outputs": outputs},
                attrs={
                    "table_names": table_names,
                    "height_sections": height_sections,
                    "endpoints": endpoints,
                    "padding_idx": padding_idx,
                    "trainer_id": self.trainer_id
                })
    def _is_input_of_remote_sparse_update_op(self, param_name):
        for op in self.sparse_update_ops:
@ -523,17 +558,12 @@ class DistributeTranspiler(object):
                splited_grad_varname = splited_vars[0].name
                index = find_op_by_output_arg(
                    program.global_block(), splited_grad_varname, reverse=True)
-                if splited_vars[0].type == core.VarDesc.VarType.SELECTED_ROWS:
+
                    sparse_param_name = self.grad_name_to_param_name[
                        grad_varname]
                    if self._is_input_of_remote_sparse_update_op(
                            sparse_param_name):
                        self.sparse_param_to_height_sections[
                            sparse_param_name] = [splited_vars[0].shape[0]]
            elif len(splited_vars) > 1:
                orig_var = program.global_block().vars[splited_grad_varname]
                index = find_op_by_output_arg(
                    program.global_block(), splited_grad_varname, reverse=True)
                if not self.config.runtime_split_send_recv:
                    self._insert_split_op(program, orig_var, index,
                                          splited_vars)
@ -542,6 +572,13 @@ class DistributeTranspiler(object):
                AssertionError("Can not insert the send op by original "
                               "variable name :", splited_grad_varname)
            if splited_vars[0].type == core.VarDesc.VarType.SELECTED_ROWS:
                sparse_param_name = self.grad_name_to_param_name[grad_varname]
                if self._is_input_of_remote_sparse_update_op(sparse_param_name):
                    self.sparse_param_to_height_sections[sparse_param_name] = [
                        splited_var.shape[0] for splited_var in splited_vars
                    ]
            dummy_output = program.global_block().create_var(
                name=framework.generate_control_dev_var_name())
            self.grad_name_to_send_dummy_out[grad_varname] = dummy_output
@ -639,7 +676,6 @@ class DistributeTranspiler(object):
                recv_op_role_var_name = splited_trainer_grad[0].name
            if param_varname in self.sparse_param_to_height_sections:
                for table_name in table_names:
                    distributed_var = self.vars_overview.get_distributed_var_by_slice(
                        table_name)
@ -648,7 +684,7 @@ class DistributeTranspiler(object):
                height_sections = self.sparse_param_to_height_sections[
                    param_varname]
                self._update_remote_sparse_update_op(
-                    param_varname, height_sections, eps, table_names)
+                    program, param_varname, height_sections, eps, table_names)
            else:
                recv_varnames = []
                if self.config.runtime_split_send_recv: