feature/DC asgd (#12722)

* wip * add ref_by_trainer_id op * ready to test * fix ref inputs * refine rpc_op_handle * fix merge bug
7 years ago · 306236c2c0
parent c3cbf0b8ef
commit 306236c2c0
32 changed files with 469 additions and 58 deletions
--- a/paddle/fluid/framework/details/rpc_op_handle.cc
+++ b/paddle/fluid/framework/details/rpc_op_handle.cc
@ -29,22 +29,19 @@ RPCOpHandle::RPCOpHandle(ir::Node *node, const framework::OpDesc &op_desc,
      place_(place) {}

 void RPCOpHandle::RunImpl() {
-  // TODO(wuyi): need further analysis whether wait VarDummyHandle.
-  // Wait input done
  for (auto *in : inputs_) {
    auto &p = static_cast<VarHandle *>(in)->place_;
-    // FIXME(Yancey1989): need a better solution instead of use DebugString()
-    if (ir::IsControlDepVar(*in->Node())) {  // HACK
+    if (ir::IsControlDepVar(*in->Node())) {
      continue;
    }
    if (in->GeneratedOp()) {
      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(p));
    }
  }
-  auto &tmp_scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
-  // FIXME(wuyi): can not use RunAndRecordEvent here, for it will cause dead
-  // lock.
-  op_->Run(*tmp_scope, place_);
+  this->RunAndRecordEvent([this] {
+    op_->Run(*local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(),
+             place_);
+  });
 }

 std::string RPCOpHandle::Name() const { return name_; }
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -85,8 +85,10 @@ Executor::Executor(const platform::Place& place) : place_(place) {}

 void Executor::Close() {
 #ifdef PADDLE_WITH_DISTRIBUTE
+  // TODO(typhoonzero): complete message will need to use real trainer_id,
+  // except 0.
  ::paddle::operators::distributed::RPCClient::GetInstance<
-      ::paddle::operators::distributed::GRPCClient>()
+      ::paddle::operators::distributed::GRPCClient>(0)
      ->SendComplete();
 #endif
 }
--- a/paddle/fluid/operators/checkpoint_notify_op.cc
+++ b/paddle/fluid/operators/checkpoint_notify_op.cc
@ -38,9 +38,10 @@ class CheckpointNotifyOp : public framework::OperatorBase {
    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
    std::string dir = Attr<std::string>("dir");
    std::string lookup_table_name = Attr<std::string>("lookup_table");
+    int trainer_id = Attr<int>("trainer_id");

    distributed::RPCClient* rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
    for (size_t i = 0; i < epmap.size(); i++) {
      auto lookup_table_save_dir =
          string::Sprintf("%s/%s_%d", dir, lookup_table_name, i);
@ -63,6 +64,7 @@ class CheckpointNotifyOpMaker : public framework::OpProtoAndCheckerMaker {
        "dir", "(string, default '') indicate the folder checkpoint will use");
    AddAttr<std::string>("lookup_table",
                         "(string, default '') the lookup table name");
+    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
    AddComment(R"DOC(
 CheckpointNotify operator

--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@ -79,7 +79,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
    auto* var = p_scope->FindVar(var_name_val);

    ::grpc::ByteBuffer req;
-    SerializeToByteBuffer(var_name_val, var, *p_ctx, &req);
+    SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_);

    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";

@ -105,7 +105,10 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
 void ProcGetResponse(const VarHandle& var_h,
                     const ::grpc::ByteBuffer& ret_msg) {
  framework::Variable* outvar = nullptr;
-  DeserializeFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar);
+  // get response's trainer_id is not used
+  int trainer_id;
+  DeserializeFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar,
+                            &trainer_id);
 }

 template <typename T>
@ -135,6 +138,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
    // prepare input
    sendrecv::VariableMessage req;
    req.set_varname(var_name_val);
+    req.set_trainer_id(trainer_id_);
    ::grpc::ByteBuffer buf;
    RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);

--- a/paddle/fluid/operators/distributed/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
@ -34,8 +34,8 @@ namespace distributed {

 void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                           const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg,
-                           const std::string& out_name) {
+                           ::grpc::ByteBuffer* msg, const std::string& out_name,
+                           const int trainer_id) {
  platform::RecordRPCEvent record_event("serial", &ctx);
  // Default DestroyCallback does nothing, When using GPU
  // the CPU buffer need to be freed.
@ -45,6 +45,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
  size_t payload_size;

  request.set_varname(name);
+  request.set_trainer_id(trainer_id);
  // Note: normally the profiler is enabled in 1 trainer, hence only
  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
  // servers the trainer's profiling state so that PS can follow the
@ -147,11 +148,12 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
 void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                               const platform::DeviceContext& ctx,
                               const framework::Scope* scope,
-                               framework::Variable** var) {
+                               framework::Variable** var, int* trainer_id) {
  platform::RecordRPCEvent record_event("deserial", &ctx);
  operators::distributed::GRPCVariableResponse resp(scope, &ctx);
  PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
  *var = resp.GetVar();
+  *trainer_id = resp.GetTrainerId();
 }

 }  // namespace distributed
--- a/paddle/fluid/operators/distributed/grpc_serde.h
+++ b/paddle/fluid/operators/distributed/grpc_serde.h
@ -38,12 +38,13 @@ typedef void (*DestroyCallback)(void*);
 void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                           const platform::DeviceContext& ctx,
                           ::grpc::ByteBuffer* msg,
-                           const std::string& out_varname = std::string());
+                           const std::string& out_varname = std::string(),
+                           const int trainer_id = 0);

 void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                               const platform::DeviceContext& ctx,
                               const framework::Scope* scope,
-                               framework::Variable** var);
+                               framework::Variable** var, int* trainer_id);

 }  // namespace distributed
 }  // namespace operators
--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@ -102,9 +102,10 @@ class RequestSend final : public RequestBase {

    auto scope = request_->GetMutableLocalScope();
    auto invar = request_->GetVar();
+    int trainer_id = request_->GetTrainerId();
    framework::Variable* outvar = nullptr;

-    request_handler_->Handle(varname, scope, invar, &outvar);
+    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id);
    Finish(reply_, &responder_);
  }

@ -133,13 +134,14 @@ class RequestGet final : public RequestBase {
  void Process() override {
    // proc request.
    std::string varname = request_.varname();
+    int trainer_id = request_.trainer_id();
    VLOG(4) << "RequestGet " << varname;

    auto scope = request_handler_->scope();
    auto invar = scope->FindVar(varname);
    framework::Variable* outvar = nullptr;

-    request_handler_->Handle(varname, scope, invar, &outvar);
+    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id);

    if (outvar) {
      SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(),
@ -179,6 +181,7 @@ class RequestPrefetch final : public RequestBase {
    // prefetch process...
    std::string in_var_name = request_->Varname();
    std::string out_var_name = request_->OutVarname();
+    int trainer_id = request_->GetTrainerId();
    VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name
            << " out_var_name: " << out_var_name;

@ -187,7 +190,8 @@ class RequestPrefetch final : public RequestBase {
    // out var must be created in local scope!
    framework::Variable* outvar = scope->Var(out_var_name);

-    request_handler_->Handle(in_var_name, scope, invar, &outvar, out_var_name);
+    request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
+                             out_var_name);

    SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
                          &reply_);
@ -225,12 +229,13 @@ class RequestCheckpointNotify final : public RequestBase {

    std::string checkpoint_notify = request_->Varname();
    std::string checkpoint_dir = request_->OutVarname();
+    int trainer_id = request_->GetTrainerId();

    VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify
            << ", dir: " << checkpoint_dir;

    request_handler_->Handle(checkpoint_notify, scope, nullptr, nullptr,
-                             checkpoint_dir);
+                             trainer_id, checkpoint_dir);
    Finish(reply_, &responder_);
  }

--- a/paddle/fluid/operators/distributed/grpc_variable_response.cc
+++ b/paddle/fluid/operators/distributed/grpc_variable_response.cc
@ -293,6 +293,14 @@ int GRPCVariableResponse::Parse(Source* source) {
        }
        break;
      }
+      case sendrecv::VariableMessage::kTrainerIdFieldNumber: {
+        uint64_t trainer_id = 0;
+        if (!input.ReadVarint64(&trainer_id)) {
+          return tag;
+        }
+        meta_.set_trainer_id(trainer_id);
+        break;
+      }
      default: {
        // Unknown tag, return unknown error.
        return -1;
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@ -190,6 +190,7 @@ class RequestHandler {
  //    }
  virtual bool Handle(const std::string& varname, framework::Scope* scope,
                      framework::Variable* var, framework::Variable** outvar,
+                      const int trainer_id,
                      const std::string& out_var_name = "") = 0;

 protected:
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@ -36,6 +36,7 @@ bool RequestSendHandler::Handle(const std::string& varname,
                                framework::Scope* scope,
                                framework::Variable* invar,
                                framework::Variable** outvar,
+                                const int trainer_id,
                                const std::string& out_var_name) {
  VLOG(4) << "RequestSendHandler:" << varname;

@ -76,6 +77,7 @@ bool RequestGetHandler::Handle(const std::string& varname,
                               framework::Scope* scope,
                               framework::Variable* invar,
                               framework::Variable** outvar,
+                               const int trainer_id,
                               const std::string& out_var_name) {
  VLOG(4) << "RequestGetHandler:" << varname;
  if (sync_mode_) {
@ -88,6 +90,19 @@ bool RequestGetHandler::Handle(const std::string& varname,
    }
  } else {
    if (varname != FETCH_BARRIER_MESSAGE && varname != COMPLETE_MESSAGE) {
+      if (enable_dc_asgd_) {
+        // NOTE: the format is determined by distributed_transpiler.py
+        std::string param_bak_name =
+            string::Sprintf("%s.trainer_%d_bak", varname, trainer_id);
+        VLOG(3) << "getting " << param_bak_name << " trainer_id " << trainer_id;
+        auto var = scope_->FindVar(varname);
+        auto t_orig = var->Get<framework::LoDTensor>();
+        auto param_bak = scope_->Var(param_bak_name);
+        auto t = param_bak->GetMutable<framework::LoDTensor>();
+        t->mutable_data(dev_ctx_->GetPlace(), t_orig.type());
+        VLOG(3) << "copying " << varname << " to " << param_bak_name;
+        framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t);
+      }
      *outvar = scope_->FindVar(varname);
    }
  }
@ -98,6 +113,7 @@ bool RequestPrefetchHandler::Handle(const std::string& varname,
                                    framework::Scope* scope,
                                    framework::Variable* invar,
                                    framework::Variable** outvar,
+                                    const int trainer_id,
                                    const std::string& out_var_name) {
  VLOG(4) << "RequestPrefetchHandler " << varname;

@ -113,6 +129,7 @@ bool RequestCheckpointHandler::Handle(const std::string& varname,
                                      framework::Scope* scope,
                                      framework::Variable* invar,
                                      framework::Variable** outvar,
+                                      const int trainer_id,
                                      const std::string& out_var_name) {
  PADDLE_ENFORCE(
      checkpoint_notify_id != -1,
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ b/paddle/fluid/operators/distributed/request_handler_impl.h
@ -36,20 +36,34 @@ namespace distributed {

 class RequestSendHandler final : public RequestHandler {
 public:
-  explicit RequestSendHandler(bool sync_mode) : RequestHandler(sync_mode) {}
+  explicit RequestSendHandler(bool sync_mode, bool enable_dc_asgd = false)
+      : RequestHandler(sync_mode) {
+    enable_dc_asgd_ = enable_dc_asgd;
+  }
  virtual ~RequestSendHandler() {}
  bool Handle(const std::string& varname, framework::Scope* scope,
              framework::Variable* var, framework::Variable** outvar,
+              const int trainer_id,
              const std::string& out_var_name = "") override;
+
+ private:
+  bool enable_dc_asgd_;
 };

 class RequestGetHandler final : public RequestHandler {
 public:
-  explicit RequestGetHandler(bool sync_mode) : RequestHandler(sync_mode) {}
+  explicit RequestGetHandler(bool sync_mode, bool enable_dc_asgd = false)
+      : RequestHandler(sync_mode) {
+    enable_dc_asgd_ = enable_dc_asgd;
+  }
  virtual ~RequestGetHandler() {}
  bool Handle(const std::string& varname, framework::Scope* scope,
              framework::Variable* var, framework::Variable** outvar,
+              const int trainer_id,
              const std::string& out_var_name = "") override;
+
+ private:
+  bool enable_dc_asgd_;
 };

 class RequestPrefetchHandler final : public RequestHandler {
@ -58,6 +72,7 @@ class RequestPrefetchHandler final : public RequestHandler {
  virtual ~RequestPrefetchHandler() {}
  bool Handle(const std::string& varname, framework::Scope* scope,
              framework::Variable* var, framework::Variable** outvar,
+              const int trainer_id,
              const std::string& out_var_name = "") override;
 };

@ -70,6 +85,7 @@ class RequestCheckpointHandler final : public RequestHandler {
  virtual ~RequestCheckpointHandler() {}
  bool Handle(const std::string& varname, framework::Scope* scope,
              framework::Variable* var, framework::Variable** outvar,
+              const int trainer_id,
              const std::string& out_var_name = "") override;

 private:
--- a/paddle/fluid/operators/distributed/rpc_client.cc
+++ b/paddle/fluid/operators/distributed/rpc_client.cc
@ -24,6 +24,7 @@ namespace distributed {

 std::once_flag RPCClient::init_flag_;
 std::unique_ptr<RPCClient> RPCClient::rpc_client_(nullptr);
+int RPCClient::trainer_id_ = 0;

 }  // namespace distributed
 }  // namespace operators
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@ -72,14 +72,15 @@ class RPCClient {
  virtual bool Wait() = 0;

  template <typename T>
-  static RPCClient* GetInstance() {
-    std::call_once(init_flag_, &RPCClient::Init<T>);
+  static RPCClient* GetInstance(int trainer_id) {
+    std::call_once(init_flag_, &RPCClient::Init<T>, trainer_id);
    return rpc_client_.get();
  }

  // Init is called by GetInstance.
  template <typename T>
-  static void Init() {
+  static void Init(int trainer_id) {
+    trainer_id_ = trainer_id;
    if (rpc_client_.get() == nullptr) {
      rpc_client_.reset(new T());
      rpc_client_->InitImpl();
@ -88,6 +89,8 @@ class RPCClient {

 protected:
  virtual void InitImpl() {}
+  // each trainer have exact one trainer id, it should be static
+  static int trainer_id_;

 private:
  static std::once_flag init_flag_;
--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ b/paddle/fluid/operators/distributed/rpc_server_test.cc
@ -125,7 +125,7 @@ TEST(PREFETCH, CPU) {
  g_req_handler.reset(new distributed::RequestPrefetchHandler(true));
  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
  distributed::RPCClient* client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>();
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);

  std::thread server_thread(StartServer, distributed::kRequestPrefetch);
  g_rpc_service->WaitServerReady();
@ -165,7 +165,7 @@ TEST(COMPLETE, CPU) {
  g_req_handler.reset(new distributed::RequestSendHandler(true));
  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 2));
  distributed::RPCClient* client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>();
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
  PADDLE_ENFORCE(client != nullptr);
  std::thread server_thread(StartServer, distributed::kRequestSend);
  g_rpc_service->WaitServerReady();
--- a/paddle/fluid/operators/distributed/send_recv.proto.in
+++ b/paddle/fluid/operators/distributed/send_recv.proto.in
@ -79,6 +79,7 @@ message VariableMessage {
  // server stops profiling and generates a profile to /tmp/profile_ps_*
  // when profile switches from 1 to 2.
  int64 profile = 11;
+  int64 trainer_id = 12;
 }

 message VoidMessage {}
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
@ -92,6 +92,8 @@ class VariableResponse {
    return scope_->FindVar(meta_.varname());
  }

+  int GetTrainerId() { return static_cast<int>(meta_.trainer_id()); }
+
 protected:
  bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
               const platform::DeviceContext& dev_ctx, platform::Place place,
--- a/paddle/fluid/operators/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
@ -37,7 +37,8 @@ class FetchBarrierOp : public framework::OperatorBase {
               const platform::Place& place) const override {
    std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
    distributed::RPCClient* rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>(
+            Attr<int>("trainer_id"));

    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");

@ -61,6 +62,7 @@ This operator will send a send barrier signal to list_and_serv op, so that
 the Parameter Server would knew all variables have been sent.
 )DOC");

+    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
    AddAttr<std::vector<std::string>>("endpoints",
                                      "(string vector, default 127.0.0.1:6164)"
                                      "Server endpoints to send variables to.")
--- a/paddle/fluid/operators/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/gen_nccl_id_op.cc
@ -61,7 +61,7 @@ class GenNCCLIdOp : public framework::OperatorBase {
    std::vector<std::string> endpoint_list =
        Attr<std::vector<std::string>>("endpoint_list");
    distributed::RPCClient* client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);

    for (auto& ep : endpoint_list) {
      VLOG(3) << "sending nccl id to " << ep;
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@ -218,23 +218,26 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
                                   framework::ProgramDesc *program,
                                   framework::Scope *recv_scope) const {
  VLOG(2) << "RunAsyncLoop";
-  // grad name to block id
-  std::unordered_map<std::string, int32_t> grad_to_block_id;
-  std::unordered_map<int32_t, std::string> id_to_grad;
-
  auto grad_to_block_id_str =
      Attr<std::vector<std::string>>("grad_to_block_id");
-  for (const auto &grad_and_id : grad_to_block_id_str) {
+  DoubleFindMap<std::string, int32_t> grad_to_block_id;
+
+  auto append_block_maps = [](DoubleFindMap<std::string, int32_t> *out_map,
+                              const std::string &grad_and_id) {
    std::vector<std::string> pieces;
    split(grad_and_id, ':', &pieces);
-    VLOG(3) << "after split, grad = " << pieces[0] << ", id=" << pieces[1];
+    VLOG(3) << "after split, key = " << pieces[0] << ", id=" << pieces[1];
    PADDLE_ENFORCE_EQ(pieces.size(), 2);
-    PADDLE_ENFORCE_EQ(grad_to_block_id.count(pieces[0]), 0);
+    PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0);

    int block_id = std::stoi(pieces[1]);
-    grad_to_block_id[pieces[0]] = block_id;
-    id_to_grad[block_id] = pieces[0];
+    (*out_map)[pieces[0]] = block_id;
+  };
+
+  for (const auto &grad_and_id : grad_to_block_id_str) {
+    append_block_maps(&grad_to_block_id, grad_and_id);
  }
+
  size_t num_blocks = program->Size();
  PADDLE_ENFORCE_GE(num_blocks, 2,
                    "server program should have at least 2 blocks");
@ -244,15 +247,22 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
    block_list.push_back(blkid);
  }
  auto optimize_prepared = executor->Prepare(*program, block_list);
-  // execute global block if needed
-  if (block_list[0] == 1 && id_to_grad.count(1) == 0) {
+  // execute global block if needed, block id 1 in the program is global
+  // block if it's not bind to a grad var for it's update.
+  if (block_list[0] == 1 &&
+      grad_to_block_id.find_value(static_cast<int32_t>(1)) ==
+          grad_to_block_id.end()) {
    executor->RunPreparedContext(optimize_prepared[0].get(), recv_scope);
  }
  std::unordered_map<std::string,
                     std::shared_ptr<framework::ExecutorPrepareContext>>
-      grad_to_prepared_ctx;
+      grad_to_prepared_ctx, param_to_prepared_ctx;
  for (size_t i = 0; i < block_list.size(); ++i) {
-    grad_to_prepared_ctx[id_to_grad[block_list[i]]] = optimize_prepared[i];
+    auto blkid = block_list[i];
+    auto it = grad_to_block_id.find_value(blkid);
+    if (it != grad_to_block_id.end()) {
+      grad_to_prepared_ctx[it->first] = optimize_prepared[i];
+    }
  }

  request_send_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
@ -315,6 +325,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
  framework::Scope &recv_scope = scope.NewScope();

  bool sync_mode = Attr<bool>("sync_mode");
+  bool dc_sgd = Attr<bool>("dc_asgd");
  auto fan_in = Attr<int>("Fanin");
  auto inputs = Inputs("X");

@ -328,8 +339,10 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,

  rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));

-  request_send_handler_.reset(new distributed::RequestSendHandler(sync_mode));
-  request_get_handler_.reset(new distributed::RequestGetHandler(sync_mode));
+  request_send_handler_.reset(
+      new distributed::RequestSendHandler(sync_mode, dc_sgd));
+  request_get_handler_.reset(
+      new distributed::RequestGetHandler(sync_mode, dc_sgd));
  request_prefetch_handler_.reset(
      new distributed::RequestPrefetchHandler(sync_mode));
  request_checkpoint_handler_.reset(new distributed::RequestCheckpointHandler(
@ -443,6 +456,8 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
        "a map from grad name to it's optimize block id")
        .SetDefault({});
    AddAttr<bool>("sync_mode", "if works at sync_mode or not").SetDefault(true);
+    AddAttr<bool>("dc_asgd", "set to true will enable DC-ASGD training.")
+        .SetDefault(false);
    AddAttr<std::vector<framework::BlockDesc *>>(
        kOptimizeBlocks, "Optimize blocks to run on server side.")
        .SetDefault({});
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@ -18,6 +18,7 @@ limitations under the License. */
 #include <atomic>
 #include <set>
 #include <string>
+#include <utility>
 #include <vector>

 #include "paddle/fluid/framework/executor.h"
@ -37,6 +38,17 @@ constexpr char kCheckpointBlockId[] = "checkpint_block_id";

 void RunServer(std::shared_ptr<distributed::RPCServer> service);

+template <class TKey, class TValue>
+class DoubleFindMap : public std::unordered_map<TKey, TValue> {
+ public:
+  typename std::unordered_map<TKey, TValue>::iterator find_value(TValue v) {
+    return std::find_if(this->begin(), this->end(),
+                        [&v](const std::pair<const std::string, int> p) {
+                          return p.second == v;
+                        });
+  }
+};
+
 class ListenAndServOp : public framework::OperatorBase {
 public:
  ListenAndServOp(const std::string& type,
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
@ -42,7 +42,8 @@ class PrefetchOp : public framework::OperatorBase {
    auto& ctx = *pool.Get(place);

    distributed::RPCClient* rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>(
+            Attr<int>("trainer_id"));

    std::vector<distributed::VarHandlePtr> rets;
    for (size_t i = 0; i < ins.size(); i++) {
@ -69,6 +70,7 @@ class PrefetchOpMaker : public framework::OpProtoAndCheckerMaker {
              "(LoDTensor) result "
              "to be fetched from parameter server")
        .AsDuplicable();
+    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
    AddAttr<std::vector<std::string>>(
        "epmap",
        "(string vector, default 127.0.0.1:6164)"
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@ -42,7 +42,8 @@ class RecvOp : public framework::OperatorBase {
    auto& ctx = *pool.Get(place);

    distributed::RPCClient* rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>(
+            Attr<int>("trainer_id"));

    std::vector<distributed::VarHandlePtr> rets;
    for (size_t i = 0; i < outs.size(); i++) {
@ -73,6 +74,7 @@ This operator can get variables from server side.
                                      "Server endpoints in the order of input "
                                      "variables for mapping")
        .SetDefault({});
+    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
    AddAttr<int>("sync_mode",
                 "(int, default 0)"
                 "sync recv or async recv.")
--- a/paddle/fluid/operators/ref_by_trainer_id_op.cc
+++ b/paddle/fluid/operators/ref_by_trainer_id_op.cc
@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/ref_by_trainer_id_op.h"
+#include <string>
+
+namespace paddle {
+namespace operators {
+
+class RefByTrainerIdOp : public framework::OperatorWithKernel {
+ public:
+  RefByTrainerIdOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInputs("X"),
+                   "Input(X) of RefByTrainerIdOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("TrainerId"),
+                   "Input(TrainerId) of RefByTrainerIdOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of RefByTrainerIdOp should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("TrainerId").size(), 1,
+                      "TrainerId should be a scalar.");
+    // Out's shape is determined at runtime.
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.MultiInput<framework::Tensor>("X")[0]->type()),
+        ctx.GetPlace());
+  }
+};
+
+class RefByTrainerIdOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) Input tensor list.").AsDuplicable();
+    AddInput("TrainerId", "(Tensor) Scalar int, the trainer id runtime value.");
+    AddOutput("Out", "(Tensor) Return one tensor reference of X[trainer_id]");
+    AddComment(R"DOC(
+**RefByTrainerId operator**
+
+Return a reference of a tensor, using trainer_id as the index to find from the input.
+
+$$Out = X[TrainerId]$$
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_WITHOUT_GRADIENT(ref_by_trainer_id, ops::RefByTrainerIdOp,
+                             ops::RefByTrainerIdOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    ref_by_trainer_id,
+    ops::RefByTrainerIdKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::RefByTrainerIdKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::RefByTrainerIdKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::RefByTrainerIdKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/ref_by_trainer_id_op.cu.cc
+++ b/paddle/fluid/operators/ref_by_trainer_id_op.cu.cc
@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/ref_by_trainer_id_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    ref_by_trainer_id,
+    paddle::operators::RefByTrainerIdKernel<paddle::platform::CUDADeviceContext,
+                                            float>,
+    paddle::operators::RefByTrainerIdKernel<paddle::platform::CUDADeviceContext,
+                                            double>,
+    paddle::operators::RefByTrainerIdKernel<paddle::platform::CUDADeviceContext,
+                                            int>,
+    paddle::operators::RefByTrainerIdKernel<paddle::platform::CUDADeviceContext,
+                                            int64_t>);
--- a/paddle/fluid/operators/ref_by_trainer_id_op.h
+++ b/paddle/fluid/operators/ref_by_trainer_id_op.h
@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdio.h>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class RefByTrainerIdKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& context) const {
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto in_list = context.MultiInput<framework::Tensor>("X");
+    auto* trainer_id_t = context.Input<framework::Tensor>("TrainerId");
+    int64_t trainer_id;
+    auto* trainer_id_data = trainer_id_t->data<int64_t>();
+    if (platform::is_gpu_place(context.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+      auto stream = context.cuda_device_context().stream();
+      memory::Copy<>(platform::CPUPlace(), &trainer_id,
+                     boost::get<platform::CUDAPlace>(context.GetPlace()),
+                     trainer_id_data, sizeof(int64_t), stream);
+#endif
+    } else {
+      trainer_id = *trainer_id_data;
+    }
+    printf("after get trainer_id %lu\n", trainer_id);
+    PADDLE_ENFORCE_LT(trainer_id, in_list.size());
+    out->mutable_data<T>(context.GetPlace());
+    out->ShareDataWith(*(in_list[trainer_id]));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/Show More
+++ b/Show More