heter box (#29734)

* 　add heter box * add trainer, worker, wrapper... * format * for ci * format * remove boost get * boost & copyright * rename * 　rename * format * format * format Co-authored-by: yaoxuefeng6 <yaoxuefeng@baidu.com>
5 years ago · 09b6e71928
parent 1092da82b2
commit 09b6e71928
48 changed files with 4171 additions and 54 deletions
--- a/paddle/fluid/distributed/CMakeLists.txt
+++ b/paddle/fluid/distributed/CMakeLists.txt
@ -1,3 +1,6 @@
+if (WITH_PSLIB)
+    return()
+endif()
 if(NOT WITH_DISTRIBUTE)
    return()
 endif()
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -204,11 +204,11 @@ if(WITH_DISTRIBUTE)
    cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
    dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
    heterxpu_trainer.cc
-    data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc
-    heterbox_worker.cc heterbox_trainer.cc downpour_worker.cc downpour_worker_opt.cc
+    data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc ps_gpu_worker.cc
+    heterbox_worker.cc heterbox_trainer.cc ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc
    pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
    device_context scope framework_proto trainer_desc_proto glog fs shell
-    fleet_wrapper heter_wrapper box_wrapper lodtensor_printer
+    fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer
    lod_rank_table feed_fetch_method sendrecvop_rpc communicator collective_helper ${GLOB_DISTRIBUTE_DEPS}
    graph_to_program_pass variable_helper data_feed_proto timer monitor
    heter_service_proto pslib_brpc)
@ -218,11 +218,11 @@ if(WITH_DISTRIBUTE)
    cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
    dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
    heterxpu_trainer.cc
-    data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc
-    heterbox_worker.cc heterbox_trainer.cc downpour_worker.cc downpour_worker_opt.cc
+    data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc ps_gpu_worker.cc
+    heterbox_worker.cc heterbox_trainer.cc ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc
    pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
    device_context scope framework_proto trainer_desc_proto glog fs shell
-    fleet_wrapper heter_wrapper box_wrapper lodtensor_printer
+    fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer
    lod_rank_table feed_fetch_method sendrecvop_rpc communicator collective_helper ${GLOB_DISTRIBUTE_DEPS}
    graph_to_program_pass variable_helper data_feed_proto timer monitor
    heter_service_proto)
@ -233,11 +233,11 @@ elseif(WITH_PSLIB)
  cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
  dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
  heterxpu_trainer.cc
-  data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc
-  heterbox_worker.cc heterbox_trainer.cc downpour_worker.cc downpour_worker_opt.cc
+  data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc ps_gpu_worker.cc
+  heterbox_worker.cc heterbox_trainer.cc ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc
  pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
  device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
-  lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper lodtensor_printer feed_fetch_method
+  lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
  graph_to_program_pass variable_helper timer monitor pslib_brpc )
  # TODO: Fix these unittest failed on Windows
  # This unittest will always failed, now no CI will run this unittest
@ -248,11 +248,11 @@ else()
  cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
  dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
  heterxpu_trainer.cc
-  data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc
-  heterbox_worker.cc heterbox_trainer.cc downpour_worker.cc downpour_worker_opt.cc
+  data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc ps_gpu_worker.cc
+  heterbox_worker.cc heterbox_trainer.cc ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc
  pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
  device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
-  lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper lodtensor_printer feed_fetch_method
+  lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
  graph_to_program_pass variable_helper timer monitor)
  # TODO: Fix these unittest failed on Windows
  # This unittest will always failed, now no CI will run this unittest
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@ -968,7 +968,7 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) {
            if (fabs(feasign) < 1e-6 && !use_slots_is_dense_[i]) {
              continue;
            }
-            FeatureKey f;
+            FeatureFeasign f;
            f.float_feasign_ = feasign;
            instance->float_feasigns_.push_back(FeatureItem(f, idx));
          }
@ -980,7 +980,7 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) {
            if (feasign == 0 && !use_slots_is_dense_[i]) {
              continue;
            }
-            FeatureKey f;
+            FeatureFeasign f;
            f.uint64_feasign_ = feasign;
            instance->uint64_feasigns_.push_back(FeatureItem(f, idx));
          }
@ -1038,7 +1038,7 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstance(Record* instance) {
            if (fabs(feasign) < 1e-6) {
              continue;
            }
-            FeatureKey f;
+            FeatureFeasign f;
            f.float_feasign_ = feasign;
            instance->float_feasigns_.push_back(FeatureItem(f, idx));
          }
@ -1048,7 +1048,7 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstance(Record* instance) {
            if (feasign == 0) {
              continue;
            }
-            FeatureKey f;
+            FeatureFeasign f;
            f.uint64_feasign_ = feasign;
            instance->uint64_feasigns_.push_back(FeatureItem(f, idx));
          }
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@ -69,20 +69,23 @@ namespace framework {
 //   while (reader->Next()) {
 //      // trainer do something
 //   }
-union FeatureKey {
+union FeatureFeasign {
  uint64_t uint64_feasign_;
  float float_feasign_;
 };

 struct FeatureItem {
  FeatureItem() {}
-  FeatureItem(FeatureKey sign, uint16_t slot) {
+  FeatureItem(FeatureFeasign sign, uint16_t slot) {
    this->sign() = sign;
    this->slot() = slot;
  }
-  FeatureKey& sign() { return *(reinterpret_cast<FeatureKey*>(sign_buffer())); }
-  const FeatureKey& sign() const {
-    const FeatureKey* ret = reinterpret_cast<FeatureKey*>(sign_buffer());
+  FeatureFeasign& sign() {
+    return *(reinterpret_cast<FeatureFeasign*>(sign_buffer()));
+  }
+  const FeatureFeasign& sign() const {
+    const FeatureFeasign* ret =
+        reinterpret_cast<FeatureFeasign*>(sign_buffer());
    return *ret;
  }
  uint16_t& slot() { return slot_; }
@ -90,7 +93,7 @@ struct FeatureItem {

 private:
  char* sign_buffer() const { return const_cast<char*>(sign_); }
-  char sign_[sizeof(FeatureKey)];
+  char sign_[sizeof(FeatureFeasign)];
  uint16_t slot_;
 };

@ -514,7 +517,7 @@ paddle::framework::Archive<AR>& operator>>(paddle::framework::Archive<AR>& ar,

 struct RecordCandidate {
  std::string ins_id_;
-  std::unordered_multimap<uint16_t, FeatureKey> feas_;
+  std::unordered_multimap<uint16_t, FeatureFeasign> feas_;
  size_t shadow_index_ = -1;  // Optimization for Reservoir Sample

  RecordCandidate() {}
@ -606,7 +609,7 @@ class RecordCandidateList {

 template <class AR>
 paddle::framework::Archive<AR>& operator<<(paddle::framework::Archive<AR>& ar,
-                                           const FeatureKey& fk) {
+                                           const FeatureFeasign& fk) {
  ar << fk.uint64_feasign_;
  ar << fk.float_feasign_;
  return ar;
@ -614,7 +617,7 @@ paddle::framework::Archive<AR>& operator<<(paddle::framework::Archive<AR>& ar,

 template <class AR>
 paddle::framework::Archive<AR>& operator>>(paddle::framework::Archive<AR>& ar,
-                                           FeatureKey& fk) {
+                                           FeatureFeasign& fk) {
  ar >> fk.uint64_feasign_;
  ar >> fk.float_feasign_;
  return ar;
--- a/paddle/fluid/framework/data_set.h
+++ b/paddle/fluid/framework/data_set.h
@ -229,6 +229,20 @@ class DatasetImpl : public Dataset {
  virtual void DynamicAdjustReadersNum(int thread_num);
  virtual void SetFleetSendSleepSeconds(int seconds);

+  std::vector<paddle::framework::Channel<T>>& GetMultiOutputChannel() {
+    return multi_output_channel_;
+  }
+
+  std::vector<paddle::framework::Channel<T>>& GetCurOutputChannel() {
+    if (cur_channel_ == 0) {
+      return multi_output_channel_;
+    } else {
+      return multi_consume_channel_;
+    }
+  }
+
+  Channel<T>& GetInputChannelRef() { return input_channel_; }
+
 protected:
  virtual int ReceiveFromClient(int msg_type, int client_id,
                                const std::string& msg);
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@ -537,6 +537,102 @@ class HeterBoxWorker : public HogwildWorker {
 };
 #endif

+#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
+class PSGPUWorker : public HogwildWorker {
+ public:
+  PSGPUWorker() {}
+  virtual ~PSGPUWorker() {}
+  virtual void Initialize(const TrainerDesc& desc);
+  virtual void TrainFiles();
+  virtual void SetNeedDump(bool need_dump_field);
+  virtual void SetChannelWriter(ChannelObject<std::string>* queue);
+  virtual void SetWorkerNum(int num) { worker_num_ = num; }
+  virtual void CacheProgram(const ProgramDesc& main_program) {
+    new (&program_) ProgramDesc(main_program);
+  }
+  virtual void ProduceTasks() override;
+  virtual void SetStream(const cudaStream_t stream) { copy_stream_ = stream; }
+  virtual void SetEvent(const cudaEvent_t event) { event_ = event; }
+  virtual void TrainFilesWithProfiler() {}
+  void ResetStat();
+
+ protected:
+  std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
+  void PushGradients();
+  void DumpParam();
+  void CopySparseTable();
+  void CopyDenseTable();
+  void CopyDenseVars();
+
+ private:
+  int mpi_rank_;
+  std::mutex mutex_;
+  std::vector<std::string> send_var_list_;
+  int worker_num_;
+  ProgramDesc program_;
+  HeterObjectPool<HeterTask> object_pool_;
+  bool need_dump_param_;
+  std::vector<std::string> dump_param_;
+  bool need_to_push_dense_;
+  bool need_dump_field_;
+  bool dump_slot_;
+  bool need_to_push_sparse_;
+  std::vector<std::string> dump_fields_;
+  ChannelWriter<std::string> writer_;
+  DownpourWorkerParameter param_;
+  float scale_datanorm_;
+  // just save the value in param_ for easy access
+  std::map<uint64_t, std::string> label_var_name_;
+  std::map<uint64_t, std::vector<std::string>> sparse_key_names_;
+  std::map<uint64_t, std::vector<std::string>> sparse_value_names_;
+  std::map<uint64_t, std::vector<std::string>> sparse_grad_names_;
+  std::map<uint64_t, std::vector<std::string>> dense_value_names_;
+  std::map<uint64_t, std::vector<std::string>> dense_grad_names_;
+  platform::Place root_place_;
+  // actually pushed feasign of each table
+  std::map<uint64_t, std::vector<uint64_t>> sparse_push_keys_;
+
+  // skipped ops
+  std::vector<std::string> skip_ops_;
+
+  std::vector<::std::future<int32_t>> push_sparse_status_;
+  std::vector<::std::future<int32_t>> push_dense_status_;
+
+  // adjust ins weight
+  AdjustInsWeightConfig adjust_ins_weight_config_;
+  std::vector<float> nid_show_;
+  // check nan and inf during training
+  std::vector<std::string> check_nan_var_names_;
+  // copy table
+  CopyTableConfig copy_table_config_;
+  std::map<uint64_t, uint64_t> table_dependency_;
+  std::vector<std::pair<uint64_t, uint64_t>> copy_sparse_tables_;
+  std::vector<std::pair<uint64_t, uint64_t>> copy_dense_tables_;
+  std::unordered_map<uint64_t, std::unordered_set<uint64_t>> feasign_set_;
+  paddle::framework::Channel<std::shared_ptr<HeterTask>> pull_queue_;
+  paddle::framework::Channel<std::shared_ptr<HeterTask>> push_queue_;
+  cudaEvent_t event_;
+  cudaStream_t copy_stream_;
+  int batch_cnt_{0};
+  std::atomic<int> done_cnt_{0};
+
+  double total_time_;
+  double read_time_;
+  double pack_time_;
+  double pull_sparse_local_time_;
+  double op_all_time_;
+  double xpu_op_time_;
+  double xpu_wait_time_;
+  double cpu_op_time_;
+  double collect_label_time_;
+  double fill_sparse_time_;
+  double push_sparse_time_;
+  double gpu_2_cpu_time_;
+  double cpu_2_gpu_time_;
+  uint64_t total_inst_;
+};
+#endif
+
 #if defined(PADDLE_WITH_NCCL)
 class SectionWorker : public DeviceWorker {
 public:
--- a/paddle/fluid/framework/device_worker_factory.cc
+++ b/paddle/fluid/framework/device_worker_factory.cc
@ -66,8 +66,16 @@ REGISTER_DEVICE_WORKER_CLASS(DownpourWorker);
 REGISTER_DEVICE_WORKER_CLASS(DownpourWorkerOpt);
 #ifdef PADDLE_WITH_PSLIB
 REGISTER_DEVICE_WORKER_CLASS(HeterCpuWorker);
+#endif
+
+#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
 REGISTER_DEVICE_WORKER_CLASS(HeterBoxWorker);
 #endif
+
+#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
+REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker);
+#endif
+
 #if defined(PADDLE_WITH_NCCL)
 REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
 #endif
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@ -1,7 +1,15 @@
 if(WITH_PSLIB)
    cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope pslib_brpc pslib)
+    if(WITH_NCCL)
+        nv_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
+        DEPS heter_ps)
+        add_subdirectory(heter_ps)
+    else()
+        cc_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cc)
+    endif(WITH_NCCL)
 else()
    cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope)
+    cc_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cc)
 endif(WITH_PSLIB)

 if(WITH_NCCL)
@ -13,6 +21,7 @@ else()
    cc_library(box_wrapper SRCS box_wrapper.cc DEPS framework_proto lod_tensor)
 endif(WITH_BOX_PS)

+
 if(WITH_GLOO)
    cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope gloo)
 else()
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@ -198,6 +198,7 @@ void FleetWrapper::HeterPullSparseVars(
  for (auto& t : fea_values) {
    pull_result_ptr.push_back(t.data());
  }
+  /*
  auto status = pslib_ptr_->_worker_ptr->heter_pull_sparse(
      workerid, pull_result_ptr.data(), table_id, fea_keys.data(),
      fea_keys.size(), task->taskid_);
@ -211,6 +212,7 @@ void FleetWrapper::HeterPullSparseVars(
      exit(-1);
    }
  }
+  */
 }

 void FleetWrapper::HeterPushSparseVars(
@ -359,6 +361,7 @@ int FleetWrapper::RegisterHeterCallback(HeterCallBackFunc handler) {
  VLOG(3) << "pslib_ptr_=" << pslib_ptr_;
  VLOG(3) << "_worker_ptr=" << pslib_ptr_->_worker_ptr;
  return pslib_ptr_->_worker_ptr->registe_heter_callback(handler);
+
 #else
  VLOG(0) << "FleetWrapper::RegisterHeterCallback"
          << " does nothing when no pslib";
@ -1222,13 +1225,6 @@ void FleetWrapper::LoadModelOneTable(const uint64_t table_id,
 void FleetWrapper::LoadWithWhitelist(const uint64_t table_id,
                                     const std::string& path, const int mode) {
 #ifdef PADDLE_WITH_PSLIB
-  auto ret = pslib_ptr_->_worker_ptr->load_with_whitelist(table_id, path,
-                                                          std::to_string(mode));
-  ret.wait();
-  if (ret.get() != 0) {
-    LOG(ERROR) << "load model of table id: " << table_id
-               << ", from path: " << path << " failed";
-  }
 #else
  VLOG(0) << "FleetWrapper::LoadWhitelist does nothing when no pslib";
 #endif
@ -1353,16 +1349,7 @@ int32_t FleetWrapper::SaveWithWhitelist(int table_id, const std::string& path,
                                        const int mode,
                                        const std::string& whitelist_path) {
 #ifdef PADDLE_WITH_PSLIB
-  auto ret = pslib_ptr_->_worker_ptr->save_with_whitelist(
-      table_id, path, std::to_string(mode), whitelist_path);
-  ret.wait();
-  int32_t feasign_cnt = ret.get();
-  if (feasign_cnt == -1) {
-    LOG(ERROR) << "table save cache failed";
-    sleep(sleep_seconds_before_fail_exit_);
-    exit(-1);
-  }
-  return feasign_cnt;
+  return 0;
 #else
  VLOG(0) << "FleetWrapper::SaveCache does nothing when no pslib";
  return -1;
--- a/paddle/fluid/framework/fleet/heter_context.h
+++ b/paddle/fluid/framework/fleet/heter_context.h
@ -0,0 +1,47 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
+
+#include <map>
+#include <unordered_map>
+#include <vector>
+
+#include "common_value.h"  // NOLINT
+#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+
+class HeterContext {
+ public:
+  Scope* scope_{nullptr};
+  std::vector<std::vector<FeatureKey>> feature_keys_;
+  std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>> value_ptr_;
+  std::vector<std::vector<FeatureValue>> feature_values_;
+  uint64_t size() {
+    uint64_t total_size = 0;
+    for (auto& keys : feature_keys_) {
+      total_size += keys.size();
+    }
+    return total_size;
+  }
+};
+
+}  // end namespace framework
+}  // end namespace paddle
+#endif
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@ -0,0 +1,6 @@
+nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc
+heter_resource.h hashtable.h DEPS cub device_context)
+nv_test(test_heter_comm SRCS test_heter_comm.cu feature_value.h DEPS
+heter_comm)
+
+nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
--- a/paddle/fluid/framework/fleet/heter_ps/cudf/LICENSE
+++ b/paddle/fluid/framework/fleet/heter_ps/cudf/LICENSE
@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2018 NVIDIA Corporation
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
+++ b/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
--- a/paddle/fluid/framework/fleet/heter_ps/cudf/hash_functions.cuh
+++ b/paddle/fluid/framework/fleet/heter_ps/cudf/hash_functions.cuh
@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef HASH_FUNCTIONS_CUH
+#define HASH_FUNCTIONS_CUH
+
+using hash_value_type = uint32_t;
+
+// MurmurHash3_32 implementation from
+// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+// Note - The x86 and x64 versions do _not_ produce the same results, as the
+// algorithms are optimized for their respective platforms. You can still
+// compile and run any of them on any platform, but your performance with the
+// non-native version will be less than optimal.
+template <typename Key>
+struct MurmurHash3_32 {
+  using argument_type = Key;
+  using result_type = hash_value_type;
+
+  __forceinline__ __host__ __device__ MurmurHash3_32() : m_seed(0) {}
+
+  __forceinline__ __host__ __device__ uint32_t rotl32(uint32_t x, int8_t r) const {
+    return (x << r) | (x >> (32 - r));
+  }
+
+  __forceinline__ __host__ __device__ uint32_t fmix32(uint32_t h) const {
+    h ^= h >> 16;
+    h *= 0x85ebca6b;
+    h ^= h >> 13;
+    h *= 0xc2b2ae35;
+    h ^= h >> 16;
+    return h;
+  }
+
+  /* --------------------------------------------------------------------------*/
+  /**
+   * @Synopsis  Combines two hash values into a new single hash value. Called
+   * repeatedly to create a hash value from several variables.
+   * Taken from the Boost hash_combine function
+   * https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html
+   *
+   * @Param lhs The first hash value to combine
+   * @Param rhs The second hash value to combine
+   *
+   * @Returns A hash value that intelligently combines the lhs and rhs hash values
+   */
+  /* ----------------------------------------------------------------------------*/
+  __host__ __device__ result_type hash_combine(result_type lhs, result_type rhs) {
+    result_type combined{lhs};
+
+    combined ^= rhs + 0x9e3779b9 + (combined << 6) + (combined >> 2);
+
+    return combined; 
+  }
+
+  __forceinline__ __host__ __device__ result_type operator()(const Key& key) const {
+    constexpr int len = sizeof(argument_type);
+    const uint8_t* const data = (const uint8_t*)&key;
+    constexpr int nblocks = len / 4;
+    uint32_t h1 = m_seed;
+    constexpr uint32_t c1 = 0xcc9e2d51;
+    constexpr uint32_t c2 = 0x1b873593;
+    //----------
+    // body
+    const uint32_t* const blocks = (const uint32_t*)(data + nblocks * 4);
+    for (int i = -nblocks; i; i++) {
+      uint32_t k1 = blocks[i];  // getblock32(blocks,i);
+      k1 *= c1;
+      k1 = rotl32(k1, 15);
+      k1 *= c2;
+      h1 ^= k1;
+      h1 = rotl32(h1, 13);
+      h1 = h1 * 5 + 0xe6546b64;
+    }
+    //----------
+    // tail
+    const uint8_t* tail = (const uint8_t*)(data + nblocks * 4);
+    uint32_t k1 = 0;
+    switch (len & 3) {
+      case 3:
+        k1 ^= tail[2] << 16;
+      case 2:
+        k1 ^= tail[1] << 8;
+      case 1:
+        k1 ^= tail[0];
+        k1 *= c1;
+        k1 = rotl32(k1, 15);
+        k1 *= c2;
+        h1 ^= k1;
+    };
+    //----------
+    // finalization
+    h1 ^= len;
+    h1 = fmix32(h1);
+    return h1;
+  }
+
+ private:
+  const uint32_t m_seed;
+};
+
+template <typename Key>
+using default_hash = MurmurHash3_32<Key>;
+
+#endif  // HASH_FUNCTIONS_CUH
--- a/paddle/fluid/framework/fleet/heter_ps/cudf/managed.cuh
+++ b/paddle/fluid/framework/fleet/heter_ps/cudf/managed.cuh
@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MANAGED_CUH
+#define MANAGED_CUH 
+
+#include <new>
+
+struct managed {
+  static void *operator new(size_t n) {
+    void *ptr = 0;
+    cudaError_t result = cudaMallocManaged(&ptr, n);
+    if (cudaSuccess != result || 0 == ptr) throw std::bad_alloc();
+    return ptr;
+  }
+
+  static void operator delete(void *ptr) noexcept { cudaFree(ptr); }
+};
+
+#endif  // MANAGED_CUH
--- a/paddle/fluid/framework/fleet/heter_ps/cudf/managed_allocator.cuh
+++ b/paddle/fluid/framework/fleet/heter_ps/cudf/managed_allocator.cuh
@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MANAGED_ALLOCATOR_CUH 
+#define MANAGED_ALLOCATOR_CUH
+
+#include <new>
+
+template <class T>
+struct managed_allocator {
+  typedef T value_type;
+
+  managed_allocator() = default;
+
+  template <class U>
+  constexpr managed_allocator(const managed_allocator<U>&) noexcept {}
+
+  T* allocate(std::size_t n) const {
+    T* ptr = 0;
+    cudaError_t result = cudaMallocManaged(&ptr, n * sizeof(T));
+    if (cudaSuccess != result || nullptr == ptr) {
+      std::cerr << "ERROR: CUDA Runtime call in line " << __LINE__ << "of file " << __FILE__
+                << " failed with " << cudaGetErrorString(result) << " (" << result << ") "
+                << " Attempted to allocate: " << n * sizeof(T) << " bytes.\n";
+      throw std::bad_alloc();
+    }
+    return ptr;
+  }
+  void deallocate(T* p, std::size_t) const { cudaFree(p); }
+};
+
+template <class T, class U>
+bool operator==(const managed_allocator<T>&, const managed_allocator<U>&) {
+  return true;
+}
+template <class T, class U>
+bool operator!=(const managed_allocator<T>&, const managed_allocator<U>&) {
+  return false;
+}
+
+#endif
--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
@ -0,0 +1,76 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_PSLIB
+
+#include <iostream>
+
+namespace paddle {
+namespace framework {
+#define MF_DIM 8
+
+typedef uint64_t FeatureKey;
+
+struct FeatureValue {
+  float delta_score;
+  float show;
+  float clk;
+  int slot;
+  float lr;
+  float lr_g2sum;
+  int mf_size;
+  float mf[MF_DIM + 1];
+
+  friend std::ostream& operator<<(std::ostream& out, FeatureValue& val) {
+    out << "show: " << val.show << " clk: " << val.clk << " slot: " << val.slot
+        << " lr: " << val.lr << " mf_size: " << val.mf_size << " mf:";
+    for (int i = 0; i < val.mf_size; ++i) {
+      out << " " << val.mf[i];
+    }
+    return out;
+  }
+};
+
+struct FeaturePushValue {
+  float show;
+  float clk;
+  int slot;
+  float lr_g;
+  float mf_g[MF_DIM];
+};
+// class DownpourFixedFeatureValue {
+//    public:
+//        DownpourFixedFeatureValue() {}
+//        ~DownpourFixedFeatureValue() {}
+//        float* data() {
+//            return _data.data();
+//        }
+//        size_t size() {
+//            return _data.size();
+//        }
+//        void resize(size_t size) {
+//            _data.resize(size);
+//        }
+//        void shrink_to_fit() {
+//            _data.shrink_to_fit();
+//        }
+//    private:
+//        std::vector<float> _data;
+//    };
+
+}  // end namespace framework
+}  // end namespace paddle
+#endif
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@ -0,0 +1,64 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <limits>
+#include <memory>
+#include <vector>
+#include "thrust/pair.h"
+//#include "cudf/concurrent_unordered_map.cuh.h"
+#include "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h"
+#ifdef PADDLE_WITH_PSLIB
+
+namespace paddle {
+namespace framework {
+
+template <typename KeyType, typename ValType>
+class TableContainer
+    : public concurrent_unordered_map<KeyType, ValType,
+                                      std::numeric_limits<KeyType>::max()> {
+ public:
+  TableContainer(size_t capacity)
+      : concurrent_unordered_map<KeyType, ValType,
+                                 std::numeric_limits<KeyType>::max()>(
+            capacity, ValType()) {}
+};
+
+template <typename KeyType, typename ValType>
+class HashTable {
+ public:
+  HashTable(size_t capacity);
+  virtual ~HashTable();
+  HashTable(const HashTable&) = delete;
+  HashTable& operator=(const HashTable&) = delete;
+  void insert(const KeyType* d_keys, const ValType* d_vals, size_t len,
+              cudaStream_t stream);
+  void get(const KeyType* d_keys, ValType* d_vals, size_t len,
+           cudaStream_t stream);
+  void show();
+
+  template <typename GradType, typename Sgd>
+  void update(const KeyType* d_keys, const GradType* d_grads, size_t len,
+              Sgd sgd, cudaStream_t stream);
+
+ private:
+  TableContainer<KeyType, ValType>* container_;
+  int BLOCK_SIZE_{256};
+  float LOAD_FACTOR{0.75f};
+  size_t capacity_;
+};
+}  // end namespace framework
+}  // end namespace paddle
+#include "hashtable.tpp"
+#endif
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.tpp
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.tpp
@ -0,0 +1,126 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_PSLIB
+
+namespace paddle {
+namespace framework {
+
+template <typename value_type>
+struct ReplaceOp {
+  __host__ __device__ value_type operator()(value_type new_value,
+                                            value_type old_value) {
+    return new_value;
+  }
+};
+
+template <typename Table>
+__global__ void insert_kernel(Table* table,
+                              const typename Table::key_type* const keys,
+                              const typename Table::mapped_type* const vals,
+                              size_t len) {
+  ReplaceOp<typename Table::mapped_type> op;
+  thrust::pair<typename Table::key_type, typename Table::mapped_type> kv;
+
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    kv.first = keys[i];
+    kv.second = vals[i];
+    auto it = table->insert(kv, op);
+    assert(it != table->end() && "error: insert fails: table is full");
+  }
+}
+
+template <typename Table>
+__global__ void search_kernel(Table* table,
+                              const typename Table::key_type* const keys,
+                              typename Table::mapped_type* const vals,
+                              size_t len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    auto it = table->find(keys[i]);
+    if (it != table->end()) {
+      vals[i] = it->second;
+    }
+  }
+}
+
+template <typename Table, typename GradType, typename Sgd>
+__global__ void update_kernel(Table* table,
+                              const typename Table::key_type* const keys,
+                              const GradType* const grads, size_t len,
+                              Sgd sgd) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    auto it = table->find(keys[i]);
+    if (it != table->end()) {
+      sgd.update_value((it.getter())->second, grads[i]);
+    }
+  }
+}
+
+template <typename KeyType, typename ValType>
+HashTable<KeyType, ValType>::HashTable(size_t capacity) {
+  container_ = new TableContainer<KeyType, ValType>(capacity);
+}
+
+template <typename KeyType, typename ValType>
+HashTable<KeyType, ValType>::~HashTable() {
+  delete container_;
+}
+
+template <typename KeyType, typename ValType>
+void HashTable<KeyType, ValType>::show() {
+  container_->print();
+}
+
+template <typename KeyType, typename ValType>
+void HashTable<KeyType, ValType>::get(const KeyType* d_keys, ValType* d_vals,
+                                      size_t len, cudaStream_t stream) {
+  if (len == 0) {
+    return;
+  }
+  const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
+  search_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(container_, d_keys,
+                                                       d_vals, len);
+}
+
+template <typename KeyType, typename ValType>
+void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
+                                         const ValType* d_vals, size_t len,
+                                         cudaStream_t stream) {
+  if (len == 0) {
+    return;
+  }
+  const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
+  insert_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(container_, d_keys,
+                                                       d_vals, len);
+}
+
+template <typename KeyType, typename ValType>
+template <typename GradType, typename Sgd>
+void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
+                                         const GradType* d_grads, size_t len,
+                                         Sgd sgd, cudaStream_t stream) {
+  if (len == 0) {
+    return;
+  }
+  const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
+  update_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(container_, d_keys,
+                                                       d_grads, len, sgd);
+}
+
+}  // end namespace framework
+}  // end namespace paddle
+#endif
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
@ -0,0 +1,84 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "cub/cub.cuh"
+#include "hashtable.h"
+#include "heter_resource.h"
+#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh"
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/place.h"
+#include "thrust/pair.h"
+
+#ifdef PADDLE_WITH_PSLIB
+
+namespace paddle {
+namespace framework {
+
+struct CustomGradMerger {
+  template <typename T>
+  CUB_RUNTIME_FUNCTION __forceinline__ __device__ T
+  operator()(const T& a, const T& b) const {
+    T out;
+    out.slot = a.slot;
+    out.show = a.show + b.show;
+    out.clk = a.clk + b.clk;
+    out.lr_g = a.lr_g + b.lr_g;
+    for (int i = 0; i < MF_DIM; ++i) {
+      out.mf_g[i] = a.mf_g[i] + b.mf_g[i];
+    }
+    return out;
+  }
+};
+
+template <typename KeyType, typename ValType, typename GradType>
+class HeterComm {
+ public:
+  HeterComm(size_t capacity, std::shared_ptr<HeterPsResource> resource);
+  virtual ~HeterComm();
+  HeterComm(const HeterComm&) = delete;
+  HeterComm& operator=(const HeterComm&) = delete;
+
+  void split_input_to_shard(KeyType* d_keys, int* d_idx_ptr, size_t len,
+                            int* left, int* right, int gpu_num);
+  void merge_grad(int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len,
+                  int& uniq_len);
+  void pull_sparse(int num, KeyType* d_keys, ValType* d_vals, size_t len);
+  void build_ps(int num, KeyType* h_keys, ValType* h_vals, size_t len,
+                size_t chunk_size, int stream_num);
+  void dump();
+  void show_one_table(int gpu_num);
+  int get_index_by_devid(int devid);
+
+  template <typename Sgd>
+  void push_sparse(int num, KeyType* d_keys, GradType* d_grads, size_t len,
+                   Sgd& sgd);
+
+  int log2i(int x);
+
+ private:
+  using Table = HashTable<KeyType, ValType>;
+  int block_size_{256};
+  float load_factor_{0.75};
+  std::vector<Table*> tables_;
+  std::shared_ptr<HeterPsResource> resource_;
+  CustomGradMerger merger_;
+};
+
+}  // end namespace framework
+}  // end namespace paddle
+#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.tpp"
+#endif
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.tpp
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.tpp
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
@ -0,0 +1,62 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h"
+
+#ifdef PADDLE_WITH_PSLIB
+
+namespace paddle {
+namespace framework {
+
+HeterPsBase* HeterPsBase::get_instance(
+    size_t capacity, std::shared_ptr<HeterPsResource> resource) {
+  return new HeterPs(capacity, resource);
+}
+
+HeterPs::HeterPs(size_t capacity, std::shared_ptr<HeterPsResource> resource) {
+  comm_ =
+      std::make_shared<HeterComm<FeatureKey, FeatureValue, FeaturePushValue>>(
+          capacity, resource);
+  opt_ = Optimizer<FeatureValue, FeaturePushValue>();
+}
+
+HeterPs::~HeterPs() {}
+
+void HeterPs::pull_sparse(int num, FeatureKey* d_keys, FeatureValue* d_vals,
+                          size_t len) {
+  comm_->pull_sparse(num, d_keys, d_vals, len);
+}
+
+void HeterPs::build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals,
+                       size_t len, size_t chunk_size, int stream_num) {
+  comm_->build_ps(num, h_keys, h_vals, len, chunk_size, stream_num);
+}
+
+int HeterPs::get_index_by_devid(int devid) {
+  return comm_->get_index_by_devid(devid);
+}
+
+void HeterPs::dump() {}
+
+void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); }
+
+void HeterPs::push_sparse(int num, FeatureKey* d_keys,
+                          FeaturePushValue* d_grads, size_t len) {
+  comm_->push_sparse(num, d_keys, d_grads, len, opt_);
+}
+
+}  // end namespace framework
+}  // end namespace paddle
+#endif
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
@ -0,0 +1,51 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h"
+#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh"
+
+#ifdef PADDLE_WITH_PSLIB
+
+namespace paddle {
+namespace framework {
+
+class HeterPs : public HeterPsBase {
+ public:
+  HeterPs() {}
+  HeterPs(size_t capacity, std::shared_ptr<HeterPsResource> resource);
+  virtual ~HeterPs();
+  HeterPs(const HeterPs&) = delete;
+  HeterPs& operator=(const HeterPs&) = delete;
+
+  virtual void pull_sparse(int num, FeatureKey* d_keys, FeatureValue* d_vals,
+                           size_t len) override;
+  virtual void build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals,
+                        size_t len, size_t chunk_size, int stream_num) override;
+  virtual void dump() override;
+  virtual int get_index_by_devid(int devid) override;
+  virtual void show_one_table(int gpu_num) override;
+  virtual void push_sparse(int num, FeatureKey* d_keys,
+                           FeaturePushValue* d_grads, size_t len) override;
+
+ private:
+  std::shared_ptr<HeterComm<FeatureKey, FeatureValue, FeaturePushValue>> comm_;
+  Optimizer<FeatureValue, FeaturePushValue> opt_;
+};
+
+}  // end namespace framework
+}  // end namespace paddle
+#endif
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
@ -0,0 +1,47 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
+
+#ifdef PADDLE_WITH_PSLIB
+
+namespace paddle {
+namespace framework {
+
+class HeterPsBase {
+ public:
+  HeterPsBase(){};
+  HeterPsBase(size_t capacity, std::shared_ptr<HeterPsResource> resource){};
+  virtual ~HeterPsBase(){};
+  HeterPsBase(const HeterPsBase&) = delete;
+  HeterPsBase& operator=(const HeterPsBase&) = delete;
+
+  virtual void pull_sparse(int num, FeatureKey* d_keys, FeatureValue* d_vals,
+                           size_t len) = 0;
+  virtual void build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals,
+                        size_t len, size_t chunk_size, int stream_num) = 0;
+  virtual int get_index_by_devid(int devid) = 0;
+  virtual void dump() = 0;
+  virtual void show_one_table(int gpu_num) = 0;
+  virtual void push_sparse(int num, FeatureKey* d_keys,
+                           FeaturePushValue* d_grads, size_t len) = 0;
+  static HeterPsBase* get_instance(size_t capacity,
+                                   std::shared_ptr<HeterPsResource> resource);
+};
+
+}  // end namespace framework
+}  // end namespace paddle
+#endif
--- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
@ -0,0 +1,91 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_PSLIB
+#include "heter_resource.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+
+namespace paddle {
+namespace framework {
+
+GPUResource::GPUResource(int dev_id, int index) {
+  index_ = index;
+  dev_id_ = dev_id;
+
+  platform::CUDADeviceGuard guard(dev_id_);
+
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      cudaStreamCreateWithFlags(&copy_stream_, cudaStreamNonBlocking));
+}
+
+GPUResource::~GPUResource() {
+  platform::CUDADeviceGuard guard(dev_id_);
+
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(copy_stream_));
+}
+
+void HeterPsResource::enable_p2p() {
+  for (size_t i = 0; i < dev_ids_.size(); ++i) {
+    platform::CUDADeviceGuard guard(dev_ids_[i]);
+    for (size_t j = 0; j < dev_ids_.size(); ++j) {
+      if (i != j) {
+        int p2p_flag;
+        PADDLE_ENFORCE_CUDA_SUCCESS(
+            cudaDeviceCanAccessPeer(&p2p_flag, dev_ids_[i], dev_ids_[j]));
+        if (p2p_flag == 1) {
+          cudaError_t ret = cudaDeviceEnablePeerAccess(dev_ids_[j], 0);
+          if (ret != cudaSuccess && ret != cudaErrorPeerAccessAlreadyEnabled) {
+            VLOG(0) << " Cuda error(" << ret << "), " << cudaGetErrorString(ret)
+                    << ".";
+          } else {
+            cudaGetLastError();
+          }
+        }
+      }
+    }
+  }
+}
+
+HeterPsResource::HeterPsResource(const std::vector<int>& dev_ids) {
+  dev_ids_ = dev_ids;
+  for (size_t i = 0; i < dev_ids_.size(); ++i) {
+    std::shared_ptr<GPUResource> resource =
+        std::make_shared<GPUResource>(dev_ids_[i], i);
+    resources_.push_back(resource);
+    devid_2_index_[dev_ids_[i]] = i;
+  }
+}
+
+cudaStream_t HeterPsResource::copy_stream(int num) {
+  return resources_[num]->copy_stream();
+}
+
+cudaStream_t HeterPsResource::stream(int num) {
+  return resources_[num]->stream();
+}
+
+int HeterPsResource::dev_id(int num) { return dev_ids_[num]; }
+
+int HeterPsResource::get_index_by_devid(int devid) {
+  return devid_2_index_[devid];
+}
+
+int HeterPsResource::total_gpu() { return dev_ids_.size(); }
+
+}  // end namespace framework
+}  // end namespace paddle
+#endif
--- a/Show More
+++ b/Show More