add heter ps mode (#25682)

* add heter ps mode

* code style
test=develop

* add with_pslib
test=develop

* unitest
test=develop

* code style
test=develop

* code style
test=develop

* code style
test=develop

* code style
test=develop

* code style
test=develop

* code style
test=develop

* code style
test=develop

* code style
test=develop

* test monitor
test=develop

* prepare trainer
test=develop

* code style
test=develop
revert-24895-update_cub
Thunderbrook 5 years ago committed by GitHub
parent c8d0d1419b
commit 0cb60c700d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -27,6 +27,7 @@ add_subdirectory(fleet)
add_subdirectory(io) add_subdirectory(io)
#ddim lib #ddim lib
proto_library(framework_proto SRCS framework.proto) proto_library(framework_proto SRCS framework.proto)
proto_library(heter_service_proto SRCS heter_service.proto)
proto_library(data_feed_proto SRCS data_feed.proto) proto_library(data_feed_proto SRCS data_feed.proto)
proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto
data_feed_proto) data_feed_proto)
@ -195,20 +196,37 @@ cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc o
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
data_feed.cc device_worker.cc hogwild_worker.cc downpour_worker.cc downpour_worker_opt.cc heterxpu_trainer.cc
data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc downpour_worker.cc downpour_worker_opt.cc
pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
device_context scope framework_proto trainer_desc_proto glog fs shell fleet_wrapper box_wrapper lodtensor_printer device_context scope framework_proto trainer_desc_proto glog fs shell
fleet_wrapper heter_wrapper box_wrapper lodtensor_printer
lod_rank_table feed_fetch_method sendrecvop_rpc communicator collective_helper ${GLOB_DISTRIBUTE_DEPS} lod_rank_table feed_fetch_method sendrecvop_rpc communicator collective_helper ${GLOB_DISTRIBUTE_DEPS}
graph_to_program_pass variable_helper data_feed_proto timer monitor) graph_to_program_pass variable_helper data_feed_proto timer monitor
heter_service_proto)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
elseif(WITH_PSLIB)
cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
heterxpu_trainer.cc
data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc downpour_worker.cc downpour_worker_opt.cc
pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper lodtensor_printer feed_fetch_method
graph_to_program_pass variable_helper timer monitor pslib_brpc )
# TODO: Fix these unittest failed on Windows
if(NOT WIN32)
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
endif()
else() else()
cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
data_feed.cc device_worker.cc hogwild_worker.cc downpour_worker.cc downpour_worker_opt.cc heterxpu_trainer.cc
data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc downpour_worker.cc downpour_worker_opt.cc
pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
device_context scope framework_proto data_feed_proto trainer_desc_proto glog device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
lod_rank_table fs shell fleet_wrapper box_wrapper lodtensor_printer feed_fetch_method lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper lodtensor_printer feed_fetch_method
graph_to_program_pass variable_helper timer monitor) graph_to_program_pass variable_helper timer monitor)
# TODO: Fix these unittest failed on Windows # TODO: Fix these unittest failed on Windows
if(NOT WIN32) if(NOT WIN32)

@ -27,6 +27,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_feed.h" #include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/heter_service.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
@ -51,10 +52,23 @@ bool CheckValidOutput(LoDTensor* tensor, size_t batch_size);
class FleetWrapper; class FleetWrapper;
#ifdef PADDLE_WITH_PSLIB
class HeterWrapper;
#endif
class PullDenseWorker { class PullDenseWorker {
public: public:
virtual ~PullDenseWorker() {} virtual ~PullDenseWorker() {}
virtual void Initialize(const TrainerDesc& param); virtual void Initialize(const TrainerDesc& param);
#ifdef PADDLE_WITH_CUDA
void AddStream(const cudaStream_t stream) { copy_streams_.push_back(stream); }
void AddPlace(const paddle::platform::Place place) {
places_.push_back(place);
}
void AddThreadScope(Scope* scope) { thread_scopes_.push_back(scope); }
#endif
int Start(); int Start();
void Stop(); void Stop();
void SetRootScope(Scope* scope) { root_scope_ = scope; } void SetRootScope(Scope* scope) { root_scope_ = scope; }
@ -62,6 +76,7 @@ class PullDenseWorker {
void ResetThreadVersion(uint64_t table_id); void ResetThreadVersion(uint64_t table_id);
void Wait(std::vector<::std::future<int32_t>>* status_vec); void Wait(std::vector<::std::future<int32_t>>* status_vec);
void PullDense(bool force_update = false); void PullDense(bool force_update = false);
void CreatePinVar();
int GetThreadIdByScope(const Scope* scope); int GetThreadIdByScope(const Scope* scope);
void SetThreadIdByScope(const Scope* scope, int tid); void SetThreadIdByScope(const Scope* scope, int tid);
static std::shared_ptr<PullDenseWorker> GetInstance() { static std::shared_ptr<PullDenseWorker> GetInstance() {
@ -105,6 +120,12 @@ class PullDenseWorker {
std::mutex mutex_for_mean_scale_; std::mutex mutex_for_mean_scale_;
float total_batch_num_ = 0; float total_batch_num_ = 0;
std::unordered_map<const Scope*, int> scope_to_thread_id_; std::unordered_map<const Scope*, int> scope_to_thread_id_;
#ifdef PADDLE_WITH_CUDA
std::vector<cudaStream_t> copy_streams_;
std::vector<paddle::platform::Place> places_;
std::vector<Scope*> thread_scopes_;
#endif
}; };
// should incorporate different type of device // should incorporate different type of device
@ -126,6 +147,8 @@ class DeviceWorker {
virtual void BindingDataFeedMemory() = 0; virtual void BindingDataFeedMemory() = 0;
virtual void SetRootScope(Scope* root_scope); virtual void SetRootScope(Scope* root_scope);
virtual void SetDataFeed(DataFeed* data_feed); virtual void SetDataFeed(DataFeed* data_feed);
virtual void SetWorkerNum(int num) {}
virtual void CacheProgram(const ProgramDesc& main_program) {}
virtual void SetNeedDumpField(bool need_dump_field) { virtual void SetNeedDumpField(bool need_dump_field) {
need_dump_field_ = need_dump_field; need_dump_field_ = need_dump_field;
} }
@ -161,6 +184,7 @@ class DeviceWorker {
FetchConfig fetch_config_; FetchConfig fetch_config_;
bool use_cvm_; bool use_cvm_;
bool no_cvm_; bool no_cvm_;
TrainerDesc trainer_desc_;
// dump params or grads for debug // dump params or grads for debug
bool need_dump_param_; bool need_dump_param_;
@ -306,6 +330,87 @@ class DownpourWorkerOpt : public DownpourWorker {
uint64_t async_tid_ = 0; uint64_t async_tid_ = 0;
}; };
#ifdef PADDLE_WITH_PSLIB
class HeterCpuWorker : public HogwildWorker {
public:
HeterCpuWorker() {}
virtual ~HeterCpuWorker() {}
virtual void Initialize(const TrainerDesc& desc);
virtual void TrainFiles();
virtual void TrainFilesWithProfiler();
virtual void SetNeedDump(bool need_dump_field);
virtual void SetChannelWriter(ChannelObject<std::string>* queue);
virtual void SetWorkerNum(int num) { worker_num_ = num; }
virtual void Schedule(int taskid);
virtual void JumpContext(std::shared_ptr<HeterTask> task);
virtual void CacheProgram(const ProgramDesc& main_program) {
new (&program_) ProgramDesc(main_program);
}
virtual void GetXpuOpIndex();
protected:
std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
std::shared_ptr<paddle::framework::HeterWrapper> heter_ptr_;
std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
void FillSparseValue(std::shared_ptr<HeterTask> task, size_t table_id);
void PushGradients();
void CollectLabelInfo(std::shared_ptr<HeterTask> task, size_t table_id);
void AdjustInsWeight(std::shared_ptr<HeterTask> task);
void DumpParam();
void CopySparseTable();
void CopyDenseTable();
void CopyDenseVars();
private:
int mpi_rank_;
int worker_num_;
int xpu_begin_op_index_;
int xpu_end_op_index_;
ProgramDesc program_;
HeterObjectPool<HeterTask> object_pool_;
HeterList<int, std::shared_ptr<HeterTask>> run_queue_;
HeterList<int, std::shared_ptr<HeterTask>> wait_queue_;
bool need_dump_param_;
std::vector<std::string> dump_param_;
bool need_to_push_dense_;
bool need_dump_field_;
bool dump_slot_;
bool need_to_push_sparse_;
std::vector<std::string> dump_fields_;
ChannelWriter<std::string> writer_;
DownpourWorkerParameter param_;
float scale_datanorm_;
// just save the value in param_ for easy access
std::map<uint64_t, std::string> label_var_name_;
std::map<uint64_t, std::vector<std::string>> sparse_key_names_;
std::map<uint64_t, std::vector<std::string>> sparse_value_names_;
std::map<uint64_t, std::vector<std::string>> sparse_grad_names_;
std::map<uint64_t, std::vector<std::string>> dense_value_names_;
std::map<uint64_t, std::vector<std::string>> dense_grad_names_;
platform::Place root_place_;
// actually pushed feasign of each table
std::map<uint64_t, std::vector<uint64_t>> sparse_push_keys_;
// skipped ops
std::vector<std::string> skip_ops_;
std::vector<::std::future<int32_t>> push_sparse_status_;
std::vector<::std::future<int32_t>> push_dense_status_;
// adjust ins weight
AdjustInsWeightConfig adjust_ins_weight_config_;
std::vector<float> nid_show_;
// check nan and inf during training
std::vector<std::string> check_nan_var_names_;
// copy table
CopyTableConfig copy_table_config_;
std::map<uint64_t, uint64_t> table_dependency_;
std::vector<std::pair<uint64_t, uint64_t>> copy_sparse_tables_;
std::vector<std::pair<uint64_t, uint64_t>> copy_dense_tables_;
std::unordered_map<uint64_t, std::unordered_set<uint64_t>> feasign_set_;
};
#endif
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
class SectionWorker : public DeviceWorker { class SectionWorker : public DeviceWorker {
public: public:

@ -62,6 +62,9 @@ std::shared_ptr<DeviceWorker> DeviceWorkerFactory::CreateDeviceWorker(
REGISTER_DEVICE_WORKER_CLASS(HogwildWorker); REGISTER_DEVICE_WORKER_CLASS(HogwildWorker);
REGISTER_DEVICE_WORKER_CLASS(DownpourWorker); REGISTER_DEVICE_WORKER_CLASS(DownpourWorker);
REGISTER_DEVICE_WORKER_CLASS(DownpourWorkerOpt); REGISTER_DEVICE_WORKER_CLASS(DownpourWorkerOpt);
#ifdef PADDLE_WITH_PSLIB
REGISTER_DEVICE_WORKER_CLASS(HeterCpuWorker);
#endif
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
REGISTER_DEVICE_WORKER_CLASS(SectionWorker); REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
#endif #endif

@ -35,7 +35,7 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
dump_file_num_ = trainer_desc.dump_file_num(); dump_file_num_ = trainer_desc.dump_file_num();
const std::vector<paddle::framework::DataFeed *> readers = const std::vector<paddle::framework::DataFeed *> readers =
dataset->GetReaders(); dataset->GetReaders();
RegisterHeterCallback();
thread_num_ = readers.size(); thread_num_ = readers.size();
workers_.resize(thread_num_); workers_.resize(thread_num_);
for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size(); for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size();
@ -55,6 +55,7 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
workers_[i]->SetDumpParamVector(dump_param_); workers_[i]->SetDumpParamVector(dump_param_);
workers_[i]->InitRandomDumpConfig(trainer_desc); workers_[i]->InitRandomDumpConfig(trainer_desc);
workers_[i]->Initialize(trainer_desc); workers_[i]->Initialize(trainer_desc);
workers_[i]->SetWorkerNum(thread_num_);
} }
VLOG(3) << "going to initialize pull dense worker"; VLOG(3) << "going to initialize pull dense worker";
@ -64,6 +65,13 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
SetDebug(trainer_desc.debug()); SetDebug(trainer_desc.debug());
} }
void DistMultiTrainer::RegisterHeterCallback() {
auto fleet_ptr = FleetWrapper::GetInstance();
fleet_ptr->RegisterHeterCallback([this](int worker, int taskid) {
// workers_[worker]->Schedule(taskid);
});
}
void DistMultiTrainer::InitDumpEnv() { void DistMultiTrainer::InitDumpEnv() {
queue_ = paddle::framework::MakeChannel<std::string>(); queue_ = paddle::framework::MakeChannel<std::string>();
for (int i = 0; i < thread_num_; ++i) { for (int i = 0; i < thread_num_; ++i) {
@ -90,6 +98,9 @@ void DistMultiTrainer::InitTrainerEnv(const ProgramDesc &main_program,
workers_[i]->SetRootScope(root_scope_); workers_[i]->SetRootScope(root_scope_);
workers_[i]->CreateDeviceResource(main_program); // Program workers_[i]->CreateDeviceResource(main_program); // Program
workers_[i]->BindingDataFeedMemory(); workers_[i]->BindingDataFeedMemory();
#ifdef PADDLE_WITH_PSLIB
workers_[i]->CacheProgram(main_program);
#endif
} }
// Scope* -> thread id, it will be used in push_dense op // Scope* -> thread id, it will be used in push_dense op
for (int i = 0; i < thread_num_; ++i) { for (int i = 0; i < thread_num_; ++i) {
@ -104,6 +115,11 @@ void DistMultiTrainer::InitOtherEnv(const ProgramDesc &main_program) {
} }
pull_dense_worker_->SetRootScope(root_scope_); pull_dense_worker_->SetRootScope(root_scope_);
pull_dense_worker_->Start(); pull_dense_worker_->Start();
#ifdef PADDLE_WITH_PSLIB
for (int i = 0; i < thread_num_; ++i) {
workers_[i]->GetXpuOpIndex();
}
#endif
VLOG(3) << "init other env done."; VLOG(3) << "init other env done.";
} }

@ -379,7 +379,7 @@ void DownpourWorker::CopyDenseTable() {
pull_dense_status.resize(0); pull_dense_status.resize(0);
fleet_ptr_->PullDenseVarsAsync(*root_scope_, dest_table, fleet_ptr_->PullDenseVarsAsync(*root_scope_, dest_table,
dense_value_names_[dest_table], dense_value_names_[dest_table],
&pull_dense_status); &pull_dense_status, true);
for (auto& t : pull_dense_status) { for (auto& t : pull_dense_status) {
t.wait(); t.wait();
auto status = t.get(); auto status = t.get();

@ -19,4 +19,6 @@ else()
cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope) cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope)
endif(WITH_GLOO) endif(WITH_GLOO)
cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto)
cc_test(test_fleet SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell) cc_test(test_fleet SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)

File diff suppressed because it is too large Load Diff

@ -28,6 +28,7 @@ limitations under the License. */
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/heter_service.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
@ -80,6 +81,24 @@ class FleetWrapper {
pull_local_thread_num_ = thread_num; pull_local_thread_num_ = thread_num;
} }
#ifdef PADDLE_WITH_PSLIB
void HeterPullSparseVars(int workerid, std::shared_ptr<HeterTask> task,
const uint64_t table_id,
const std::vector<std::string>& var_names,
int fea_dim,
const std::vector<std::string>& var_emb_names);
void HeterPushSparseVars(
std::shared_ptr<HeterTask> task, const uint64_t table_id,
const std::vector<std::string>& sparse_key_names,
const std::vector<std::string>& sparse_grad_names, const int emb_dim,
std::vector<::std::future<int32_t>>* push_sparse_status,
const bool use_cvm, const bool dump_slot, const bool no_cvm);
#endif
typedef std::function<void(int, int)> HeterCallBackFunc;
int RegisterHeterCallback(HeterCallBackFunc handler);
// Pull sparse variables from server in sync mode // Pull sparse variables from server in sync mode
// Param<in>: scope, table_id, var_names, fea_keys, fea_dim, var_emb_names // Param<in>: scope, table_id, var_names, fea_keys, fea_dim, var_emb_names
// Param<out>: fea_values // Param<out>: fea_values
@ -118,15 +137,24 @@ class FleetWrapper {
void PullDenseVarsAsync( void PullDenseVarsAsync(
const Scope& scope, const uint64_t table_id, const Scope& scope, const uint64_t table_id,
const std::vector<std::string>& var_names, const std::vector<std::string>& var_names,
std::vector<::std::future<int32_t>>* pull_dense_status); std::vector<::std::future<int32_t>>* pull_dense_status, bool in_cpu);
// push dense parameters(not gradients) to server in sync mode // push dense parameters(not gradients) to server in sync mode
void PushDenseParamSync(const Scope& scope, const uint64_t table_id, void PushDenseParamSync(const Scope& scope, const uint64_t table_id,
const std::vector<std::string>& var_names); const std::vector<std::string>& var_names);
// Push dense variables to server in async mode // Push dense variables to server in async mode
// Param<in>: scope, table_id, var_names, scale_datanorm, batch_size // Param<in>: scope, table_id, var_names, scale_datanorm, batch_size
// Param<out>: push_sparse_status // Param<out>: push_sparse_status
#ifdef PADDLE_WITH_CUDA
void PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id,
const std::vector<std::string>& var_names,
std::vector<::std::future<int32_t>>* push_sparse_status,
float scale_datanorm, int batch_size,
const paddle::platform::Place& place, cudaStream_t stream,
cudaEvent_t event);
#endif
void PushDenseVarsAsync( void PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id, const Scope& scope, const uint64_t table_id,
const std::vector<std::string>& var_names, const std::vector<std::string>& var_names,

@ -54,10 +54,10 @@ void HdfsStore::set(const std::string& key, const std::vector<char>& data) {
paddle::framework::fs_remove(tmp); paddle::framework::fs_remove(tmp);
if (i == retry_times_) { if (i == retry_times_) {
VLOG(0) << "fs_open_write failed, retry times reaches limit"; VLOG(0) << "fs_open_write failed, retry times reaches limit";
PADDLE_THROW(platform::errors::PreconditionNotMet( // PADDLE_THROW(platform::errors::PreconditionNotMet(
"fs_open_write failed, retry times reaches" // "fs_open_write failed, retry times reaches"
" limit ", // " limit ",
retry_times_)); // retry_times_));
} }
} else { } else {
break; break;
@ -143,9 +143,9 @@ void HdfsStore::wait(const std::vector<std::string>& keys,
break; break;
} }
} }
PADDLE_THROW(platform::errors::ExecutionTimeout( // PADDLE_THROW(platform::errors::ExecutionTimeout(
"TIMEOUT self_rank = %d pair_rank = %d", self_rank_, VLOG(0) << "TIMEOUT self_rank = " << self_rank_
last_check_rank)); << " pair_rank = " << last_check_rank;
} }
std::this_thread::sleep_for(std::chrono::milliseconds(wait_sleep_ms_)); std::this_thread::sleep_for(std::chrono::milliseconds(wait_sleep_ms_));
} }

File diff suppressed because it is too large Load Diff

@ -0,0 +1,123 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <atomic>
#include <ctime>
#include <map>
#include <memory>
#include <random>
#include <string>
#include <unordered_map>
#include <vector>
#ifdef PADDLE_WITH_PSLIB
#include "paddle/fluid/framework/heter_service.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
namespace paddle {
namespace framework {
class HeterCpuWorker;
typedef std::function<void(void*)> HeterRpcCallbackFunc;
class OnHeterRpcDone : public google::protobuf::Closure {
public:
OnHeterRpcDone(HeterRpcCallbackFunc func) : handler_(func) {}
virtual ~OnHeterRpcDone() {}
void Run() {
std::unique_ptr<OnHeterRpcDone> self_guard(this);
handler_(this);
}
HeterRpcCallbackFunc handler_;
HeterResponse response;
brpc::Controller cntl;
};
class HeterWrapper {
public:
virtual ~HeterWrapper() {
server_.Stop(1000);
server_.Join();
}
HeterWrapper() {}
static void HeterRpcCallBack(HeterResponse* response, brpc::Controller* cntl,
HeterCpuWorker* worker,
std::shared_ptr<HeterTask> task);
void CreateClient2XpuConnection();
void RegisterServiceHandler(int cmd, HeterServiceHandler func);
void StartXpuService(const std::string& ip, uint32_t port);
void CallRemoteXpu(std::shared_ptr<HeterTask> task, HeterCpuWorker* worker,
int mpi_rank, std::vector<std::string>& send_vars);
void CallRemoteXpuSync(std::shared_ptr<HeterTask> task,
HeterCpuWorker* worker, int mpi_rank,
std::vector<std::string>& send_vars);
void StopXpuService(int num);
void EndPass(Scope* scope, int num);
void SerializeToReq(const std::string& varname, Scope* scope,
VariableMessage* req_var);
framework::proto::VarType::Type ToVarType(VariableMessage::Type type);
#ifdef PADDLE_WITH_CUDA
void DeSerializeToTensor(Scope* scope, const VariableMessage& req_var,
platform::Place place,
cudaStream_t stream = nullptr);
#else
void DeSerializeToTensor(Scope* scope, const VariableMessage& req_var,
platform::Place place);
#endif
// HeterWrapper singleton
static std::shared_ptr<HeterWrapper> GetInstance() {
if (NULL == s_instance_) {
s_instance_.reset(new paddle::framework::HeterWrapper());
}
return s_instance_;
}
std::vector<std::string>& GetXpuList() { return xpu_list_; }
void SetXpuList(const std::vector<std::string>& xpu_list);
private:
static std::shared_ptr<HeterWrapper> s_instance_;
protected:
std::vector<std::shared_ptr<brpc::Channel>> xpu_channels_;
brpc::Server server_;
HeterXpuService service_;
static bool is_initialized_;
DISABLE_COPY_AND_ASSIGN(HeterWrapper);
std::vector<std::string> xpu_list_;
};
} // end namespace framework
} // end namespace paddle
#endif

File diff suppressed because it is too large Load Diff

@ -0,0 +1,69 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
syntax = "proto2";
package paddle.framework;
option cc_generic_services = true;
// It can be: LoDTensorSelectedRows or NCCL_ID
enum VarType {
LOD_TENSOR = 0;
SELECTED_ROWS = 1;
NCCL_ID = 2;
}
// VariableMessage is serialized paddle variable message.
// NOTICE(gongwb):don't modify this proto if you are not
// not familar with how we serialize in sendrecvop_utils.h
// and deserilize it in variable_response.h.
message VariableMessage {
enum Type {
// Pod Types
BOOL = 0;
INT16 = 1;
INT32 = 2;
INT64 = 3;
FP16 = 4;
FP32 = 5;
FP64 = 6;
}
message LodData { repeated int64 lod_data = 1; }
optional string varname = 1;
// TODO(Yancey1989): reference framework::proto::VarDesc::VarType
optional VarType type = 2;
// bool persistable is not needed for sending.
// tensor info:
optional Type data_type = 3;
repeated int64 dims = 4;
// lod details:
optional int64 lod_level = 5;
repeated LodData lod = 6;
// selected_rows height, aka. original dim0
optional int64 slr_height = 7;
// tensor data
optional bytes data = 8;
}
message HeterRequest {
required int32 cmd = 1;
optional int32 cur_batch = 2;
repeated VariableMessage vars = 3;
};
message HeterResponse {
// optional VariableMessage vars = 1;
repeated VariableMessage vars = 1;
};
service HeterService { rpc service(HeterRequest) returns (HeterResponse); };

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -102,6 +102,7 @@ void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program,
workers_[i]->SetRootScope(root_scope_); workers_[i]->SetRootScope(root_scope_);
workers_[i]->CreateDeviceResource(main_program); // Program workers_[i]->CreateDeviceResource(main_program); // Program
workers_[i]->BindingDataFeedMemory(); workers_[i]->BindingDataFeedMemory();
workers_[i]->CacheProgram(main_program);
} }
} }

@ -56,6 +56,34 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
current_version_[tid] = 0; current_version_[tid] = 0;
} }
fleet_ptr_ = FleetWrapper::GetInstance(); fleet_ptr_ = FleetWrapper::GetInstance();
#ifdef PADDLE_WITH_CUDA
copy_streams_.clear();
places_.clear();
thread_scopes_.clear();
#endif
}
void PullDenseWorker::CreatePinVar() {
#ifdef PADDLE_WITH_CUDA
// for (auto& v : dense_value_names_) {
// for (auto& name : v.second) {
for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size();
++i) {
uint64_t tid = static_cast<uint64_t>(
dwp_param_.program_config(0).pull_dense_table_id(i));
for (size_t j = 0; j < dense_value_names_[tid].size(); j++) {
auto& name = dense_value_names_[tid][j];
Variable* var = root_scope_->FindVar(name);
LoDTensor* tensor = var->GetMutable<LoDTensor>();
auto* ptr = root_scope_->Var(name + "pin");
InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
LoDTensor* pin_tensor = ptr->GetMutable<LoDTensor>();
pin_tensor->mutable_data<float>(tensor->dims(),
platform::CUDAPinnedPlace());
}
}
#endif
} }
void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) { void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
@ -75,6 +103,31 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
exit(-1); exit(-1);
} }
status_vec->resize(0); status_vec->resize(0);
#ifdef PADDLE_WITH_CUDA
for (size_t i = 0; i < places_.size(); ++i) {
// for (auto& v : dense_value_names_) {
// for (auto& name : v.second) {
for (int x = 0; x < dwp_param_.program_config(0).pull_dense_table_id_size();
++x) {
uint64_t tid = static_cast<uint64_t>(
dwp_param_.program_config(0).pull_dense_table_id(x));
for (size_t j = 0; j < dense_value_names_[tid].size(); j++) {
auto& name = dense_value_names_[tid][j];
Variable* pin_var = root_scope_->FindVar(name + "pin");
LoDTensor* pin_tensor = pin_var->GetMutable<LoDTensor>();
float* pin_w = pin_tensor->data<float>();
Variable* var = thread_scopes_[i]->FindVar(name);
LoDTensor* tensor = var->GetMutable<LoDTensor>();
float* w = tensor->data<float>();
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, places_[i]), w,
platform::CUDAPinnedPlace(), pin_w,
sizeof(float) * tensor->numel(), copy_streams_[i]);
}
}
}
#endif
} }
void PullDenseWorker::Stop() { void PullDenseWorker::Stop() {
@ -91,8 +144,14 @@ void PullDenseWorker::PullDense(bool force_update) {
uint64_t tid = static_cast<uint64_t>( uint64_t tid = static_cast<uint64_t>(
dwp_param_.program_config(0).pull_dense_table_id(i)); dwp_param_.program_config(0).pull_dense_table_id(i));
if (force_update || CheckUpdateParam(tid)) { if (force_update || CheckUpdateParam(tid)) {
#ifdef PADDLE_WITH_CUDA
VLOG(3) << "pull dense " << force_update << " " << tid;
fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid], fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid],
&pull_dense_status_); &pull_dense_status_, false);
#else
fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid],
&pull_dense_status_, true);
#endif
ResetThreadVersion(tid); ResetThreadVersion(tid);
} }
} }

@ -21,9 +21,12 @@ limitations under the License. */
#include <thread> // NOLINT #include <thread> // NOLINT
#include <vector> #include <vector>
#include <ctime>
#include "paddle/fluid/framework/data_feed.h" #include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/data_set.h"
#include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/device_worker.h"
#include "paddle/fluid/framework/fleet/heter_wrapper.h"
#include "paddle/fluid/framework/heter_service.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/reader.h"
@ -62,6 +65,7 @@ class TrainerBase {
Scope* root_scope_; Scope* root_scope_;
bool debug_; bool debug_;
Dataset* dataset_ptr_; Dataset* dataset_ptr_;
TrainerDesc trainer_desc_;
// For dump param or field // For dump param or field
bool need_dump_field_ = false; bool need_dump_field_ = false;
@ -118,10 +122,86 @@ class DistMultiTrainer : public MultiTrainer {
void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor); void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
virtual void InitDumpEnv(); virtual void InitDumpEnv();
virtual Scope* GetWorkerScope(int thread_id); virtual Scope* GetWorkerScope(int thread_id);
virtual void RegisterHeterCallback();
protected:
std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
};
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
class HeterServiceContext {
public:
HeterServiceContext() {}
virtual ~HeterServiceContext() {
for (OperatorBase* op : ops_) {
delete op;
}
std::vector<OperatorBase*>().swap(ops_);
}
void Reset() { push_dense_status_.clear(); }
int place_num_;
Scope* scope_{nullptr};
cudaEvent_t event_;
std::vector<OperatorBase*> ops_;
std::vector<::std::future<int32_t>> push_dense_status_;
};
class HeterXpuTrainer : public TrainerBase {
public:
HeterXpuTrainer() {}
virtual ~HeterXpuTrainer() {
for (OperatorBase* op : ops_) {
delete op;
}
std::vector<OperatorBase*>().swap(ops_);
}
virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set);
virtual void InitTrainerEnv(const ProgramDesc& main_program,
const platform::Place& place);
virtual void InitOtherEnv(const ProgramDesc& main_program);
virtual void Run();
virtual void Finalize();
virtual void DumpWork(int tid);
virtual void RegisterServiceHandler();
virtual int RunTask(const HeterRequest* request, HeterResponse* response);
virtual Scope* GetWorkerScope(int thread_id);
virtual void CacheProgram(const ProgramDesc& main_program) {
new (&program_) ProgramDesc(main_program);
}
template <typename T>
void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor,
const paddle::platform::Place& thread_place,
cudaStream_t stream);
void CreateThreadParam(const ProgramDesc& program, int num);
template <typename T>
void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
int EndPass(const HeterRequest* request, HeterResponse* response);
int StopService(const HeterRequest* request, HeterResponse* response);
protected: protected:
DownpourWorkerParameter param_;
std::map<uint64_t, std::vector<std::string>> dense_grad_names_;
std::vector<std::string> need_merge_var_names_;
float scale_datanorm_;
int xpu_begin_op_index_;
int xpu_end_op_index_;
bool running_;
paddle::platform::Place place_;
std::mutex mutex_;
ProgramDesc program_;
std::condition_variable cond_;
std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
std::shared_ptr<paddle::framework::HeterWrapper> heter_ptr_;
std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_; std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
std::vector<OperatorBase*> ops_;
std::vector<std::string> op_names_;
std::vector<Scope*> place_scopes_;
BtObjectPool<HeterServiceContext> object_pool_;
std::vector<cudaStream_t> copy_streams_;
std::vector<platform::Place> places_;
std::vector<cudaEvent_t> events_;
}; };
#endif
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
class PipelineTrainer : public TrainerBase { class PipelineTrainer : public TrainerBase {

@ -52,6 +52,12 @@ message TrainerDesc {
optional bool enable_random_dump = 24 [ default = false ]; optional bool enable_random_dump = 24 [ default = false ];
optional bool random_with_lineid = 25 [ default = false ]; optional bool random_with_lineid = 25 [ default = false ];
optional int32 dump_interval = 26 [ default = 10000 ]; optional int32 dump_interval = 26 [ default = 10000 ];
repeated int32 worker_places = 27;
repeated string xpu_send_list = 28;
repeated string xpu_recv_list = 29;
optional int32 xpu_start_idx = 30;
optional int32 xpu_end_idx = 31;
// device worker parameters // device worker parameters
optional HogwildWorkerParameter hogwild_param = 101; optional HogwildWorkerParameter hogwild_param = 101;

@ -63,6 +63,9 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(
REGISTER_TRAINER_CLASS(MultiTrainer); REGISTER_TRAINER_CLASS(MultiTrainer);
REGISTER_TRAINER_CLASS(DistMultiTrainer); REGISTER_TRAINER_CLASS(DistMultiTrainer);
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
REGISTER_TRAINER_CLASS(HeterXpuTrainer);
#endif
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
REGISTER_TRAINER_CLASS(PipelineTrainer); REGISTER_TRAINER_CLASS(PipelineTrainer);
#endif #endif

@ -1,7 +1,7 @@
set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune
feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool
analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
gloo_wrapper infer_io_utils) gloo_wrapper infer_io_utils heter_wrapper)
if (WITH_NCCL) if (WITH_NCCL)
set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper) set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper)
@ -31,6 +31,7 @@ set(PYBIND_SRCS
global_value_getter_setter.cc global_value_getter_setter.cc
reader_py.cc reader_py.cc
fleet_wrapper_py.cc fleet_wrapper_py.cc
heter_wrapper_py.cc
gloo_wrapper_py.cc gloo_wrapper_py.cc
box_helper_py.cc box_helper_py.cc
data_set_py.cc data_set_py.cc

@ -0,0 +1,50 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fcntl.h>
#ifdef _POSIX_C_SOURCE
#undef _POSIX_C_SOURCE
#endif
#ifdef _XOPEN_SOURCE
#undef _XOPEN_SOURCE
#endif
#include <string>
#include <vector>
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/text_format.h"
#include "paddle/fluid/framework/fleet/heter_wrapper.h"
#include "paddle/fluid/pybind/heter_wrapper_py.h"
namespace py = pybind11;
namespace paddle {
namespace pybind {
#ifdef PADDLE_WITH_PSLIB
void BindHeterWrapper(py::module* m) {
py::class_<framework::HeterWrapper, std::shared_ptr<framework::HeterWrapper>>(
*m, "Heter")
.def(py::init([]() { return framework::HeterWrapper::GetInstance(); }))
.def("create_client2xpu_connection",
&framework::HeterWrapper::CreateClient2XpuConnection)
.def("set_xpu_list", &framework::HeterWrapper::SetXpuList)
.def("start_xpu_service", &framework::HeterWrapper::StartXpuService)
.def("end_pass", &framework::HeterWrapper::EndPass)
.def("stop_xpu_service", &framework::HeterWrapper::StopXpuService);
} // end HeterWrapper
#endif
} // end namespace pybind
} // end namespace paddle

@ -0,0 +1,29 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
namespace py = pybind11;
namespace paddle {
namespace pybind {
#ifdef PADDLE_WITH_PSLIB
void BindHeterWrapper(py::module* m);
#endif
} // namespace pybind
} // namespace paddle

@ -66,6 +66,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/fleet_wrapper_py.h" #include "paddle/fluid/pybind/fleet_wrapper_py.h"
#include "paddle/fluid/pybind/global_value_getter_setter.h" #include "paddle/fluid/pybind/global_value_getter_setter.h"
#include "paddle/fluid/pybind/gloo_wrapper_py.h" #include "paddle/fluid/pybind/gloo_wrapper_py.h"
#include "paddle/fluid/pybind/heter_wrapper_py.h"
#include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/inference_api.h" #include "paddle/fluid/pybind/inference_api.h"
#include "paddle/fluid/pybind/ir.h" #include "paddle/fluid/pybind/ir.h"
@ -2479,6 +2480,9 @@ All parameter, weight, gradient are variables in Paddle.
.def("device_count", &ParallelExecutor::DeviceCount); .def("device_count", &ParallelExecutor::DeviceCount);
BindFleetWrapper(&m); BindFleetWrapper(&m);
#ifdef PADDLE_WITH_PSLIB
BindHeterWrapper(&m);
#endif
BindGlooWrapper(&m); BindGlooWrapper(&m);
BindBoxHelper(&m); BindBoxHelper(&m);
#ifdef PADDLE_WITH_BOX_PS #ifdef PADDLE_WITH_BOX_PS

@ -223,7 +223,8 @@ class DownpourSGD(DeviceWorker):
dense_table_set.add(i) dense_table_set.add(i)
break break
trainer_desc.device_worker_name = "DownpourWorker" trainer_desc.device_worker_name = opt_info.get("worker_class",
"DownpourWorker")
pull_thread = trainer_desc.pull_dense_param pull_thread = trainer_desc.pull_dense_param
pull_thread.device_num = trainer_desc.thread_num pull_thread.device_num = trainer_desc.thread_num
if opt_info.get("program_id_to_worker") is None: if opt_info.get("program_id_to_worker") is None:

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save