add ps_instance doc

revert-15207-remove_op_handle_lock_and_fix_var
heqiaozhi 6 years ago committed by dongdaxiang
parent 35ce6ac2e6
commit bd1c1724aa

@ -1,6 +1,6 @@
# windows treat symbolic file as a real file, which is different with unix
# We create a hidden file and compile it instead of origin source file.
#windows treat symbolic file as a real file, which is different with unix
#We create a hidden file and compile it instead of origin source file.
function(windows_symbolic TARGET)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
@ -11,7 +11,7 @@ function(windows_symbolic TARGET)
message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
endif()
# only copy the xx.cu to .xx.cu when the content are modified
#only copy the xx.cu to.xx.cu when the content are modified
set(copy_flag 1)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu)
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR)
@ -32,7 +32,7 @@ endfunction()
add_subdirectory(ir)
add_subdirectory(details)
# ddim lib
#ddim lib
proto_library(framework_proto SRCS framework.proto)
proto_library(async_executor_proto SRCS data_feed.proto)
@ -89,8 +89,8 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
if(WITH_GPU)
if (WIN32)
# windows treat symbolic file as a real file, which is different with unix
# We create a hidden file and compile it instead of origin source file.
#windows treat symbolic file as a real file, which is different with unix
#We create a hidden file and compile it instead of origin source file.
windows_symbolic(hidden_file SRCS data_type_transform.cu)
nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor)
add_dependencies(data_type_transform hidden_file)
@ -137,7 +137,8 @@ cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator
nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)
# Generate an empty __init__.py to make framework_py_proto as a valid python module.
#Generate an empty \
__init__.py to make framework_py_proto as a valid python module.
add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(framework_py_proto framework_py_proto_init)
if (NOT WIN32)

@ -30,7 +30,7 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/pybind/pybind.h"
#ifdef PADDLE_WITH_PSLIB
#include "pslib.h"
#include <pslib.h>
#endif
namespace paddle {
@ -70,50 +70,52 @@ void PrepareReaders(std::vector<std::shared_ptr<DataFeed>>& readers, // NOLINT
#ifdef PADDLE_WITH_PSLIB
void AsyncExecutor::InitServer(const std::string& dist_desc, int index) {
_pslib_ptr =
std::shared_ptr<paddle::distributed::PSlib>(
new paddle::distributed::PSlib());
_pslib_ptr->init_server(dist_desc, index);
InitParamConfig();
_pslib_ptr = std::shared_ptr<paddle::distributed::PSlib>(
new paddle::distributed::PSlib());
_pslib_ptr->init_server(dist_desc, index);
InitParamConfig();
}
void AsyncExecutor::InitWorker(const std::string& dist_desc,
const std::vector<uint64_t>& host_sign_list,
int node_num, int index) {
_pslib_ptr = std::shared_ptr<paddle::distributed::PSlib>(
new paddle::distributed::PSlib());
_pslib_ptr->init_worker(
dist_desc, (uint64_t*)(host_sign_list.data()), node_num, index);
_pslib_ptr = std::shared_ptr<paddle::distributed::PSlib>(
new paddle::distributed::PSlib());
_pslib_ptr->init_worker(dist_desc,
static_cast<uint64_t*>(host_sign_list.data()),
node_num, index);
InitParamConfig();
InitParamConfig();
}
uint64_t AsyncExecutor::StartServer() {
return _pslib_ptr->run_server();
}
uint64_t AsyncExecutor::StartServer() { return _pslib_ptr->run_server(); }
void AsyncExecutor::StopServer() {
_pslib_ptr->stop_server();
}
void AsyncExecutor::StopServer() { _pslib_ptr->stop_server(); }
void AsyncExecutor::GatherServers(
const std::vector<uint64_t>& host_sign_list, int node_num) {
_pslib_ptr->gather_servers((uint64_t*)(host_sign_list.data()), node_num);
void AsyncExecutor::GatherServers(const std::vector<uint64_t>& host_sign_list,
int node_num) {
_pslib_ptr->gather_servers(static_cast<uint64_t*>(host_sign_list.data()),
node_num);
}
void AsyncExecutor::InitParamConfig() {
for (int i = 0; i <
_pslib_ptr->get_param()->server_param(). \
downpour_server_param(). \
downpour_table_param_size();
for (int i = 0; i < _pslib_ptr->get_param()
->server_param()
.downpour_server_param()
.downpour_table_param_size();
++i) {
if (_pslib_ptr->get_param()->server_param(). \
downpour_server_param().downpour_table_param(i). \
table_class().find("SparseTable") != -1) {
_param_config.fea_dim = _pslib_ptr->get_param()->server_param(). \
downpour_server_param(). \
downpour_table_param(i). \
accessor().fea_dim();
if (_pslib_ptr->get_param()
->server_param()
.downpour_server_param()
.downpour_table_param(i)
.table_class()
.find("SparseTable") != -1) {
_param_config.fea_dim = _pslib_ptr->get_param()
->server_param()
.downpour_server_param()
.downpour_table_param(i)
.accessor()
.fea_dim();
break;
}
}
@ -122,28 +124,24 @@ void AsyncExecutor::InitParamConfig() {
_pslib_ptr->get_param()->trainer_param().push_dense_per_batch());
_param_config.tmp_push_sparse_wait_times = static_cast<int32_t>(
_pslib_ptr->get_param()->trainer_param().push_sparse_per_batch());
for (auto t = 0u;
t < _pslib_ptr->get_param()->trainer_param().skip_op_size();
for (auto t = 0u; t < _pslib_ptr->get_param()->trainer_param().skip_op_size();
++t) {
_param_config.skip_op.push_back(
_pslib_ptr->get_param()->trainer_param().skip_op(t));
}
for (auto t = 0u;
t < _pslib_ptr->get_param()->trainer_param().sparse_table_size();
++t) {
t < _pslib_ptr->get_param()->trainer_param().sparse_table_size(); ++t) {
auto& table = _pslib_ptr->get_param()->trainer_param().sparse_table(t);
std::vector<std::string> tmp_sparse_variable_name;
for (int i = 0u; i < table.slot_value_size(); ++i) {
tmp_sparse_variable_name.push_back(table.slot_value(i));
_param_config.slot_alias_to_table[table.slot_key(i)] =
table.table_id();
_param_config.slot_alias_to_table[table.slot_key(i)] = table.table_id();
}
std::vector<std::string> tmp_sparse_gradient_variable_name;
for (auto i = 0u; i < table.slot_gradient_size(); ++i) {
tmp_sparse_gradient_variable_name.push_back(
table.slot_gradient(i));
tmp_sparse_gradient_variable_name.push_back(table.slot_gradient(i));
}
_param_config.slot_input_vec[table.table_id()] =
std::move(tmp_sparse_variable_name);
@ -151,10 +149,9 @@ void AsyncExecutor::InitParamConfig() {
std::move(tmp_sparse_gradient_variable_name);
_param_config.sparse_table_id.push_back(table.table_id());
}
for (auto t = 0u;
t < _pslib_ptr->get_param()->trainer_param().dense_table_size();
++t) {
t < _pslib_ptr->get_param()->trainer_param().dense_table_size(); ++t) {
auto& table = _pslib_ptr->get_param()->trainer_param().dense_table(t);
std::vector<std::string> tmp_dense_variable_name;
for (int i = 0u; i < table.dense_variable_name_size(); ++i) {
@ -181,26 +178,25 @@ void AsyncExecutor::InitModel() {
Variable* var = root_scope_->FindVar(t);
CHECK(var != nullptr) << "var[" << t << "] not found";
LoDTensor* tensor = var->GetMutable<LoDTensor>();
float* g = tensor->data<float>();
CHECK(g != nullptr) << "var[" << t << "] value not initialized";
float init_range = 0.2;
int rown = tensor->dims()[0];
init_range /= sqrt(rown);
std::normal_distribution<float> ndistr(0.0, 1.0);
for (auto i = 0u; i < tensor->numel(); ++i) {
g[i] = ndistr(local_random_engine()) * init_range;
}
paddle::ps::Region reg(g, tensor->numel());
regions.emplace_back(std::move(reg));
}
auto push_status =
_pslib_ptr->_worker_ptr->push_dense_param(
regions.data(), regions.size(), table_id);
auto push_status = _pslib_ptr->_worker_ptr->push_dense_param(
regions.data(), regions.size(), table_id);
push_status.wait();
auto status = push_status.get();
if (status != 0) {
@ -225,14 +221,14 @@ void AsyncExecutor::SaveModel(const std::string& path) {
void AsyncExecutor::PrepareDenseThread(const std::string& mode) {
if (mode == "mpi") {
DensePullThreadParam param;
param.ps_client = _pslib_ptr->_worker_ptr;;
param.ps_client = _pslib_ptr->_worker_ptr;
param.threshold = 1;
param.training_thread_num = actual_thread_num;
param.root_scope = root_scope_;
param.dense_params = &_param_config.dense_variable_name;
_pull_dense_thread = std::shared_ptr<DensePullThread>(
new DensePullThread(param));
_pull_dense_thread =
std::shared_ptr<DensePullThread>(new DensePullThread(param));
_pull_dense_thread->start();
}
}
@ -243,8 +239,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
const std::vector<std::string>& filelist,
const int thread_num,
const std::vector<std::string>& fetch_var_names,
const std::string& mode,
const bool debug) {
const std::string& mode, const bool debug) {
std::vector<std::thread> threads;
auto& block = main_program.Block(0);
@ -293,9 +288,9 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
for (auto& worker : workers) {
#ifdef PADDLE_WITH_PSLIB
if (mode == "mpi") {
worker.reset(new AsyncExecutorThreadWorker);
worker.reset(new AsyncExecutorThreadWorker);
} else {
worker.reset(new ExecutorThreadWorker);
worker.reset(new ExecutorThreadWorker);
}
#else
worker.reset(new ExecutorThreadWorker);
@ -308,7 +303,6 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
fetch_var_names, root_scope_, thidx, debug);
}
// start executing ops in multiple threads
for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
threads.push_back(

File diff suppressed because it is too large Load Diff

@ -26,7 +26,7 @@ limitations under the License. */
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#ifdef PADDLE_WITH_PSLIB
#include "pslib.h"
#include <pslib.h>
#endif
namespace paddle {
@ -34,75 +34,74 @@ namespace framework {
void CreateTensor(Variable* var, proto::VarType::Type var_type);
#ifdef PADDLE_WITH_PSLIB
const static uint32_t MAX_FEASIGN_NUM = 1000 * 100 * 100;
static const uint32_t MAX_FEASIGN_NUM = 1000 * 100 * 100;
struct AsyncWorkerParamConfig {
int slot_dim;
int fea_dim;
int32_t tmp_push_dense_wait_times;
int32_t tmp_push_sparse_wait_times;
std::vector<std::string> skip_op;
std::map<uint64_t, std::vector<std::string>> dense_variable_name;
std::map<uint64_t, std::vector<std::string>> dense_gradient_variable_name;
std::vector<int> dense_table_id;
std::vector<int> dense_table_id;
// fea_dim for each dense table
std::vector<uint32_t> dense_table_size;
std::vector<int> sparse_table_id;
std::vector<uint32_t> dense_table_size;
std::vector<int> sparse_table_id;
std::map<uint64_t, std::vector<std::string>> slot_input_vec;
std::map<uint64_t, std::vector<std::string>> gradient_var;
std::map<std::string, uint64_t> slot_alias_to_table;
};
struct DensePullThreadParam {
std::shared_ptr<paddle::ps::PSClient> ps_client;
int threshold;
int training_thread_num;
Scope* root_scope;
std::map<uint64_t, std::vector<std::string>>* dense_params;
int sleep_time_ms = 2;
std::shared_ptr<paddle::ps::PSClient> ps_client;
int threshold;
int training_thread_num;
Scope* root_scope;
std::map<uint64_t, std::vector<std::string>>* dense_params;
int sleep_time_ms = 2;
};
class DensePullThread {
public:
explicit DensePullThread(const DensePullThreadParam& param) :
_running(false) {
explicit DensePullThread(const DensePullThreadParam& param)
: _running(false) {
_ps_client = param.ps_client;
_threshold = param.threshold;
_thread_num = param.training_thread_num;
_root_scope = param.root_scope;
_sleep_time_ms = param.sleep_time_ms;
for (auto& t : *param.dense_params) {
_dense_variable_name[t.first].insert(
_dense_variable_name[t.first].end(),
t.second.begin(), t.second.end());
_dense_variable_name[t.first].insert(_dense_variable_name[t.first].end(),
t.second.begin(), t.second.end());
_training_versions[t.first].resize(_thread_num, 0);
_last_versions[t.first] = 0;
_current_version[t.first] = 0;
}
}
int start();
void stop() {
if (_running) {
_running = false;
_t.join();
}
}
void increase_thread_version(int thread_id, uint64_t table_id);
void reset_thread_version(uint64_t table_id);
std::future<int32_t> pull_dense(uint64_t table_id);
void pull_dense2(uint64_t table_id);
void wait_all();
private:
void run();
bool check_update_param(uint64_t table_id);
private:
std::shared_ptr<paddle::ps::PSClient> _ps_client;
int _thread_num;
@ -113,33 +112,33 @@ class DensePullThread {
std::map<uint64_t, uint64_t> _last_versions;
std::map<uint64_t, uint64_t> _current_version;
std::mutex _mutex_for_version;
std::mutex _mutex_for_version;
std::map<uint64_t, std::vector<uint64_t>> _training_versions;
std::map<uint64_t, std::vector<std::string>> _dense_variable_name;
std::thread _t;
std::vector<::std::future<int32_t>> _pull_dense_status;
std::map<uint64_t, std::vector<paddle::ps::Region>> _regions;
uint32_t _pull_dense_fail_times = 0;
std::vector<float> _base_norm_param;
std::vector<float> _mean;
std::vector<float> _scale;
uint32_t _pull_dense_fail_times = 0;
std::vector<float> _base_norm_param;
std::vector<float> _mean;
std::vector<float> _scale;
float _squared_sum_epsilon = 1e-4;
std::mutex _mutex_for_mean_scale;
float _total_batch_num = 0;
};
#endif
class ExecutorThreadWorker {
public:
ExecutorThreadWorker()
: thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {}
ExecutorThreadWorker()
: thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {}
virtual ~ExecutorThreadWorker() {}
void CreateThreadResource(const framework::ProgramDesc& program,
const paddle::platform::Place& place);
void SetThreadId(int tid);
@ -161,10 +160,8 @@ ExecutorThreadWorker()
#ifdef PADDLE_WITH_PSLIB
virtual void SetPSlibPtr(
std::shared_ptr<paddle::distributed::PSlib> pslib_ptr) {}
virtual void SetPullDenseThread(
std::shared_ptr<DensePullThread> dpt) {}
virtual void SetParamConfig(
AsyncWorkerParamConfig * param_config) {}
virtual void SetPullDenseThread(std::shared_ptr<DensePullThread> dpt) {}
virtual void SetParamConfig(AsyncWorkerParamConfig* param_config) {}
#endif
private:
@ -195,7 +192,7 @@ ExecutorThreadWorker()
};
#ifdef PADDLE_WITH_PSLIB
class AsyncExecutorThreadWorker: public ExecutorThreadWorker {
class AsyncExecutorThreadWorker : public ExecutorThreadWorker {
public:
AsyncExecutorThreadWorker() {}
virtual ~AsyncExecutorThreadWorker() {}
@ -210,40 +207,35 @@ class AsyncExecutorThreadWorker: public ExecutorThreadWorker {
void FillSparse(int table_id);
void PushSparse(int table_id);
void PushDense(int table_id);
void check_pull_push_memory(
const std::vector<uint64_t>& features,
std::vector<float*>& push_g,
int dim);
void check_pull_push_memory(const std::vector<uint64_t>& features,
std::vector<std::vector<float>>& push_g,
int dim);
std::vector<float*>* push_g, int dim);
void check_pull_push_memory(const std::vector<uint64_t>& features,
std::vector<std::vector<float>>* push_g, int dim);
void collect_feasign_info(int table_id);
private:
struct FeasignInfo {
uint32_t slot;
uint32_t ins;
int64_t label;
};
std::map<uint64_t, std::vector<uint64_t>> _features;
std::map<uint64_t, std::vector<FeasignInfo>> _fea_info;
std::map<uint64_t, std::vector<uint64_t>> _features;
std::map<uint64_t, std::vector<FeasignInfo>> _fea_info;
std::map<uint64_t, std::vector<std::vector<float>>> _feature_value;
std::map<uint64_t, std::vector<std::vector<float>>> _feature_push_value;
std::shared_ptr<paddle::distributed::PSlib> _pslib_ptr;
std::shared_ptr<DensePullThread> _pull_dense_thread;
std::vector<::std::future<int32_t>> _pull_sparse_status;
std::vector<::std::future<int32_t>> _pull_dense_status;
std::vector<::std::future<int32_t>> _push_sparse_status;
std::vector<::std::future<int32_t>> _push_dense_status;
AsyncWorkerParamConfig* _param_config;
std::shared_ptr<paddle::distributed::PSlib> _pslib_ptr;
std::shared_ptr<DensePullThread> _pull_dense_thread;
std::vector<::std::future<int32_t>> _pull_sparse_status;
std::vector<::std::future<int32_t>> _pull_dense_status;
std::vector<::std::future<int32_t>> _push_sparse_status;
std::vector<::std::future<int32_t>> _push_dense_status;
AsyncWorkerParamConfig* _param_config;
};
#endif

Loading…
Cancel
Save