add ps cache manager

pull/9746/head
lizhenyu 4 years ago
parent 1033166d8a
commit e3f7ae61db

@ -194,6 +194,14 @@ if (ENABLE_GPU)
)
endif ()
if (ENABLE_CPU AND (ENABLE_D OR ENABLE_GPU))
install(
TARGETS ps_cache
DESTINATION ${INSTALL_LIB_DIR}
COMPONENT mindspore
)
endif()
if (ENABLE_SERVING OR ENABLE_TESTCASES)
file(GLOB_RECURSE LIBEVENT_LIB_LIST
${libevent_LIBPATH}/libevent*

@ -308,7 +308,7 @@ elseif (CMAKE_SYSTEM_NAME MATCHES "Darwin")
else ()
if (ENABLE_CPU AND (ENABLE_D OR ENABLE_GPU))
target_link_libraries(mindspore mindspore::pslite proto_input mindspore::protobuf mindspore::event mindspore::event_pthreads ${zeromq_DIRPATH}/zmq_install/lib/libzmq.a)
target_link_libraries(mindspore -Wl,--no-as-needed mindspore::event_core)
target_link_libraries(mindspore -Wl,--no-as-needed mindspore::event_core ps_cache)
if (${ENABLE_IBVERBS} STREQUAL "ON")
target_link_libraries(mindspore ibverbs rdmacm)
endif()

@ -75,6 +75,9 @@ void AicpuOpKernelMod::CreateCpuKernelInfo(const std::vector<AddressPtr> &inputs
if (kCustAiCpuKernelOps.find(node_name_) != kCustAiCpuKernelOps.end()) {
node_so_ = CUST_AICPU_OPS_SO_NAME;
node_name_ = kCustRunApi;
} else if (kCacheKernelOps.find(node_name_) != kCacheKernelOps.end()) {
node_so_ = AICPU_OPS_SO_NAME;
node_name_ = kCustRunApi;
} else {
node_so_ = AICPU_OPS_SO_NAME;
}
@ -161,6 +164,9 @@ std::vector<TaskInfoPtr> AicpuOpKernelMod::GenTask(const std::vector<AddressPtr>
if (kCustAiCpuKernelOps.find(node_name_) != kCustAiCpuKernelOps.end()) {
node_so_ = CUST_AICPU_OPS_SO_NAME;
node_name_ = kCustRunApi;
} else if (kCacheKernelOps.find(node_name_) != kCacheKernelOps.end()) {
node_so_ = AICPU_OPS_SO_NAME;
node_name_ = kCustRunApi;
} else {
node_so_ = AICPU_OPS_SO_NAME;
}

@ -49,6 +49,7 @@ constexpr auto kIdentity = "Identity";
constexpr auto kUpdateCache = "UpdateCache";
constexpr auto kCustRunApi = "RunCpuKernel";
const std::set<std::string> kCustAiCpuKernelOps{kEditDistance, kIdentity};
const std::set<std::string> kCacheKernelOps{kUpdateCache};
struct AicpuParamHead {
uint32_t length; // Total length: include cunstom message

@ -15,6 +15,7 @@
*/
#include "backend/kernel_compiler/cpu/ps/embedding_look_up_proxy_kernel.h"
#include <vector>
#include <algorithm>
#include "ps/worker.h"
namespace mindspore {
@ -38,10 +39,13 @@ void EmbeddingLookUpProxyKernel::InitKernel(const CNodePtr &kernel_node) {
key_ = AnfAlgo::GetNodeAttr<size_t>(kernel_node, kAttrPsKey);
}
std::vector<size_t> keys{key_, key_, key_};
std::vector<size_t> values;
values.insert(values.end(), input_shape.begin(), input_shape.end());
values.insert(values.end(), indices_shape.begin(), indices_shape.end());
values.insert(values.end(), output_shape.begin(), output_shape.end());
std::vector<float> values;
std::transform(input_shape.begin(), input_shape.end(), std::back_inserter(values),
[](size_t dim) -> float { return SizeToFloat(dim); });
std::transform(indices_shape.begin(), indices_shape.end(), std::back_inserter(values),
[](size_t dim) -> float { return SizeToFloat(dim); });
std::transform(output_shape.begin(), output_shape.end(), std::back_inserter(values),
[](size_t dim) -> float { return SizeToFloat(dim); });
MS_LOG(INFO) << "Init embedding lookup proxy kernel, input shape:" << input_shape
<< ", indices_shape:" << indices_shape << ", output_shape:" << output_shape;
std::vector<int64_t> lens{SizeToLong(input_shape.size()), SizeToLong(indices_shape.size()),

@ -72,6 +72,23 @@ bool EmbeddingLookUpPSKernel::Execute(const std::vector<AddressPtr> &inputs, con
return Launch(inputs, workspace, outputs);
}
void EmbeddingLookUpPSKernel::UpdateEmbeddings(float *embedding_table, const size_t *lookup_ids,
const float *update_vals, size_t ids_size) {
size_t copy_lens = outer_dim_size_ * sizeof(float);
for (size_t i = 0; i < ids_size; ++i) {
int index = lookup_ids[i] - offset_;
if (index >= 0 && index < SizeToInt(first_dim_size_)) {
auto ret =
memcpy_s(embedding_table + index * outer_dim_size_, copy_lens, update_vals + i * outer_dim_size_, copy_lens);
if (ret != EOK) {
MS_LOG(EXCEPTION) << "LookUpTable task memcpy failed.";
}
} else {
MS_LOG(EXCEPTION) << "UpdateEmbeddings index invalid.";
}
}
}
const std::vector<size_t> &EmbeddingLookUpPSKernel::input_sizes() const { return input_shape_; }
const std::vector<size_t> &EmbeddingLookUpPSKernel::output_sizes() const { return GetOutputSizeList(); }

@ -35,7 +35,8 @@ class EmbeddingLookUpPSKernel : public EmbeddingLookUpCPUKernel, public PServerK
bool Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
void UpdateEmbeddings(float *embedding_table, const size_t *lookup_ids, const float *update_vals,
size_t ids_size) override;
const std::vector<size_t> &input_sizes() const override;
const std::vector<size_t> &output_sizes() const override;
const std::vector<size_t> &workspace_sizes() const override;

@ -38,7 +38,8 @@ class PServerKernel {
virtual void ReInit(const std::vector<std::vector<size_t>> &) {}
virtual bool Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) = 0;
virtual void UpdateEmbeddings(float *embedding_table, const size_t *lookup_ids, const float *update_vals,
size_t ids_size) {}
virtual const std::vector<size_t> &input_sizes() const = 0;
virtual const std::vector<size_t> &output_sizes() const = 0;
virtual const std::vector<size_t> &workspace_sizes() const = 0;

@ -56,6 +56,7 @@
#include "toolchain/adx_datadump_server.h"
#if ENABLE_CPU && ENABLE_D
#include "ps/util.h"
#include "ps/ps_cache/ps_cache_manager.h"
#endif
namespace mindspore {
@ -487,11 +488,7 @@ GraphId AscendSession::CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) {
// adjust kernel
AdjustKernel(root_graph);
#if ENABLE_CPU && ENABLE_D
if (ps::Util::IsParamServerMode()) {
CheckPSModeConsistence(root_graph);
// Assign parameter keys.
AssignParamKey(root_graph);
}
InitPsWorker(root_graph);
#endif
// assign stream
AssignStream(NOT_NULL(root_graph));
@ -568,6 +565,9 @@ void AscendSession::BuildGraphImpl(GraphId graph_id) {
}
// adjust execution order because merge child graph and other special operations
AdjustKernel(graph);
#if ENABLE_CPU && ENABLE_D
InitPsWorker(graph);
#endif
// Reorder optimizer order
auto execution_order = graph->execution_order();
Reorder(&execution_order);
@ -644,6 +644,10 @@ void AscendSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tens
#if ENABLE_CPU && ENABLE_D
// Initialize parameter server
InitPSParamAndOptim(kernel_graph, inputs);
std::string channel_name;
if (ps::PsDataPrefetch::GetInstance().cache_enable() && IsGetNextGraph(graph_id, &channel_name)) {
ps::ps_cache_instance.IncreaseGraphStep(channel_name);
}
#endif
{
// run task on device

@ -21,6 +21,9 @@
#include "runtime/device/kernel_runtime_manager.h"
#include "utils/comm_manager.h"
#include "utils/scoped_long_running.h"
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
#include "ps/ps_cache/ps_cache_manager.h"
#endif
namespace mindspore {
namespace session {

@ -64,6 +64,7 @@
#include "utils/ms_context.h"
#if ENABLE_CPU && ENABLE_GPU
#include "ps/util.h"
#include "ps/ps_cache/ps_cache_manager.h"
#endif
namespace mindspore {
@ -237,6 +238,12 @@ void GPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
auto input_node = input_nodes[i];
MS_EXCEPTION_IF_NULL(input_node);
if (input_node->isa<Parameter>() && AnfAlgo::OutputAddrExist(input_node, 0)) {
#if ENABLE_CPU && ENABLE_GPU
const std::string &param_name = input_node->fullname_with_scope();
if (ps::ps_cache_instance.IsHashTable(param_name)) {
continue;
}
#endif
auto pk_node = input_node->cast<ParameterPtr>();
auto device_address = AnfAlgo::GetMutableOutputAddr(pk_node, 0);
auto tensor_address = std::dynamic_pointer_cast<device::DeviceAddress>(tensor->device_address());
@ -300,16 +307,11 @@ GraphId GPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr
HardwareOptimize(graph);
// Graph kernel fusion optimization
GraphKernelOptimize(graph);
#if ENABLE_CPU && ENABLE_GPU
if (ps::Util::IsParamServerMode()) {
CheckPSModeConsistence(graph);
// Assign parameter keys.
AssignParamKey(graph);
}
#endif
// Start gpu kernel runtime
StartKernelRT();
#if ENABLE_CPU && ENABLE_GPU
InitPsWorker(graph);
#endif
// Assign CUDA streams
AssignStream(graph);
// Dump .pb graph before remove nop nodes
@ -374,6 +376,12 @@ void GPUSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor:
int kernel_num = kernel_graph->execution_order().size();
int64_t loopsize = (kernel_num > 1) ? ConfigManager::GetInstance().gpu_loopsink_size() : 1;
for (int64_t i = 0; i < loopsize; i++) {
#if ENABLE_CPU && ENABLE_GPU
std::string channel_name;
if (ps::PsDataPrefetch::GetInstance().cache_enable() && IsGetNextGraph(graph_id, &channel_name)) {
ps::ps_cache_instance.IncreaseGraphStep(channel_name);
}
#endif
Execute(kernel_graph);
}
// In pynative mode, device addresses of tensors in value nodes need be clean.

@ -41,8 +41,10 @@
#include "utils/trace_base.h"
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
#include "ps/worker.h"
#include "ps/ps_cache/ps_cache_manager.h"
#include "ps/common.h"
#include "ps/util.h"
#include "abstract/abstract_value.h"
#endif
namespace mindspore {
@ -1125,6 +1127,12 @@ void SessionBasic::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_grap
size = abstract::ShapeSize(shape_tmp) * abstract::TypeIdSize(tensor->data_type());
}
if (input_node->isa<Parameter>() && AnfAlgo::OutputAddrExist(input_node, 0) && TensorNeedSync(input_node, tensor)) {
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
const std::string &param_name = input_node->fullname_with_scope();
if (ps::ps_cache_instance.IsHashTable(param_name)) {
continue;
}
#endif
auto device_address = AnfAlgo::GetMutableOutputAddr(input_node, 0);
MS_EXCEPTION_IF_NULL(device_address);
if (size != 0 && !device_address->SyncHostToDevice(trans::GetRuntimePaddingShape(input_node, 0), size,
@ -1715,8 +1723,64 @@ void SessionBasic::CleanUselessTensorsImpl(const std::shared_ptr<std::vector<ten
}
}
bool SessionBasic::IsGetNextGraph(const GraphId &graph_id, std::string *channel_name) {
auto kernel_graph = graphs_[graph_id];
MS_EXCEPTION_IF_NULL(kernel_graph);
for (const auto &kernel_node : kernel_graph->execution_order()) {
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
if (kernel_name == kGetNextOpName) {
auto prim = AnfAlgo::GetCNodePrimitive(kernel_node);
MS_EXCEPTION_IF_NULL(prim);
*channel_name = GetValue<std::string>(prim->GetAttr("shared_name"));
return true;
}
}
return false;
}
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
void SessionBasic::CheckPSModeConsistence(const KernelGraphPtr &kernel_graph) {
void SessionBasic::InitPsWorker(const KernelGraphPtr &kernel_graph) {
if (!ps::Util::IsRoleOfWorker()) {
return;
}
CheckPSModeConsistence(kernel_graph);
if (ps::PsDataPrefetch::GetInstance().cache_enable()) {
if (!ps::ps_cache_instance.initialized_ps_cache()) {
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
auto devcie_target = context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET);
auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(devcie_target, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
auto context = runtime_instance->context();
const auto &kernels = kernel_graph->execution_order();
if (kernels.size() > 0 && AnfAlgo::GetCNodeName(kernels[0]) == "InitDataSetQueue") {
GetBatchElements(kernels[0]);
ps::ps_cache_instance.Initialize();
}
ps::ps_cache_instance.DoProcessData(device_id_, context);
}
} else {
// Assign parameter keys.
AssignParamKey(kernel_graph);
}
}
void SessionBasic::GetBatchElements(const AnfNodePtr &kernel_node) const {
auto shapes = AnfAlgo::GetNodeAttr<std::vector<std::vector<int64_t>>>(kernel_node, "shapes");
auto types = AnfAlgo::GetNodeAttr<std::vector<TypePtr>>(kernel_node, "types");
if (shapes.size() != types.size() || shapes.size() == 0 || types.size() == 0) {
MS_LOG(EXCEPTION) << "Invalid shapes of op[InitDataSetQueue]: shapes size " << shapes.size() << ", types size "
<< types;
}
size_t batch_elements = 1;
const auto &shape = shapes[0];
for (size_t i = 0; i < shape.size(); ++i) {
batch_elements *= shape[i];
}
ps::ps_cache_instance.set_batch_elements(batch_elements);
}
void SessionBasic::CheckPSModeConsistence(const KernelGraphPtr &kernel_graph) const {
auto input_nodes = kernel_graph->inputs();
for (const auto &input_node : input_nodes) {
if (!input_node->isa<Parameter>()) {
@ -1725,8 +1789,9 @@ void SessionBasic::CheckPSModeConsistence(const KernelGraphPtr &kernel_graph) {
auto pk_node = input_node->cast<ParameterPtr>();
MS_EXCEPTION_IF_NULL(pk_node);
auto param_info_ptr = pk_node->param_info();
if (param_info_ptr != nullptr && param_info_ptr->init_in_server()) {
const std::string &param_name = pk_node->fullname_with_scope();
const std::string &param_name = pk_node->fullname_with_scope();
if (param_info_ptr != nullptr && param_info_ptr->init_in_server() &&
!ps::ps_cache_instance.IsHashTable(param_name)) {
MS_LOG(EXCEPTION) << "Can not initialize the parameter[" << param_name
<< "] in server, this parameter is used by kernel which executes in device";
}
@ -1734,10 +1799,6 @@ void SessionBasic::CheckPSModeConsistence(const KernelGraphPtr &kernel_graph) {
}
void SessionBasic::AssignParamKey(const KernelGraphPtr &kernel_graph) {
if (!ps::Util::IsRoleOfWorker()) {
MS_LOG(INFO) << "Not parameter server mode.";
return;
}
MS_EXCEPTION_IF_NULL(kernel_graph);
std::vector<AnfNodePtr> node_list = TopoSort(kernel_graph->get_return());
for (auto &node : node_list) {
@ -1775,16 +1836,8 @@ void SessionBasic::InitPSParamAndOptim(const KernelGraphPtr &kernel_graph,
return;
}
std::vector<tensor::TensorPtr> inputs(inputs_const);
size_t input_ctrl_size = 1;
MS_EXCEPTION_IF_NULL(kernel_graph);
if (kernel_graph->input_ctrl_tensors()) {
input_ctrl_size = LoadCtrlInputTensor(kernel_graph, &inputs);
}
auto input_nodes = kernel_graph->inputs();
if ((inputs.size() + input_ctrl_size) - 1 != input_nodes.size()) {
MS_LOG(EXCEPTION) << "Tensor input:" << inputs.size() << " is not equal graph inputs:" << input_nodes.size()
<< ", input_ctrl_size:" << input_ctrl_size;
}
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
for (size_t i = 0; i < inputs.size(); ++i) {

@ -99,9 +99,9 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> {
// get graph id in child graphs by ME front anf node pointer
virtual GraphId GetGraphIdByNode(const AnfNodePtr &) const;
virtual GraphId GetFinalRunGraph() const { return kInvalidGraphId; }
void CheckPSModeConsistence(const KernelGraphPtr &Kernel_graph);
void AssignParamKey(const KernelGraphPtr &kernel_graph);
void InitPSParamAndOptim(const KernelGraphPtr &kernel_graph, const std::vector<tensor::TensorPtr> &inputs_const);
bool IsGetNextGraph(const GraphId &graph_id, std::string *channel_name);
virtual bool CheckModelInputs(uint32_t graph_id, const std::vector<tensor::TensorPtr> &inputs,
std::string *error_msg) const {
return true;
@ -195,6 +195,11 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> {
AnfNodePtr FindPullNode(const AnfNodePtr &push_node, const std::vector<AnfNodePtr> &node_list);
void UpdateGraphDynamicShapeAttr(const NotNull<KernelGraphPtr> &root_graph);
void UpdateAllGraphDynamicShapeAttr(const std::vector<KernelGraphPtr> &all_graphs);
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
void CheckPSModeConsistence(const KernelGraphPtr &kernel_graph) const;
void GetBatchElements(const AnfNodePtr &kernel_node) const;
void InitPsWorker(const KernelGraphPtr &kernel_graph);
#endif
std::unordered_map<GraphId, std::shared_ptr<KernelGraph>> graphs_;
std::unordered_map<GraphInfo, std::shared_ptr<KernelGraph>> run_op_graphs_;
@ -207,6 +212,9 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> {
#if !defined(_WIN32) && !defined(_WIN64)
std::shared_ptr<Debugger> debugger_;
#endif
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
bool initialized_ps_cache_{false};
#endif
};
using SessionPtr = std::shared_ptr<session::SessionBasic>;

@ -24,6 +24,9 @@
#include "frontend/parallel/device_matrix.h"
#include "frontend/parallel/graph_util/generate_graph.h"
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
#include "ps/ps_cache/ps_data/ps_data_prefetch.h"
#endif
namespace mindspore {
namespace parallel {
@ -514,6 +517,12 @@ Status GatherV2PInfo::InferBias() {
if (repeated_calc_num_ > 1) {
rank = rank / repeated_calc_num_;
}
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
if (ps::PsDataPrefetch::GetInstance().cache_enable()) {
bias_ = 0;
return SUCCESS;
}
#endif
bias_ = rank / params_strategy.at(1) * slice_size_;
return SUCCESS;
}

@ -46,10 +46,18 @@
#include "ir/anf.h"
#include "ir/param_info.h"
#include "ir/tensor.h"
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
#include "ps/util.h"
#endif
namespace mindspore {
namespace parallel {
bool StepAutoParallel(const FuncGraphPtr &root, const opt::OptimizerPtr &) {
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
if (ps::Util::IsRoleOfPServer() || ps::Util::IsRoleOfScheduler()) {
return false;
}
#endif
MS_EXCEPTION_IF_NULL(root);
MS_EXCEPTION_IF_NULL(ParallelContext::GetInstance());
std::string parallel_mode = ParallelContext::GetInstance()->parallel_mode();

@ -44,6 +44,9 @@
#include "utils/comm_manager.h"
#include "utils/ms_context.h"
#include "utils/symbolic.h"
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
#include "ps/util.h"
#endif
using mindspore::tensor::Tensor;
@ -3036,6 +3039,11 @@ static void HandleNoUsedParameter(const FuncGraphPtr &root) {
}
bool StepParallel(const FuncGraphPtr &root, const opt::OptimizerPtr &optimizer) {
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
if (ps::Util::IsRoleOfPServer() || ps::Util::IsRoleOfScheduler()) {
return false;
}
#endif
MS_EXCEPTION_IF_NULL(root);
MS_EXCEPTION_IF_NULL(optimizer);
MS_EXCEPTION_IF_NULL(ParallelContext::GetInstance());

@ -201,6 +201,7 @@ else ()
if (${ENABLE_IBVERBS} STREQUAL "ON")
target_link_libraries(_c_dataengine PRIVATE ibverbs rdmacm)
endif ()
target_link_libraries(_c_dataengine PRIVATE ps_cache)
endif ()
endif ()

@ -322,6 +322,7 @@ Status DeviceQueueOp::RetryPushGPUData(const std::vector<size_t> &data_size, con
bool profiling, int32_t *push_time) {
std::vector<device::DataItemGpu> items;
double start_time;
bool ps_data_prefetch = false;
for (int i = 0; i < data_size.size(); i++) {
device::DataItemGpu data_item;
data_item.data_len_ = data_size[i];
@ -334,6 +335,11 @@ Status DeviceQueueOp::RetryPushGPUData(const std::vector<size_t> &data_size, con
if (profiling) {
start_time = ProfilingTime::GetCurMilliSecond();
}
// Data prefetch only when PS mode enables cache.
if ((!ps_data_prefetch) && (items.size() > 0)) {
ps::PsDataPrefetch::GetInstance().PrefetchData(channel_name_, items[0].data_ptr_, items[0].data_len_);
ps_data_prefetch = true;
}
BlockQueueStatus_T ret = GpuBufferMgr::GetInstance().Push(handle, items, WAIT_TIME);
if (profiling) {
double end_time = ProfilingTime::GetCurMilliSecond();

@ -24,6 +24,7 @@
#include "minddata/dataset/engine/datasetops/pipeline_op.h"
#include "minddata/dataset/engine/datasetops/repeat_op.h"
#include "minddata/dataset/util/status.h"
#include "ps/ps_cache/ps_data/ps_data_prefetch.h"
#ifdef ENABLE_TDTQUE
#include "minddata/dataset/util/queue.h"

@ -17,6 +17,8 @@
#include "utils/ms_utils.h"
#include "minddata/dataset/engine/perf/profiling.h"
#include "minddata/dataset/util/log_adapter.h"
#include "ps/ps_cache/ps_data/ps_data_prefetch.h"
namespace mindspore {
namespace dataset {
static std::shared_ptr<TdtPlugin> instance_ptr_ = nullptr;
@ -48,6 +50,10 @@ TdtStatus TdtPlugin::hostPush(TensorRow ts_row, bool is_wait, std::string channe
if (profiling) {
start_time = ProfilingTime::GetCurMilliSecond();
}
// Data prefetch only when PS mode enables cache.
if (items.size() > 0) {
ps::PsDataPrefetch::GetInstance().PrefetchData(channel_name, items[0].dataPtr_.get(), items[0].dataLen_);
}
if (tdt::TdtHostPushData(channel_name, items) != 0) {
return FAILED;
}

@ -308,7 +308,14 @@ PYBIND11_MODULE(_c_expression, m) {
.def("is_role_worker", &PSContext::is_role_worker, "Get whether the role of this process is Worker.")
.def("is_role_pserver", &PSContext::is_role_pserver, "Get whether the role of this process is PServer.")
.def("is_role_sched", &PSContext::is_role_sched, "Get whether the role of this process is Scheduler.")
.def("ps_rank_id", &PSContext::ps_rank_id, "Get Worker and PServer rank id.");
.def("ps_rank_id", &PSContext::ps_rank_id, "Get Worker and PServer rank id.")
.def("insert_hash_table_size", &PSContext::InsertHashTableSize, "Insert hash table size.")
.def("reinsert_hash_table_size", &PSContext::ReInsertHashTableSize,
"Insert hash table size with new parameter name.")
.def("insert_weight_init_info", &PSContext::InsertWeightInitInfo, "Insert embedding table initialization seed.")
.def("insert_accumu_init_info", &PSContext::InsertAccumuInitInfo, "Insert accumulation initialization value.")
.def("clone_hash_table", &PSContext::CloneHashTable, "Clone a hash table.")
.def("set_cache_enable", &PSContext::set_cache_enable, "Set ps mode cache enable or not.");
(void)py::class_<OpInfoLoaderPy, std::shared_ptr<OpInfoLoaderPy>>(m, "OpInfoLoaderPy")
.def(py::init())

@ -52,6 +52,7 @@
#include "ps/common.h"
#include "ps/util.h"
#include "ps/worker.h"
#include "ps/ps_cache/ps_data/ps_data_prefetch.h"
#endif
#if (ENABLE_GE || ENABLE_D)
@ -921,6 +922,11 @@ bool InitExecDataset(const std::string &queue_name, int64_t iter_num, int64_t ba
bool InitExecDatasetVm(const std::string &queue_name, int64_t size, int64_t batch_size,
const std::vector<TypePtr> &types, const std::vector<std::vector<int64_t>> &shapes,
const std::vector<int64_t> &input_indexes, bool need_run) {
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
if ((ps::Util::IsParamServerMode()) && (!ps::Util::IsRoleOfWorker())) {
return true;
}
#endif
MS_LOG(INFO) << "Start InitDataSet Entry";
ShapeVector int_input_indexes;
(void)std::transform(input_indexes.begin(), input_indexes.end(), std::back_inserter(int_input_indexes),
@ -966,7 +972,17 @@ bool InitExecDatasetVm(const std::string &queue_name, int64_t size, int64_t batc
if (MsContext::GetInstance()->get_param<int>(MS_CTX_EXECUTION_MODE) != kPynativeMode) {
backend->Link(runner.graph_id);
}
ConfigManager::GetInstance().set_iter_num(size);
// PS mode does not support loop sink.
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
if (ps::Util::IsRoleOfWorker()) {
ps::PsDataPrefetch::GetInstance().CreateDataChannel(queue_name, LongToSize(size));
ConfigManager::GetInstance().set_iter_num(1);
} else {
#endif
ConfigManager::GetInstance().set_iter_num(size);
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
}
#endif
if (!(*runner.run)) {
// empty function
@ -981,7 +997,7 @@ bool InitExecDatasetVm(const std::string &queue_name, int64_t size, int64_t batc
}
MS_LOG(DEBUG) << "InitDataSetVm End.";
return true;
}
} // namespace pipeline
void ResetOpId() { mindspore::id_generator::reset_id(); }

@ -14,22 +14,20 @@ if (NOT (ENABLE_CPU AND (ENABLE_D OR ENABLE_GPU)))
list(REMOVE_ITEM _PS_SRC_FILES "core/cluster_config.cc")
list(REMOVE_ITEM _PS_SRC_FILES "core/node.cc")
list(REMOVE_ITEM _PS_SRC_FILES "core/node_manager.cc")
list(REMOVE_ITEM _PS_SRC_FILES "ps_cache/ps_cache_manager.cc")
endif ()
if (NOT ENABLE_D)
list(REMOVE_ITEM _PS_SRC_FILES "ps_cache/ascend/ascend_ps_cache.cc")
list(REMOVE_ITEM _PS_SRC_FILES "ps_cache/ps_cache_manager.cc")
endif()
if (NOT ENABLE_GPU)
list(REMOVE_ITEM _PS_SRC_FILES "ps_cache/gpu/gpu_ps_cache.cc")
list(REMOVE_ITEM _PS_SRC_FILES "ps_cache/ps_cache_manager.cc")
endif()
list(REMOVE_ITEM _PS_SRC_FILES "ps_cache/ps_data/ps_data_prefetch.cc")
list(REMOVE_ITEM _PS_SRC_FILES "ps_cache/ps_data/ps_data_channel.cc")
add_subdirectory(ps_cache)
set_property(SOURCE ${_PS_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_PS)
add_library(_mindspore_ps_obj OBJECT ${_PS_SRC_FILES})

@ -64,6 +64,7 @@ constexpr int64_t kInitWeightToOptimIdCmd = 11;
constexpr int64_t kInitOptimInputsShapeCmd = 12;
constexpr int64_t kInitKeyToPushNodeIdCmd = 13;
constexpr int64_t kInitEmbeddingsCmd = 20;
constexpr int64_t kUpdateEmbeddingsCmd = 21;
constexpr int64_t kCheckReadyForPushCmd = 25;
constexpr int64_t kCheckReadyForPullCmd = 26;
constexpr int64_t kEmbeddingLookupCmd = 30;

@ -51,6 +51,8 @@
#include "backend/kernel_compiler/cpu/ps/sparse_apply_ftrl_ps_kernel.h"
#include "backend/kernel_compiler/cpu/ps/apply_momentum_ps_kernel.h"
#include "backend/kernel_compiler/cpu/ps/embedding_look_up_ps_kernel.h"
#include "ps/ps_cache/ps_data/ps_data_prefetch.h"
#include "ps/random_normal/random_normal.h"
namespace mindspore {
namespace ps {
@ -100,6 +102,7 @@ class ParameterServer {
void HandleCheckReadyForPush(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res);
void HandleCheckReadyForPull(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res);
void HandleEmbeddingLookup(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res);
void HandleUpdateEmbeddings(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res);
void HandleFinalize(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res);
ParameterServer *ps_;
@ -118,13 +121,15 @@ class ParameterServer {
void InitWeight(const Key &key, const WeightPtr &weight);
void InitGrad(const Key &key, const GradPtr &grad);
void InitEmbeddingTable(const Key &key,
const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes);
const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes,
const ParamInitInfo &param_init_info);
bool HasWeight(const Key &key);
void Finalize();
void UpdateWeights();
void AccumGrad(const Keys &key, const Values &values, const Lengths &lengths);
WeightPtr weight(const Key &key);
void DoEmbeddingLookup(Key key, const LookupIds &lookup_ids, ::ps::KVPairs<T> *res);
void UpdateEmbeddings(const Key &key, const LookupIds &lookup_ids, const Values &vals);
bool ReadyForUpdateWeights();
bool ReadyForPush(const Key &key);
bool ReadyForPull(const Key &key);
@ -193,6 +198,7 @@ void ParameterServer<T>::ServerHandler::Init() {
handlers_[kCheckReadyForPushCmd] = &ServerHandler::HandleCheckReadyForPush;
handlers_[kCheckReadyForPullCmd] = &ServerHandler::HandleCheckReadyForPull;
handlers_[kEmbeddingLookupCmd] = &ServerHandler::HandleEmbeddingLookup;
handlers_[kUpdateEmbeddingsCmd] = &ServerHandler::HandleUpdateEmbeddings;
handlers_[kFinalizeCmd] = &ServerHandler::HandleFinalize;
}
@ -302,7 +308,17 @@ void ParameterServer<T>::ServerHandler::HandleInitEmbeddings(const ::ps::KVMeta
for (int64_t k = 0; k < lens[2]; k++) {
output_shape->push_back(static_cast<size_t>(req_data.vals[index++]));
}
ps_->InitEmbeddingTable(key, shapes);
ParamInitInfo param_init_info;
if (ps::PsDataPrefetch::GetInstance().cache_enable()) {
param_init_info.param_type_ = static_cast<ParamType>(lens[3]);
if (param_init_info.param_type_ == kWeight) {
param_init_info.global_seed_ = static_cast<size_t>(lens[4]);
param_init_info.op_seed_ = static_cast<size_t>(lens[5]);
} else if (param_init_info.param_type_ == kAccumulation) {
param_init_info.init_val_ = req_data.vals[index];
}
}
ps_->InitEmbeddingTable(key, shapes, param_init_info);
}
template <typename T>
@ -338,6 +354,18 @@ void ParameterServer<T>::ServerHandler::HandleEmbeddingLookup(const ::ps::KVMeta
ps_->DoEmbeddingLookup(key, req_data.keys.segment(1, req_data.keys.size()), res);
}
template <typename T>
void ParameterServer<T>::ServerHandler::HandleUpdateEmbeddings(const ::ps::KVMeta &req_meta,
const ::ps::KVPairs<T> &req_data,
::ps::KVPairs<T> *res) {
std::unique_lock<std::mutex> lock(ps_->mutex());
MS_EXCEPTION_IF_NULL(res);
const Key &key = req_data.keys[0];
const LookupIds &lookup_ids = req_data.keys.segment(1, req_data.keys.size());
const Values &update_vals = req_data.vals;
ps_->UpdateEmbeddings(key, lookup_ids, update_vals);
}
template <typename T>
void ParameterServer<T>::ServerHandler::HandleFinalize(const ::ps::KVMeta &req_meta, const ::ps::KVPairs<T> &req_data,
::ps::KVPairs<T> *res) {
@ -476,7 +504,8 @@ void ParameterServer<T>::InitGrad(const Key &key, const GradPtr &grad) {
template <typename T>
void ParameterServer<T>::InitEmbeddingTable(
const Key &key, const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes) {
const Key &key, const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes,
const ParamInitInfo &param_init_info) {
MS_EXCEPTION_IF_NULL(shapes);
if (weights_.count(key) == 0) {
std::shared_ptr<PServerKernel> lookup =
@ -493,8 +522,18 @@ void ParameterServer<T>::InitEmbeddingTable(
T *embedding_data = embedding->data();
std::default_random_engine engine;
std::normal_distribution<float> random(0, 0.01);
for (size_t i = 0; i < total_dims; i++) {
embedding_data[i] = random(engine);
if (ps::PsDataPrefetch::GetInstance().cache_enable()) {
if (param_init_info.param_type_ == kWeight) {
InitRandomNormal(0, 0.01, input_shapes, param_init_info.global_seed_, param_init_info.op_seed_, embedding_data);
} else if (param_init_info.param_type_ == kAccumulation) {
for (size_t i = 0; i < total_dims; i++) {
embedding_data[i] = param_init_info.init_val_;
}
}
} else {
for (size_t i = 0; i < total_dims; i++) {
embedding_data[i] = random(engine);
}
}
weights_[key] = embedding;
tokens_[key] = 0;
@ -673,6 +712,23 @@ void ParameterServer<T>::DoEmbeddingLookup(Key key, const LookupIds &lookup_ids,
res->lens.push_back(res->vals.size());
}
template <typename T>
void ParameterServer<T>::UpdateEmbeddings(const Key &key, const LookupIds &lookup_ids, const Values &vals) {
if (weights_.count(key) == 0) {
MS_LOG(ERROR) << "Invalid embedding table key " << key;
return;
}
if (embedding_lookup_ops_.count(key) == 0) {
MS_LOG(ERROR) << "Invalid embedding lookup op key " << key;
return;
}
WeightPtr table_ptr = weights_[key];
MS_EXCEPTION_IF_NULL(table_ptr);
std::shared_ptr<PServerKernel> table_lookup_op = embedding_lookup_ops_[key];
MS_EXCEPTION_IF_NULL(table_lookup_op);
table_lookup_op->UpdateEmbeddings(table_ptr->data(), lookup_ids.data(), vals.data(), lookup_ids.size());
}
template <typename T>
inline bool ParameterServer<T>::ReadyForUpdateWeights() {
return grads_accum_counter_.size() > 0 && grad_accum_count_ == grads_accum_counter_.size();

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save