!5413 Combine sparse embedding gradient

Merge pull request !5413 from chengang/combine_grad
pull/5413/MERGE
mindspore-ci-bot 5 years ago committed by Gitee
commit 2b86c92fec

@ -71,8 +71,8 @@ void SparseApplyAdamPSKernel::ReInit(const std::shared_ptr<std::vector<std::shar
const std::vector<std::shared_ptr<std::vector<size_t>>> &shape_vec = *shapes; const std::vector<std::shared_ptr<std::vector<size_t>>> &shape_vec = *shapes;
const std::vector<size_t> &indices_shape = *(shape_vec[0]); const std::vector<size_t> &indices_shape = *(shape_vec[0]);
indices_size_ = indices_shape[0]; indices_size_ = indices_shape[0];
workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float); workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float) * worker_num_;
workspace_size_list_[1] = indices_size_ * sizeof(int); workspace_size_list_[1] = indices_size_ * sizeof(int) * worker_num_;
} }
void SparseApplyAdamPSKernel::ReInit(const std::vector<AddressPtr> &inputs) { void SparseApplyAdamPSKernel::ReInit(const std::vector<AddressPtr> &inputs) {
@ -85,10 +85,6 @@ void SparseApplyAdamPSKernel::ReInit(const std::vector<AddressPtr> &inputs) {
bool SparseApplyAdamPSKernel::Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, bool SparseApplyAdamPSKernel::Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) { const std::vector<AddressPtr> &outputs) {
ReInit(inputs); ReInit(inputs);
int *indices = reinterpret_cast<int *>(inputs[10]->addr);
for (size_t i = 0; i < inputs[10]->size / sizeof(int); i++) {
indices[i] -= row_offset_;
}
return Launch(inputs, workspace, outputs); return Launch(inputs, workspace, outputs);
} }

@ -74,15 +74,15 @@ void SparseApplyFtrlPSKernel::ReInit(const std::shared_ptr<std::vector<std::shar
const std::vector<std::shared_ptr<std::vector<size_t>>> &shape_vec = *shapes; const std::vector<std::shared_ptr<std::vector<size_t>>> &shape_vec = *shapes;
std::vector<size_t> indices_shape = *(shape_vec[0]); std::vector<size_t> indices_shape = *(shape_vec[0]);
indices_size_ = indices_shape[0]; indices_size_ = indices_shape[0];
workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float); workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float) * worker_num_;
workspace_size_list_[1] = indices_size_ * sizeof(int); workspace_size_list_[1] = indices_size_ * sizeof(int) * worker_num_;
} }
void SparseApplyFtrlPSKernel::ReInit(const std::vector<AddressPtr> &inputs) { void SparseApplyFtrlPSKernel::ReInit(const std::vector<AddressPtr> &inputs) {
const auto &indices_addr = inputs[4]; const auto &indices_addr = inputs[4];
indices_size_ = indices_addr->size / sizeof(int); indices_size_ = indices_addr->size / sizeof(int);
workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float); workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float) * worker_num_;
workspace_size_list_[1] = indices_size_ * sizeof(int); workspace_size_list_[1] = indices_size_ * sizeof(int) * worker_num_;
} }
bool SparseApplyFtrlPSKernel::Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, bool SparseApplyFtrlPSKernel::Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,

@ -71,15 +71,15 @@ void SparseApplyLazyAdamPSKernel::ReInit(
const std::vector<std::shared_ptr<std::vector<size_t>>> &shape_vec = *shapes; const std::vector<std::shared_ptr<std::vector<size_t>>> &shape_vec = *shapes;
const std::vector<size_t> &indices_shape = *(shape_vec[0]); const std::vector<size_t> &indices_shape = *(shape_vec[0]);
indices_size_ = indices_shape[0]; indices_size_ = indices_shape[0];
workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float); workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float) * worker_num_;
workspace_size_list_[1] = indices_size_ * sizeof(int); workspace_size_list_[1] = indices_size_ * sizeof(int) * worker_num_;
} }
void SparseApplyLazyAdamPSKernel::ReInit(const std::vector<AddressPtr> &inputs) { void SparseApplyLazyAdamPSKernel::ReInit(const std::vector<AddressPtr> &inputs) {
const auto &indices_addr = inputs[10]; const auto &indices_addr = inputs[10];
indices_size_ = indices_addr->size / sizeof(int); indices_size_ = indices_addr->size / sizeof(int);
workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float); workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float) * worker_num_;
workspace_size_list_[1] = indices_size_ * sizeof(int); workspace_size_list_[1] = indices_size_ * sizeof(int) * worker_num_;
} }
bool SparseApplyLazyAdamPSKernel::Execute(const std::vector<AddressPtr> &inputs, bool SparseApplyLazyAdamPSKernel::Execute(const std::vector<AddressPtr> &inputs,

@ -16,6 +16,7 @@
#include "frontend/parallel/ps/optimizer_info.h" #include "frontend/parallel/ps/optimizer_info.h"
#include <memory> #include <memory>
#include "frontend/parallel/ps/util.h"
namespace mindspore { namespace mindspore {
namespace parallel { namespace parallel {
@ -30,6 +31,8 @@ const std::vector<AddressPtr> &OptimizerInfo::outputs() { return outputs_; }
bool OptimizerInfo::IsSparse() const { return false; } bool OptimizerInfo::IsSparse() const { return false; }
const size_t OptimizerInfo::indice_size() const { return 0; }
size_t OptimizerInfo::grad_index() { return 0; } size_t OptimizerInfo::grad_index() { return 0; }
size_t OptimizerInfo::indices_index() { return 0; } size_t OptimizerInfo::indices_index() { return 0; }
@ -57,7 +60,8 @@ void DenseOptimInfo::Accumulate(const Values &values, const Lengths &lengths) {
} }
} }
void DenseOptimInfo::ComputeMean(size_t n) { void DenseOptimInfo::ComputeMean(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &, size_t n,
size_t server_num, size_t rank_id) {
if (n > 1) { if (n > 1) {
float *accum_grad_data = reinterpret_cast<float *>(gradient()->addr); float *accum_grad_data = reinterpret_cast<float *>(gradient()->addr);
size_t size = gradient()->size / sizeof(float); size_t size = gradient()->size / sizeof(float);
@ -96,15 +100,88 @@ void SparseOptimInfo::Accumulate(const Values &values, const Lengths &lengths) {
for (size_t i = 0; i < indices_index; i++) { for (size_t i = 0; i < indices_index; i++) {
indice_offset += lengths[i]; indice_offset += lengths[i];
} }
int *incr_indice_data = reinterpret_cast<int *>(values.data() + indice_offset); float *incr_indice_data = values.data() + indice_offset;
size_t incr_indice_size = lengths[indices_index] * sizeof(float); size_t incr_indice_size = lengths[indices_index];
size_t incr_indice_data_size = incr_indice_size * sizeof(int);
int *converted_indices = new int[incr_indice_size];
for (size_t i = 0; i < incr_indice_size; i++) {
converted_indices[i] = static_cast<int>(incr_indice_data[i]);
}
auto ret2 = memcpy_s(accum_indices_data + indices_offset_, incr_indice_size, incr_indice_data, incr_indice_size); auto ret2 =
memcpy_s(accum_indices_data + indices_offset_, incr_indice_data_size, converted_indices, incr_indice_data_size);
if (ret2 != 0) { if (ret2 != 0) {
MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret2 << ")"; MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret2 << ")";
} }
delete[] converted_indices;
indices_offset_ += lengths[indices_index]; indices_offset_ += lengths[indices_index];
indices()->size += incr_indice_size; indices()->size += incr_indice_data_size;
}
void SparseOptimInfo::ComputeMean(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes,
size_t n, size_t server_num, size_t rank_id) {
size_t indices_size = static_cast<size_t>(indices()->size / sizeof(int));
int segment_size = gradient()->size / indices()->size;
float *new_grad = new float[indices_size * segment_size];
int *new_indices = new int[indices_size];
mindspore::kernel::SparseGradient<int> unique_sparse_grad({new_grad, new_indices, indices_size});
const std::vector<std::shared_ptr<std::vector<size_t>>> &shape_vec = *shapes;
if (shape_vec.size() < 2 || shape_vec[1] == nullptr) {
MS_LOG(EXCEPTION) << "No input shape found";
}
auto input_shapes = shape_vec.size() > 0 ? shape_vec[1] : nullptr;
MS_EXCEPTION_IF_NULL(input_shapes);
if (input_shapes->size() == 0) {
MS_LOG(EXCEPTION) << "Invalid input shapes";
}
int first_dim_size = input_shapes->front();
int outer_dim_size = segment_size;
if (first_dim_size == 0 || outer_dim_size == 0) {
MS_LOG(ERROR) << "Invalid first dim size";
}
float *grad_data = reinterpret_cast<float *>(gradient()->addr);
int *indices_data = reinterpret_cast<int *>(indices()->addr);
size_t original_row_count = input_shapes->front();
if (original_row_count > 0) {
size_t offset = 0;
if ((original_row_count % server_num) == 0) {
offset = original_row_count / server_num * rank_id;
} else {
offset = std::round((static_cast<float>(original_row_count)) / server_num) * rank_id;
}
for (size_t i = 0; i < indices_size; i++) {
indices_data[i] -= offset;
}
}
Util::ReduceSparseGradient(grad_data, indices_data, indices_size, segment_size, first_dim_size, outer_dim_size,
&unique_sparse_grad);
int reduced_grad_size = unique_sparse_grad.indices_size_ * segment_size * sizeof(float);
auto ret = memcpy_s(gradient()->addr, reduced_grad_size, unique_sparse_grad.value_, reduced_grad_size);
if (ret != 0) {
MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret << ")";
}
int reduced_indice_size = unique_sparse_grad.indices_size_ * sizeof(int);
ret = memcpy_s(indices()->addr, reduced_indice_size, unique_sparse_grad.indices_, reduced_indice_size);
if (ret != 0) {
MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret << ")";
}
gradient()->size = reduced_grad_size;
indices()->size = reduced_indice_size;
for (size_t i = 0; i < unique_sparse_grad.indices_size_ * segment_size; i++) {
grad_data[i] = grad_data[i] / n;
}
delete[] new_grad;
delete[] new_indices;
} }
void SparseOptimInfo::Reset() { void SparseOptimInfo::Reset() {
@ -135,6 +212,8 @@ void MomentumOptimInfo::Update(const Values &values, const Lengths &lens) {
} }
} }
const size_t SparseOptimInfo::indice_size() const { return indices_offset_; }
const AddressPtr &MomentumOptimInfo::gradient() { return inputs_[3]; } const AddressPtr &MomentumOptimInfo::gradient() { return inputs_[3]; }
const AddressPtr &MomentumOptimInfo::indices() { return inputs_[3]; } const AddressPtr &MomentumOptimInfo::indices() { return inputs_[3]; }

@ -18,6 +18,7 @@
#define MINDSPORE_CCSRC_FRONTEND_PARALLEL_PS_OPTIMIZER_INFO_H_ #define MINDSPORE_CCSRC_FRONTEND_PARALLEL_PS_OPTIMIZER_INFO_H_
#include <vector> #include <vector>
#include <memory>
#include "backend/kernel_compiler/kernel.h" #include "backend/kernel_compiler/kernel.h"
#include "frontend/parallel/ps/common.h" #include "frontend/parallel/ps/common.h"
@ -33,12 +34,14 @@ class OptimizerInfo {
virtual void Update(const Values &values, const Lengths &lengths) {} virtual void Update(const Values &values, const Lengths &lengths) {}
virtual void UpdateWeight(const WeightPtr &weight); virtual void UpdateWeight(const WeightPtr &weight);
virtual void Accumulate(const Values &values, const Lengths &lengths) = 0; virtual void Accumulate(const Values &values, const Lengths &lengths) = 0;
virtual void ComputeMean(size_t n) {} virtual void ComputeMean(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes, size_t n,
size_t server_num, size_t rank_id) {}
virtual void Reset() {} virtual void Reset() {}
void AddWorkspace(const AddressPtr &workspace); void AddWorkspace(const AddressPtr &workspace);
virtual const AddressPtr &gradient() = 0; virtual const AddressPtr &gradient() = 0;
virtual const AddressPtr &indices() = 0; virtual const AddressPtr &indices() = 0;
virtual const size_t indice_size() const;
const std::vector<AddressPtr> &inputs(); const std::vector<AddressPtr> &inputs();
const std::vector<AddressPtr> &workspaces(); const std::vector<AddressPtr> &workspaces();
const std::vector<AddressPtr> &outputs(); const std::vector<AddressPtr> &outputs();
@ -59,7 +62,8 @@ class DenseOptimInfo : public OptimizerInfo {
~DenseOptimInfo() override = default; ~DenseOptimInfo() override = default;
void Accumulate(const Values &values, const Lengths &lens) override; void Accumulate(const Values &values, const Lengths &lens) override;
void ComputeMean(size_t n) override; void ComputeMean(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes, size_t n,
size_t server_num, size_t rank_id) override;
void Reset() override; void Reset() override;
}; };
@ -69,7 +73,10 @@ class SparseOptimInfo : public OptimizerInfo {
~SparseOptimInfo() override = default; ~SparseOptimInfo() override = default;
void Accumulate(const Values &values, const Lengths &lens) override; void Accumulate(const Values &values, const Lengths &lens) override;
void ComputeMean(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes, size_t n,
size_t server_num, size_t rank_id) override;
void Reset() override; void Reset() override;
const size_t indice_size() const override;
protected: protected:
size_t grads_offset_{0}; size_t grads_offset_{0};

@ -136,15 +136,21 @@ OptimizerInfo *SparseAdamOptimInfoBuilder::BuildInputs(const WeightPtr &weight,
const std::shared_ptr<std::vector<size_t>> &indices_shape = (*inputs_shape)[10]; const std::shared_ptr<std::vector<size_t>> &indices_shape = (*inputs_shape)[10];
size_t total_indice_size = size_t total_indice_size =
std::accumulate((*indices_shape).begin(), (*indices_shape).end(), sizeof(float), std::multiplies<size_t>()); std::accumulate((*indices_shape).begin(), (*indices_shape).end(), sizeof(int), std::multiplies<size_t>());
AddressPtr indices = std::make_shared<kernel::Address>(); AddressPtr indices = std::make_shared<kernel::Address>();
indices->addr = new float[total_indice_size * worker_num]; indices->addr = new int[total_indice_size * worker_num];
ret = memcpy_s(indices->addr, lens[7] * sizeof(float), reinterpret_cast<float *>(epsilon->addr) + lens[5] + lens[6], int *converted_indices = new int[lens[7]];
lens[7] * sizeof(float)); size_t indices_data_size = lens[7] * sizeof(int);
float *indices_data = reinterpret_cast<float *>(epsilon->addr) + lens[5] + lens[6];
for (int i = 0; i < lens[7]; i++) {
converted_indices[i] = static_cast<int>(indices_data[i]);
}
ret = memcpy_s(indices->addr, indices_data_size, converted_indices, indices_data_size);
if (ret != 0) { if (ret != 0) {
MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret << ")"; MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret << ")";
} }
indices->size = lens[7] * sizeof(int); indices->size = indices_data_size;
delete[] converted_indices;
return new SparseAdamOptimInfo(weight_addr, m, v, beta1_power, beta2_power, learning_rate, beta1, beta2, epsilon, return new SparseAdamOptimInfo(weight_addr, m, v, beta1_power, beta2_power, learning_rate, beta1, beta2, epsilon,
grad, indices); grad, indices);
@ -185,13 +191,19 @@ OptimizerInfo *SparseFtrlOptimInfoBuilder::BuildInputs(const WeightPtr &weight,
size_t total_indice_size = size_t total_indice_size =
std::accumulate((*indices_shape).begin(), (*indices_shape).end(), 1, std::multiplies<size_t>()); std::accumulate((*indices_shape).begin(), (*indices_shape).end(), 1, std::multiplies<size_t>());
AddressPtr indices = std::make_shared<kernel::Address>(); AddressPtr indices = std::make_shared<kernel::Address>();
indices->addr = new float[total_indice_size * worker_num]; indices->addr = new int[total_indice_size * worker_num];
ret = memcpy_s(indices->addr, lens[1] * sizeof(float), reinterpret_cast<float *>(values.data()) + lens[0], int *converted_indices = new int[lens[1]];
lens[1] * sizeof(float)); size_t indices_data_size = lens[1] * sizeof(int);
float *indices_data = reinterpret_cast<float *>(values.data()) + lens[0];
for (int i = 0; i < lens[1]; i++) {
converted_indices[i] = static_cast<int>(indices_data[i]);
}
ret = memcpy_s(indices->addr, indices_data_size, converted_indices, indices_data_size);
if (ret != 0) { if (ret != 0) {
MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret << ")"; MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret << ")";
} }
indices->size = lens[1] * sizeof(int); indices->size = indices_data_size;
delete[] converted_indices;
return new SparseFtrlOptimInfo(weight_addr, accum, linear, grad, indices); return new SparseFtrlOptimInfo(weight_addr, accum, linear, grad, indices);
} }

@ -145,6 +145,7 @@ class ParameterServer {
std::unordered_map<Key, std::shared_ptr<PServerKernel>> optimizers_; std::unordered_map<Key, std::shared_ptr<PServerKernel>> optimizers_;
std::unordered_map<Key, InputsShapePtr> optim_inputs_shape_; std::unordered_map<Key, InputsShapePtr> optim_inputs_shape_;
std::unordered_map<Key, InputsShapePtr> original_optim_inputs_shape_;
std::unordered_map<Key, std::shared_ptr<OptimizerInfo>> optim_infos_; std::unordered_map<Key, std::shared_ptr<OptimizerInfo>> optim_infos_;
std::unordered_map<std::string, std::shared_ptr<OptimizerInfoBuilder>> optim_info_builders_; std::unordered_map<std::string, std::shared_ptr<OptimizerInfoBuilder>> optim_info_builders_;
std::unordered_map<Key, std::string> weight_key_to_optims_; std::unordered_map<Key, std::string> weight_key_to_optims_;
@ -366,19 +367,24 @@ void ParameterServer<T>::InitWeightKeyToOptims(const Key &key, const int &optim_
template <typename T> template <typename T>
void ParameterServer<T>::InitOptimInputsShape(const Keys &keys, const Values &values, const Lengths &lengths) { void ParameterServer<T>::InitOptimInputsShape(const Keys &keys, const Values &values, const Lengths &lengths) {
InputsShapePtr inputs_shape = std::make_shared<InputsShape>(); InputsShapePtr inputs_shape = std::make_shared<InputsShape>();
InputsShapePtr original_inputs_shape = std::make_shared<InputsShape>();
int val_idx = 0; int val_idx = 0;
const Key &key = keys[0]; const Key &key = keys[0];
MS_LOG(INFO) << "Initializing optimizer inputs shape for key:" << key; MS_LOG(INFO) << "Initializing optimizer inputs shape for key:" << key;
if (optim_inputs_shape_.count(key) == 0) { if (optim_inputs_shape_.count(key) == 0) {
original_optim_inputs_shape_[key] = original_inputs_shape;
optim_inputs_shape_[key] = inputs_shape; optim_inputs_shape_[key] = inputs_shape;
} }
for (size_t i = 0; i < keys.size(); i++) { for (size_t i = 0; i < keys.size(); i++) {
auto shape = std::make_shared<std::vector<size_t>>(); auto shape = std::make_shared<std::vector<size_t>>();
auto original_shape = std::make_shared<std::vector<size_t>>();
inputs_shape->push_back(shape); inputs_shape->push_back(shape);
original_inputs_shape->push_back(original_shape);
int len = lengths[i]; int len = lengths[i];
for (int j = 0; j < len; j++) { for (int j = 0; j < len; j++) {
shape->push_back(values[val_idx++]); shape->push_back(values[val_idx]);
original_shape->push_back(values[val_idx++]);
} }
} }
if (weight_key_to_optims_.count(key) > 0) { if (weight_key_to_optims_.count(key) > 0) {
@ -512,7 +518,19 @@ void ParameterServer<T>::UpdateWeights() {
const std::vector<kernel::AddressPtr> &workspaces = optim_info->workspaces(); const std::vector<kernel::AddressPtr> &workspaces = optim_info->workspaces();
const std::vector<kernel::AddressPtr> &outputs = optim_info->outputs(); const std::vector<kernel::AddressPtr> &outputs = optim_info->outputs();
optim_info->ComputeMean(worker_num_); std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> shapes =
std::make_shared<std::vector<std::shared_ptr<std::vector<size_t>>>>();
std::shared_ptr<std::vector<size_t>> indices_shape = std::make_shared<std::vector<size_t>>();
indices_shape->emplace_back(optim_info->indice_size());
shapes->push_back(indices_shape);
if (original_optim_inputs_shape_.count(key) != 0) {
for (auto &input_shapes : *(original_optim_inputs_shape_[key])) {
shapes->push_back(input_shapes);
}
}
optimizer->ReInit(shapes);
optim_info->ComputeMean(shapes, worker_num_, pserver_num_, rank_id_);
optimizer->Execute(inputs, workspaces, outputs); optimizer->Execute(inputs, workspaces, outputs);
optim_info->Reset(); optim_info->Reset();
if (!is_embedding_[key]) { if (!is_embedding_[key]) {

@ -146,6 +146,32 @@ int Util::LocalShard(int first_dim, int rank_id, int server_num) {
void Util::SetRankId(int rank_id) { rank_id_ = rank_id; } void Util::SetRankId(int rank_id) { rank_id_ = rank_id; }
int Util::GetRankId() { return rank_id_; } int Util::GetRankId() { return rank_id_; }
void Util::ReduceSparseGradient(float *gradients, int *indices, const size_t indices_size, size_t segment_size,
const size_t first_dim_size, const size_t outer_dim_size,
mindspore::kernel::SparseGradient<int> *unique_sparse_grad) {
size_t slice_segment_size = indices_size * segment_size;
auto workspace_grad = new float[slice_segment_size];
auto workspace_indices = new int[indices_size];
MS_EXCEPTION_IF_NULL(gradients);
MS_EXCEPTION_IF_NULL(indices);
MS_EXCEPTION_IF_NULL(workspace_grad);
MS_EXCEPTION_IF_NULL(workspace_indices);
mindspore::kernel::SparseGradient<int> workspace_sparse_grad({workspace_grad, workspace_indices, indices_size});
mindspore::kernel::SparseGradient<int> input_sparse_grad({gradients, indices, indices_size});
mindspore::kernel::ReduceSparseGradientParam<int> param;
param.input_grad_ = &input_sparse_grad;
param.workspace_grad_ = &workspace_sparse_grad;
param.output_grad_ = unique_sparse_grad;
param.max_index_ = first_dim_size;
param.value_stride_ = outer_dim_size;
mindspore::kernel::SparseOptimizerCPUKernel::BucketReduceSparseGradient(param);
delete[] workspace_grad;
delete[] workspace_indices;
}
} // namespace ps } // namespace ps
} // namespace parallel } // namespace parallel
} // namespace mindspore } // namespace mindspore

@ -21,6 +21,8 @@
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include "backend/session/anf_runtime_algorithm.h" #include "backend/session/anf_runtime_algorithm.h"
#include "backend/kernel_compiler/common_utils.h"
#include "backend/kernel_compiler/cpu/sparse_optimizer_cpu_kernel.h"
namespace mindspore { namespace mindspore {
namespace parallel { namespace parallel {
@ -39,6 +41,9 @@ class Util {
static int LocalShard(int first_dim, int rank_id, int server_num); static int LocalShard(int first_dim, int rank_id, int server_num);
static void SetRankId(int rank_id); static void SetRankId(int rank_id);
static int GetRankId(); static int GetRankId();
static void ReduceSparseGradient(float *gradients, int *indices, const size_t indices_size, size_t segment_size,
const size_t first_dim_size, const size_t outer_dim_size,
mindspore::kernel::SparseGradient<int> *unique_sparse_grad);
private: private:
static std::unordered_map<std::string, int> optimizer_to_ids; static std::unordered_map<std::string, int> optimizer_to_ids;

@ -96,6 +96,32 @@ void Worker<T>::Run() {
template <typename T> template <typename T>
void Worker<T>::Push(const std::vector<size_t> &keys, std::vector<uintptr_t> addrs, const ShapeVector &sizes) { void Worker<T>::Push(const std::vector<size_t> &keys, std::vector<uintptr_t> addrs, const ShapeVector &sizes) {
if (keys.size() == 0) {
MS_LOG(EXCEPTION) << "key size should be greater than zero";
}
if (key_to_optimId_.count(keys[0]) == 0) {
MS_LOG(EXCEPTION) << "no optim id found for key" << keys[0];
}
Key key = keys[0];
int optim_id = key_to_optimId_[key];
bool is_sparse = false;
if (optim_id == 1 || optim_id == 2 || optim_id == 3) {
is_sparse = true;
}
int grad_index = -1;
int indice_index = -1;
// Sparse adam gradient
if (optim_id == 1 || optim_id == 2) {
grad_index = 6;
indice_index = 7;
// Sparse ftrl gradient
} else if (optim_id == 3) {
grad_index = 0;
indice_index = 1;
}
size_t total_size = 0; size_t total_size = 0;
for (auto size : sizes) { for (auto size : sizes) {
total_size += size; total_size += size;
@ -110,10 +136,22 @@ void Worker<T>::Push(const std::vector<size_t> &keys, std::vector<uintptr_t> add
} }
offset += sizes[i] * sizeof(T); offset += sizes[i] * sizeof(T);
} }
while (!kv_worker_->IsReadyForPush(keys[0])) { while (!kv_worker_->IsReadyForPush(keys[0])) {
continue; continue;
} }
if (!is_sparse) {
kv_worker_->PushData(::ps::SArray<::ps::Key>(keys), total_buffer, ::ps::SArray<int>(sizes)); kv_worker_->PushData(::ps::SArray<::ps::Key>(keys), total_buffer, ::ps::SArray<int>(sizes));
} else {
std::vector<int> &var_shape = key_to_optim_shapes_[key][0];
int first_dim_size = var_shape[0];
int outer_dim_size = 1;
for (size_t i = 1; i < var_shape.size(); ++i) {
outer_dim_size *= var_shape[i];
}
kv_worker_->PushSparseData(::ps::SArray<::ps::Key>(keys), total_buffer, ::ps::SArray<int>(sizes), grad_index,
indice_index, first_dim_size, outer_dim_size);
}
} }
template <typename T> template <typename T>

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save