From a719a9fe3a3606878ca739ae9542fff817a92435 Mon Sep 17 00:00:00 2001 From: kswang Date: Tue, 7 Jul 2020 14:16:40 +0800 Subject: [PATCH] add two level reduce sparse gradient --- mindspore/ccsrc/kernel/common_utils.cc | 69 +++++++++++++++++++ mindspore/ccsrc/kernel/common_utils.h | 5 ++ .../cpu/sparse_apply_ftrl_cpu_kernel.cc | 9 ++- .../cpu/sparse_apply_lazy_adam_cpu_kernel.cc | 9 ++- .../cpu/sparse_apply_adam_cpu_kernel_test.cc | 17 +++-- .../cpu/sparse_apply_ftrl_cpu_kernel_test.cc | 17 +++-- .../sparse_apply_lazy_adam_cpu_kernel_test.cc | 17 +++-- ..._apply_proximal_adagrad_cpu_kernel_test.cc | 17 +++-- 8 files changed, 140 insertions(+), 20 deletions(-) diff --git a/mindspore/ccsrc/kernel/common_utils.cc b/mindspore/ccsrc/kernel/common_utils.cc index ab4f59e549..526aca9a31 100644 --- a/mindspore/ccsrc/kernel/common_utils.cc +++ b/mindspore/ccsrc/kernel/common_utils.cc @@ -632,6 +632,75 @@ void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradie unique_grad->indices_size_ = slice_positions.size(); } +void ReduceMultiSparseGradient(const std::vector> &unique_slice_grads, + SparseGradient *tmp_grad, SparseGradient *unique_grad, size_t first_dim, + size_t outer_dim) { + if (unique_slice_grads.empty()) { + return; + } + size_t index_data_size = outer_dim * sizeof(float); + size_t unique_indices_size = 0; + for (size_t i = 0; i < unique_slice_grads.size(); ++i) { + auto &slice_grad = unique_slice_grads[i]; + auto ret_code = memcpy_s(tmp_grad->value_ + unique_indices_size * outer_dim, + (tmp_grad->indices_size_ - unique_indices_size) * index_data_size, slice_grad->value_, + slice_grad->indices_size_ * index_data_size); + if (ret_code != EOK) { + MS_LOG(EXCEPTION) << "Failed to copy data!"; + } + ret_code = + memcpy_s(tmp_grad->indices_ + unique_indices_size, (tmp_grad->indices_size_ - unique_indices_size) * sizeof(int), + slice_grad->indices_, slice_grad->indices_size_ * sizeof(int)); + if (ret_code != EOK) { + MS_LOG(EXCEPTION) << "Failed to copy data!"; + } + unique_indices_size += slice_grad->indices_size_; + } + tmp_grad->indices_size_ = unique_indices_size; + ReduceSparseGradient(*tmp_grad, unique_grad, first_dim, outer_dim); +} + +void TwoLevelReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *tmp_grad, + SparseGradient *unique_grad, size_t first_dim, size_t outer_dim) { + MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_); + MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_); + MS_EXCEPTION_IF_NULL(unique_grad); + MS_EXCEPTION_IF_NULL(unique_grad->value_); + MS_EXCEPTION_IF_NULL(unique_grad->indices_); + MS_EXCEPTION_IF_NULL(tmp_grad); + MS_EXCEPTION_IF_NULL(tmp_grad->value_); + MS_EXCEPTION_IF_NULL(tmp_grad->indices_); + size_t thread_num = 24; + if (origin_sparse_grad.indices_size_ < thread_num) { + thread_num = origin_sparse_grad.indices_size_; + } + size_t thread_indices_size = origin_sparse_grad.indices_size_ / thread_num; + size_t left_indices_size = origin_sparse_grad.indices_size_ % thread_num; + std::vector threads; + threads.reserve(thread_num); + std::vector> unique_slice_grads; + for (size_t i = 0; i < thread_num; ++i) { + size_t indices_size = thread_indices_size; + if (i == thread_num - 1) { + indices_size = thread_indices_size + left_indices_size; + } + size_t value_offset = i * thread_indices_size * outer_dim; + size_t indices_offset = i * thread_indices_size; + auto slice_grad = SparseGradient( + {origin_sparse_grad.value_ + value_offset, origin_sparse_grad.indices_ + indices_offset, indices_size}); + unique_slice_grads.emplace_back(std::make_shared()); + unique_slice_grads[i]->value_ = unique_grad->value_ + value_offset; + unique_slice_grads[i]->indices_ = unique_grad->indices_ + indices_offset; + unique_slice_grads[i]->indices_size_ = indices_size; + threads.emplace_back( + std::thread(ReduceSparseGradient, slice_grad, unique_slice_grads[i].get(), first_dim, outer_dim)); + } + for (size_t i = 0; i < thread_num; ++i) { + threads[i].join(); + } + ReduceMultiSparseGradient(unique_slice_grads, tmp_grad, unique_grad, first_dim, outer_dim); +} + std::pair GetKernelInput(const AnfNodePtr &anf_node, size_t index) { MS_EXCEPTION_IF_NULL(anf_node); diff --git a/mindspore/ccsrc/kernel/common_utils.h b/mindspore/ccsrc/kernel/common_utils.h index e9d72848f6..13d36e2d53 100644 --- a/mindspore/ccsrc/kernel/common_utils.h +++ b/mindspore/ccsrc/kernel/common_utils.h @@ -130,6 +130,11 @@ void GetGraphRealOutput(const FuncGraphPtr &func_graph, std::vector> &unique_slice_grads, + SparseGradient *tmp_grad, SparseGradient *unique_grad, size_t first_dim, + size_t outer_dim); +void TwoLevelReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *tmp_grad, + SparseGradient *unique_grad, size_t first_dim, size_t outer_dim); } // namespace kernel } // namespace mindspore diff --git a/mindspore/ccsrc/kernel/cpu/sparse_apply_ftrl_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/sparse_apply_ftrl_cpu_kernel.cc index 0537e746f3..03fb1d303f 100644 --- a/mindspore/ccsrc/kernel/cpu/sparse_apply_ftrl_cpu_kernel.cc +++ b/mindspore/ccsrc/kernel/cpu/sparse_apply_ftrl_cpu_kernel.cc @@ -66,6 +66,8 @@ void SparseApplyFtrlCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) MS_EXCEPTION_IF_NULL(kernel_node); workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); workspace_size_list_.emplace_back(indices_size_ * sizeof(int)); + workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); + workspace_size_list_.emplace_back(indices_size_ * sizeof(int)); } void SparseApplyFtrlCPUKernel::InitKernel(const CNodePtr &kernel_node) { @@ -130,9 +132,12 @@ bool SparseApplyFtrlCPUKernel::Launch(const std::vector &inp auto indices = reinterpret_cast(inputs[4]->addr); auto new_grad = reinterpret_cast(workspace[0]->addr); auto new_indices = reinterpret_cast(workspace[1]->addr); + auto tmp_grad = reinterpret_cast(workspace[2]->addr); + auto tmp_indices = reinterpret_cast(workspace[3]->addr); SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_}); - ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_, - var_outer_dim_size_); + SparseGradient tmp_sparse_grad({tmp_grad, tmp_indices, indices_size_}); + TwoLevelReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &tmp_sparse_grad, &unique_sparse_grad, + var_first_dim_size_, var_outer_dim_size_); MultiThreadComputeParams input_params; input_params.var_ = var; diff --git a/mindspore/ccsrc/kernel/cpu/sparse_apply_lazy_adam_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/sparse_apply_lazy_adam_cpu_kernel.cc index 16cb901b04..ed5438a318 100644 --- a/mindspore/ccsrc/kernel/cpu/sparse_apply_lazy_adam_cpu_kernel.cc +++ b/mindspore/ccsrc/kernel/cpu/sparse_apply_lazy_adam_cpu_kernel.cc @@ -61,6 +61,8 @@ void SparseApplyLazyAdamCPUKernel::InitInputOutputSize(const CNodePtr &kernel_no MS_EXCEPTION_IF_NULL(kernel_node); workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); workspace_size_list_.emplace_back(indices_size_ * sizeof(int)); + workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); + workspace_size_list_.emplace_back(indices_size_ * sizeof(int)); } void SparseApplyLazyAdamCPUKernel::InitKernel(const CNodePtr &kernel_node) { @@ -121,10 +123,13 @@ bool SparseApplyLazyAdamCPUKernel::Launch(const std::vector auto indices = reinterpret_cast(inputs[10]->addr); auto new_grad = reinterpret_cast(workspace[0]->addr); auto new_indices = reinterpret_cast(workspace[1]->addr); + auto tmp_grad = reinterpret_cast(workspace[2]->addr); + auto tmp_indices = reinterpret_cast(workspace[3]->addr); SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_}); - ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_, - var_outer_dim_size_); + SparseGradient tmp_sparse_grad({tmp_grad, tmp_indices, indices_size_}); + TwoLevelReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &tmp_sparse_grad, &unique_sparse_grad, + var_first_dim_size_, var_outer_dim_size_); lr = lr * std::sqrt(1 - beta2_power) / (1 - beta1_power); MultiThreadComputeParams input_params; diff --git a/tests/ut/cpp/kernel/cpu/sparse_apply_adam_cpu_kernel_test.cc b/tests/ut/cpp/kernel/cpu/sparse_apply_adam_cpu_kernel_test.cc index 2a6b80f9e7..dfd6147389 100644 --- a/tests/ut/cpp/kernel/cpu/sparse_apply_adam_cpu_kernel_test.cc +++ b/tests/ut/cpp/kernel/cpu/sparse_apply_adam_cpu_kernel_test.cc @@ -58,9 +58,12 @@ class SparseApplyAdamCpuKernelTest : public UT::Common { inputs_.push_back(CreateKernelAddress(indices.data())); } - void CreateWorkspaceAddress(std::vector &new_grad, std::vector &new_indices, std::vector &m_t) { + void CreateWorkspaceAddress(std::vector &new_grad, std::vector &new_indices, std::vector &tmp_grad, + std::vector &tmp_indices, std::vector &m_t) { workspace_.push_back(CreateKernelAddress(new_grad.data())); workspace_.push_back(CreateKernelAddress(new_indices.data())); + workspace_.push_back(CreateKernelAddress(tmp_grad.data())); + workspace_.push_back(CreateKernelAddress(tmp_indices.data())); workspace_.push_back(CreateKernelAddress(m_t.data())); } @@ -95,8 +98,10 @@ TEST_F(SparseApplyAdamCpuKernelTest, dense_test) { CreateInputAddress(indices); std::vector new_grad(3 * 3 * 3); std::vector new_indices(3); + std::vector tmp_grad(3 * 3 * 3); + std::vector tmp_indices(3); std::vector m_t(3 * 3 * 3); - CreateWorkspaceAddress(new_grad, new_indices, m_t); + CreateWorkspaceAddress(new_grad, new_indices, tmp_grad, tmp_indices, m_t); sparse_adam_->Launch(inputs_, workspace_, outputs_); for (size_t i = 0; i < 3 * 3 * 3; ++i) { EXPECT_TRUE(std::fabs(var_[i] - 0.999684) < 1e-6); @@ -120,8 +125,10 @@ TEST_F(SparseApplyAdamCpuKernelTest, sparse_test1) { CreateInputAddress(indices); std::vector new_grad(3 * 3 * 3); std::vector new_indices(3); + std::vector tmp_grad(3 * 3 * 3); + std::vector tmp_indices(3); std::vector m_t(3 * 3 * 3); - CreateWorkspaceAddress(new_grad, new_indices, m_t); + CreateWorkspaceAddress(new_grad, new_indices, tmp_grad, tmp_indices, m_t); sparse_adam_->Launch(inputs_, workspace_, outputs_); for (size_t i = 0; i < 3 * 3; ++i) { EXPECT_TRUE(std::fabs(var_[i] - 0.999684) < 1e-6); @@ -149,8 +156,10 @@ TEST_F(SparseApplyAdamCpuKernelTest, sparse_test2) { CreateInputAddress(indices); std::vector new_grad(3 * 3 * 3); std::vector new_indices(3); + std::vector tmp_grad(3 * 3 * 3); + std::vector tmp_indices(3); std::vector m_t(3 * 3 * 3); - CreateWorkspaceAddress(new_grad, new_indices, m_t); + CreateWorkspaceAddress(new_grad, new_indices, tmp_grad, tmp_indices, m_t); sparse_adam_->Launch(inputs_, workspace_, outputs_); for (size_t i = 0; i < 3 * 3; ++i) { EXPECT_TRUE(std::fabs(var_[i] - 0.999715) < 1e-6); diff --git a/tests/ut/cpp/kernel/cpu/sparse_apply_ftrl_cpu_kernel_test.cc b/tests/ut/cpp/kernel/cpu/sparse_apply_ftrl_cpu_kernel_test.cc index c5c2394538..a7df66cf9a 100644 --- a/tests/ut/cpp/kernel/cpu/sparse_apply_ftrl_cpu_kernel_test.cc +++ b/tests/ut/cpp/kernel/cpu/sparse_apply_ftrl_cpu_kernel_test.cc @@ -56,9 +56,12 @@ class SparseApplyFtrlCpuKernelTest : public UT::Common { inputs_.push_back(CreateKernelAddress(indices.data())); } - void CreateWorkspaceAddress(std::vector &new_grad, std::vector &new_indices) { + void CreateWorkspaceAddress(std::vector &new_grad, std::vector &new_indices, std::vector &tmp_grad, + std::vector &tmp_indices) { workspace_.push_back(CreateKernelAddress(new_grad.data())); workspace_.push_back(CreateKernelAddress(new_indices.data())); + workspace_.push_back(CreateKernelAddress(tmp_grad.data())); + workspace_.push_back(CreateKernelAddress(tmp_indices.data())); } std::vector var_; @@ -86,7 +89,9 @@ TEST_F(SparseApplyFtrlCpuKernelTest, dense_test) { CreateInputAddress(indices); std::vector new_grad(3 * 3 * 3); std::vector new_indices(3); - CreateWorkspaceAddress(new_grad, new_indices); + std::vector tmp_grad(3 * 3 * 3); + std::vector tmp_indices(3); + CreateWorkspaceAddress(new_grad, new_indices, tmp_grad, tmp_indices); sparse_ftrl_->Launch(inputs_, workspace_, outputs_); for (size_t i = 0; i < 3 * 3 * 3; ++i) { EXPECT_TRUE(std::fabs(var_[i] - 0.291479) < 1e-6); @@ -110,7 +115,9 @@ TEST_F(SparseApplyFtrlCpuKernelTest, sparse_test1) { CreateInputAddress(indices); std::vector new_grad(3 * 3 * 3); std::vector new_indices(3); - CreateWorkspaceAddress(new_grad, new_indices); + std::vector tmp_grad(3 * 3 * 3); + std::vector tmp_indices(3); + CreateWorkspaceAddress(new_grad, new_indices, tmp_grad, tmp_indices); sparse_ftrl_->Launch(inputs_, workspace_, outputs_); for (size_t i = 0; i < 3 * 3; ++i) { EXPECT_TRUE(std::fabs(var_[i] - 0.291479) < 1e-6); @@ -138,7 +145,9 @@ TEST_F(SparseApplyFtrlCpuKernelTest, sparse_test2) { CreateInputAddress(indices); std::vector new_grad(3 * 3 * 3); std::vector new_indices(3); - CreateWorkspaceAddress(new_grad, new_indices); + std::vector tmp_grad(3 * 3 * 3); + std::vector tmp_indices(3); + CreateWorkspaceAddress(new_grad, new_indices, tmp_grad, tmp_indices); sparse_ftrl_->Launch(inputs_, workspace_, outputs_); for (size_t i = 0; i < 3 * 3; ++i) { EXPECT_EQ(var_[i], 1.0); diff --git a/tests/ut/cpp/kernel/cpu/sparse_apply_lazy_adam_cpu_kernel_test.cc b/tests/ut/cpp/kernel/cpu/sparse_apply_lazy_adam_cpu_kernel_test.cc index 1765ed896f..63e8706d1b 100644 --- a/tests/ut/cpp/kernel/cpu/sparse_apply_lazy_adam_cpu_kernel_test.cc +++ b/tests/ut/cpp/kernel/cpu/sparse_apply_lazy_adam_cpu_kernel_test.cc @@ -58,9 +58,12 @@ class SparseApplyLazyAdamCpuKernelTest : public UT::Common { inputs_.push_back(CreateKernelAddress(indices.data())); } - void CreateWorkspaceAddress(std::vector &new_grad, std::vector &new_indices) { + void CreateWorkspaceAddress(std::vector &new_grad, std::vector &new_indices, std::vector &tmp_grad, + std::vector &tmp_indices) { workspace_.push_back(CreateKernelAddress(new_grad.data())); workspace_.push_back(CreateKernelAddress(new_indices.data())); + workspace_.push_back(CreateKernelAddress(tmp_grad.data())); + workspace_.push_back(CreateKernelAddress(tmp_indices.data())); } std::vector var_; @@ -94,7 +97,9 @@ TEST_F(SparseApplyLazyAdamCpuKernelTest, dense_test) { CreateInputAddress(indices); std::vector new_grad(3 * 3 * 3); std::vector new_indices(3); - CreateWorkspaceAddress(new_grad, new_indices); + std::vector tmp_grad(3 * 3 * 3); + std::vector tmp_indices(3); + CreateWorkspaceAddress(new_grad, new_indices, tmp_grad, tmp_indices); sparse_lazy_adam_->Launch(inputs_, workspace_, outputs_); for (size_t i = 0; i < 3 * 3 * 3; ++i) { EXPECT_TRUE(std::fabs(var_[i] - 0.999684) < 1e-6); @@ -118,7 +123,9 @@ TEST_F(SparseApplyLazyAdamCpuKernelTest, sparse_test1) { CreateInputAddress(indices); std::vector new_grad(3 * 3 * 3); std::vector new_indices(3); - CreateWorkspaceAddress(new_grad, new_indices); + std::vector tmp_grad(3 * 3 * 3); + std::vector tmp_indices(3); + CreateWorkspaceAddress(new_grad, new_indices, tmp_grad, tmp_indices); sparse_lazy_adam_->Launch(inputs_, workspace_, outputs_); for (size_t i = 0; i < 3 * 3; ++i) { EXPECT_TRUE(std::fabs(var_[i] - 0.999684) < 1e-6); @@ -146,7 +153,9 @@ TEST_F(SparseApplyLazyAdamCpuKernelTest, sparse_test2) { CreateInputAddress(indices); std::vector new_grad(3 * 3 * 3); std::vector new_indices(3); - CreateWorkspaceAddress(new_grad, new_indices); + std::vector tmp_grad(3 * 3 * 3); + std::vector tmp_indices(3); + CreateWorkspaceAddress(new_grad, new_indices, tmp_grad, tmp_indices); sparse_lazy_adam_->Launch(inputs_, workspace_, outputs_); for (size_t i = 0; i < 3 * 3; ++i) { EXPECT_EQ(var_[i], 1.0); diff --git a/tests/ut/cpp/kernel/cpu/sparse_apply_proximal_adagrad_cpu_kernel_test.cc b/tests/ut/cpp/kernel/cpu/sparse_apply_proximal_adagrad_cpu_kernel_test.cc index 23f66db58c..0d679d7e5c 100644 --- a/tests/ut/cpp/kernel/cpu/sparse_apply_proximal_adagrad_cpu_kernel_test.cc +++ b/tests/ut/cpp/kernel/cpu/sparse_apply_proximal_adagrad_cpu_kernel_test.cc @@ -54,9 +54,12 @@ class SparseApplyProximalAdagradCpuKernelTest : public UT::Common { inputs_.push_back(CreateKernelAddress(indices.data())); } - void CreateWorkspaceAddress(std::vector &new_grad, std::vector &new_indices) { + void CreateWorkspaceAddress(std::vector &new_grad, std::vector &new_indices, std::vector &tmp_grad, + std::vector &tmp_indices) { workspace_.push_back(CreateKernelAddress(new_grad.data())); workspace_.push_back(CreateKernelAddress(new_indices.data())); + workspace_.push_back(CreateKernelAddress(tmp_grad.data())); + workspace_.push_back(CreateKernelAddress(tmp_indices.data())); } std::vector var_; @@ -85,7 +88,9 @@ TEST_F(SparseApplyProximalAdagradCpuKernelTest, dense_test) { CreateInputAddress(indices); std::vector new_grad(3 * 3 * 3); std::vector new_indices(3); - CreateWorkspaceAddress(new_grad, new_indices); + std::vector tmp_grad(3 * 3 * 3); + std::vector tmp_indices(3); + CreateWorkspaceAddress(new_grad, new_indices, tmp_grad, tmp_indices); sparse_proximal_adagrad_->Launch(inputs_, workspace_, outputs_); for (size_t i = 0; i < 3 * 3 * 3; ++i) { EXPECT_TRUE(std::fabs(var_[i] - 0.9929289) < 1e-6); @@ -108,7 +113,9 @@ TEST_F(SparseApplyProximalAdagradCpuKernelTest, sparse_test1) { CreateInputAddress(indices); std::vector new_grad(3 * 3 * 3); std::vector new_indices(3); - CreateWorkspaceAddress(new_grad, new_indices); + std::vector tmp_grad(3 * 3 * 3); + std::vector tmp_indices(3); + CreateWorkspaceAddress(new_grad, new_indices, tmp_grad, tmp_indices); sparse_proximal_adagrad_->Launch(inputs_, workspace_, outputs_); for (size_t i = 0; i < 3 * 3; ++i) { EXPECT_TRUE(std::fabs(var_[i] - 0.9929289) < 1e-6); @@ -135,7 +142,9 @@ TEST_F(SparseApplyProximalAdagradCpuKernelTest, sparse_test2) { CreateInputAddress(indices); std::vector new_grad(3 * 3 * 3); std::vector new_indices(3); - CreateWorkspaceAddress(new_grad, new_indices); + std::vector tmp_grad(3 * 3 * 3); + std::vector tmp_indices(3); + CreateWorkspaceAddress(new_grad, new_indices, tmp_grad, tmp_indices); sparse_proximal_adagrad_->Launch(inputs_, workspace_, outputs_); for (size_t i = 0; i < 3 * 3; ++i) { EXPECT_EQ(var_[i], 1.0);