From 83e06bbba070edfedfda92c21cd690e86569cf3b Mon Sep 17 00:00:00 2001 From: TFbunny Date: Tue, 29 Dec 2020 16:18:43 -0500 Subject: [PATCH] add return to sparseapplyftrl gpu --- .../gpu/nn/sparse_ftrl_gpu_kernel.h | 26 +++++++++++++++---- tests/st/ops/gpu/test_sparse_apply_ftrl_op.py | 24 +++++++++++++++++ 2 files changed, 45 insertions(+), 5 deletions(-) diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/sparse_ftrl_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/sparse_ftrl_gpu_kernel.h index bc3dd2192b..5af74c2aea 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/sparse_ftrl_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/sparse_ftrl_gpu_kernel.h @@ -33,19 +33,35 @@ class SparseFtrlGpuKernel : public GpuKernel { const std::vector &GetOutputSizeList() const override { return output_size_list_; } const std::vector &GetWorkspaceSizeList() const override { return workspace_size_list_; } - bool Launch(const std::vector &inputs, const std::vector &, const std::vector &, - void *stream_ptr) override { + bool Launch(const std::vector &inputs, const std::vector &, + const std::vector &outputs, void *stream_ptr) override { T *variable = GetDeviceAddress(inputs, 0); T *accumulation = GetDeviceAddress(inputs, 1); T *linear = GetDeviceAddress(inputs, 2); T *gradient = GetDeviceAddress(inputs, 3); S *indices = GetDeviceAddress(inputs, 4); + T *variable_out = GetDeviceAddress(outputs, 0); + T *accumulation_out = GetDeviceAddress(outputs, 1); + T *linear_out = GetDeviceAddress(outputs, 2); CalSparseApplyFtrl(gradient, indices, num_index_, n_stride_, lr_, l1_, l2_, lr_power_, use_locking_, variable, accumulation, linear, reinterpret_cast(stream_ptr)); + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(&variable_out[0], &variable[0], variable_size_, cudaMemcpyDeviceToDevice, + reinterpret_cast(stream_ptr)), + "cudaMemcpyAsync output failed"); + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(&accumulation_out[0], &accumulation[0], accumulation_size_, + cudaMemcpyDeviceToDevice, reinterpret_cast(stream_ptr)), + "cudaMemcpyAsync output failed"); + CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, + cudaMemcpyAsync(&linear_out[0], &linear[0], linear_size_, cudaMemcpyDeviceToDevice, + reinterpret_cast(stream_ptr)), + "cudaMemcpyAsync output failed"); return true; } bool Init(const CNodePtr &kernel_node) override { + kernel_node_ = kernel_node; size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); if (input_num != 5) { MS_LOG(ERROR) << "Input number is " << input_num << ", but sparse ftrl needs 5 inputs."; @@ -104,9 +120,9 @@ class SparseFtrlGpuKernel : public GpuKernel { input_size_list_.push_back(linear_size_); input_size_list_.push_back(gradient_size_); input_size_list_.push_back(indices_size_); - output_size_list_.push_back(0); - output_size_list_.push_back(0); - output_size_list_.push_back(0); + output_size_list_.push_back(variable_size_); + output_size_list_.push_back(accumulation_size_); + output_size_list_.push_back(linear_size_); } void ResetResource() noexcept override { diff --git a/tests/st/ops/gpu/test_sparse_apply_ftrl_op.py b/tests/st/ops/gpu/test_sparse_apply_ftrl_op.py index 09789236c9..fbda364d9b 100644 --- a/tests/st/ops/gpu/test_sparse_apply_ftrl_op.py +++ b/tests/st/ops/gpu/test_sparse_apply_ftrl_op.py @@ -147,3 +147,27 @@ def test_ftrl_sparse_half_int64_ind(): sparse_apply_ftrl = Net_half() sparse_apply_ftrl(gradient, indices) assert np.all(sparse_apply_ftrl.var.data.asnumpy() == expect_var) + +@pytest.mark.level0 +@pytest.mark.platform_x86_gpu_training +@pytest.mark.env_onecard +def test_ftrl_half_return_output(): + gradient = Tensor(np.ones([3, 3, 3]).astype(np.float16)) + indices = Tensor([0, 1, 2], mstype.int32) + expect_var = np.array([[[0.291479, 0.291479, 0.291479], + [0.291479, 0.291479, 0.291479], + [0.291479, 0.291479, 0.291479]], + [[0.291479, 0.291479, 0.291479], + [0.291479, 0.291479, 0.291479], + [0.291479, 0.291479, 0.291479]], + [[0.291479, 0.291479, 0.291479], + [0.291479, 0.291479, 0.291479], + [0.291479, 0.291479, 0.291479]]]).astype(np.float16) + context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU") + sparse_apply_ftrl = Net_half() + output = sparse_apply_ftrl(gradient, indices) + assert np.all(output[0].asnumpy() == expect_var) + context.set_context(mode=context.GRAPH_MODE, device_target="GPU") + sparse_apply_ftrl = Net_half() + sparse_apply_ftrl(gradient, indices) + assert np.all(output[0].asnumpy() == expect_var)