optimize cpu unique

5 years ago · 618c05d454
parent 7d250f2218
commit 618c05d454
7 changed files with 440 additions and 92 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/unique_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/unique_cpu_kernel.cc
@ -19,45 +19,67 @@

 namespace mindspore {
 namespace kernel {
+const size_t kUseBucketUniqueSize = 100000;
+const size_t kUniqueThreadNum = 23;
 void UniqueCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
-  n_ = input_shape[0];
+  input_size_ = input_shape[0];
  dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0);
 }

+void UniqueCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
+  CPUKernel::InitInputOutputSize(kernel_node);
+  workspace_size_list_.emplace_back(input_size_ * sizeof(int64_t));
+  workspace_size_list_.emplace_back(input_size_ * sizeof(int64_t));
+  workspace_size_list_.emplace_back(input_size_ * sizeof(int64_t));
+}
+
 bool UniqueCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
-                             const std::vector<kernel::AddressPtr> & /*workspace*/,
+                             const std::vector<kernel::AddressPtr> &workspace,
                             const std::vector<kernel::AddressPtr> &outputs) {
  if (dtype_ == kNumberTypeInt32) {
-    LaunchKernel<int>(inputs, outputs);
-  } else if (dtype_ == kNumberTypeFloat32) {
-    LaunchKernel<float>(inputs, outputs);
+    LaunchKernel<int, int>(inputs, workspace, outputs);
  } else if (dtype_ == kNumberTypeInt64) {
-    LaunchKernel<int64_t>(inputs, outputs);
+    LaunchKernel<int64_t, int>(inputs, workspace, outputs);
+  } else if (dtype_ == kNumberTypeFloat32) {
+    LaunchKernel<float, int>(inputs, workspace, outputs);
  }
  return true;
 }

-template <typename T>
-void UniqueCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
-  auto x_addr = reinterpret_cast<T *>(inputs[0]->addr);
-  auto y_addr = reinterpret_cast<T *>(outputs[0]->addr);
-  auto idx_addr = reinterpret_cast<int64_t *>(outputs[1]->addr);
-
-  std::unordered_map<T, int64_t> uniq;
-  int n = SizeToInt(n_);
-  uniq.reserve(n * 2);
-  for (int i = 0, j = 0; i < n; ++i) {
-    auto it = uniq.emplace(x_addr[i], j);
-    idx_addr[i] = it.first->second;
-    if (it.second) {
-      ++j;
-    }
+template <typename DataType, typename IndexType>
+void UniqueCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+                                   const std::vector<AddressPtr> &outputs) {
+  if (input_size_ == 0) {
+    return;
+  }
+  if (inputs.size() < 1) {
+    MS_LOG(EXCEPTION) << "Input size should be large than 0";
+  }
+  if (workspace.size() < 3) {
+    MS_LOG(EXCEPTION) << "workspace size should be large than 2";
+  }
+  if (outputs.size() < 2) {
+    MS_LOG(EXCEPTION) << "Output size should be large than 1";
  }
-  for (const auto &it : uniq) {
-    y_addr[it.second] = it.first;
+  auto params = std::make_shared<UniqueParam<DataType, IndexType>>();
+  params->input_ = reinterpret_cast<DataType *>(inputs[0]->addr);
+  params->input_idx_ = reinterpret_cast<IndexType *>(workspace[0]->addr);
+  params->workspace_ = reinterpret_cast<DataType *>(workspace[1]->addr);
+  params->workspace_idx_ = reinterpret_cast<IndexType *>(workspace[2]->addr);
+  params->output_ = reinterpret_cast<DataType *>(outputs[0]->addr);
+  params->inverse_idx_ = reinterpret_cast<IndexType *>(outputs[1]->addr);
+  params->input_size_ = input_size_;
+  params->output_size_ = 0;
+  params->need_sort_ = true;
+  params->thread_num_ = kUniqueThreadNum;
+  if (input_size_ < kUseBucketUniqueSize) {
+    Unique(params);
+  } else {
+    BucketUnique(params);
  }
+  output_size_ = params->output_size_;
 }

 void UniqueCPUKernel::CheckParam(const CNodePtr &kernel_node) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/unique_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/unique_cpu_kernel.h
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/unique_with_pad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/unique_with_pad_cpu_kernel.cc
@ -19,49 +19,33 @@

 namespace mindspore {
 namespace kernel {
-void UniqueWithPadCPUKernel::InitKernel(const CNodePtr &kernel_node) {
-  CheckParam(kernel_node);
-  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
-  n_ = SizeToLong(input_shape[0]);
-  dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0);
-}
-
 bool UniqueWithPadCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
-                                    const std::vector<kernel::AddressPtr> & /*workspace*/,
+                                    const std::vector<kernel::AddressPtr> &workspace,
                                    const std::vector<kernel::AddressPtr> &outputs) {
+  UniqueCPUKernel::Launch(inputs, workspace, outputs);
  if (dtype_ == kNumberTypeInt32) {
-    LaunchKernel<int>(inputs, outputs);
+    PadOutput<int>(inputs, outputs);
  } else if (dtype_ == kNumberTypeInt64) {
-    LaunchKernel<int64_t>(inputs, outputs);
-  } else {
-    MS_LOG(EXCEPTION) << "Only unsupported int32 or int64 dtype";
+    PadOutput<int64_t>(inputs, outputs);
+  } else if (dtype_ == kNumberTypeFloat32) {
+    PadOutput<float>(inputs, outputs);
  }
  return true;
 }

 template <typename T>
-void UniqueWithPadCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
-                                          const std::vector<AddressPtr> &outputs) {
-  T *a = reinterpret_cast<T *>(inputs[0]->addr);
+void UniqueWithPadCPUKernel::PadOutput(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
+  if (inputs.size() < 2) {
+    MS_LOG(EXCEPTION) << "Input size should be large than 1";
+  }
+  if (outputs.size() < 1) {
+    MS_LOG(EXCEPTION) << "Output size should be large than 0";
+  }
  T pad_num = *reinterpret_cast<T *>(inputs[1]->addr);
  T *out = reinterpret_cast<T *>(outputs[0]->addr);
-  T *idx_vec = reinterpret_cast<T *>(outputs[1]->addr);
-
-  for (int64_t i = 0; i < n_; ++i) {
+  for (size_t i = output_size_; i < input_size_; ++i) {
    out[i] = pad_num;
  }
-  std::unordered_map<T, int> uniq;
-  uniq.reserve(n_);
-  for (int64_t i = 0, j = 0; i < n_; ++i) {
-    auto it = uniq.emplace(a[i], j);
-    idx_vec[i] = it.first->second;
-    if (it.second) {
-      ++j;
-    }
-  }
-  for (const auto &it : uniq) {
-    out[it.second] = it.first;
-  }
 }

 void UniqueWithPadCPUKernel::CheckParam(const CNodePtr &kernel_node) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/unique_with_pad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/unique_with_pad_cpu_kernel.h
@ -16,31 +16,26 @@

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_UNIQUE_WITH_PAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_UNIQUE_WITH_PAD_CPU_KERNEL_H_
-#include <vector>
 #include <memory>
 #include <unordered_map>
+#include <vector>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
+#include "backend/kernel_compiler/cpu/unique_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
-class UniqueWithPadCPUKernel : public CPUKernel {
+class UniqueWithPadCPUKernel : public UniqueCPUKernel {
 public:
  UniqueWithPadCPUKernel() = default;
  ~UniqueWithPadCPUKernel() override = default;
-
-  void InitKernel(const CNodePtr &kernel_node) override;
-
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;
-
  template <typename T>
-  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
+  void PadOutput(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

- private:
-  void CheckParam(const CNodePtr &kernel_node);
-  int64_t n_{0};
-  TypeId dtype_{kTypeUnknown};
+ protected:
+  void CheckParam(const CNodePtr &kernel_node) override;
 };

 MS_REG_CPU_KERNEL(UniqueWithPad,
@ -56,7 +51,15 @@ MS_REG_CPU_KERNEL(UniqueWithPad,
                    .AddInputAttr(kNumberTypeInt64)
                    .AddInputAttr(kNumberTypeInt64)
                    .AddOutputAttr(kNumberTypeInt64)
-                    .AddOutputAttr(kNumberTypeInt64),
+                    .AddOutputAttr(kNumberTypeInt32),
+                  UniqueWithPadCPUKernel);
+
+MS_REG_CPU_KERNEL(UniqueWithPad,
+                  KernelAttr()
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeInt32),
                  UniqueWithPadCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore
--- a/tests/st/ops/cpu/test_unique_op.py
+++ b/tests/st/ops/cpu/test_unique_op.py
@ -33,7 +33,7 @@ class Net(nn.Cell):
        return self.uniq(x)


-def test_net():
+def test_net_fp32():
    x = Tensor(np.array([1, 2, 5, 2]), mstype.float32)
    uniq = Net()
    output = uniq(x)
@ -45,3 +45,31 @@ def test_net():

    assert (output[0].asnumpy() == expect_y_result).all()
    assert (output[1].asnumpy() == expect_idx_result).all()
+
+
+def test_net_int32():
+    x = Tensor(np.array([1, 2, 5, 2]), mstype.int32)
+    uniq = Net()
+    output = uniq(x)
+    print("x:\n", x)
+    print("y:\n", output[0])
+    print("idx:\n", output[1])
+    expect_y_result = [1, 2, 5]
+    expect_idx_result = [0, 1, 2, 1]
+
+    assert (output[0].asnumpy() == expect_y_result).all()
+    assert (output[1].asnumpy() == expect_idx_result).all()
+
+
+def test_net_int64():
+    x = Tensor(np.array([1, 2, 5, 2]), mstype.int64)
+    uniq = Net()
+    output = uniq(x)
+    print("x:\n", x)
+    print("y:\n", output[0])
+    print("idx:\n", output[1])
+    expect_y_result = [1, 2, 5]
+    expect_idx_result = [0, 1, 2, 1]
+
+    assert (output[0].asnumpy() == expect_y_result).all()
+    assert (output[1].asnumpy() == expect_idx_result).all()
--- a/tests/ut/cpp/kernel/cpu/unique_cpu_kernel_test.cc
+++ b/tests/ut/cpp/kernel/cpu/unique_cpu_kernel_test.cc
@ -29,7 +29,7 @@ class UniqueCpuKernelTest : public UT::Common {
  UniqueCpuKernelTest() : unique_(std::make_shared<UniqueCPUKernel>()) {}

  void SetUp() override {
-    unique_->n_ = 9;
+    unique_->input_size_ = 9;
    unique_->dtype_ = kNumberTypeFloat32;
    inputs_.clear();
    workspace_.clear();
@ -42,16 +42,19 @@ class UniqueCpuKernelTest : public UT::Common {
    return kernel_addr;
  }

-  void CreateInputAddress() { inputs_.push_back(CreateKernelAddress(x_.data())); }
-
-  void CreateOutputAddress() {
+  void CreateAddress() {
+    inputs_.push_back(CreateKernelAddress(x_.data()));
    outputs_.push_back(CreateKernelAddress(y_.data()));
    outputs_.push_back(CreateKernelAddress(idx_.data()));
+    workspace_.push_back(CreateKernelAddress(workspace_idx_.data()));
+    workspace_.push_back(CreateKernelAddress(workspace_idx_.data()));
+    workspace_.push_back(CreateKernelAddress(workspace_idx_.data()));
  }

  std::vector<float> x_;
  std::vector<float> y_;
-  std::vector<int64_t> idx_;
+  std::vector<int> idx_;
+  std::vector<int64_t> workspace_idx_;
  std::vector<AddressPtr> inputs_;
  std::vector<AddressPtr> workspace_;
  std::vector<AddressPtr> outputs_;
@ -62,13 +65,13 @@ TEST_F(UniqueCpuKernelTest, compute_test) {
  x_ = {1, 1, 2, 4, 4, 4, 7, 8, 8};
  y_ = {1, 1, 1, 1, 1};
  idx_ = {1, 1, 1, 1, 1, 1, 1, 1, 1};
-  CreateInputAddress();
-  CreateOutputAddress();
+  workspace_idx_ = {1, 1, 1, 1, 1, 1, 1, 1, 1};
+  CreateAddress();
  unique_->Launch(inputs_, workspace_, outputs_);

  // check compute result
  std::vector<float> expect_y{1, 2, 4, 7, 8};
-  std::vector<int64_t> expect_idx{0, 0, 1, 2, 2, 2, 3, 4, 4};
+  std::vector<int> expect_idx{0, 0, 1, 2, 2, 2, 3, 4, 4};
  EXPECT_TRUE(y_ == expect_y);
  EXPECT_TRUE(idx_ == expect_idx);
 }
--- a/tests/ut/cpp/kernel/cpu/unique_with_pad_cpu_kernel_test.cc
+++ b/tests/ut/cpp/kernel/cpu/unique_with_pad_cpu_kernel_test.cc
@ -29,7 +29,7 @@ class UniqueWithPadCpuKernelTest : public UT::Common {
  UniqueWithPadCpuKernelTest() : unique_with_pad_(std::make_shared<UniqueWithPadCPUKernel>()) {}

  void SetUp() override {
-    unique_with_pad_->n_ = 10;
+    unique_with_pad_->input_size_ = 10;
    unique_with_pad_->dtype_ = kNumberTypeInt64;
    inputs_.clear();
    workspace_.clear();
@ -42,21 +42,21 @@ class UniqueWithPadCpuKernelTest : public UT::Common {
    return kernel_addr;
  }

-  void CreateInputAddress() {
+  void CreateAddress() {
    inputs_.push_back(CreateKernelAddress(x_.data()));
    inputs_.push_back(CreateKernelAddress(&pad_dim_));
-    ;
-  }
-
-  void CreateOutputAddress() {
    outputs_.push_back(CreateKernelAddress(out_.data()));
    outputs_.push_back(CreateKernelAddress(idx_.data()));
+    workspace_.push_back(CreateKernelAddress(workspace_idx_.data()));
+    workspace_.push_back(CreateKernelAddress(workspace_idx_.data()));
+    workspace_.push_back(CreateKernelAddress(workspace_idx_.data()));
  }

  std::vector<int64_t> x_;
  int64_t pad_dim_;
  std::vector<int64_t> out_;
-  std::vector<int64_t> idx_;
+  std::vector<int> idx_;
+  std::vector<int64_t> workspace_idx_;
  std::vector<AddressPtr> inputs_;
  std::vector<AddressPtr> workspace_;
  std::vector<AddressPtr> outputs_;
@ -68,13 +68,13 @@ TEST_F(UniqueWithPadCpuKernelTest, compute_test) {
  pad_dim_ = 8;
  out_ = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
  idx_ = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-  CreateInputAddress();
-  CreateOutputAddress();
+  workspace_idx_ = {1, 1, 1, 1, 1, 1, 1, 1, 1};
+  CreateAddress();
  unique_with_pad_->Launch(inputs_, workspace_, outputs_);

  // check compute result
-  std::vector<int64_t> expect_out{1, 5, 4, 3, 2, 8, 8, 8, 8, 8};
-  std::vector<int64_t> expect_idx{0, 0, 1, 1, 2, 2, 3, 3, 4, 4};
+  std::vector<int64_t> expect_out{1, 2, 3, 4, 5, 8, 8, 8, 8, 8};
+  std::vector<int> expect_idx{0, 0, 4, 4, 3, 3, 2, 2, 1, 1};
  EXPECT_TRUE(out_ == expect_out);
  EXPECT_TRUE(idx_ == expect_idx);
 }