From 618c05d45490c7398ce3a14cf2bdb216eb47a704 Mon Sep 17 00:00:00 2001
From: kswang <wangkaisheng2@huawei.com>
Date: Thu, 29 Oct 2020 21:02:39 +0800
Subject: [PATCH] optimize cpu unique

---
 .../kernel_compiler/cpu/unique_cpu_kernel.cc  |  68 ++--
 .../kernel_compiler/cpu/unique_cpu_kernel.h   | 322 +++++++++++++++++-
 .../cpu/unique_with_pad_cpu_kernel.cc         |  44 +--
 .../cpu/unique_with_pad_cpu_kernel.h          |  27 +-
 tests/st/ops/cpu/test_unique_op.py            |  30 +-
 .../cpp/kernel/cpu/unique_cpu_kernel_test.cc  |  19 +-
 .../cpu/unique_with_pad_cpu_kernel_test.cc    |  22 +-
 7 files changed, 440 insertions(+), 92 deletions(-)
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/unique_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/unique_cpu_kernel.cc
index 05e90f6a92..e367bbba3d 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/unique_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/unique_cpu_kernel.cc
@@ -19,45 +19,67 @@
 
 namespace mindspore {
 namespace kernel {
+const size_t kUseBucketUniqueSize = 100000;
+const size_t kUniqueThreadNum = 23;
 void UniqueCPUKernel::InitKernel(const CNodePtr &kernel_node) {
   CheckParam(kernel_node);
   auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
-  n_ = input_shape[0];
+  input_size_ = input_shape[0];
   dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0);
 }
 
+void UniqueCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
+  CPUKernel::InitInputOutputSize(kernel_node);
+  workspace_size_list_.emplace_back(input_size_ * sizeof(int64_t));
+  workspace_size_list_.emplace_back(input_size_ * sizeof(int64_t));
+  workspace_size_list_.emplace_back(input_size_ * sizeof(int64_t));
+}
+
 bool UniqueCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
-                             const std::vector<kernel::AddressPtr> & /*workspace*/,
+                             const std::vector<kernel::AddressPtr> &workspace,
                              const std::vector<kernel::AddressPtr> &outputs) {
   if (dtype_ == kNumberTypeInt32) {
-    LaunchKernel<int>(inputs, outputs);
-  } else if (dtype_ == kNumberTypeFloat32) {
-    LaunchKernel<float>(inputs, outputs);
+    LaunchKernel<int, int>(inputs, workspace, outputs);
   } else if (dtype_ == kNumberTypeInt64) {
-    LaunchKernel<int64_t>(inputs, outputs);
+    LaunchKernel<int64_t, int>(inputs, workspace, outputs);
+  } else if (dtype_ == kNumberTypeFloat32) {
+    LaunchKernel<float, int>(inputs, workspace, outputs);
   }
   return true;
 }
 
-template <typename T>
-void UniqueCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
-  auto x_addr = reinterpret_cast<T *>(inputs[0]->addr);
-  auto y_addr = reinterpret_cast<T *>(outputs[0]->addr);
-  auto idx_addr = reinterpret_cast<int64_t *>(outputs[1]->addr);
-
-  std::unordered_map<T, int64_t> uniq;
-  int n = SizeToInt(n_);
-  uniq.reserve(n * 2);
-  for (int i = 0, j = 0; i < n; ++i) {
-    auto it = uniq.emplace(x_addr[i], j);
-    idx_addr[i] = it.first->second;
-    if (it.second) {
-      ++j;
-    }
+template <typename DataType, typename IndexType>
+void UniqueCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+                                   const std::vector<AddressPtr> &outputs) {
+  if (input_size_ == 0) {
+    return;
+  }
+  if (inputs.size() < 1) {
+    MS_LOG(EXCEPTION) << "Input size should be large than 0";
+  }
+  if (workspace.size() < 3) {
+    MS_LOG(EXCEPTION) << "workspace size should be large than 2";
+  }
+  if (outputs.size() < 2) {
+    MS_LOG(EXCEPTION) << "Output size should be large than 1";
   }
-  for (const auto &it : uniq) {
-    y_addr[it.second] = it.first;
+  auto params = std::make_shared<UniqueParam<DataType, IndexType>>();
+  params->input_ = reinterpret_cast<DataType *>(inputs[0]->addr);
+  params->input_idx_ = reinterpret_cast<IndexType *>(workspace[0]->addr);
+  params->workspace_ = reinterpret_cast<DataType *>(workspace[1]->addr);
+  params->workspace_idx_ = reinterpret_cast<IndexType *>(workspace[2]->addr);
+  params->output_ = reinterpret_cast<DataType *>(outputs[0]->addr);
+  params->inverse_idx_ = reinterpret_cast<IndexType *>(outputs[1]->addr);
+  params->input_size_ = input_size_;
+  params->output_size_ = 0;
+  params->need_sort_ = true;
+  params->thread_num_ = kUniqueThreadNum;
+  if (input_size_ < kUseBucketUniqueSize) {
+    Unique(params);
+  } else {
+    BucketUnique(params);
   }
+  output_size_ = params->output_size_;
 }
 
 void UniqueCPUKernel::CheckParam(const CNodePtr &kernel_node) {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/unique_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/unique_cpu_kernel.h
index 1fb5b299aa..339c87131b 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/unique_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/unique_cpu_kernel.h
@@ -16,31 +16,339 @@
 
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_UNIQUE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_UNIQUE_CPU_KERNEL_H_
-#include <vector>
+#include <algorithm>
 #include <memory>
+#include <thread>
 #include <unordered_map>
+#include <vector>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
 
 namespace mindspore {
 namespace kernel {
+template <typename DataType, typename IndexType>
+struct UniqueParam {
+  DataType *input_{nullptr};
+  IndexType *input_idx_{nullptr};
+  DataType *output_{nullptr};
+  IndexType *inverse_idx_{nullptr};
+  DataType *workspace_{nullptr};
+  IndexType *workspace_idx_{nullptr};
+  IndexType input_size_{0};
+  IndexType output_size_{0};
+  size_t thread_num_{0};
+  bool need_sort_{true};
+};
+
 class UniqueCPUKernel : public CPUKernel {
  public:
   UniqueCPUKernel() = default;
   ~UniqueCPUKernel() override = default;
 
   void InitKernel(const CNodePtr &kernel_node) override;
-
+  void InitInputOutputSize(const CNodePtr &kernel_node) override;
   bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
               const std::vector<AddressPtr> &outputs) override;
 
-  template <typename T>
-  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
+  template <typename DataType, typename IndexType>
+  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+                    const std::vector<AddressPtr> &outputs);
 
- private:
-  void CheckParam(const CNodePtr &kernel_node);
-  size_t n_{0};
+ protected:
+  virtual void CheckParam(const CNodePtr &kernel_node);
+  size_t input_size_{0};
   TypeId dtype_{kTypeUnknown};
+  size_t output_size_{0};
+
+  template <typename DataType>
+  static size_t BucketId(DataType data, size_t bucket_num) {
+    return static_cast<size_t>(data) % bucket_num;
+  }
+
+  template <typename DataType, typename IndexType>
+  static void CalculateEachBucketSize(const std::shared_ptr<UniqueParam<DataType, IndexType>> &params,
+                                      std::vector<IndexType> *each_bucket_size) {
+    MS_EXCEPTION_IF_NULL(params);
+    MS_EXCEPTION_IF_NULL(params->input_);
+    MS_EXCEPTION_IF_NULL(each_bucket_size);
+    size_t bucket_num = each_bucket_size->size();
+    for (IndexType i = 0; i < params->input_size_; ++i) {
+      auto bucket_id = BucketId(params->input_[i], bucket_num);
+      each_bucket_size->at(bucket_id)++;
+    }
+  }
+
+  template <typename DataType, typename IndexType>
+  static void SplitAndCalculateBucketSize(
+    const std::shared_ptr<UniqueParam<DataType, IndexType>> &params,
+    std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> *segments_ptr,
+    std::vector<std::shared_ptr<std::vector<IndexType>>> *segment_bucket_sizes_ptr) {
+    MS_EXCEPTION_IF_NULL(params);
+    MS_EXCEPTION_IF_NULL(params->input_);
+    MS_EXCEPTION_IF_NULL(segments_ptr);
+    MS_EXCEPTION_IF_NULL(segment_bucket_sizes_ptr);
+    auto &segments = *segments_ptr;
+    auto &segment_bucket_sizes = *segment_bucket_sizes_ptr;
+
+    IndexType input_size = params->input_size_;
+    size_t thread_num = params->thread_num_;
+    if (thread_num < 1) {
+      MS_LOG(EXCEPTION) << "Thread num must > 0 !";
+    }
+    IndexType thread_data_size = input_size / thread_num;
+    size_t left_data_size = input_size % thread_num;
+    std::vector<std::thread> threads;
+    threads.reserve(thread_num);
+    segments.reserve(thread_num);
+    segment_bucket_sizes.reserve(thread_num);
+    IndexType current_offset = 0;
+    for (size_t i = 0; i < thread_num; ++i) {
+      segment_bucket_sizes.emplace_back(std::make_shared<std::vector<IndexType>>(thread_num, 0));
+      IndexType data_size = thread_data_size;
+      if (i < left_data_size) {
+        data_size += 1;
+      }
+      segments.emplace_back(std::make_shared<UniqueParam<DataType, IndexType>>());
+      segments[i]->input_ = params->input_ + current_offset;
+      segments[i]->input_size_ = data_size;
+      segments[i]->thread_num_ = thread_num;
+      threads.emplace_back(
+        std::thread(CalculateEachBucketSize<DataType, IndexType>, segments[i], segment_bucket_sizes[i].get()));
+      current_offset += data_size;
+    }
+    for (size_t i = 0; i < params->thread_num_; ++i) {
+      threads[i].join();
+    }
+  }
+
+  template <typename DataType, typename IndexType>
+  static void SegmentToBuckets(const std::shared_ptr<UniqueParam<DataType, IndexType>> &segment,
+                               IndexType segment_offset,
+                               const std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> &buckets) {
+    MS_LOG(DEBUG) << "Start";
+    MS_EXCEPTION_IF_NULL(segment);
+    MS_EXCEPTION_IF_NULL(segment->input_);
+    std::vector<IndexType> bucket_data_num(segment->thread_num_, 0);
+    auto bucket_size = buckets.size();
+    for (IndexType i = 0; i < segment->input_size_; ++i) {
+      DataType data = segment->input_[i];
+      auto bucket_id = BucketId(data, segment->thread_num_);
+      auto bucket_index = bucket_data_num[bucket_id];
+      if (bucket_id >= bucket_size) {
+        MS_LOG(ERROR) << "Error bucket id!";
+        continue;
+      }
+      auto &bucket = buckets[bucket_id];
+      MS_EXCEPTION_IF_NULL(bucket);
+      if (bucket_index >= bucket->input_size_) {
+        MS_LOG(ERROR) << "Error bucket index!";
+        continue;
+      }
+      bucket->input_[bucket_index] = data;
+      bucket->workspace_idx_[bucket_index] = segment_offset + i;
+      bucket_data_num[bucket_id]++;
+    }
+    MS_LOG(DEBUG) << "End";
+  }
+
+  template <typename DataType, typename IndexType>
+  static void GatherSegmentsToBuckets(const std::shared_ptr<UniqueParam<DataType, IndexType>> &params,
+                                      std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> *segments_ptr,
+                                      std::vector<std::shared_ptr<std::vector<IndexType>>> *segment_bucket_sizes_ptr,
+                                      std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> *buckets_ptr) {
+    MS_LOG(DEBUG) << "Start";
+    MS_EXCEPTION_IF_NULL(params);
+    MS_EXCEPTION_IF_NULL(params->workspace_);
+    MS_EXCEPTION_IF_NULL(params->inverse_idx_);
+    MS_EXCEPTION_IF_NULL(params->workspace_idx_);
+    MS_EXCEPTION_IF_NULL(params->output_);
+    MS_EXCEPTION_IF_NULL(params->input_idx_);
+    MS_EXCEPTION_IF_NULL(segments_ptr);
+    MS_EXCEPTION_IF_NULL(segment_bucket_sizes_ptr);
+    MS_EXCEPTION_IF_NULL(buckets_ptr);
+    auto &segments = *segments_ptr;
+    auto &segment_bucket_sizes = *segment_bucket_sizes_ptr;
+    auto &buckets = *buckets_ptr;
+    auto thread_num = segments.size();
+    buckets.reserve(thread_num);
+    std::vector<IndexType> bucket_data_size(thread_num, 0);
+    for (size_t i = 0; i < thread_num; ++i) {
+      for (size_t j = 0; j < thread_num; ++j) {
+        bucket_data_size[j] += segment_bucket_sizes[i]->at(j);
+      }
+    }
+
+    IndexType current_offset = 0;
+    for (size_t i = 0; i < thread_num; ++i) {
+      auto bucket = std::make_shared<UniqueParam<DataType, IndexType>>();
+      bucket->input_ = params->output_ + current_offset;
+      bucket->input_idx_ = params->inverse_idx_ + current_offset;
+      bucket->workspace_idx_ = params->workspace_idx_ + current_offset;
+      bucket->output_ = params->workspace_ + current_offset;
+      bucket->inverse_idx_ = params->input_idx_ + current_offset;
+      bucket->input_size_ = bucket_data_size[i];
+      current_offset += bucket_data_size[i];
+      buckets.emplace_back(bucket);
+    }
+    std::vector<IndexType> tmp_bucket_data_size(thread_num, 0);
+    std::vector<std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>>> thread_buckets;
+    for (size_t i = 0; i < thread_num; ++i) {
+      std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> local_buckets;
+      for (size_t j = 0; j < thread_num; ++j) {
+        auto bucket = std::make_shared<UniqueParam<DataType, IndexType>>();
+        bucket->input_ = buckets[j]->input_ + tmp_bucket_data_size[j];
+        bucket->input_size_ = buckets[j]->input_size_ - tmp_bucket_data_size[j];
+        bucket->workspace_idx_ = buckets[j]->workspace_idx_ + tmp_bucket_data_size[j];
+        local_buckets.emplace_back(bucket);
+        tmp_bucket_data_size[j] += segment_bucket_sizes[i]->at(j);
+      }
+      thread_buckets.emplace_back(local_buckets);
+    }
+    std::vector<std::thread> threads;
+    threads.reserve(thread_num);
+    current_offset = 0;
+    for (size_t i = 0; i < thread_num; ++i) {
+      MS_EXCEPTION_IF_NULL(segments[i]);
+      threads.emplace_back(
+        std::thread(SegmentToBuckets<DataType, IndexType>, segments[i], current_offset, thread_buckets[i]));
+      current_offset += segments[i]->input_size_;
+    }
+    for (size_t i = 0; i < thread_num; ++i) {
+      threads[i].join();
+    }
+    MS_LOG(DEBUG) << "End";
+  }
+
+  template <typename DataType, typename IndexType>
+  static void Unique(const std::shared_ptr<UniqueParam<DataType, IndexType>> &params) {
+    MS_LOG(DEBUG) << "Start";
+    MS_EXCEPTION_IF_NULL(params);
+    DataType *input = params->input_;
+    IndexType *input_idx = params->input_idx_;
+    DataType *output = params->output_;
+    IndexType *inverse_idx = params->inverse_idx_;
+    MS_EXCEPTION_IF_NULL(input);
+    MS_EXCEPTION_IF_NULL(input_idx);
+    MS_EXCEPTION_IF_NULL(output);
+    MS_EXCEPTION_IF_NULL(inverse_idx);
+    IndexType j = 0;
+    if (params->need_sort_) {
+      for (IndexType i = 0; i < params->input_size_; ++i) {
+        input_idx[i] = i;
+      }
+      std::sort(input_idx, input_idx + params->input_size_,
+                [&](IndexType left, IndexType right) { return input[left] < input[right]; });
+      DataType last = input[0];
+      for (IndexType i = 0; i < params->input_size_; ++i) {
+        auto curr = input[input_idx[i]];
+        if (i == 0 || curr != last) {
+          if (i != 0) {
+            j++;
+          }
+          output[j] = curr;
+          inverse_idx[input_idx[i]] = j;
+          last = curr;
+        } else {
+          inverse_idx[input_idx[i]] = j;
+        }
+      }
+      params->output_size_ = j + 1;
+    } else {
+      std::unordered_map<DataType, IndexType> uniq;
+      uniq.reserve(params->input_size_);
+      for (IndexType i = 0; i < params->input_size_; ++i) {
+        auto it = uniq.emplace(input[i], j);
+        inverse_idx[i] = it.first->second;
+        if (it.second) {
+          ++j;
+        }
+      }
+      for (const auto &it : uniq) {
+        output[it.second] = it.first;
+      }
+      params->output_size_ = j;
+    }
+    MS_LOG(DEBUG) << "End";
+  }
+
+  template <typename DataType, typename IndexType>
+  static void UniqueEachBucket(const std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> &buckets) {
+    MS_LOG(DEBUG) << "Start";
+    size_t thread_num = buckets.size();
+    std::vector<std::thread> threads;
+    threads.reserve(thread_num);
+    for (size_t i = 0; i < thread_num; ++i) {
+      threads.emplace_back(std::thread(Unique<DataType, IndexType>, buckets[i]));
+    }
+    for (size_t i = 0; i < thread_num; ++i) {
+      threads[i].join();
+    }
+    MS_LOG(DEBUG) << "End";
+  }
+
+  template <typename DataType, typename IndexType>
+  static void TransformBucketReverseIndices(const std::shared_ptr<UniqueParam<DataType, IndexType>> &bucket,
+                                            const std::shared_ptr<UniqueParam<DataType, IndexType>> &result,
+                                            IndexType offset) {
+    MS_EXCEPTION_IF_NULL(bucket);
+    MS_EXCEPTION_IF_NULL(bucket->inverse_idx_);
+    MS_EXCEPTION_IF_NULL(bucket->workspace_idx_);
+    MS_EXCEPTION_IF_NULL(result);
+    MS_EXCEPTION_IF_NULL(result->inverse_idx_);
+    for (IndexType i = 0; i < bucket->input_size_; ++i) {
+      auto origin_idx = bucket->workspace_idx_[i];
+      if (origin_idx >= 0 && origin_idx < result->input_size_) {
+        result->inverse_idx_[origin_idx] = bucket->inverse_idx_[i] + offset;
+      }
+    }
+  }
+
+  template <typename DataType, typename IndexType>
+  static void MergeBuckets(const std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> &buckets,
+                           const std::shared_ptr<UniqueParam<DataType, IndexType>> &result) {
+    MS_LOG(DEBUG) << "Start";
+    MS_EXCEPTION_IF_NULL(result);
+    MS_EXCEPTION_IF_NULL(result->output_);
+    size_t thread_num = buckets.size();
+    std::vector<IndexType> bucket_offsets(thread_num);
+    IndexType current_size = 0;
+    for (size_t i = 0; i < thread_num; ++i) {
+      auto bucket = buckets[i];
+      MS_EXCEPTION_IF_NULL(bucket);
+      MS_EXCEPTION_IF_NULL(bucket->output_);
+      bucket_offsets[i] = current_size;
+      auto ret_code = memcpy_s(result->output_ + current_size, (result->input_size_ - current_size) * sizeof(DataType),
+                               bucket->output_, bucket->output_size_ * sizeof(DataType));
+      if (ret_code != EOK) {
+        MS_LOG(EXCEPTION) << "Failed to copy data!";
+      }
+      current_size += bucket->output_size_;
+    }
+    result->output_size_ = current_size;
+
+    std::vector<std::thread> threads;
+    threads.reserve(thread_num);
+    for (size_t i = 0; i < thread_num; ++i) {
+      threads.emplace_back(
+        std::thread(TransformBucketReverseIndices<DataType, IndexType>, buckets[i], result, bucket_offsets[i]));
+    }
+    for (size_t i = 0; i < thread_num; ++i) {
+      threads[i].join();
+    }
+    MS_LOG(DEBUG) << "End";
+  }
+
+  template <typename DataType, typename IndexType>
+  static void BucketUnique(const std::shared_ptr<UniqueParam<DataType, IndexType>> &params) {
+    MS_EXCEPTION_IF_NULL(params);
+    std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> segments;
+    std::vector<std::shared_ptr<UniqueParam<DataType, IndexType>>> buckets;
+    std::vector<std::shared_ptr<std::vector<IndexType>>> segment_bucket_sizes;
+    SplitAndCalculateBucketSize(params, &segments, &segment_bucket_sizes);
+    GatherSegmentsToBuckets(params, &segments, &segment_bucket_sizes, &buckets);
+    UniqueEachBucket(buckets);
+    MergeBuckets(buckets, params);
+  }
 };
 
 MS_REG_CPU_KERNEL(
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/unique_with_pad_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/unique_with_pad_cpu_kernel.cc
index 31e1e3195a..dbefe6ed90 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/unique_with_pad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/unique_with_pad_cpu_kernel.cc
@@ -19,49 +19,33 @@
 
 namespace mindspore {
 namespace kernel {
-void UniqueWithPadCPUKernel::InitKernel(const CNodePtr &kernel_node) {
-  CheckParam(kernel_node);
-  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
-  n_ = SizeToLong(input_shape[0]);
-  dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0);
-}
-
 bool UniqueWithPadCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
-                                    const std::vector<kernel::AddressPtr> & /*workspace*/,
+                                    const std::vector<kernel::AddressPtr> &workspace,
                                     const std::vector<kernel::AddressPtr> &outputs) {
+  UniqueCPUKernel::Launch(inputs, workspace, outputs);
   if (dtype_ == kNumberTypeInt32) {
-    LaunchKernel<int>(inputs, outputs);
+    PadOutput<int>(inputs, outputs);
   } else if (dtype_ == kNumberTypeInt64) {
-    LaunchKernel<int64_t>(inputs, outputs);
-  } else {
-    MS_LOG(EXCEPTION) << "Only unsupported int32 or int64 dtype";
+    PadOutput<int64_t>(inputs, outputs);
+  } else if (dtype_ == kNumberTypeFloat32) {
+    PadOutput<float>(inputs, outputs);
   }
   return true;
 }
 
 template <typename T>
-void UniqueWithPadCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
-                                          const std::vector<AddressPtr> &outputs) {
-  T *a = reinterpret_cast<T *>(inputs[0]->addr);
+void UniqueWithPadCPUKernel::PadOutput(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
+  if (inputs.size() < 2) {
+    MS_LOG(EXCEPTION) << "Input size should be large than 1";
+  }
+  if (outputs.size() < 1) {
+    MS_LOG(EXCEPTION) << "Output size should be large than 0";
+  }
   T pad_num = *reinterpret_cast<T *>(inputs[1]->addr);
   T *out = reinterpret_cast<T *>(outputs[0]->addr);
-  T *idx_vec = reinterpret_cast<T *>(outputs[1]->addr);
-
-  for (int64_t i = 0; i < n_; ++i) {
+  for (size_t i = output_size_; i < input_size_; ++i) {
     out[i] = pad_num;
   }
-  std::unordered_map<T, int> uniq;
-  uniq.reserve(n_);
-  for (int64_t i = 0, j = 0; i < n_; ++i) {
-    auto it = uniq.emplace(a[i], j);
-    idx_vec[i] = it.first->second;
-    if (it.second) {
-      ++j;
-    }
-  }
-  for (const auto &it : uniq) {
-    out[it.second] = it.first;
-  }
 }
 
 void UniqueWithPadCPUKernel::CheckParam(const CNodePtr &kernel_node) {
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/unique_with_pad_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/unique_with_pad_cpu_kernel.h
index 23683a9c4d..759ef8b4df 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/unique_with_pad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/unique_with_pad_cpu_kernel.h
@@ -16,31 +16,26 @@
 
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_UNIQUE_WITH_PAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_UNIQUE_WITH_PAD_CPU_KERNEL_H_
-#include <vector>
 #include <memory>
 #include <unordered_map>
+#include <vector>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
+#include "backend/kernel_compiler/cpu/unique_cpu_kernel.h"
 
 namespace mindspore {
 namespace kernel {
-class UniqueWithPadCPUKernel : public CPUKernel {
+class UniqueWithPadCPUKernel : public UniqueCPUKernel {
  public:
   UniqueWithPadCPUKernel() = default;
   ~UniqueWithPadCPUKernel() override = default;
-
-  void InitKernel(const CNodePtr &kernel_node) override;
-
   bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
               const std::vector<AddressPtr> &outputs) override;
-
   template <typename T>
-  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
+  void PadOutput(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
 
- private:
-  void CheckParam(const CNodePtr &kernel_node);
-  int64_t n_{0};
-  TypeId dtype_{kTypeUnknown};
+ protected:
+  void CheckParam(const CNodePtr &kernel_node) override;
 };
 
 MS_REG_CPU_KERNEL(UniqueWithPad,
@@ -56,7 +51,15 @@ MS_REG_CPU_KERNEL(UniqueWithPad,
                     .AddInputAttr(kNumberTypeInt64)
                     .AddInputAttr(kNumberTypeInt64)
                     .AddOutputAttr(kNumberTypeInt64)
-                    .AddOutputAttr(kNumberTypeInt64),
+                    .AddOutputAttr(kNumberTypeInt32),
+                  UniqueWithPadCPUKernel);
+
+MS_REG_CPU_KERNEL(UniqueWithPad,
+                  KernelAttr()
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeInt32),
                   UniqueWithPadCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/tests/st/ops/cpu/test_unique_op.py b/tests/st/ops/cpu/test_unique_op.py
index 0ad55b7808..4bf888bb11 100644
--- a/tests/st/ops/cpu/test_unique_op.py
+++ b/tests/st/ops/cpu/test_unique_op.py
@@ -33,7 +33,7 @@ class Net(nn.Cell):
         return self.uniq(x)
 
 
-def test_net():
+def test_net_fp32():
     x = Tensor(np.array([1, 2, 5, 2]), mstype.float32)
     uniq = Net()
     output = uniq(x)
@@ -45,3 +45,31 @@ def test_net():
 
     assert (output[0].asnumpy() == expect_y_result).all()
     assert (output[1].asnumpy() == expect_idx_result).all()
+
+
+def test_net_int32():
+    x = Tensor(np.array([1, 2, 5, 2]), mstype.int32)
+    uniq = Net()
+    output = uniq(x)
+    print("x:\n", x)
+    print("y:\n", output[0])
+    print("idx:\n", output[1])
+    expect_y_result = [1, 2, 5]
+    expect_idx_result = [0, 1, 2, 1]
+
+    assert (output[0].asnumpy() == expect_y_result).all()
+    assert (output[1].asnumpy() == expect_idx_result).all()
+
+
+def test_net_int64():
+    x = Tensor(np.array([1, 2, 5, 2]), mstype.int64)
+    uniq = Net()
+    output = uniq(x)
+    print("x:\n", x)
+    print("y:\n", output[0])
+    print("idx:\n", output[1])
+    expect_y_result = [1, 2, 5]
+    expect_idx_result = [0, 1, 2, 1]
+
+    assert (output[0].asnumpy() == expect_y_result).all()
+    assert (output[1].asnumpy() == expect_idx_result).all()
diff --git a/tests/ut/cpp/kernel/cpu/unique_cpu_kernel_test.cc b/tests/ut/cpp/kernel/cpu/unique_cpu_kernel_test.cc
index 943a1ace96..e835b24a80 100644
--- a/tests/ut/cpp/kernel/cpu/unique_cpu_kernel_test.cc
+++ b/tests/ut/cpp/kernel/cpu/unique_cpu_kernel_test.cc
@@ -29,7 +29,7 @@ class UniqueCpuKernelTest : public UT::Common {
   UniqueCpuKernelTest() : unique_(std::make_shared<UniqueCPUKernel>()) {}
 
   void SetUp() override {
-    unique_->n_ = 9;
+    unique_->input_size_ = 9;
     unique_->dtype_ = kNumberTypeFloat32;
     inputs_.clear();
     workspace_.clear();
@@ -42,16 +42,19 @@ class UniqueCpuKernelTest : public UT::Common {
     return kernel_addr;
   }
 
-  void CreateInputAddress() { inputs_.push_back(CreateKernelAddress(x_.data())); }
-
-  void CreateOutputAddress() {
+  void CreateAddress() {
+    inputs_.push_back(CreateKernelAddress(x_.data()));
     outputs_.push_back(CreateKernelAddress(y_.data()));
     outputs_.push_back(CreateKernelAddress(idx_.data()));
+    workspace_.push_back(CreateKernelAddress(workspace_idx_.data()));
+    workspace_.push_back(CreateKernelAddress(workspace_idx_.data()));
+    workspace_.push_back(CreateKernelAddress(workspace_idx_.data()));
   }
 
   std::vector<float> x_;
   std::vector<float> y_;
-  std::vector<int64_t> idx_;
+  std::vector<int> idx_;
+  std::vector<int64_t> workspace_idx_;
   std::vector<AddressPtr> inputs_;
   std::vector<AddressPtr> workspace_;
   std::vector<AddressPtr> outputs_;
@@ -62,13 +65,13 @@ TEST_F(UniqueCpuKernelTest, compute_test) {
   x_ = {1, 1, 2, 4, 4, 4, 7, 8, 8};
   y_ = {1, 1, 1, 1, 1};
   idx_ = {1, 1, 1, 1, 1, 1, 1, 1, 1};
-  CreateInputAddress();
-  CreateOutputAddress();
+  workspace_idx_ = {1, 1, 1, 1, 1, 1, 1, 1, 1};
+  CreateAddress();
   unique_->Launch(inputs_, workspace_, outputs_);
 
   // check compute result
   std::vector<float> expect_y{1, 2, 4, 7, 8};
-  std::vector<int64_t> expect_idx{0, 0, 1, 2, 2, 2, 3, 4, 4};
+  std::vector<int> expect_idx{0, 0, 1, 2, 2, 2, 3, 4, 4};
   EXPECT_TRUE(y_ == expect_y);
   EXPECT_TRUE(idx_ == expect_idx);
 }
diff --git a/tests/ut/cpp/kernel/cpu/unique_with_pad_cpu_kernel_test.cc b/tests/ut/cpp/kernel/cpu/unique_with_pad_cpu_kernel_test.cc
index c9d1177f57..25d5727cf2 100644
--- a/tests/ut/cpp/kernel/cpu/unique_with_pad_cpu_kernel_test.cc
+++ b/tests/ut/cpp/kernel/cpu/unique_with_pad_cpu_kernel_test.cc
@@ -29,7 +29,7 @@ class UniqueWithPadCpuKernelTest : public UT::Common {
   UniqueWithPadCpuKernelTest() : unique_with_pad_(std::make_shared<UniqueWithPadCPUKernel>()) {}
 
   void SetUp() override {
-    unique_with_pad_->n_ = 10;
+    unique_with_pad_->input_size_ = 10;
     unique_with_pad_->dtype_ = kNumberTypeInt64;
     inputs_.clear();
     workspace_.clear();
@@ -42,21 +42,21 @@ class UniqueWithPadCpuKernelTest : public UT::Common {
     return kernel_addr;
   }
 
-  void CreateInputAddress() {
+  void CreateAddress() {
     inputs_.push_back(CreateKernelAddress(x_.data()));
     inputs_.push_back(CreateKernelAddress(&pad_dim_));
-    ;
-  }
-
-  void CreateOutputAddress() {
     outputs_.push_back(CreateKernelAddress(out_.data()));
     outputs_.push_back(CreateKernelAddress(idx_.data()));
+    workspace_.push_back(CreateKernelAddress(workspace_idx_.data()));
+    workspace_.push_back(CreateKernelAddress(workspace_idx_.data()));
+    workspace_.push_back(CreateKernelAddress(workspace_idx_.data()));
   }
 
   std::vector<int64_t> x_;
   int64_t pad_dim_;
   std::vector<int64_t> out_;
-  std::vector<int64_t> idx_;
+  std::vector<int> idx_;
+  std::vector<int64_t> workspace_idx_;
   std::vector<AddressPtr> inputs_;
   std::vector<AddressPtr> workspace_;
   std::vector<AddressPtr> outputs_;
@@ -68,13 +68,13 @@ TEST_F(UniqueWithPadCpuKernelTest, compute_test) {
   pad_dim_ = 8;
   out_ = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
   idx_ = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-  CreateInputAddress();
-  CreateOutputAddress();
+  workspace_idx_ = {1, 1, 1, 1, 1, 1, 1, 1, 1};
+  CreateAddress();
   unique_with_pad_->Launch(inputs_, workspace_, outputs_);
 
   // check compute result
-  std::vector<int64_t> expect_out{1, 5, 4, 3, 2, 8, 8, 8, 8, 8};
-  std::vector<int64_t> expect_idx{0, 0, 1, 1, 2, 2, 3, 3, 4, 4};
+  std::vector<int64_t> expect_out{1, 2, 3, 4, 5, 8, 8, 8, 8, 8};
+  std::vector<int> expect_idx{0, 0, 4, 4, 3, 3, 2, 2, 1, 1};
   EXPECT_TRUE(out_ == expect_out);
   EXPECT_TRUE(idx_ == expect_idx);
 }