add ParallelFor for cpu kernel

4 years ago · ab8b2ab826
parent 7e04fab748
commit ab8b2ab826
4 changed files with 49 additions and 21 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc
@ -79,5 +79,24 @@ void CPUKernelUtils::GetElementNumEveryDim(const std::vector<size_t> &shape, std
  }
  std::reverse(element_num->begin(), element_num->end());
 }
+
+void CPUKernelUtils::ParallelFor(const CTask &task, size_t count) {
+  auto max_thread_num = std::thread::hardware_concurrency();
+  const float block_size = 128.0;
+  size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num;
+  std::vector<std::thread> threads;
+  threads.reserve(thread_num);
+  size_t start = 0;
+  size_t once_compute_size = (count + thread_num - 1) / thread_num;
+  while (start < count) {
+    size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size);
+    threads.emplace_back(std::thread(task, start, end));
+    start += once_compute_size;
+  }
+  for (size_t i = 0; i < threads.size(); ++i) {
+    threads[i].join();
+  }
+}
+
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h
@ -19,6 +19,7 @@
 #include <memory>
 #include <numeric>
 #include <string>
+#include <thread>
 #include <vector>
 #include "backend/kernel_compiler/kernel.h"
 #include "backend/session/anf_runtime_algorithm.h"
@ -26,6 +27,7 @@

 using mindspore::kernel::Address;
 using mindspore::kernel::AddressPtr;
+using CTask = std::function<void(size_t, size_t)>;
 namespace mindspore {
 namespace kernel {
 const char KSIZE[] = "ksize";
@ -106,6 +108,7 @@ class CPUKernelUtils {
  static size_t CalcOffset(const std::vector<size_t> &shape, size_t dim0, size_t dim1, size_t dim2, size_t dim3);
  static size_t GetElementNumOnAxis(const std::vector<size_t> &shape, int axis);
  static void GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num);
+  static void ParallelFor(const CTask &task, size_t count);
 };
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_max_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_max_grad_cpu_kernel.cc
@ -117,12 +117,15 @@ bool MaxPoolingGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inpu
  size_t src_wh = src_shape_[2] * src_shape_[3];
  size_t dst_wh = dst_shape_[2] * dst_shape_[3];
  for (size_t n = 0; n < src_shape_[0]; ++n) {
-    for (size_t c = 0; c < src_shape_[1]; ++c) {
-      ChannelPoolingGrad(input, diff, output);
-      input = input + src_wh;
-      output = output + src_wh;
-      diff = diff + dst_wh;
-    }
+    auto task = [&](size_t start, size_t end) {
+      for (size_t c = start; c < end; ++c) {
+        ChannelPoolingGrad(input, diff, output);
+        input = input + src_wh;
+        output = output + src_wh;
+        diff = diff + dst_wh;
+      }
+    };
+    CPUKernelUtils::ParallelFor(task, src_shape_[1]);
  }
  return true;
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.cc
@ -164,27 +164,30 @@ void ReduceCPUKernel::ConvertDataToOutput(const float *new_input, float *output)

 void ReduceCPUKernel::Transpose(const int size, const float *input, const std::vector<size_t> &input_shape,
                                const std::vector<size_t> &input_axis, const int shape_size, float *output) {
-  int pos_array[kMaxDim];
  int size_offset[kMaxDim];
  size_offset[0] = size / SizeToInt(input_shape[0]);
  for (int i = 1; i < shape_size; ++i) {
    size_offset[i] = size_offset[i - 1] / SizeToInt(input_shape[i]);
  }
-  for (int position = 0; position < size; position += 1) {
-    int temp_position = position;
-    pos_array[0] = temp_position / size_offset[0];
-    for (int i = 1; i < shape_size; ++i) {
-      temp_position -= pos_array[i - 1] * size_offset[i - 1];
-      pos_array[i] = temp_position / size_offset[i];
-    }
-    int new_position = pos_array[SizeToInt(input_axis[shape_size - 1])];
-    int new_position_size = 1;
-    for (int j = shape_size - 2; j >= 0; j--) {
-      new_position_size *= SizeToInt(input_shape[SizeToInt(input_axis[j + 1])]);
-      new_position += pos_array[SizeToInt(input_axis[j])] * new_position_size;
+  auto task = [&](size_t start, size_t end) {
+    int pos_array[kMaxDim];
+    for (size_t position = start; position < end; position += 1) {
+      size_t temp_position = position;
+      pos_array[0] = temp_position / size_offset[0];
+      for (int i = 1; i < shape_size; ++i) {
+        temp_position -= pos_array[i - 1] * size_offset[i - 1];
+        pos_array[i] = temp_position / size_offset[i];
+      }
+      size_t new_position = pos_array[SizeToInt(input_axis[shape_size - 1])];
+      size_t new_position_size = 1;
+      for (int j = shape_size - 2; j >= 0; j--) {
+        new_position_size *= SizeToInt(input_shape[SizeToInt(input_axis[j + 1])]);
+        new_position += pos_array[SizeToInt(input_axis[j])] * new_position_size;
+      }
+      output[new_position] = input[position];
    }
-    output[new_position] = input[position];
-  }
+  };
+  CPUKernelUtils::ParallelFor(task, size);
  return;
 }
 }  // namespace kernel