add ParallelFor for cpu kernel

pull/10368/head
wuxuejian 4 years ago
parent 7e04fab748
commit ab8b2ab826

@ -79,5 +79,24 @@ void CPUKernelUtils::GetElementNumEveryDim(const std::vector<size_t> &shape, std
}
std::reverse(element_num->begin(), element_num->end());
}
void CPUKernelUtils::ParallelFor(const CTask &task, size_t count) {
auto max_thread_num = std::thread::hardware_concurrency();
const float block_size = 128.0;
size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num;
std::vector<std::thread> threads;
threads.reserve(thread_num);
size_t start = 0;
size_t once_compute_size = (count + thread_num - 1) / thread_num;
while (start < count) {
size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size);
threads.emplace_back(std::thread(task, start, end));
start += once_compute_size;
}
for (size_t i = 0; i < threads.size(); ++i) {
threads[i].join();
}
}
} // namespace kernel
} // namespace mindspore

@ -19,6 +19,7 @@
#include <memory>
#include <numeric>
#include <string>
#include <thread>
#include <vector>
#include "backend/kernel_compiler/kernel.h"
#include "backend/session/anf_runtime_algorithm.h"
@ -26,6 +27,7 @@
using mindspore::kernel::Address;
using mindspore::kernel::AddressPtr;
using CTask = std::function<void(size_t, size_t)>;
namespace mindspore {
namespace kernel {
const char KSIZE[] = "ksize";
@ -106,6 +108,7 @@ class CPUKernelUtils {
static size_t CalcOffset(const std::vector<size_t> &shape, size_t dim0, size_t dim1, size_t dim2, size_t dim3);
static size_t GetElementNumOnAxis(const std::vector<size_t> &shape, int axis);
static void GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num);
static void ParallelFor(const CTask &task, size_t count);
};
} // namespace kernel
} // namespace mindspore

@ -117,12 +117,15 @@ bool MaxPoolingGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inpu
size_t src_wh = src_shape_[2] * src_shape_[3];
size_t dst_wh = dst_shape_[2] * dst_shape_[3];
for (size_t n = 0; n < src_shape_[0]; ++n) {
for (size_t c = 0; c < src_shape_[1]; ++c) {
ChannelPoolingGrad(input, diff, output);
input = input + src_wh;
output = output + src_wh;
diff = diff + dst_wh;
}
auto task = [&](size_t start, size_t end) {
for (size_t c = start; c < end; ++c) {
ChannelPoolingGrad(input, diff, output);
input = input + src_wh;
output = output + src_wh;
diff = diff + dst_wh;
}
};
CPUKernelUtils::ParallelFor(task, src_shape_[1]);
}
return true;
}

@ -164,27 +164,30 @@ void ReduceCPUKernel::ConvertDataToOutput(const float *new_input, float *output)
void ReduceCPUKernel::Transpose(const int size, const float *input, const std::vector<size_t> &input_shape,
const std::vector<size_t> &input_axis, const int shape_size, float *output) {
int pos_array[kMaxDim];
int size_offset[kMaxDim];
size_offset[0] = size / SizeToInt(input_shape[0]);
for (int i = 1; i < shape_size; ++i) {
size_offset[i] = size_offset[i - 1] / SizeToInt(input_shape[i]);
}
for (int position = 0; position < size; position += 1) {
int temp_position = position;
pos_array[0] = temp_position / size_offset[0];
for (int i = 1; i < shape_size; ++i) {
temp_position -= pos_array[i - 1] * size_offset[i - 1];
pos_array[i] = temp_position / size_offset[i];
}
int new_position = pos_array[SizeToInt(input_axis[shape_size - 1])];
int new_position_size = 1;
for (int j = shape_size - 2; j >= 0; j--) {
new_position_size *= SizeToInt(input_shape[SizeToInt(input_axis[j + 1])]);
new_position += pos_array[SizeToInt(input_axis[j])] * new_position_size;
auto task = [&](size_t start, size_t end) {
int pos_array[kMaxDim];
for (size_t position = start; position < end; position += 1) {
size_t temp_position = position;
pos_array[0] = temp_position / size_offset[0];
for (int i = 1; i < shape_size; ++i) {
temp_position -= pos_array[i - 1] * size_offset[i - 1];
pos_array[i] = temp_position / size_offset[i];
}
size_t new_position = pos_array[SizeToInt(input_axis[shape_size - 1])];
size_t new_position_size = 1;
for (int j = shape_size - 2; j >= 0; j--) {
new_position_size *= SizeToInt(input_shape[SizeToInt(input_axis[j + 1])]);
new_position += pos_array[SizeToInt(input_axis[j])] * new_position_size;
}
output[new_position] = input[position];
}
output[new_position] = input[position];
}
};
CPUKernelUtils::ParallelFor(task, size);
return;
}
} // namespace kernel

Loading…
Cancel
Save