!9870 modify some cpu kernel thread num

From: @kisnwang
Reviewed-by: @zhoufeng54,@jjfeing,@chujinjin
Signed-off-by: @chujinjin
pull/9870/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit de142fef2b

@ -24,7 +24,11 @@
namespace mindspore { namespace mindspore {
namespace kernel { namespace kernel {
constexpr size_t kAdamDeltaInputSize = 9; constexpr size_t kAdamDeltaInputSize = 9;
#ifdef ENABLE_D
constexpr size_t kUsedThreadNum = 23; constexpr size_t kUsedThreadNum = 23;
#else
constexpr size_t kUsedThreadNum = 8;
#endif
namespace { namespace {
struct ComputeParam { struct ComputeParam {
float *delta_{nullptr}; float *delta_{nullptr};

@ -22,6 +22,11 @@
namespace mindspore { namespace mindspore {
namespace kernel { namespace kernel {
namespace { namespace {
#ifdef ENABLE_D
constexpr size_t kUsedThreadNum = 23;
#else
constexpr size_t kUsedThreadNum = 8;
#endif
template <typename T> template <typename T>
void LookUpTableTask(const float *input_addr, const T *indices_addr, float *output_addr, size_t indices_lens, void LookUpTableTask(const float *input_addr, const T *indices_addr, float *output_addr, size_t indices_lens,
size_t outer_dim_size, T offset, size_t first_dim_size) { size_t outer_dim_size, T offset, size_t first_dim_size) {
@ -92,10 +97,9 @@ void EmbeddingLookUpCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr
auto input_addr = reinterpret_cast<float *>(inputs[0]->addr); auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
auto indices_addr = reinterpret_cast<T *>(inputs[1]->addr); auto indices_addr = reinterpret_cast<T *>(inputs[1]->addr);
auto output_addr = reinterpret_cast<float *>(outputs[0]->addr); auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
const size_t kMaxThreadNum = 16;
size_t thread_num = indices_lens_ / 10000 + 1; size_t thread_num = indices_lens_ / 10000 + 1;
thread_num = thread_num > kMaxThreadNum ? kMaxThreadNum : thread_num; thread_num = thread_num > kUsedThreadNum ? kUsedThreadNum : thread_num;
std::thread threads[kMaxThreadNum]; std::thread threads[kUsedThreadNum];
size_t task_proc_lens = (indices_lens_ + thread_num - 1) / thread_num; size_t task_proc_lens = (indices_lens_ + thread_num - 1) / thread_num;
size_t i; size_t i;
size_t task_offset = 0; size_t task_offset = 0;

@ -22,6 +22,11 @@
namespace mindspore { namespace mindspore {
namespace kernel { namespace kernel {
namespace { namespace {
#ifdef ENABLE_D
constexpr size_t kUsedThreadNum = 23;
#else
constexpr size_t kUsedThreadNum = 8;
#endif
template <typename T> template <typename T>
void Compute(const ComputeParams<T> *params, const size_t start, const size_t end) { void Compute(const ComputeParams<T> *params, const size_t start, const size_t end) {
MS_EXCEPTION_IF_NULL(params); MS_EXCEPTION_IF_NULL(params);
@ -115,10 +120,9 @@ void ScatterNdUpdateCPUKernel::LaunchKernel(const std::vector<AddressPtr> &input
params.indices_unit_rank_ = indices_unit_rank_; params.indices_unit_rank_ = indices_unit_rank_;
params.out_strides_ = &out_strides_; params.out_strides_ = &out_strides_;
const size_t thread_num = 24;
std::vector<Task> tasks; std::vector<Task> tasks;
size_t start = 0; size_t start = 0;
size_t once_compute_size = (num_units_ + thread_num - 1) / thread_num; size_t once_compute_size = (num_units_ + kUsedThreadNum - 1) / kUsedThreadNum;
while (start < num_units_) { while (start < num_units_) {
size_t end = (start + once_compute_size) > num_units_ ? num_units_ : (start + once_compute_size); size_t end = (start + once_compute_size) > num_units_ ? num_units_ : (start + once_compute_size);
auto task = [&params, start, end]() -> int { auto task = [&params, start, end]() -> int {

@ -27,6 +27,11 @@
namespace mindspore { namespace mindspore {
namespace kernel { namespace kernel {
#ifdef ENABLE_D
constexpr size_t kUsedThreadNum = 23;
#else
constexpr size_t kUsedThreadNum = 8;
#endif
template <typename T> template <typename T>
struct SparseGradient { struct SparseGradient {
float *value_{nullptr}; float *value_{nullptr};
@ -95,7 +100,7 @@ class SparseOptimizerCPUKernel : public CPUKernel {
static void BucketReduceSparseGradient(const ReduceSparseGradientParam<T> &param) { static void BucketReduceSparseGradient(const ReduceSparseGradientParam<T> &param) {
MS_LOG(DEBUG) << "Start"; MS_LOG(DEBUG) << "Start";
MS_EXCEPTION_IF_NULL(param.input_grad_); MS_EXCEPTION_IF_NULL(param.input_grad_);
size_t thread_num = 23; size_t thread_num = kUsedThreadNum;
if (param.input_grad_->indices_size_ < thread_num) { if (param.input_grad_->indices_size_ < thread_num) {
thread_num = param.input_grad_->indices_size_; thread_num = param.input_grad_->indices_size_;
} }
@ -120,11 +125,10 @@ class SparseOptimizerCPUKernel : public CPUKernel {
template <typename T> template <typename T>
void MultiThreadCompute(const MultiThreadComputeFunc<T> &func, MultiThreadComputeParams<T> *params, void MultiThreadCompute(const MultiThreadComputeFunc<T> &func, MultiThreadComputeParams<T> *params,
size_t total_compute_size) const { size_t total_compute_size) const {
const size_t kThreadNum = 24;
std::vector<std::thread> threads; std::vector<std::thread> threads;
threads.reserve(kThreadNum); threads.reserve(kUsedThreadNum);
size_t start = 0; size_t start = 0;
size_t once_compute_size = (total_compute_size + kThreadNum - 1) / kThreadNum; size_t once_compute_size = (total_compute_size + kUsedThreadNum - 1) / kUsedThreadNum;
while (start < total_compute_size) { while (start < total_compute_size) {
size_t end = (start + once_compute_size) > total_compute_size ? total_compute_size : (start + once_compute_size); size_t end = (start + once_compute_size) > total_compute_size ? total_compute_size : (start + once_compute_size);
threads.emplace_back(std::thread(func, params, start, end)); threads.emplace_back(std::thread(func, params, start, end));

@ -20,7 +20,11 @@
namespace mindspore { namespace mindspore {
namespace kernel { namespace kernel {
const size_t kUseBucketUniqueSize = 100000; const size_t kUseBucketUniqueSize = 100000;
const size_t kUniqueThreadNum = 23; #ifdef ENABLE_D
constexpr size_t kUniqueThreadNum = 23;
#else
constexpr size_t kUniqueThreadNum = 8;
#endif
void UniqueCPUKernel::InitKernel(const CNodePtr &kernel_node) { void UniqueCPUKernel::InitKernel(const CNodePtr &kernel_node) {
node_ = kernel_node; node_ = kernel_node;
CheckParam(kernel_node); CheckParam(kernel_node);

Loading…
Cancel
Save