diff --git a/mindspore/ccsrc/kernel/common_utils.cc b/mindspore/ccsrc/kernel/common_utils.cc index 526aca9a31..3fe928a1af 100644 --- a/mindspore/ccsrc/kernel/common_utils.cc +++ b/mindspore/ccsrc/kernel/common_utils.cc @@ -579,8 +579,40 @@ void WorkerForReduceSparseGradient(WorkerParamsForReduceSparseGradient param) { } } +void RunMultiThreadReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, + size_t outer_dim, std::vector> *sorted_indices, + std::vector *slice_positions) { + MS_LOG(DEBUG) << "Start"; + size_t thread_num = 24; + if (slice_positions->size() < thread_num) { + thread_num = slice_positions->size(); + } + size_t stride = (slice_positions->size() + thread_num - 1) / thread_num; + thread_num = (slice_positions->size() + stride - 1) / stride; + std::vector threads; + size_t max_length = sorted_indices->size() * outer_dim; + for (size_t i = 0; i < thread_num; ++i) { + size_t slice_start = i * stride; + size_t slice_end = 0; + if (i == thread_num - 1) { + slice_end = slice_positions->size(); + } else { + slice_end = slice_start + stride; + } + WorkerParamsForReduceSparseGradient params{ + slice_start, slice_end, max_length, outer_dim, sorted_indices, slice_positions, origin_sparse_grad.value_, + unique_grad}; + threads.emplace_back(std::thread(WorkerForReduceSparseGradient, params)); + } + for (size_t i = 0; i < thread_num; ++i) { + threads[i].join(); + } + MS_LOG(DEBUG) << "End"; +} + void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim, - size_t outer_dim) { + size_t outer_dim, bool use_multi_threads) { + MS_LOG(DEBUG) << "Start"; MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_); MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_); MS_EXCEPTION_IF_NULL(unique_grad); @@ -599,42 +631,35 @@ void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradie [](const std::pair &left, const std::pair &right) { return left.first < right.first; }); int last_index = 0; std::vector slice_positions; + slice_positions.reserve(sorted_indices.size()); for (size_t i = 0; i < sorted_indices.size(); ++i) { if (i == 0 || last_index != sorted_indices[i].first) { slice_positions.emplace_back(i); } last_index = sorted_indices[i].first; } - size_t thread_num = 8; - if (slice_positions.size() < thread_num) { - thread_num = slice_positions.size(); - } - size_t stride = (slice_positions.size() + thread_num - 1) / thread_num; - thread_num = (slice_positions.size() + stride - 1) / stride; - std::vector threads; - size_t max_length = sorted_indices.size() * outer_dim; - for (size_t i = 0; i < thread_num; ++i) { - size_t slice_start = i * stride; - size_t slice_end = 0; - if (i == thread_num - 1) { - slice_end = slice_positions.size(); - } else { - slice_end = slice_start + stride; - } - WorkerParamsForReduceSparseGradient params{ - slice_start, slice_end, max_length, outer_dim, &sorted_indices, &slice_positions, origin_sparse_grad.value_, - unique_grad}; - threads.emplace_back(std::thread(WorkerForReduceSparseGradient, params)); - } - for (size_t i = 0; i < thread_num; ++i) { - threads[i].join(); + if (use_multi_threads) { + RunMultiThreadReduceSparseGradient(origin_sparse_grad, unique_grad, outer_dim, &sorted_indices, &slice_positions); + } else { + size_t max_length = sorted_indices.size() * outer_dim; + WorkerParamsForReduceSparseGradient params{0, + slice_positions.size(), + max_length, + outer_dim, + &sorted_indices, + &slice_positions, + origin_sparse_grad.value_, + unique_grad}; + WorkerForReduceSparseGradient(params); } unique_grad->indices_size_ = slice_positions.size(); + MS_LOG(DEBUG) << "End"; } void ReduceMultiSparseGradient(const std::vector> &unique_slice_grads, SparseGradient *tmp_grad, SparseGradient *unique_grad, size_t first_dim, size_t outer_dim) { + MS_LOG(DEBUG) << "Start"; if (unique_slice_grads.empty()) { return; } @@ -658,10 +683,12 @@ void ReduceMultiSparseGradient(const std::vector } tmp_grad->indices_size_ = unique_indices_size; ReduceSparseGradient(*tmp_grad, unique_grad, first_dim, outer_dim); + MS_LOG(DEBUG) << "End"; } void TwoLevelReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *tmp_grad, SparseGradient *unique_grad, size_t first_dim, size_t outer_dim) { + MS_LOG(DEBUG) << "Start"; MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_); MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_); MS_EXCEPTION_IF_NULL(unique_grad); @@ -693,12 +720,13 @@ void TwoLevelReduceSparseGradient(const SparseGradient &origin_sparse_grad, Spar unique_slice_grads[i]->indices_ = unique_grad->indices_ + indices_offset; unique_slice_grads[i]->indices_size_ = indices_size; threads.emplace_back( - std::thread(ReduceSparseGradient, slice_grad, unique_slice_grads[i].get(), first_dim, outer_dim)); + std::thread(ReduceSparseGradient, slice_grad, unique_slice_grads[i].get(), first_dim, outer_dim, false)); } for (size_t i = 0; i < thread_num; ++i) { threads[i].join(); } ReduceMultiSparseGradient(unique_slice_grads, tmp_grad, unique_grad, first_dim, outer_dim); + MS_LOG(DEBUG) << "End"; } std::pair GetKernelInput(const AnfNodePtr &anf_node, size_t index) { diff --git a/mindspore/ccsrc/kernel/common_utils.h b/mindspore/ccsrc/kernel/common_utils.h index 13d36e2d53..3d8807c4ce 100644 --- a/mindspore/ccsrc/kernel/common_utils.h +++ b/mindspore/ccsrc/kernel/common_utils.h @@ -115,7 +115,7 @@ int Sign(float x); void DeduplicateIndexedSlices(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim, size_t outer_dim); void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim, - size_t outer_dim); + size_t outer_dim, bool use_multi_threads = true); std::pair GetKernelInput(const AnfNodePtr &anf_node, size_t index); std::vector>> GetInputIndex(const std::vector &node_list, const std::vector &input_list); @@ -130,6 +130,9 @@ void GetGraphRealOutput(const FuncGraphPtr &func_graph, std::vector> *sorted_indices, + std::vector *slice_positions); void ReduceMultiSparseGradient(const std::vector> &unique_slice_grads, SparseGradient *tmp_grad, SparseGradient *unique_grad, size_t first_dim, size_t outer_dim);