|
|
|
@ -626,6 +626,18 @@ void Reducer::MarkGroupReady(size_t group_index) {
|
|
|
|
|
// group.dense_tensors ---> group.dense_contents_
|
|
|
|
|
group.ConcatTensors(*parallel_ctx_->GetDeviceContext(run_order));
|
|
|
|
|
|
|
|
|
|
// NOTE(liuyuhui): ConcatTensors use communication stream, but BKCL only support
|
|
|
|
|
// default stream for communicating,
|
|
|
|
|
// so there exist some problems in synchronization. And need to add a WaitComm
|
|
|
|
|
// there.
|
|
|
|
|
// TODO(liuyuhui): If BKCL support events, it should be fixed as non-blocking
|
|
|
|
|
// communication.
|
|
|
|
|
#ifdef PADDLE_WITH_XPU_BKCL
|
|
|
|
|
if (platform::is_xpu_place(group.dense_tensors_[0].place())) {
|
|
|
|
|
parallel_ctx_->WaitComm(run_order);
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// Start allreduce
|
|
|
|
|
parallel_ctx_->AllReduceByStream(
|
|
|
|
|
group.dense_contents_, &(group.dense_contents_), run_order, false);
|
|
|
|
|