[kunlun]fix sync in multi kunlun xpu dygraph training. (#30943)

revert-31068-fix_conv3d_windows
liuyuhui 4 years ago committed by GitHub
parent 99bf6228b8
commit 87197f8c2e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -626,6 +626,18 @@ void Reducer::MarkGroupReady(size_t group_index) {
// group.dense_tensors ---> group.dense_contents_
group.ConcatTensors(*parallel_ctx_->GetDeviceContext(run_order));
// NOTE(liuyuhui): ConcatTensors use communication stream, but BKCL only support
// default stream for communicating,
// so there exist some problems in synchronization. And need to add a WaitComm
// there.
// TODO(liuyuhui): If BKCL support events, it should be fixed as non-blocking
// communication.
#ifdef PADDLE_WITH_XPU_BKCL
if (platform::is_xpu_place(group.dense_tensors_[0].place())) {
parallel_ctx_->WaitComm(run_order);
}
#endif
// Start allreduce
parallel_ctx_->AllReduceByStream(
group.dense_contents_, &(group.dense_contents_), run_order, false);

@ -55,7 +55,7 @@ class TestParallelDygraphMnistXPU(TestDistBase):
if fluid.core.is_compiled_with_xpu():
self.check_with_place(
"parallel_dygraph_mnist.py",
delta=1e-1,
delta=1e-4,
check_error_log=True,
log_name=flag_name)
@ -94,7 +94,7 @@ class TestFleetDygraphMnistXPU(TestDistBase):
if fluid.core.is_compiled_with_xpu():
self.check_with_place(
"parallel_dygraph_mnist.py",
delta=1e-1,
delta=1e-4,
check_error_log=True,
log_name=flag_name)

Loading…
Cancel
Save