[kunlun]fix sync in multi kunlun xpu dygraph training. (#30943)

4 years ago · 87197f8c2e
parent 99bf6228b8
commit 87197f8c2e
2 changed files with 14 additions and 2 deletions
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@ -626,6 +626,18 @@ void Reducer::MarkGroupReady(size_t group_index) {
        // group.dense_tensors ---> group.dense_contents_
        group.ConcatTensors(*parallel_ctx_->GetDeviceContext(run_order));

+// NOTE(liuyuhui): ConcatTensors use communication stream, but BKCL only support
+// default stream for communicating,
+// so there exist some problems in synchronization. And need to add a WaitComm
+// there.
+// TODO(liuyuhui): If BKCL support events, it should be fixed as non-blocking
+// communication.
+#ifdef PADDLE_WITH_XPU_BKCL
+        if (platform::is_xpu_place(group.dense_tensors_[0].place())) {
+          parallel_ctx_->WaitComm(run_order);
+        }
+#endif
+
        // Start allreduce
        parallel_ctx_->AllReduceByStream(
            group.dense_contents_, &(group.dense_contents_), run_order, false);
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
@ -55,7 +55,7 @@ class TestParallelDygraphMnistXPU(TestDistBase):
        if fluid.core.is_compiled_with_xpu():
            self.check_with_place(
                "parallel_dygraph_mnist.py",
-                delta=1e-1,
+                delta=1e-4,
                check_error_log=True,
                log_name=flag_name)

@ -94,7 +94,7 @@ class TestFleetDygraphMnistXPU(TestDistBase):
        if fluid.core.is_compiled_with_xpu():
            self.check_with_place(
                "parallel_dygraph_mnist.py",
-                delta=1e-1,
+                delta=1e-4,
                check_error_log=True,
                log_name=flag_name)