[Kunlun] PR2: Support MultiDevicePass and BKCL in parallel executor (#29574)

4 years ago · 4427df37cf
parent 0b74428db8
commit 4427df37cf
59 changed files with 1479 additions and 290 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -29,7 +29,7 @@ include(generic)            # simplify cmake module
 find_package(CUDA QUIET)
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_TENSORRT    "Compile PaddlePaddle with NVIDIA TensorRT"     OFF)
-option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN"        OFF)
+option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode"    OFF)
 if (WITH_GPU  AND WITH_XPU)
    message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
@ -166,6 +166,7 @@ option(WITH_DGC   "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE}
 option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF)
 option(WITH_LITE   "Compile Paddle Fluid with Lite Engine" OFF)
 option(WITH_NCCL   "Compile PaddlePaddle with NCCL support"             ON)
+option(WITH_XPU_BKCL    "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL"   OFF)
 option(WITH_CRYPTO   "Compile PaddlePaddle with crypto support"         ON)
 option(WITH_ARM   "Compile PaddlePaddle with arm support"         OFF)
 option(WITH_SW   "Compile PaddlePaddle with sw support"         OFF)
@ -213,6 +214,13 @@ if (NOT WITH_GPU AND WITH_NCCL)
        "Disable NCCL when compiling without GPU" FORCE)
 endif()

+if (NOT WITH_XPU AND WITH_XPU_BKCL)
+    MESSAGE(WARNING
+        "Disable BKCL when compiling without XPU. Force WITH_XPU_BKCL=OFF.")
+    set(WITH_XPU_BKCL OFF CACHE STRING
+        "Disable BKCL when compiling without XPU" FORCE)
+endif()
+
 if(WITH_NCCL)
     add_definitions("-DPADDLE_WITH_NCCL")
     include(nccl)
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@ -47,4 +47,18 @@ set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}")
 generate_dummy_static_lib(LIB_NAME "xpulib" GENERATOR "xpu.cmake")

 TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB})
+
+if (WITH_XPU_BKCL)
+  MESSAGE(STATUS "Compile with XPU BKCL!")
+  ADD_DEFINITIONS(-DPADDLE_WITH_XPU_BKCL)
+
+  SET(XPU_BKCL_LIB_NAME         "libbkcl.so")
+  SET(XPU_BKCL_LIB              "${XPU_LIB_DIR}/${XPU_BKCL_LIB_NAME}")
+  SET(XPU_BKCL_INC_DIR          "${THIRD_PARTY_PATH}/install/xpu/include")
+  INCLUDE_DIRECTORIES(${XPU_BKCL_INC_DIR})
+  TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_BKCL_LIB})
+else(WITH_XPU_BKCL)
+  TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} )
+endif(WITH_XPU_BKCL)
+
 ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@ -43,6 +43,19 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                        "number of local scopes is %d.",
                        places_.size(), local_scopes_.size()));
 }
+#elif defined(PADDLE_WITH_XPU_BKCL)
+AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
+                                     const std::vector<Scope *> &local_scopes,
+                                     const std::vector<platform::Place> &places,
+                                     const platform::BKCLCommunicator *ctxs)
+    : BKCLOpHandleBase(node, places, ctxs), local_scopes_(local_scopes) {
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
+                    platform::errors::InvalidArgument(
+                        "The number of places and the number of local scopes "
+                        "should be equal, but got number of places is %d and "
+                        "number of local scopes is %d.",
+                        places_.size(), local_scopes_.size()));
+}
 #else
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                     const std::vector<Scope *> &local_scopes,
@ -98,6 +111,9 @@ void AllReduceOpHandle::AllReduceImpl(
  places.reserve(num_places);
  int64_t numel = -1;
  bool is_gpu_place = false;
+#if defined(PADDLE_WITH_XPU_BKCL)
+  bool is_xpu_place = false;
+#endif
  auto dtype = static_cast<framework::proto::VarType::Type>(0);
  for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
    auto &local_scope = local_exec_scopes_[i];
@ -117,6 +133,9 @@ void AllReduceOpHandle::AllReduceImpl(
              in_var_handles[i]->name(), numel));
      dtype = lod_tensor.type();
      is_gpu_place = platform::is_gpu_place(lod_tensor.place());
+#if defined(PADDLE_WITH_XPU_BKCL)
+      is_xpu_place = platform::is_xpu_place(lod_tensor.place());
+#endif
    }
    PADDLE_ENFORCE_EQ(
        numel, static_cast<int64_t>(lod_tensor.numel()),
@ -128,6 +147,12 @@ void AllReduceOpHandle::AllReduceImpl(
        platform::errors::PreconditionNotMet(
            "The dtype of tensors of the same variable in different local "
            "scopes should be equal."));
+#if defined(PADDLE_WITH_XPU_BKCL)
+    PADDLE_ENFORCE_EQ(is_xpu_place, platform::is_xpu_place(lod_tensor.place()),
+                      platform::errors::PreconditionNotMet(
+                          "The place type of tensors of the same variable "
+                          "in different local scopes should be equal."));
+#endif
    PADDLE_ENFORCE_EQ(is_gpu_place, platform::is_gpu_place(lod_tensor.place()),
                      platform::errors::PreconditionNotMet(
                          "The place type of tensors of the same variable "
@ -179,6 +204,25 @@ void AllReduceOpHandle::AllReduceFunc(
 #else
    PADDLE_THROW(
        platform::errors::PreconditionNotMet("Not compiled with CUDA."));
+#endif
+  } else if (is_xpu_place(places[0])) {
+#if defined(PADDLE_WITH_XPU_BKCL)
+    PADDLE_ENFORCE_NOT_NULL(bkcl_ctxs_,
+                            platform::errors::InvalidArgument(
+                                "The bkcl context should not be NULL."));
+    BKCLDataType bkcl_dtype = platform::ToBKCLDataType(dtype);
+    std::vector<std::function<void()>> all_reduce_calls;
+    for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
+      auto &p = places[i];
+      void *buffer = const_cast<void *>(lod_tensor_data.at(i));
+      all_reduce_calls.emplace_back([=] {
+        BKCLAllReduce(p, buffer, buffer, numel, bkcl_dtype, BKCL_ADD);
+      });
+    }
+    BKCLAllReduceFunc(all_reduce_calls);
+#else
+    PADDLE_THROW(
+        platform::errors::PreconditionNotMet("Not compiled with BKCL."));
 #endif
  } else {  // Special handle CPU only Operator's gradient. Like CRF
    auto &trg = *local_exec_scopes_[0]
@ -205,6 +249,27 @@ void AllReduceOpHandle::AllReduceFunc(
  VLOG(10) << Name() << " size:" << numel * SizeOfType(dtype);
 }

+#if defined(PADDLE_WITH_XPU_BKCL)
+void AllReduceOpHandle::BKCLAllReduceFunc(
+    const std::vector<std::function<void()>> &all_reduce_calls) {
+  this->RunAndRecordEvent([&] {
+    if (all_reduce_calls.size() == 1UL) {
+      all_reduce_calls[0]();
+    } else {
+      PADDLE_ENFORCE_EQ(
+          bkcl_group_start(), BKCL_SUCCESS,
+          platform::errors::PreconditionNotMet("bkcl_group_start failed"));
+      for (auto &call : all_reduce_calls) {
+        call();
+      }
+      PADDLE_ENFORCE_EQ(
+          bkcl_group_end(), BKCL_SUCCESS,
+          platform::errors::PreconditionNotMet("bkcl_group_end failed"));
+    }
+  });
+}
+#endif
+
 #if defined(PADDLE_WITH_NCCL)
 void AllReduceOpHandle::NCCLAllReduceFunc(
    const std::vector<std::function<void()>> &all_reduce_calls) {
--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@ -34,6 +34,9 @@ class NCCLCommunicator;
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/nccl_helper.h"
+#elif defined(PADDLE_WITH_XPU_BKCL)
+#include "paddle/fluid/framework/details/bkcl_op_handle.h"
+#include "paddle/fluid/platform/bkcl_helper.h"
 #endif

 namespace paddle {
@ -46,6 +49,12 @@ class AllReduceOpHandle : public NCCLOpHandleBase {
  AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                    const std::vector<platform::Place> &places,
                    const platform::NCCLCommunicator *ctxs);
+#elif defined(PADDLE_WITH_XPU_BKCL)
+class AllReduceOpHandle : public BKCLOpHandleBase {
+ public:
+  AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
+                    const std::vector<platform::Place> &places,
+                    const platform::BKCLCommunicator *ctxs);
 #else
 class AllReduceOpHandle : public OpHandleBase {
 public:
@ -65,8 +74,8 @@ class AllReduceOpHandle : public OpHandleBase {

  std::vector<Scope *> local_scopes_;

-#ifndef PADDLE_WITH_NCCL
-  // NCCLOpHandleBase already have these attributes.
+#if !(PADDLE_WITH_NCCL || PADDLE_WITH_XPU_BKCL)
+  // NCCLOpHandleBase and BKCLOpHandleBase already have these attributes.
  // Will polish it by class inheritance framework.
  std::vector<platform::Place> places_;
 #endif
@ -78,6 +87,11 @@ class AllReduceOpHandle : public OpHandleBase {
  void SyncNCCLAllReduce();
 #endif

+#if defined(PADDLE_WITH_XPU_BKCL)
+  void BKCLAllReduceFunc(
+      const std::vector<std::function<void()>> &all_reduce_calls);
+#endif
+
  void AllReduceImpl(const std::vector<VarHandle *> &in_var_handles,
                     const std::vector<VarHandle *> &out_var_handles);

--- a/paddle/fluid/framework/details/bkcl_op_handle.h
+++ b/paddle/fluid/framework/details/bkcl_op_handle.h
@ -0,0 +1,131 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "xpu/bkcl.h"
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/bkcl_helper.h"
+
+DECLARE_bool(sync_bkcl_allreduce);
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class BKCLOpHandleBase : public OpHandleBase {
+ public:
+  BKCLOpHandleBase(ir::Node* node, const std::vector<platform::Place>& places,
+                   const platform::BKCLCommunicator* bkcl_ctxs)
+      : OpHandleBase(node), places_(places), bkcl_ctxs_(bkcl_ctxs) {
+    if (bkcl_ctxs == nullptr) {
+      return;
+    }
+    // init device context
+    auto default_bkcl_ctxs = bkcl_ctxs_->DefaultFlatCtx();
+    for (auto& p : places_) {
+      this->SetDeviceContext(p, default_bkcl_ctxs->DevCtx(p));
+    }
+  }
+
+  virtual ~BKCLOpHandleBase() {}
+
+  void SetRunEnv(int run_order, bool use_hierarchical_allreduce) {
+    PADDLE_ENFORCE_GE(
+        run_order, 0,
+        platform::errors::InvalidArgument(
+            "The argument run_order must be >= 0, but got %d.", run_order));
+    PADDLE_ENFORCE_NE(use_hierarchical_allreduce, true,
+                      platform::errors::Unimplemented(
+                          "xpu doesn't support hierarchical_allreduce"));
+
+    run_order_ = run_order;
+    use_hierarchical_allreduce_ = use_hierarchical_allreduce;
+
+    VLOG(10) << "SetRunEnv "
+             << " run_order:" << run_order
+             << ", use_hierarchical_allreduce:" << use_hierarchical_allreduce;
+
+    if (bkcl_ctxs_ == nullptr) {
+      return;
+    }
+
+    if (!use_hierarchical_allreduce_) {
+      auto ctxs = bkcl_ctxs_->GetFlatCtx(run_order);
+      for (auto& p : places_) {
+        this->SetDeviceContext(p, ctxs->DevCtx(p));
+      }
+      return;
+    }
+  }
+
+  void FlatBKCLAllReduce(platform::Place place, const void* sendbuff,
+                         void* recvbuff, size_t count, BKCLDataType datatype,
+                         BKCLOp op) {
+    PADDLE_ENFORCE_GE(
+        run_order_, 0,
+        platform::errors::InvalidArgument(
+            "The argument run_order_ must be >= 0, but got %d.", run_order_));
+    auto flat_bkcl_ctxs = bkcl_ctxs_->GetFlatCtx(run_order_);
+    int dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
+    auto& bkcl_ctx = flat_bkcl_ctxs->at(dev_id);
+    auto comm = bkcl_ctx.comm_;
+
+    VLOG(10) << "before all reduce buffer:" << sendbuff << ", numel:" << count
+             << ", dev_id:" << dev_id << ", dtype:" << datatype
+             << ", place:" << place;
+
+    PADDLE_ENFORCE_EQ(
+        bkcl_all_reduce(comm, sendbuff, recvbuff, count, datatype, op, NULL),
+        BKCL_SUCCESS,
+        platform::errors::PreconditionNotMet("bckl all reduce failed"));
+  }
+
+  void BKCLAllReduce(platform::Place place, const void* sendbuff,
+                     void* recvbuff, size_t count, BKCLDataType datatype,
+                     BKCLOp op) {
+    PADDLE_ENFORCE_GE(
+        run_order_, 0,
+        platform::errors::InvalidArgument(
+            "The argument run_order_ must be >= 0, but got %d.", run_order_));
+    PADDLE_ENFORCE_EQ(use_hierarchical_allreduce_, false,
+                      platform::errors::Unimplemented(
+                          "xpu doesn't support hierarchical all reduce"));
+    if (!use_hierarchical_allreduce_) {
+      FlatBKCLAllReduce(place, sendbuff, recvbuff, count, datatype, op);
+      return;
+    }
+  }
+
+ protected:
+  std::vector<platform::Place> places_;
+  const platform::BKCLCommunicator* bkcl_ctxs_{nullptr};
+  // When multi trainer call collective function, they need run the same order.
+  // Or the program will hang.So we use allreduce_deps_pass to set this
+  // run_order_.
+  int run_order_{0};
+  // Use 2d allreduce or not.
+  bool use_hierarchical_allreduce_{false};
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@ -80,7 +80,7 @@ void BroadcastOpHandle::BroadcastOneVar(
            &VariableVisitor::GetMutableTensor(out_var));
      });
    }
-  } else {
+  } else if (platform::is_gpu_place(in_tensor.place())) {
 #if defined(PADDLE_WITH_NCCL)
    VarHandle *out_handle = nullptr;
    int root_id =
@ -141,6 +141,72 @@ void BroadcastOpHandle::BroadcastOneVar(
 #else
    PADDLE_THROW(
        platform::errors::PreconditionNotMet("Not compiled with NCLL."));
+#endif
+  } else {
+#if defined(PADDLE_WITH_XPU_BKCL)
+    VarHandle *out_handle = nullptr;
+    int root_id = BOOST_GET_CONST(platform::XPUPlace, in_tensor.place()).device;
+    std::vector<std::function<void()>> broadcast_calls;
+
+    int type = platform::ToBKCLDataType(in_tensor.type());
+    size_t numel = static_cast<size_t>(in_tensor.numel());
+
+    for (auto out_var_handle : out_var_handles) {
+      Variable *out_var = var_scopes.at(out_var_handle->scope_idx())
+                              ->FindVar(out_var_handle->name());
+
+      int dst_id =
+          BOOST_GET_CONST(platform::XPUPlace, out_var_handle->place()).device;
+
+      auto &bkcl_ctx = bkcl_ctxs_->at(dst_id);
+
+      void *send_recv_buffer = nullptr;
+      if (root_id == dst_id) {
+        send_recv_buffer = const_cast<void *>(in_tensor.data<void>());
+        out_handle = out_var_handle;
+      } else {
+        send_recv_buffer = VariableVisitor::GetMutableTensor(out_var)
+                               .Resize(in_tensor.dims())
+                               .mutable_data(out_var_handle->place());
+      }
+
+      broadcast_calls.emplace_back([send_recv_buffer, numel, type, root_id,
+                                    &bkcl_ctx] {
+        PADDLE_ENFORCE_EQ(
+            bkcl_broadcast(bkcl_ctx.comm(), send_recv_buffer, send_recv_buffer,
+                           numel, static_cast<BKCLDataType>(type), root_id,
+                           nullptr),
+            BKCL_SUCCESS,
+            platform::errors::Unavailable("bkcl_broadcast failed"));
+      });
+    }
+
+    WaitInputVarGenerated();
+    this->RunAndRecordEvent([&] {
+      {
+        PADDLE_ENFORCE_EQ(
+            bkcl_group_start(), BKCL_SUCCESS,
+            platform::errors::Unavailable("bkcl_group_start failed"));
+        for (auto &call : broadcast_calls) {
+          call();
+        }
+        PADDLE_ENFORCE_EQ(
+            bkcl_group_end(), BKCL_SUCCESS,
+            platform::errors::Unavailable("bkcl_group_end failed"));
+      }
+
+      if (!out_handle->IsTheSameVar(in_var_handle)) {
+        auto out_var = var_scopes.at(in_var_handle.scope_idx())
+                           ->FindVar(out_var_handles[0]->name());
+        paddle::framework::TensorCopy(
+            in_tensor, in_var_handle.place(),
+            *(dev_ctxes_.at(in_var_handle.place())),
+            &VariableVisitor::GetMutableTensor(out_var));
+      }
+    });
+#else
+    PADDLE_THROW(
+        platform::errors::PreconditionNotMet("Not compiled with BKCL."));
 #endif
  }
 }
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@ -34,12 +34,19 @@ class Node;
 }  // namespace ir
 }  // namespace framework
 namespace platform {
+#if defined(PADDLE_WITH_NCCL)
 struct NCCLContextMap;
+#endif
+#if defined(PADDLE_WITH_XPU_BKCL)
+struct BKCLContextMap;
+#endif
 }  // namespace platform
 }  // namespace paddle

 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
+#elif defined(PADDLE_WITH_XPU_BKCL)
+#include "paddle/fluid/platform/bkcl_helper.h"
 #endif

 namespace paddle {
@ -63,11 +70,26 @@ struct BroadcastOpHandle : public OpHandleBase {
      }
    }
  }
-#else
+#endif
+#if defined(PADDLE_WITH_XPU_BKCL)
+  BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
+                    const std::vector<platform::Place> &places,
+                    const platform::BKCLContextMap *bkcl_ctxs)
+      : OpHandleBase(node),
+        local_scopes_(local_scopes),
+        places_(places),
+        bkcl_ctxs_(bkcl_ctxs) {
+    if (bkcl_ctxs_) {
+      for (auto &p_ctx : bkcl_ctxs_->contexts_) {
+        this->SetDeviceContext(platform::XPUPlace(p_ctx.first),
+                               p_ctx.second.ctx_.get());
+      }
+    }
+  }
+#endif
  BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                    const std::vector<platform::Place> &places)
      : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
-#endif

  std::string Name() const override;

@ -86,6 +108,8 @@ struct BroadcastOpHandle : public OpHandleBase {
  std::vector<platform::Place> places_;
 #if defined(PADDLE_WITH_NCCL)
  const platform::NCCLContextMap *nccl_ctxs_;
+#elif defined(PADDLE_WITH_XPU_BKCL)
+  const platform::BKCLContextMap *bkcl_ctxs_;
 #endif

  void InitOutputValue(const VarHandle &in_var_handle,
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@ -18,10 +18,12 @@ namespace paddle {
 namespace framework {
 namespace details {

+using DeviceType = paddle::platform::DeviceType;
+
 TEST(BroadcastTester, TestCPUBroadcastTestLodTensor) {
  TestBroadcastOpHandle test_op;
  size_t input_scope_idx = 0;
-  test_op.InitCtxOnGpu(false);
+  test_op.InitCtxOnDevice(p::kCPU);
  test_op.InitBroadcastOp(input_scope_idx);
  test_op.TestBroadcastLodTensor(input_scope_idx);
 }
@ -29,7 +31,7 @@ TEST(BroadcastTester, TestCPUBroadcastTestLodTensor) {
 TEST(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
  TestBroadcastOpHandle test_op;
  size_t input_scope_idx = 0;
-  test_op.InitCtxOnGpu(false);
+  test_op.InitCtxOnDevice(p::kCPU);
  test_op.InitBroadcastOp(input_scope_idx);
  test_op.TestBroadcastSelectedRows(input_scope_idx);
 }
@ -38,7 +40,7 @@ TEST(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
 TEST(BroadcastTester, TestGPUBroadcastTestLodTensor) {
  TestBroadcastOpHandle test_op;
  size_t input_scope_idx = 0;
-  test_op.InitCtxOnGpu(true);
+  test_op.InitCtxOnDevice(p::kCUDA);
  test_op.InitBroadcastOp(input_scope_idx);
  test_op.TestBroadcastLodTensor(input_scope_idx);
 }
@ -46,12 +48,22 @@ TEST(BroadcastTester, TestGPUBroadcastTestLodTensor) {
 TEST(BroadcastTester, TestGPUBroadcastTestSelectedRows) {
  TestBroadcastOpHandle test_op;
  size_t input_scope_idx = 0;
-  test_op.InitCtxOnGpu(true);
+  test_op.InitCtxOnDevice(p::kCUDA);
  test_op.InitBroadcastOp(input_scope_idx);
  test_op.TestBroadcastSelectedRows(input_scope_idx);
 }
 #endif

+#if defined(PADDLE_WITH_XPU_BKCL)
+TEST(BroadcastTester, TestXPUBroadcastTestLodTensor) {
+  TestBroadcastOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnDevice(p::kXPU);
+  test_op.InitBroadcastOp(input_scope_idx);
+  test_op.TestBroadcastLodTensor(input_scope_idx);
+}
+#endif
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@ -33,7 +33,7 @@ struct VarHandle;
 namespace f = paddle::framework;
 namespace p = paddle::platform;

-using UseDevice = paddle::framework::details::ExecutionStrategy::UseDevice;
+using DeviceType = paddle::platform::DeviceType;

 // test data amount
 const f::DDim kDims = {20, 20};
@ -47,11 +47,15 @@ struct TestBroadcastOpHandle {
  std::vector<VarHandleBase*> vars_;
  std::vector<std::unique_ptr<ir::Node>> nodes_;
  std::vector<p::Place> place_list_;
-  bool use_gpu_;
+  DeviceType use_device_;
 #if defined(PADDLE_WITH_NCCL)
  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif

+#if defined(PADDLE_WITH_XPU_BKCL)
+  std::unique_ptr<platform::BKCLContextMap> bkcl_ctxs_;
+#endif
+
  void WaitAll() {
    for (size_t j = 0; j < ctxs_.size(); ++j) {
      ctxs_[j]->Wait();
@ -60,12 +64,36 @@ struct TestBroadcastOpHandle {
    if (nccl_ctxs_) {
      nccl_ctxs_->WaitAll();
    }
+#endif
+#if defined(PADDLE_WITH_XPU_BKCL)
+    if (bkcl_ctxs_) {
+      bkcl_ctxs_->WaitAll();
+    }
 #endif
  }

-  void InitCtxOnGpu(bool use_gpu) {
-    use_gpu_ = use_gpu;
-    if (use_gpu_) {
+  void InitCtxOnDevice(DeviceType use_device) {
+    use_device_ = use_device;
+    if (use_device_ == p::kXPU) {
+#if defined(PADDLE_WITH_XPU_BKCL)
+      int count = p::GetXPUDeviceCount();
+      if (count <= 1) {
+        LOG(WARNING) << "Cannot test multi-xpu Broadcast, because the XPU "
+                        "device count is "
+                     << count;
+        exit(0);
+      }
+      for (int i = 0; i < count; ++i) {
+        auto p = p::XPUPlace(i);
+        place_list_.push_back(p);
+        ctxs_.emplace_back(new p::XPUDeviceContext(p));
+      }
+      bkcl_ctxs_.reset(new platform::BKCLContextMap(place_list_));
+#else
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet("Not compiled with BKCL."));
+#endif
+    } else if (use_device_ == p::kCUDA) {
 #if defined(PADDLE_WITH_NCCL)
      int count = p::GetCUDADeviceCount();
      if (count <= 1) {
@ -91,6 +119,9 @@ struct TestBroadcastOpHandle {
        place_list_.push_back(p);
        ctxs_.emplace_back(new p::CPUDeviceContext(p));
      }
+#if defined(PADDLE_WITH_XPU_BKCL)
+      bkcl_ctxs_.reset(nullptr);
+#endif
 #if defined(PADDLE_WITH_NCCL)
      nccl_ctxs_.reset(nullptr);
 #endif
@ -111,22 +142,25 @@ struct TestBroadcastOpHandle {

    nodes_.emplace_back(
        ir::CreateNodeForTest("node0", ir::Node::Type::kOperation));
-    if (use_gpu_) {
+    if (use_device_ == p::kCUDA) {
 #if defined(PADDLE_WITH_NCCL)
      op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
                                         place_list_, nccl_ctxs_.get());
 #else
      PADDLE_THROW(
-          platform::errors::PreconditionNotMet("Not compiled with NCLL."));
+          platform::errors::PreconditionNotMet("Not compiled with NCCL."));
 #endif
-    } else {
-#if defined(PADDLE_WITH_NCCL)
+    } else if (use_device_ == p::kXPU) {
+#if defined(PADDLE_WITH_XPU_BKCL)
      op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
-                                         place_list_, nccl_ctxs_.get());
+                                         place_list_, bkcl_ctxs_.get());
 #else
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet("Not compiled with BKCL."));
+#endif
+    } else {
      op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
                                         place_list_);
-#endif
    }

    op_handle_->SetLocalExecScopes(scope_map);
@ -149,7 +183,7 @@ struct TestBroadcastOpHandle {
    op_handle_->AddInput(dummy_var_handle);

    for (size_t j = 0; j < place_list_.size(); ++j) {
-      if (!use_gpu_) {
+      if (use_device_ != p::kCUDA) {
        op_handle_->SetDeviceContext(place_list_[j], ctxs_[j].get());
      }
      nodes_.emplace_back(
@ -275,7 +309,7 @@ struct TestBroadcastOpHandle {
    f::LoD lod{{0, 10, 20}};
    auto send_vector = InitLoDTensor("input", input_scope_idx, lod);

-    UseDevice use_device = UseDevice::kCPU;
+    DeviceType use_device = p::kCPU;
    op_handle_->Run(use_device);

    WaitAll();
@ -290,7 +324,7 @@ struct TestBroadcastOpHandle {
    int height = static_cast<int>(kDims[0] * 2);
    auto send_vector = InitSelectedRows("input", input_scope_idx, rows, height);

-    UseDevice use_device = UseDevice::kCPU;
+    DeviceType use_device = p::kCPU;
    op_handle_->Run(use_device);

    WaitAll();
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@ -313,10 +313,13 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
                                const std::vector<Scope *> &local_scopes,
                                const size_t &nranks,
 #if defined(PADDLE_WITH_NCCL)
-                                const bool use_cuda,
+                                DeviceType use_device,
                                platform::NCCLCommunicator *nccl_ctxs) const {
+#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
+                                DeviceType use_device,
+                                platform::BKCLCommunicator *bkcl_ctxs) const {
 #else
-                                const bool use_cuda) const {
+                                DeviceType use_device) const {
 #endif
  VLOG(1) << "apply all passes";
  // Create a default one if not finalized by user.
@ -336,9 +339,16 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
      pass->Set<size_t>(kNRanks, new size_t(nranks));

 #if defined(PADDLE_WITH_NCCL)
-      platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr;
+      platform::NCCLCommunicator *nctx =
+          (use_device == p::kCUDA) ? nccl_ctxs : nullptr;
      pass->Erase(kNCCLCtxs);
      pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
+#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
+      // ToDo: more check
+      platform::BKCLCommunicator *bkcl_ctx =
+          (use_device == p::kXPU) ? bkcl_ctxs : nullptr;
+      pass->Erase(kBKCLCtxs);
+      pass->SetNotOwned<platform::BKCLCommunicator>(kBKCLCtxs, bkcl_ctx);
 #endif
    } else if (pass->Type() == "fuse_all_reduce_op_pass") {
      pass->Erase(kNRanks);
@ -349,12 +359,24 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
      pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
                                                    &local_scopes);
 #if defined(PADDLE_WITH_NCCL)
-      platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr;
+      platform::NCCLCommunicator *nctx =
+          (use_device == p::kCUDA) ? nccl_ctxs : nullptr;
      pass->Erase(kNCCLCtxs);
      pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
      pass->Erase(kUseHierarchicalAllReduce);
      pass->Set<bool>(kUseHierarchicalAllReduce,
                      new bool(use_hierarchical_allreduce_));
+#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
+      platform::BKCLCommunicator *nctx =
+          (use_device == p::kXPU) ? bkcl_ctxs : nullptr;
+      pass->Erase(kBKCLCtxs);
+      pass->SetNotOwned<platform::BKCLCommunicator>(kBKCLCtxs, nctx);
+      pass->Erase(kUseHierarchicalAllReduce);
+      PADDLE_ENFORCE_EQ(use_hierarchical_allreduce_, false,
+                        platform::errors::Unimplemented(
+                            "xpu doesn't support hierarchical_allreduce"));
+      pass->Set<bool>(kUseHierarchicalAllReduce,
+                      new bool(use_hierarchical_allreduce_));
 #endif
    } else if (pass->Type() == "coalesce_grad_tensor_pass") {
      pass->Erase(kNRanks);
@ -364,35 +386,47 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
                << enable_sequential_execution_;
    } else if (pass->Type() == "all_reduce_deps_pass") {
 #if defined(PADDLE_WITH_NCCL)
-      platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr;
+      platform::NCCLCommunicator *nctx =
+          (use_device == p::kCUDA) ? nccl_ctxs : nullptr;
      pass->Erase(kNCCLCtxs);
      pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
      pass->Erase(kUseHierarchicalAllReduce);
      pass->Set<bool>(kUseHierarchicalAllReduce,
                      new bool(use_hierarchical_allreduce_));
+#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
+      platform::BKCLCommunicator *nctx =
+          (use_device == p::kXPU) ? bkcl_ctxs : nullptr;
+      pass->Erase(kBKCLCtxs);
+      pass->SetNotOwned<platform::BKCLCommunicator>(kBKCLCtxs, nctx);
+      pass->Erase(kUseHierarchicalAllReduce);
+      PADDLE_ENFORCE_EQ(use_hierarchical_allreduce_, false,
+                        platform::errors::Unimplemented(
+                            "xpu doesn't support hierarchical_allreduce"));
+      pass->Set<bool>(kUseHierarchicalAllReduce,
+                      new bool(use_hierarchical_allreduce_));
 #endif
      VLOG(1) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
              << ", num_trainers:" << num_trainers_;
    } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") {
-      if (!use_cuda) {
+      if (use_device != p::kCUDA) {
        LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on "
                        "GPU, skipped.";
        continue;
      }
    } else if (pass->Type() == "fusion_group_pass") {
-      pass->Set<bool>("use_gpu", new bool(use_cuda));
-      if (!use_cuda) {
+      pass->Set<bool>("use_gpu", new bool((use_device == p::kCUDA)));
+      if (use_device != p::kCUDA) {
        LOG(WARNING) << "fusion_group_pass is only supported on GPU, skipped.";
        continue;
      }
    } else if (pass->Type() == "fuse_bn_act_pass") {
-      if (!use_cuda) {
+      if (use_device != p::kCUDA) {
        LOG(WARNING) << "fuse_bn_act_pass is only supported on "
                        "GPU, skipped.";
        continue;
      }
    } else if (pass->Type() == "fuse_bn_add_act_pass") {
-      if (!use_cuda) {
+      if (use_device != p::kCUDA) {
        LOG(WARNING) << "fuse_bn_add_act_pass is only supported on "
                        "GPU, skipped.";
        continue;
@ -401,7 +435,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
      pass->Set("mkldnn_enabled_op_types",
                new std::unordered_set<std::string>(mkldnn_enabled_op_types_));
    } else if (pass->Type() == "backward_optimizer_op_deps_pass") {
-      if (!use_cuda) {
+      if (use_device != p::kCUDA) {
        VLOG(1) << "backward_optimizer_op_deps_pass is only supported on "
                   "GPU, skipped.";
        continue;
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@ -41,11 +41,15 @@ class NCCLCommunicator;

 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
+#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
+#include "paddle/fluid/platform/bkcl_helper.h"
 #endif

 namespace paddle {
 namespace framework {
 namespace details {
+using DeviceType = paddle::platform::DeviceType;
+namespace p = paddle::platform;

 struct BuildStrategy {
  // ParallelExecutor supports two modes of ReduceStrategy, kAllReduce and
@ -147,6 +151,7 @@ struct BuildStrategy {

  // NCCL config
  size_t nccl_comm_num_{1};
+  size_t bkcl_comm_num_{1};
  // The picture is here:
  // https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396
  bool use_hierarchical_allreduce_{false};
@ -181,10 +186,13 @@ struct BuildStrategy {
                   const std::vector<Scope *> &local_scopes,
                   const size_t &nranks,
 #if defined(PADDLE_WITH_NCCL)
-                   const bool use_cuda,
+                   DeviceType use_device,
                   platform::NCCLCommunicator *nccl_ctxs) const;
+#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
+                   DeviceType use_device,
+                   platform::BKCLCommunicator *bkcl_ctxs) const;
 #else
-                   const bool use_cuda) const;
+                   DeviceType use_device) const;
 #endif

  // If set true, ParallelExecutor would build the main_program into multiple
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@ -14,22 +14,19 @@

 #pragma once
 #include <cstddef>  // for size_t
+#include "paddle/fluid/platform/device_context.h"

 namespace paddle {
 namespace framework {
 namespace details {
-
+using DeviceType = paddle::platform::DeviceType;
+namespace p = paddle::platform;
 struct ExecutionStrategy {
  enum ExecutorType { kDefault = 0, kExperimental = 1 };
-  enum UseDevice {
-    kCPU = 0,
-    kCUDA = 1,
-    kXPU = 2,
-  };

  // num_threads indicates the size of thread pool.
  size_t num_threads_{0};
-  UseDevice use_device_{kCUDA};
+  DeviceType use_device_ = p::kCUDA;
  // Note that allow_op_delay is invalid now.
  bool allow_op_delay_{false};
  // num_iteration_per_drop_scope indicates how many
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@ -37,6 +37,13 @@ FusedAllReduceOpHandle::FusedAllReduceOpHandle(
    const platform::NCCLCommunicator *ctxs)
    : AllReduceOpHandle(node, local_scopes, places, ctxs),
      num_of_all_reduce_(num_of_all_reduce) {}
+#elif defined(PADDLE_WITH_XPU_BKCL)
+FusedAllReduceOpHandle::FusedAllReduceOpHandle(
+    ir::Node *node, const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places, const size_t num_of_all_reduce,
+    const platform::BKCLCommunicator *ctxs)
+    : AllReduceOpHandle(node, local_scopes, places, ctxs),
+      num_of_all_reduce_(num_of_all_reduce) {}
 #else
 FusedAllReduceOpHandle::FusedAllReduceOpHandle(
    ir::Node *node, const std::vector<Scope *> &local_scopes,
@ -73,9 +80,14 @@ void FusedAllReduceOpHandle::RunImpl() {
          "handles is %d, and the number of  output variable handles is %d.",
          in_var_handles.size(), out_var_handles.size()));

-  // Note: some gradient op doesn't have CUDAKernel, so the gradients of
-  // those op are in CPUPlace, in this case, the all reduce should not be fused.
+// Note: some gradient op doesn't have CUDAKernel, so the gradients of
+// those op are in CPUPlace, in this case, the all reduce should not be fused.
+#if defined(PADDLE_WITH_XPU_BKCL)
+  // TODO(liuyuhui): XPU don't support fuse all reduce for now
+  if (InputIsInDifferentPlace(in_var_handles) || true) {
+#else
  if (InputIsInDifferentPlace(in_var_handles)) {
+#endif
    for (size_t j = 0; j < num_of_all_reduce_; ++j) {
      std::vector<VarHandle *> dev_inputs;
      std::vector<VarHandle *> dev_outputs;
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
@ -36,6 +36,8 @@ class NCCLCommunicator;
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/nccl_helper.h"
+#elif defined(PADDLE_WITH_XPU_BKCL)
+#include "paddle/fluid/platform/bkcl_helper.h"
 #endif

 namespace paddle {
@ -49,6 +51,13 @@ struct FusedAllReduceOpHandle : public AllReduceOpHandle {
                         const std::vector<platform::Place> &places,
                         const size_t num_of_all_reduce,
                         const platform::NCCLCommunicator *ctxs);
+#elif defined(PADDLE_WITH_XPU_BKCL)
+struct FusedAllReduceOpHandle : public AllReduceOpHandle {
+  FusedAllReduceOpHandle(ir::Node *node,
+                         const std::vector<Scope *> &local_scopes,
+                         const std::vector<platform::Place> &places,
+                         const size_t num_of_all_reduce,
+                         const platform::BKCLCommunicator *ctxs);
 #else
 struct FusedAllReduceOpHandle : public AllReduceOpHandle {
  FusedAllReduceOpHandle(ir::Node *node,
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
@ -52,11 +52,18 @@ struct FusedBroadcastOpHandle : public BroadcastOpHandle {
                         const std::vector<platform::Place> &places,
                         const platform::NCCLContextMap *nccl_ctx)
      : BroadcastOpHandle(node, local_scopes, places, nccl_ctx) {}
-#else
-  FusedBroadcastOpHandle(ir::Node* node, const std::vector<Scope*> local_scopes,
-                         const std::vector<platform::Place>& places)
-      : BroadcastOpHandle(node, local_scopes, places) {}
 #endif
+#if defined(PADDLE_WITH_XPU_BKCL)
+  FusedBroadcastOpHandle(ir::Node *node,
+                         const std::vector<Scope *> local_scopes,
+                         const std::vector<platform::Place> &places,
+                         const platform::BKCLContextMap *bkcl_ctx)
+      : BroadcastOpHandle(node, local_scopes, places, bkcl_ctx) {}
+#endif
+  FusedBroadcastOpHandle(ir::Node *node,
+                         const std::vector<Scope *> local_scopes,
+                         const std::vector<platform::Place> &places)
+      : BroadcastOpHandle(node, local_scopes, places) {}
  std::string Name() const override;

 protected:
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
@ -32,7 +32,7 @@ namespace framework {
 namespace details {

 struct VarHandle;
-using UseDevice = paddle::framework::details::ExecutionStrategy::UseDevice;
+using DeviceType = paddle::platform::DeviceType;

 struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
  std::vector<std::string> out_varnames_;
@ -56,7 +56,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
    // create op handle node
    nodes_.emplace_back(
        ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation));
-    if (use_gpu_) {
+    if (use_device_ == p::kCUDA) {
 #if defined(PADDLE_WITH_NCCL)
      op_handle_ = new FusedBroadcastOpHandle(
          nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
@ -64,14 +64,17 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
      PADDLE_THROW(
          platform::errors::PreconditionNotMet("Not compiled with CUDA."));
 #endif
-    } else {
-#if defined(PADDLE_WITH_NCCL)
+    } else if (use_device_ == p::kXPU) {
+#if defined(PADDLE_WITH_XPU_BKCL)
      op_handle_ = new FusedBroadcastOpHandle(
-          nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
+          nodes_.back().get(), local_scopes_, place_list_, bkcl_ctxs_.get());
 #else
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet("Not compiled with XPU."));
+#endif
+    } else {
      op_handle_ = new FusedBroadcastOpHandle(nodes_.back().get(),
                                              local_scopes_, place_list_);
-#endif
    }

    op_handle_->SetLocalExecScopes(scope_map);
@ -109,7 +112,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
          InitLoDTensor(varname, input_scope_idxes[i], lod, val_scalar));
    }

-    UseDevice use_device = UseDevice::kCPU;
+    DeviceType use_device = p::kCPU;
    op_handle_->Run(use_device);

    WaitAll();
@ -133,7 +136,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
                                             rows, height, val_scalar));
    }

-    UseDevice use_device = UseDevice::kCPU;
+    DeviceType use_device = p::kCPU;
    op_handle_->Run(use_device);

    WaitAll();
@ -150,7 +153,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
 TEST(FusedBroadcastTester, CPULodTensor) {
  TestFusedBroadcastOpHandle test_op;
  std::vector<size_t> input_scope_idxes = {0, 1};
-  test_op.InitCtxOnGpu(false);
+  test_op.InitCtxOnDevice(p::kCPU);
  test_op.InitFusedBroadcastOp(input_scope_idxes);
  test_op.TestFusedBroadcastLoDTensor(input_scope_idxes);
 }
@ -158,7 +161,7 @@ TEST(FusedBroadcastTester, CPULodTensor) {
 TEST(FusedBroadcastTester, CPUSelectedRows) {
  TestFusedBroadcastOpHandle test_op;
  std::vector<size_t> input_scope_idxes = {0, 1};
-  test_op.InitCtxOnGpu(false);
+  test_op.InitCtxOnDevice(p::kCPU);
  test_op.InitFusedBroadcastOp(input_scope_idxes);
  test_op.TestFusedBroadcastSelectedRows(input_scope_idxes);
 }
@ -167,7 +170,7 @@ TEST(FusedBroadcastTester, CPUSelectedRows) {
 TEST(FusedBroadcastTester, GPULodTensor) {
  TestFusedBroadcastOpHandle test_op;
  std::vector<size_t> input_scope_idxes = {0, 1};
-  test_op.InitCtxOnGpu(true);
+  test_op.InitCtxOnDevice(p::kCUDA);
  test_op.InitFusedBroadcastOp(input_scope_idxes);
  test_op.TestFusedBroadcastLoDTensor(input_scope_idxes);
 }
@ -175,12 +178,22 @@ TEST(FusedBroadcastTester, GPULodTensor) {
 TEST(FusedBroadcastTester, GPUSelectedRows) {
  TestFusedBroadcastOpHandle test_op;
  std::vector<size_t> input_scope_idxes = {0, 1};
-  test_op.InitCtxOnGpu(true);
+  test_op.InitCtxOnDevice(p::kCUDA);
  test_op.InitFusedBroadcastOp(input_scope_idxes);
  test_op.TestFusedBroadcastSelectedRows(input_scope_idxes);
 }
 #endif

+#if defined(PADDLE_WITH_XPU_BKCL)
+TEST(FusedBroadcastTester, XPULodTensor) {
+  TestFusedBroadcastOpHandle test_op;
+  std::vector<size_t> input_scope_idxes = {0, 1};
+  test_op.InitCtxOnDevice(p::kXPU);
+  test_op.InitFusedBroadcastOp(input_scope_idxes);
+  test_op.TestFusedBroadcastLoDTensor(input_scope_idxes);
+}
+#endif
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@ -27,7 +27,7 @@ struct DummyVarHandle;
 namespace f = paddle::framework;
 namespace p = paddle::platform;

-using UseDevice = paddle::framework::details::ExecutionStrategy::UseDevice;
+using DeviceType = paddle::platform::DeviceType;

 // test data amount
 const f::DDim kDims = {20, 20};
@ -173,7 +173,7 @@ struct TestGatherOpHandle {
    out_selected_rows->mutable_value()->ShareDataWith(
        in_selected_rows->value());

-    UseDevice use_device = UseDevice::kCPU;
+    DeviceType use_device = p::kCPU;
    op_handle_->Run(use_device);

    WaitAll();
--- a/paddle/fluid/framework/details/multi_devices_helper.h
+++ b/paddle/fluid/framework/details/multi_devices_helper.h
@ -55,6 +55,7 @@ constexpr char kPlaces[] = "places";
 constexpr char kGlobalScope[] = "global_scope";
 constexpr char kLocalScopes[] = "local_scopes";
 constexpr char kNCCLCtxs[] = "nccl_ctxs";
+constexpr char kBKCLCtxs[] = "bkcl_ctxs";
 constexpr char kUseHierarchicalAllReduce[] = "use_hierarchical_allreduce";

 // aux variables to represent dependency. Useful to resolve data hazard.
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@ -82,21 +82,74 @@ void OpHandleBase::InitCUDA() {
      }
    }
  }
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "Paddle can't use CUDA device since it's not compiled with CUDA,"
+      "Please recompile or reinstall Paddle with GPU support."));
+#endif
+}
+
+void OpHandleBase::InitXPU() {
+#ifdef PADDLE_WITH_XPU
+  if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) {
+    for (auto &out_var : outputs_) {
+      auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
+      if (out_var_handle) {
+        // TODO(liuyuhui): XPU now don't support sync events, add later.
+      }
+    }
+  } else {
+    PADDLE_ENFORCE_EQ(dev_ctxes_.size(), 1UL,
+                      platform::errors::InvalidArgument(
+                          "%s should have only one dev_ctx.", Name()));
+    auto &place = dev_ctxes_.begin()->first;
+    int dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
+    PADDLE_ENFORCE_EQ(
+        xpu_set_device(dev_id), XPU_SUCCESS,
+        platform::errors::PreconditionNotMet("xpu_set_device failed"));
+    for (auto &out_var : outputs_) {
+      auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
+      if (out_var_handle) {
+        PADDLE_ENFORCE_EQ(
+            platform::is_same_place(place, out_var_handle->place()), true,
+            platform::errors::InvalidArgument(
+                "The place of output(%s) is not consistent with the "
+                "place of current op(%s).",
+                out_var_handle->Name(), Name()));
+      }
+    }
+  }
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "Paddle can't use XPU device since it's not compiled with XPU,"
+      "Please recompile or reinstall Paddle with XPU support."));
 #endif
 }

-void OpHandleBase::Run(ExecutionStrategy::UseDevice use_device) {
+void OpHandleBase::Run(DeviceType use_device) {
 #ifdef PADDLE_WITH_CUDA
-  if (events_.empty() && use_device == ExecutionStrategy::UseDevice::kCUDA &&
-      dev_ctxes_.size() > 0) {
+  if (events_.empty() && use_device == p::kCUDA && dev_ctxes_.size() > 0) {
    InitCUDA();
  }
 #else
-  PADDLE_ENFORCE_NE(use_device, ExecutionStrategy::UseDevice::kCUDA,
-                    platform::errors::InvalidArgument(
-                        "Argument use_cuda should be false when Paddle is not "
-                        "compiled with CUDA."));
+  PADDLE_ENFORCE_NE(
+      use_device, p::kCUDA,
+      platform::errors::InvalidArgument(
+          "Argument use_device should not be kCUDA when Paddle is not "
+          "compiled with CUDA."));
+#endif
+
+  if (use_device == p::kXPU && dev_ctxes_.size() > 0) {
+#ifdef PADDLE_WITH_XPU
+    InitXPU();
+#else
+    PADDLE_ENFORCE_NE(
+        use_device, p::kXPU,
+        platform::errors::InvalidArgument(
+            "Argument use_device should not be kXPU when Paddle is not "
+            "compiled with XPU."));
 #endif
+  }

  // skip running current op, used with inplace_addto_op_pass
  if (skip_running_) {
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@ -43,7 +43,8 @@ class Node;
 }  // namespace ir

 namespace details {
-
+using DeviceType = paddle::platform::DeviceType;
+namespace p = paddle::platform;
 // Wraps ir::Node and provide helper utilities.
 // It's responsible for populating necessary fields of ir::Node.
 class OpHandleBase {
@ -72,7 +73,7 @@ class OpHandleBase {

  virtual std::string Name() const = 0;

-  void Run(ExecutionStrategy::UseDevice use_device);
+  void Run(DeviceType use_device);

  virtual void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx);

@ -145,6 +146,7 @@ class OpHandleBase {
  virtual void RunImpl() = 0;

  virtual void InitCUDA();
+  virtual void InitXPU();

  ir::Node *node_;
  std::vector<VarHandleBase *> inputs_;
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@ -212,10 +212,64 @@ void ReduceOpHandle::RunImpl() {
 #else
      PADDLE_THROW(
          platform::errors::PreconditionNotMet("Not compiled with CUDA."));
+#endif
+    } else if (paddle::platform::is_xpu_place(lod_tensors[0]->place())) {
+#if defined(PADDLE_WITH_XPU_BKCL)
+      auto pre_in = pre_in_var->Get<framework::LoDTensor>();
+      VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var);
+      VariableVisitor::GetMutableTensor(out_var).mutable_data(
+          out_var_handle->place(), pre_in.type());
+
+      auto out_p = out_var_handle->place();
+      int root_id = BOOST_GET_CONST(platform::XPUPlace, out_p).device;
+      std::vector<std::function<void()>> all_reduce_calls;
+      for (size_t i = 0; i < var_scopes.size(); ++i) {
+        auto &p = in_places[i];
+        auto &lod_tensor = *lod_tensors[i];
+
+        int dev_id = BOOST_GET_CONST(platform::XPUPlace, p).device;
+        auto &bkcl_ctx = bkcl_ctxs_->at(dev_id);
+
+        void *buffer = const_cast<void *>(lod_tensor.data<void>());
+        void *recvbuffer = nullptr;
+        if (root_id == dev_id) {
+          recvbuffer =
+              out_var->GetMutable<framework::LoDTensor>()->mutable_data(
+                  out_var_handle->place());
+        }
+
+        int type = platform::ToBKCLDataType(lod_tensor.type());
+        size_t numel = static_cast<size_t>(lod_tensor.numel());
+        all_reduce_calls.emplace_back([buffer, recvbuffer, type, numel, root_id,
+                                       &bkcl_ctx] {
+          PADDLE_ENFORCE_EQ(bkcl_reduce(bkcl_ctx.comm(), buffer, recvbuffer,
+                                        numel, static_cast<BKCLDataType>(type),
+                                        BKCL_ADD, root_id, nullptr),
+                            BKCL_SUCCESS, platform::errors::Unavailable(
+                                              "bkcl_all_reduce failed"));
+        });
+      }
+
+      WaitInputVarGenerated();
+      this->RunAndRecordEvent([&] {
+        PADDLE_ENFORCE_EQ(
+            bkcl_group_start(), BKCL_SUCCESS,
+            platform::errors::Unavailable("bkcl_group_start failed"));
+        for (auto &call : all_reduce_calls) {
+          call();
+        }
+        PADDLE_ENFORCE_EQ(
+            bkcl_group_end(), BKCL_SUCCESS,
+            platform::errors::Unavailable("bkcl_group_end failed"));
+      });
+#else
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet("Not compiled with XPU."));
 #endif
    } else {
      PADDLE_THROW(platform::errors::InvalidArgument(
-          "The place of tensor should be CPUPlace or CUDAPlace, but got %s.",
+          "The place of tensor should be CPUPlace, CUDAPlace or XPUPlace, but "
+          "got %s.",
          lod_tensors[0]->place()));
    }
  }
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@ -41,6 +41,8 @@ struct NCCLContextMap;
 }  // namespace paddle
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
+#elif defined(PADDLE_WITH_XPU_BKCL)
+#include "paddle/fluid/platform/bkcl_helper.h"
 #endif

 namespace paddle {
@ -93,6 +95,22 @@ struct ReduceOpHandle : public OpHandleBase {
      }
    }
  }
+#elif defined(PADDLE_WITH_XPU_BKCL)
+  const platform::BKCLContextMap *bkcl_ctxs_;
+  ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
+                 const std::vector<platform::Place> &places,
+                 const platform::BKCLContextMap *bkcl_ctxs)
+      : OpHandleBase(node),
+        local_scopes_(local_scopes),
+        places_(places),
+        bkcl_ctxs_(bkcl_ctxs) {
+    if (bkcl_ctxs_) {
+      for (auto &p_ctx : bkcl_ctxs_->contexts_) {
+        this->SetDeviceContext(platform::XPUPlace(p_ctx.first),
+                               p_ctx.second.ctx_.get());
+      }
+    }
+  }
 #else
  ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                 const std::vector<platform::Place> &places)
--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@ -25,7 +25,7 @@ namespace details {
 namespace f = paddle::framework;
 namespace p = paddle::platform;

-using UseDevice = paddle::framework::details::ExecutionStrategy::UseDevice;
+using DeviceType = paddle::platform::DeviceType;

 // test data amount
 const f::DDim kDims = {20, 20};
@ -198,7 +198,7 @@ struct TestReduceOpHandle {
    out_selected_rows->mutable_value()->ShareDataWith(
        in_selected_rows->value());

-    UseDevice use_device = UseDevice::kCPU;
+    DeviceType use_device = p::kCPU;
    op_handle_->Run(use_device);

    WaitAll();
@ -263,7 +263,7 @@ struct TestReduceOpHandle {

    out_lodtensor->ShareDataWith(in_lodtensor);

-    UseDevice use_device = UseDevice::kCPU;
+    DeviceType use_device = p::kCPU;
    op_handle_->Run(use_device);

    WaitAll();
--- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
@ -30,6 +30,7 @@ DECLARE_double(eager_delete_tensor_gb);

 namespace paddle {
 namespace framework {
+namespace p = paddle::platform;

 static std::vector<platform::Place> CreatePlaces(size_t num, bool use_cuda) {
  std::vector<platform::Place> result;
@ -88,8 +89,7 @@ class ReferenceCountPassTestHelper {
    FLAGS_eager_delete_tensor_gb = -1;

    details::ExecutionStrategy exec_strategy;
-    exec_strategy.use_device_ =
-        use_cuda ? (ExecutionStrategy::kCUDA) : (ExecutionStrategy::kCPU);
+    exec_strategy.use_device_ = use_cuda ? p::kCUDA : p::kCPU;

    executor_.reset(new ParallelExecutor(CreatePlaces(1, use_cuda), {}, "",
                                         &scope_, {}, exec_strategy,
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
@ -41,6 +41,9 @@ class FuseAllReduceOpPass : public ir::Pass {
 #if defined(PADDLE_WITH_NCCL)
    auto *multi_nccl_ctxs =
        &Get<platform::NCCLCommunicator>(details::kNCCLCtxs);
+#elif defined(PADDLE_WITH_XPU_BKCL)
+    auto *multi_bkcl_ctxs =
+        &Get<platform::BKCLCommunicator>(details::kBKCLCtxs);
 #endif

    ir::Graph &result = *graph;
@ -92,6 +95,9 @@ class FuseAllReduceOpPass : public ir::Pass {
 #if defined(PADDLE_WITH_NCCL)
      InsertFusedAllReduce(places, local_scopes, group_size,
                           group_all_reduce_ops, multi_nccl_ctxs, &result);
+#elif defined(PADDLE_WITH_XPU_BKCL)
+      InsertFusedAllReduce(places, local_scopes, group_size,
+                           group_all_reduce_ops, multi_bkcl_ctxs, &result);
 #else
      InsertFusedAllReduce(places, local_scopes, group_size,
                           group_all_reduce_ops, &result);
@ -154,6 +160,8 @@ class FuseAllReduceOpPass : public ir::Pass {
                            const std::vector<ir::Node *> &all_reduce_ops,
 #if defined(PADDLE_WITH_NCCL)
                            const platform::NCCLCommunicator *multi_nccl_ctxs,
+#elif defined(PADDLE_WITH_XPU_BKCL)
+                            const platform::BKCLCommunicator *multi_bkcl_ctxs,
 #endif
                            ir::Graph *result) const {
    std::vector<details::VarHandleBase *> inputs;
@ -182,6 +190,9 @@ class FuseAllReduceOpPass : public ir::Pass {
 #if defined(PADDLE_WITH_NCCL)
    CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places,
                           local_scopes, multi_nccl_ctxs, result);
+#elif defined(PADDLE_WITH_XPU_BKCL)
+    CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places,
+                           local_scopes, multi_bkcl_ctxs, result);
 #else
    CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places,
                           local_scopes, result);
@ -197,12 +208,18 @@ class FuseAllReduceOpPass : public ir::Pass {
      const std::vector<Scope *> &local_scopes,
 #if defined(PADDLE_WITH_NCCL)
      const platform::NCCLCommunicator *multi_nccl_ctxs,
+#elif defined(PADDLE_WITH_XPU_BKCL)
+      const platform::BKCLCommunicator *multi_bkcl_ctxs,
 #endif
      ir::Graph *result) const {
 #if defined(PADDLE_WITH_NCCL)
    auto *op_handle = new details::FusedAllReduceOpHandle(
        result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation),
        local_scopes, places, num_of_all_reduce, multi_nccl_ctxs);
+#elif defined(PADDLE_WITH_XPU_BKCL)
+    auto *op_handle = new details::FusedAllReduceOpHandle(
+        result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation),
+        local_scopes, places, num_of_all_reduce, multi_bkcl_ctxs);
 #else
    auto *op_handle = new details::FusedAllReduceOpHandle(
        result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation),
@ -221,6 +238,10 @@ class FuseAllReduceOpPass : public ir::Pass {
    if (!multi_nccl_ctxs) {
      SetCommunicationContext(places, op_handle);
    }
+#elif defined(PADDLE_WITH_XPU_BKCL)
+    if (!multi_bkcl_ctxs) {
+      SetCommunicationContext(places, op_handle);
+    }
 #else
    SetCommunicationContext(places, op_handle);
 #endif
--- a/Show More
+++ b/Show More