support Baidu Kunlun AI Accelerator (#25959)

* support Baidu AI Accelerator * test=kunlun * minor * test=kunlun * support xpu op in separate file * test=kunlun * update XPU error message and remove duplicated code * test=kunlun * minor * test=kunlun * minor * test=kunlun
6 years ago · 138ecf24aa
parent 17bcaef411
commit 138ecf24aa
42 changed files with 1533 additions and 85 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -28,7 +28,10 @@ include(generic)            # simplify cmake module
 # TODO(Shibo Tao): remove find_package(CUDA) completely.
 find_package(CUDA QUIET)
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
-
+option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN"        OFF)
+if (WITH_GPU  AND WITH_XPU)
+    message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
+endif()
 # cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
 if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15))
    message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. "
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -63,6 +63,11 @@ if(WITH_BOX_PS)
    add_definitions(-DPADDLE_WITH_BOX_PS)
 endif()

+if(WITH_XPU)
+    message(STATUS "Compile with XPU!")
+    add_definitions(-DPADDLE_WITH_XPU)
+endif()
+
 if(WITH_GPU)
    add_definitions(-DPADDLE_WITH_CUDA)
    add_definitions(-DEIGEN_USE_GPU)
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@ -0,0 +1,54 @@
+if (NOT WITH_XPU)
+    return()
+endif()
+
+INCLUDE(ExternalProject)
+SET(XPU_PROJECT                 "extern_xpu")
+SET(XPU_URL    "https://kunlun1.su.bcebos.com/xpu.tar.gz" CACHE STRING "" FORCE)
+SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
+SET(XPU_DOWNLOAD_DIR            "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
+SET(XPU_INSTALL_DIR             "${THIRD_PARTY_PATH}/install/xpu")
+SET(XPU_API_INC_DIR             "${THIRD_PARTY_PATH}/install/xpu/api/include")
+SET(XPU_RUNTIME_INC_DIR         "${THIRD_PARTY_PATH}/install/xpu/runtime/include")
+SET(XPU_LIB_DIR                 "${THIRD_PARTY_PATH}/install/xpu/lib")
+
+SET(XPU_API_LIB_NAME            "libxpuapi.so")
+SET(XPU_RT_LIB_NAME             "libxpurt.so")
+SET(XPU_SIM_LIB_NAME            "libxpusim.so")
+SET(XPU_API_LIB                 "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
+SET(XPU_RT_LIB                  "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
+SET(XPU_SIM_LIB                 "${XPU_LIB_DIR}/${XPU_SIM_LIB_NAME}")
+
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
+
+INCLUDE_DIRECTORIES(${XPU_API_INC_DIR})
+INCLUDE_DIRECTORIES(${XPU_RUNTIME_INC_DIR})
+
+FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(XPU)\n"
+  "cmake_minimum_required(VERSION 3.0)\n"
+  "install(DIRECTORY xpu/api xpu/runtime xpu/lib \n"
+  "        DESTINATION ${XPU_INSTALL_DIR})\n")
+
+ExternalProject_Add(
+    ${XPU_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                ${XPU_SOURCE_DIR}
+    DOWNLOAD_DIR          ${XPU_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate ${XPU_URL} -c -q -O xpu.tar.gz
+                          && tar xvf xpu.tar.gz
+    DOWNLOAD_NO_PROGRESS  1
+    UPDATE_COMMAND        ""
+    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
+    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
+)
+
+ADD_LIBRARY(shared_xpuapi SHARED IMPORTED GLOBAL)
+set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}")
+
+# generate a static dummy target to track xpulib dependencies
+# for cc_library(xxx SRCS xxx.c DEPS xpulib)
+generate_dummy_static_lib(LIB_NAME "xpulib" GENERATOR "xpu.cmake")
+
+TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_SIM_LIB})
+ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@ -8,6 +8,7 @@ function(op_library TARGET)
    set(hip_cu_srcs)
    set(miopen_hip_cc_srcs)
    set(cu_cc_srcs)
+    set(xpu_cc_srcs)
    set(cudnn_cu_cc_srcs)
    set(cudnn_cu_srcs)
    set(CUDNN_FILE)
@ -60,6 +61,12 @@ function(op_library TARGET)
                list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc)
            endif()
        endif()
+        if(WITH_XPU)
+            string(REPLACE "_op" "_xpu_op" XPU_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${XPU_FILE}.cc)
+                list(APPEND xpu_cc_srcs xpu/${XPU_FILE}.cc)
+            endif()
+        endif()
    else()
        foreach(src ${op_library_SRCS})
            if (${src} MATCHES ".*\\.hip.cu$")
@ -76,6 +83,8 @@ function(op_library TARGET)
                list(APPEND mkldnn_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cu.cc$")
                list(APPEND cu_cc_srcs ${src})
+            elseif(WITH_XPU AND ${src} MATCHES ".*_xpu_op.cc$")
+                list(APPEND xpu_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cc$")
                list(APPEND cc_srcs ${src})
            else()
@ -109,7 +118,7 @@ function(op_library TARGET)
        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
                ${op_common_deps})
    else()
-        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
+        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS}
            ${op_common_deps})
    endif()

@ -150,10 +159,11 @@ function(op_library TARGET)
    list(LENGTH cu_srcs cu_srcs_len)
    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
    list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
+    list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
    list(LENGTH hip_cu_srcs hip_cu_srcs_len)
    list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
-        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0)
+        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0)
        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
        set(pybind_flag 1)
    endif()
@ -179,6 +189,9 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
    endif()

+    if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
+    endif()
    # pybind USE_OP_DEVICE_KERNEL for MKLDNN
    if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
      # Append first implemented MKLDNN activation operator
@ -228,6 +241,7 @@ function(register_operators)

    file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
    string(REPLACE "_mkldnn" "" OPS "${OPS}")
+    string(REPLACE "_xpu" "" OPS "${OPS}")
    string(REPLACE ".cc" "" OPS "${OPS}")
    list(REMOVE_DUPLICATES OPS)
    list(LENGTH register_operators_DEPS register_operators_DEPS_len)
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@ -250,6 +250,11 @@ if(WITH_GPU)
    file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage
 endif(WITH_GPU)

+if(WITH_XPU)
+    include(external/xpu)          # download, build, install xpu
+    list(APPEND third_party_deps extern_xpu)
+endif(WITH_XPU)
+
 if(WITH_PSLIB)
    include(external/pslib)          # download, build, install pslib
    list(APPEND third_party_deps extern_pslib)
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@ -70,6 +70,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
    return ctx;
  }

+  inline ::DLContext operator()(const platform::XPUPlace &place) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("platform::XPUPlace is not supported"));
+  }
+
  inline ::DLContext operator()(const platform::CUDAPlace &place) const {
 #ifdef PADDLE_WITH_CUDA
    ::DLContext ctx;
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -444,8 +444,8 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
  int64_t max_memory_size = GetEagerDeletionThreshold();
  std::unique_ptr<GarbageCollector> gc;
  if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
-#ifdef PADDLE_WITH_CUDA
    if (platform::is_gpu_place(place_)) {
+#ifdef PADDLE_WITH_CUDA
      if (IsFastEagerDeletionModeEnabled()) {
        gc.reset(new UnsafeFastGPUGarbageCollector(
            BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
@ -453,13 +453,22 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
        gc.reset(new DefaultStreamGarbageCollector(
            BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
      }
-    } else if (platform::is_cpu_place(place_)) {
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("No GPU gc found in CPU/XPU paddle"));
 #endif
+    } else if (platform::is_cpu_place(place_)) {
      gc.reset(new CPUGarbageCollector(
          BOOST_GET_CONST(platform::CPUPlace, place_), max_memory_size));
-#ifdef PADDLE_WITH_CUDA
-    }
+    } else if (platform::is_xpu_place(place_)) {
+#ifdef PADDLE_WITH_XPU
+      gc.reset(new XPUGarbageCollector(
+          BOOST_GET_CONST(platform::XPUPlace, place_), max_memory_size));
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
 #endif
+    }
  }

  for (int64_t i = start_op_index; i < end_op_index; ++i) {
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@ -50,6 +50,15 @@ void CPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
  callback();
 }

+#ifdef PADDLE_WITH_XPU
+XPUGarbageCollector::XPUGarbageCollector(const platform::XPUPlace &place,
+                                         size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+void XPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
+  callback();
+}
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
    const platform::CUDAPlace &place, size_t max_memory_size)
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@ -59,6 +59,16 @@ class CPUGarbageCollector : public GarbageCollector {
  void ClearCallback(const std::function<void()> &callback) override;
 };

+#ifdef PADDLE_WITH_XPU
+class XPUGarbageCollector : public GarbageCollector {
+ public:
+  XPUGarbageCollector(const platform::XPUPlace &place, size_t max_memory_size);
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 class UnsafeFastGPUGarbageCollector : public GarbageCollector {
 public:
--- a/paddle/fluid/framework/library_type.h
+++ b/paddle/fluid/framework/library_type.h
@ -59,6 +59,8 @@ inline LibraryType StringToLibraryType(const char* ctype) {
    // CPU, CUDA, PLAIN are same library type.
  } else if (s == std::string("CPU")) {
    return LibraryType::kPlain;
+  } else if (s == std::string("XPU")) {
+    return LibraryType::kPlain;
  } else if (s == std::string("CUDA")) {
    return LibraryType::kPlain;
  } else {
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@ -268,6 +268,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
  REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)

+#define REGISTER_OP_XPU_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
+
 #define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class,  \
                              customized_name,                     \
                              customized_type_value,               \
@ -298,6 +301,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
      __VA_ARGS__)

+#define REGISTER_OP_XPU_KERNEL_FUNCTOR(op_type, ...)                  \
+  REGISTER_OP_KERNEL_EX(                                              \
+      op_type, XPU, ::paddle::platform::XPUPlace, DEFAULT_TYPE,       \
+      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
+      __VA_ARGS__)
+
 /**
 * Macro to mark what Operator and Kernel
 * we will use and tell the compiler to
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@ -34,6 +34,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/unused_var_check.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/profiler.h"
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_info.h"
+#endif

 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@ -165,6 +168,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #else
      auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
      platform::SetDeviceId(dev_id);
+#endif
+    } else if (platform::is_xpu_place(place)) {
+#ifndef PADDLE_WITH_XPU
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Cannot run operator on place %s", place));
+#else
+      auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
+      platform::SetXPUDeviceId(dev_id);
 #endif
    }

@ -1109,6 +1120,16 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
    expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
    kernel_iter = kernels.find(expected_kernel_key);
  }
+#endif
+#ifdef PADDLE_WITH_XPU
+  if (kernel_iter == kernels.end() &&
+      is_xpu_place(expected_kernel_key.place_)) {
+    VLOG(3) << "missing XPU kernel: " << type_
+            << ", expected_kernel_key:" << expected_kernel_key
+            << ", fallbacking to CPU one!";
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
 #endif
  if (kernel_iter == kernels.end()) {
    PADDLE_THROW("op %s does not have kernel for %s", type_,
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -449,6 +449,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
                                   const BuildStrategy &build_strategy,
                                   ir::Graph *graph)
    : member_(new ParallelExecutorPrivate(places, scope)) {
+  PADDLE_ENFORCE(places.size() > 0 && !is_xpu_place(places[0]),
+                 platform::errors::Unavailable(
+                     "XPU is not supported in ParallelExecutor"));
  ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
                                 member_->places_.size());
  member_->use_cuda_ = exec_strategy.use_cuda_;
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@ -76,6 +76,13 @@ class TensorAddFunctor : public boost::static_visitor<> {
    blas.AXPY(numel_, 1., x_, y_);
  }

+  void operator()(const platform::XPUPlace& place) {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+
 #ifdef PADDLE_WITH_CUDA
  void operator()(const platform::CUDAPlace& place) {
    platform::CUDADeviceContext* ctx =
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@ -100,6 +100,13 @@ PreparedOp PrepareOpImpl(const NameVarMap<VarType>& ins,
  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;

  auto kernel_iter = kernels.find(expected_kernel_key);
+#ifdef PADDLE_WITH_XPU
+  if (kernel_iter == kernels.end() &&
+      is_xpu_place(expected_kernel_key.place_)) {
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
+#endif
  // TODO(jiabin): Add operator.cc's line 1000 part back when we need that case
  PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
                    platform::errors::NotFound(
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@ -23,6 +23,8 @@ cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
 nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
 if (WITH_GPU)
    set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator)
+elseif(WITH_XPU)
+    set(AllocatorFacadeDeps xpu_info)
 else ()
    set(AllocatorFacadeDeps)
 endif()
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@ -39,6 +39,9 @@
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_info.h"
+#endif

 DEFINE_int64(
    gpu_allocator_retry_time, 10000,
@ -62,6 +65,11 @@ class AllocatorFacadePrivate {
    switch (strategy) {
      case AllocatorStrategy::kNaiveBestFit: {
        InitNaiveBestFitCPUAllocator();
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
 #ifdef PADDLE_WITH_CUDA
        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
             ++dev_id) {
@ -74,6 +82,11 @@ class AllocatorFacadePrivate {

      case AllocatorStrategy::kAutoGrowth: {
        InitNaiveBestFitCPUAllocator();
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
 #ifdef PADDLE_WITH_CUDA
        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
             ++dev_id) {
@ -86,6 +99,11 @@ class AllocatorFacadePrivate {

      case AllocatorStrategy::kThreadLocal: {
        InitNaiveBestFitCPUAllocator();
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
 #ifdef PADDLE_WITH_CUDA
        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
             ++dev_id) {
@ -127,6 +145,13 @@ class AllocatorFacadePrivate {
 private:
  void InitSystemAllocators() {
    system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
+#ifdef PADDLE_WITH_XPU
+    int device_count = platform::GetXPUDeviceCount();
+    for (int i = 0; i < device_count; ++i) {
+      platform::XPUPlace p(i);
+      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+    }
+#endif
 #ifdef PADDLE_WITH_CUDA
    system_allocators_[platform::CUDAPinnedPlace()] =
        std::make_shared<CPUPinnedAllocator>();
@ -164,6 +189,12 @@ class AllocatorFacadePrivate {
  }
 #endif

+#ifdef PADDLE_WITH_XPU
+  void InitNaiveBestFitXPUAllocator(platform::XPUPlace p) {
+    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+  }
+#endif
+
  class ZeroSizeAllocator : public Allocator {
   public:
    explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
@ -191,6 +222,12 @@ class AllocatorFacadePrivate {
    }
    places.emplace_back(platform::CUDAPinnedPlace());
 #endif
+#ifdef PADDLE_WITH_XPU
+    int device_count = platform::GetXPUDeviceCount();
+    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
+      places.emplace_back(platform::XPUPlace(dev_id));
+    }
+#endif

    for (auto& p : places) {
      zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@ -29,6 +29,9 @@
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_header.h"
+#endif

 DEFINE_bool(init_allocated_mem, false,
            "It is a mistake that the values of the memory allocated by "
@ -101,6 +104,100 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
  return GetCPUBuddyAllocator()->Used();
 }

+template <>
+void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
+#ifdef PADDLE_WITH_XPU
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  void *p = nullptr;
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  ret = xpu_set_device(place.device);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (FLAGS_init_allocated_mem) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "xpu memory FLAGS_init_allocated_mem is not implemented."));
+  }
+  ret = xpu_set_device(dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  VLOG(10) << "  pointer=" << p;
+  return p;
+#else
+  PADDLE_THROW(
+      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
+  return nullptr;
+#endif
+}
+
+template <>
+void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
+                              size_t size) {
+#ifdef PADDLE_WITH_XPU
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  ret = xpu_set_device(place.device);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  xpu_free(p);
+  ret = xpu_set_device(dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+#else
+  PADDLE_THROW(
+      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
+#endif
+}
+
+template <>
+size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
+#ifdef PADDLE_WITH_XPU
+  printf("Used func return 0 for XPUPlace\n");
+  return 0;
+#else
+  PADDLE_THROW(
+      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
+#endif
+}
+
 #ifdef PADDLE_WITH_CUDA
 class GPUBuddyAllocatorList {
 private:
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@ -18,6 +18,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"

+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu_header.h"
+#endif
+
 namespace paddle {
 namespace memory {

@ -29,6 +33,169 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
  std::memcpy(dst, src, num);
 }

+#ifdef PADDLE_WITH_XPU
+template <>
+void Copy<platform::XPUPlace, platform::CPUPlace>(platform::XPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::CPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (num <= 0) {
+    VLOG(0) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")";
+    return;
+  }
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  if (dev_id != dst_place.device) {
+    ret = xpu_set_device(dst_place.device);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+  ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id != dst_place.device) {
+    ret = xpu_set_device(dev_id);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+}
+
+template <>
+void Copy<platform::CPUPlace, platform::XPUPlace>(platform::CPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::XPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (num <= 0) {
+    VLOG(0) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")";
+    return;
+  }
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  if (dev_id != src_place.device) {
+    ret = xpu_set_device(src_place.device);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+  ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id != src_place.device) {
+    ret = xpu_set_device(dev_id);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+}
+
+template <>
+void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::XPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (num <= 0) {
+    VLOG(0) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")";
+    return;
+  }
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  if (dev_id >= 64) {
+    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
+    dev_id -= 64;
+  }
+  if (dev_id != src_place.device || dev_id != dst_place.device) {
+    ret = xpu_set_device(src_place.device);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    void* tmp = malloc(num);
+    ret = xpu_memcpy(tmp, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    ret = xpu_set_device(dst_place.device);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    ret = xpu_memcpy(dst, tmp, num, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    ret = xpu_set_device(dev_id);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    free(tmp);
+  } else {
+    int ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+  }
+}
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024;  // 64K

--- a/paddle/fluid/operators/eye_op.cc
+++ b/paddle/fluid/operators/eye_op.cc
@ -83,7 +83,6 @@ Return an identity tensor whose shape is [num_rows, num_columns].

 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
-using float16 = paddle::platform::float16;

 REGISTER_OPERATOR(
    eye, ops::EyeOp, ops::EyeOpMaker, ops::EyeOpVarTypeInference,
@ -93,4 +92,4 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(eye, ops::EyeKernel<CPU, float>,
                       ops::EyeKernel<CPU, double>,
                       ops::EyeKernel<CPU, int64_t>, ops::EyeKernel<CPU, int>,
-                       ops::EyeKernel<CPU, float16>);
+                       ops::EyeKernel<CPU, paddle::platform::float16>);
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@ -73,6 +73,13 @@ struct TensorSetConstantCPU {
  float value_;
 };

+template <>
+void set_constant_with_place<platform::XPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
+}
+
 template <>
 void set_constant_with_place<platform::CPUPlace>(
    const platform::DeviceContext& context, framework::Tensor* tensor,
--- a/paddle/fluid/operators/xpu/mul_xpu_op.cc
+++ b/paddle/fluid/operators/xpu/mul_xpu_op.cc
@ -0,0 +1,183 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/operators/mul_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::OpKernelType;
+using framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class MulXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* x = context.Input<Tensor>("X");
+    const Tensor* y = context.Input<Tensor>("Y");
+    Tensor* z = context.Output<Tensor>("Out");
+    const Tensor x_matrix =
+        x->dims().size() > 2
+            ? framework::ReshapeToMatrix(
+                  *x, context.template Attr<int>("x_num_col_dims"))
+            : *x;
+    const Tensor y_matrix =
+        y->dims().size() > 2
+            ? framework::ReshapeToMatrix(
+                  *y, context.template Attr<int>("y_num_col_dims"))
+            : *y;
+    z->mutable_data<T>(context.GetPlace());
+    auto z_dim = z->dims();
+    if (z_dim.size() != 2) {
+      z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+    }
+    bool trans_a = false;
+    bool trans_b = false;
+    int m = x_matrix.dims()[0];
+    int k = x_matrix.dims()[1];
+    int k1 = y_matrix.dims()[0];
+    int n = y_matrix.dims()[1];
+    PADDLE_ENFORCE_EQ(
+        k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op"));
+    T alpha = static_cast<T>(1.0);
+    T beta = static_cast<T>(0.0);
+    const T* data_a = x_matrix.data<T>();
+    const T* data_b = y_matrix.data<T>();
+    T* data_c = z->data<T>();
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int ret = xpu::fc_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k,
+                            alpha, data_a, data_b, beta, data_c);
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            ret));
+    if (z_dim.size() != 2) {
+      z->Resize(z_dim);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MulGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
+    int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto x_matrix = x->dims().size() > 2
+                        ? framework::ReshapeToMatrix(*x, x_num_col_dims)
+                        : static_cast<const Tensor&>(*x);
+    auto y_matrix = y->dims().size() > 2
+                        ? framework::ReshapeToMatrix(*y, y_num_col_dims)
+                        : static_cast<const Tensor&>(*y);
+    auto* dout = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    Tensor dout_mat;
+    dout_mat.Resize({framework::flatten_to_2d(x->dims(), x_num_col_dims)[0],
+                     framework::flatten_to_2d(y->dims(), y_num_col_dims)[1]});
+    auto* dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
+    if (dx != nullptr) {
+      dx->set_lod(x->lod());
+    }
+    if (dy != nullptr) {
+      dy->set_lod(y->lod());
+    }
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    if (dx) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      Tensor dx_matrix = dx->dims().size() > 2
+                             ? framework::ReshapeToMatrix(*dx, x_num_col_dims)
+                             : *dx;
+      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
+      // blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
+      bool trans_a = false;
+      bool trans_b = true;
+      int m = dout_mat.dims()[0];
+      int k = dout_mat.dims()[1];
+      int n = y_matrix.dims()[0];
+      int k1 = y_matrix.dims()[1];
+      PADDLE_ENFORCE_EQ(
+          k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op"));
+      int lda = (!trans_a) ? k : m;
+      int ldb = (!trans_b) ? n : k;
+      int ldc = n;
+      T alpha = static_cast<T>(1.0);
+      T beta = static_cast<T>(0.0);
+      const T* data_a = dout->data<T>();
+      const T* data_b = y_matrix.data<T>();
+      T* data_c = dx_matrix.data<T>();
+      int ret =
+          xpu::gemm_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k, alpha,
+                          data_a, lda, data_b, ldb, beta, data_c, ldc);
+      PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                        platform::errors::External(
+                            "XPU API return wrong value[%d], please check "
+                            "where Baidu Kunlun Card is properly installed.",
+                            ret));
+    }
+
+    if (dy) {
+      dy->mutable_data<T>(ctx.GetPlace());
+      Tensor dy_matrix = dy->dims().size() > 2
+                             ? framework::ReshapeToMatrix(*dy, y_num_col_dims)
+                             : *dy;
+      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
+      // blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
+      bool trans_a = true;
+      bool trans_b = false;
+      int k = x_matrix.dims()[0];
+      int m = x_matrix.dims()[1];
+      int k1 = dout_mat.dims()[0];
+      int n = dout_mat.dims()[1];
+      PADDLE_ENFORCE_EQ(
+          k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op"));
+      int lda = (!trans_a) ? k : m;
+      int ldb = (!trans_b) ? n : k;
+      int ldc = n;
+      T alpha = static_cast<T>(1.0);
+      T beta = static_cast<T>(0.0);
+      const T* data_a = x_matrix.data<T>();
+      const T* data_b = dout->data<T>();
+      T* data_c = dy_matrix.data<T>();
+      int ret =
+          xpu::gemm_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k, alpha,
+                          data_a, lda, data_b, ldb, beta, data_c, ldc);
+      PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                        platform::errors::External(
+                            "XPU API return wrong value[%d], please check "
+                            "where Baidu Kunlun Card is properly installed.",
+                            ret));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    mul, ops::MulXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    mul_grad, ops::MulGradXPUKernel<paddle::platform::XPUDeviceContext, float>)
+#endif
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@ -4,6 +4,12 @@ if(WITH_GPU)
  proto_library(cuda_error_proto SRCS cuda_error.proto)
 endif(WITH_GPU)

+if(WITH_XPU)
+  set(XPU_CTX_DEPS xpulib)
+ELSE()
+  set(XPU_CTX_DEPS)
+endif(WITH_XPU)
+
 if (WITH_PYTHON)
  py_proto_compile(profiler_py_proto SRCS profiler.proto)
  add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
@ -50,6 +56,10 @@ nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cu
 cc_library(place SRCS place.cc DEPS enforce boost)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)

+if(WITH_XPU)
+cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce)
+endif()
+
 add_subdirectory(dynload)
 add_subdirectory(stream)

@ -84,7 +94,7 @@ cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
-    ${dgc_deps} dlpack cudnn_workspace_helper)
+    ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS})

 cc_library(collective_helper SRCS collective_helper.cc DEPS framework_proto  device_context enforce)

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@ -61,7 +61,8 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
  if (it == device_contexts_.end()) {
    PADDLE_THROW(platform::errors::Unimplemented(
        "Place %s is not supported. Please check that your paddle compiles "
-        "with WITH_GPU option or check that your train process hold the "
+        "with WITH_GPU or WITH_XPU option or check that your train process "
+        "hold the "
        "correct gpu_id if you use Executor.",
        place));
  }
@ -115,6 +116,14 @@ DeviceContextPool::DeviceContextPool(
      PADDLE_THROW(platform::errors::Unimplemented(
          "CUDAPlace is not supported. Please re-compile with WITH_GPU "
          "option."));
+#endif
+    } else if (platform::is_xpu_place(p)) {
+#ifdef PADDLE_WITH_XPU
+      EmplaceDeviceContext<XPUDeviceContext, XPUPlace>(&device_contexts_, p);
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("XPUPlace is not supported. Please "
+                                          "re-compile with WITH_XPU option."));
 #endif
    }
  }
@ -134,6 +143,49 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {

 Place CPUDeviceContext::GetPlace() const { return place_; }

+#ifdef PADDLE_WITH_XPU
+XPUDeviceContext::XPUDeviceContext() { context_ = xpu::create_context(); }
+
+XPUDeviceContext::~XPUDeviceContext() { xpu::destroy_context(context_); }
+
+XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) {
+  int dev_id = -1;
+  int ret = xpu_current_device(&dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  ret = xpu_set_device(place.device);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  context_ = xpu::create_context();
+  ret = xpu_set_device(dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+}
+
+void XPUDeviceContext::Wait() const {
+  int ret = xpu_set_device(place_.device);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        ret));
+  xpu_wait();
+}
+
+Place XPUDeviceContext::GetPlace() const { return place_; }
+
+xpu::Context* XPUDeviceContext::x_context() const { return context_; }
+#endif
+
 #ifdef PADDLE_WITH_CUDA

 class EigenCudaStreamDevice : public Eigen::StreamInterface {
--- a/Show More
+++ b/Show More