[feature] support npu allocator (#30840)

[feature] support npu allocator
5 years ago · 81138239db
parent ebef6601d5
commit 81138239db
37 changed files with 1273 additions and 50 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -32,11 +32,14 @@ option(WITH_TENSORRT    "Compile PaddlePaddle with NVIDIA TensorRT"     OFF)
 option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode"    OFF)
 option(WITH_ASCEND         "Compile PaddlePaddle with ASCEND"        OFF)
+# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON 
+# to develop some acl related functionality on x86
+option(WITH_ASCEND_CL         "Compile PaddlePaddle with ASCEND CL"        ${WITH_ASCEND})
 option(WITH_ASCEND_CXX11         "Compile PaddlePaddle with ASCEND and CXX11 ABI"        OFF)
 if (WITH_GPU  AND WITH_XPU)
    message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
 endif()
-if (WITH_GPU  AND WITH_ASCEND)
+if (WITH_GPU AND WITH_ASCEND)
    message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
 endif()
 # cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -82,6 +82,10 @@ if(WITH_ASCEND)
    add_definitions(-DPADDLE_WITH_ASCEND)
 endif()

+if(WITH_ASCEND_CL)
+    add_definitions(-DPADDLE_WITH_ASCEND_CL)
+endif()
+
 if(WITH_XPU)
    message(STATUS "Compile with XPU!")
    add_definitions(-DPADDLE_WITH_XPU)
--- a/cmake/external/ascend.cmake
+++ b/cmake/external/ascend.cmake
@ -21,38 +21,58 @@ else()
    set(ASCEND_DIR /usr/local/Ascend)
 endif()

-set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
-set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
-set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
-set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
-set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
-set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
-set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
-
-set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR})
-set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
-set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
-set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
-set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
-set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
-set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR})
-
-set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
-set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
-set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
-INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
-
-if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
-  add_definitions(-DPADDLE_WITH_ASCEND_STRING)
-endif()
+if(WITH_ASCEND)
+  set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
+  set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
+  set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
+  set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
+  set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
+  set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
+  set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
+
+  set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR})
+  set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
+  set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
+  set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
+  set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
+  set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
+  set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR})
+
+  set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
+  set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
+  set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
+  INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
+
+  if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
+    add_definitions(-DPADDLE_WITH_ASCEND_STRING)
+  endif()

-ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
+  ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})

-ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib})
+  ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib})

-ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
+  ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})

-add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl)
+  add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl)
+
+elseif(WITH_ASCEND_CL)
+  set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
+  set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
+  set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
+
+  set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
+  set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
+  
+  set(atlas_acl_lib ${ATLAS_ACL_DIR}/libascendcl.so)
+  set(ATLAS_ACL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include)
+  message(STATUS "ATLAS_ACL_INC_DIR ${ATLAS_ACL_INC_DIR}")
+  message(STATUS "ATLAS_ACL_LIB_DIR ${ATLAS_ACL_DIR}")
+  INCLUDE_DIRECTORIES(${ATLAS_ACL_INC_DIR})
+
+  ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
+  add_custom_target(extern_ascend DEPENDS atlas_acl)
+endif()
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@ -274,10 +274,10 @@ if(WITH_BOX_PS)
    list(APPEND third_party_deps extern_box_ps)
 endif(WITH_BOX_PS)

-if(WITH_ASCEND)
+if(WITH_ASCEND OR WITH_ASCEND_CL)
    include(external/ascend)
    list(APPEND third_party_deps extern_ascend)
-endif (WITH_ASCEND)
+endif ()

 if (WITH_PSCORE)
    include(external/snappy)
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@ -83,6 +83,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
        platform::errors::Unimplemented("platform::XPUPlace is not supported"));
  }

+  inline ::DLContext operator()(const platform::NPUPlace &place) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("platform::NPUPlace is not supported"));
+  }
+
  inline ::DLContext operator()(const platform::CUDAPlace &place) const {
 #ifdef PADDLE_WITH_CUDA
    ::DLContext ctx;
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@ -431,6 +431,14 @@ class AnyVisitor : public boost::static_visitor<bool> {
    return GetResultHelper(out, gpu);
  }

+  bool GetResult(const framework::Tensor& out,
+                 const platform::NPUPlace& npu) const {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Not supported on place (%s) ",
+        npu));
+    //return GetResultHelper(out, npu);
+  }
+
  bool GetResult(const framework::Tensor& out,
                 const platform::CPUPlace& cpu) const {
    return *out.data<bool>();
@ -633,6 +641,10 @@ struct BothFalseVisitor : public boost::static_visitor<> {
 #endif
  }

+  void VisitorImpl(const platform::NPUPlace& npu) const {
+    //TODO(zhiqiu)
+  }
+
  void VisitorImpl(const platform::CPUPlace& cpu) const {
    int num = in_.numel();
    const bool* in_ptr = in_.data<bool>();
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@ -116,6 +116,23 @@ class TensorAddFunctor : public boost::static_visitor<> {
  }
 #endif

+#ifdef PADDLE_WITH_ASCEND_CL
+  void operator()(const platform::NPUPlace& place) {
+    // TODO(zhiqiu): SUPPORT it
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#else
+  void operator()(const platform::NPUPlace& place) {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#endif
+
  // there is NO blas in CUDAPinnedPlace
  void operator()(const platform::CUDAPinnedPlace& place) {
    PADDLE_THROW(platform::errors::PermissionDenied(
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@ -19,6 +19,10 @@ if (WITH_GPU)
  cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
 endif()

+if (WITH_ASCEND_CL)
+  cc_library(npu_allocator SRCS npu_allocator.cc DEPS allocator npu_info)
+endif()
+
 cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)

 nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@ -42,6 +42,7 @@
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/xpu_info.h"
 #endif
+#include "paddle/fluid/platform/npu_info.h"

 DEFINE_int64(
    gpu_allocator_retry_time, 10000,
@ -76,6 +77,11 @@ class AllocatorFacadePrivate {
          InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
        }
        InitNaiveBestFitCUDAPinnedAllocator();
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+        for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
+        }
 #endif
        break;
      }
@ -195,6 +201,12 @@ class AllocatorFacadePrivate {
  }
 #endif

+#ifdef PADDLE_WITH_ASCEND_CL
+  void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) {
+    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+  }
+#endif
+
  class ZeroSizeAllocator : public Allocator {
   public:
    explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@ -23,6 +23,7 @@
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/npu_info.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/split.h"
@ -112,6 +113,7 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
  return GetCPUBuddyAllocator()->Used();
 }

+// For kunlun XPU
 template <>
 void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
 #ifdef PADDLE_WITH_XPU
@ -216,6 +218,136 @@ size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
 #endif
 }

+// For Ascend NPU
+#ifdef PADDLE_WITH_ASCEND_CL
+class NPUBuddyAllocatorList {
+ private:
+  NPUBuddyAllocatorList() : devices_(platform::GetSelectedNPUDevices()) {
+    auto npu_num = devices_.size();
+    allocators_.resize(npu_num);
+    init_flags_.reserve(npu_num);
+    for (size_t i = 0; i < npu_num; ++i) {
+      init_flags_.emplace_back(new std::once_flag());
+    }
+  }
+
+  static NPUBuddyAllocatorList *CreateNewInstance() {
+    return new NPUBuddyAllocatorList();
+  }
+
+ public:
+  static NPUBuddyAllocatorList *Instance() {
+    static auto *instance = CreateNewInstance();
+    return instance;
+  }
+
+  BuddyAllocator *Get(int npu_id) {
+    auto pos = std::distance(
+        devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id));
+    PADDLE_ENFORCE_LT(pos, devices_.size(),
+                      platform::errors::OutOfRange(
+                          "The index exceeds the size of devices, the size of "
+                          "devices is %d, the index is %d",
+                          devices_.size(), pos));
+
+    std::call_once(*init_flags_[pos], [this, pos] {
+      platform::SetNPUDeviceId(devices_[pos]);
+      allocators_[pos].reset(new BuddyAllocator(
+          std::unique_ptr<detail::SystemAllocator>(
+              new detail::NPUAllocator(devices_[pos])),
+          platform::NPUMinChunkSize(), platform::NPUMaxChunkSize()));
+      VLOG(10) << "\n\nNOTE:\n"
+               << "You can set GFlags environment variable "
+               << "'FLAGS_fraction_of_gpu_memory_to_use' "
+               << "or 'FLAGS_initial_gpu_memory_in_mb' "
+               << "or 'FLAGS_reallocate_gpu_memory_in_mb' "
+               << "to change the memory size for GPU usage.\n"
+               << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
+               << FLAGS_fraction_of_gpu_memory_to_use
+               << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
+               << FLAGS_initial_gpu_memory_in_mb
+               << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
+               << FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
+    });
+
+    return allocators_[pos].get();
+  }
+
+ private:
+  std::vector<int> devices_;
+  std::vector<std::unique_ptr<std::once_flag>> init_flags_;
+  std::vector<std::unique_ptr<BuddyAllocator>> allocators_;
+};
+
+BuddyAllocator *GetNPUBuddyAllocator(int npu_id) {
+  return NPUBuddyAllocatorList::Instance()->Get(npu_id);
+}
+#endif
+
+template <>
+size_t Used<platform::NPUPlace>(const platform::NPUPlace &place) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  return GetNPUBuddyAllocator(place.device)->Used();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  auto *buddy_allocator = GetNPUBuddyAllocator(place.device);
+  auto *ptr = buddy_allocator->Alloc(size);
+  if (ptr == nullptr) {
+    platform::NPUDeviceGuard(place.device);
+    size_t avail, total;
+    platform::NPUMemoryUsage(&avail, &total);
+    PADDLE_THROW(platform::errors::ResourceExhausted(
+        "Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize "
+        "%s, GpuMaxChunkSize %s, GPU memory used: %s.",
+        string::HumanReadableSize(size), place.device,
+        string::HumanReadableSize(avail), string::HumanReadableSize(total),
+        string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
+        string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
+        string::HumanReadableSize(Used<platform::NPUPlace>(place))));
+  } else {
+    if (FLAGS_init_allocated_mem) {
+      aclrtMemset(ptr, size, 0xEF, size);
+    }
+  }
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  return ptr;
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+void Free<platform::NPUPlace>(const platform::NPUPlace &place, void *p,
+                              size_t size) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  GetNPUBuddyAllocator(place.device)->Free(p);
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+uint64_t Release<platform::NPUPlace>(const platform::NPUPlace &place) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  return GetNPUBuddyAllocator(place.device)->Release();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPlace' is not supported in CPU only device."));
+#endif
+}
+
+// For CUDA
+
 #ifdef PADDLE_WITH_CUDA
 class GPUBuddyAllocatorList {
 private:
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
@ -14,6 +14,8 @@

 #include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"

+#include <unistd.h>
+
 #include <algorithm>
 #include <chrono>              // NOLINT
 #include <condition_variable>  // NOLINT
@ -69,6 +71,22 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) {
 }
 #endif

+#ifdef PADDLE_WITH_ASCEND_CL
+TEST(NaiveBestFitAllocatorTest, NpuAlloc) {
+  NaiveBestFitAllocator alloc{platform::NPUPlace(0)};
+  {
+    size_t size = (1 << 20);
+    auto allocation = alloc.Allocate(size);
+  }
+  sleep(10);
+  alloc.Release(platform::NPUPlace(0));
+
+  size_t size = (1 << 20);
+  auto allocation = alloc.Allocate(size);
+  alloc.Release(platform::NPUPlace(0));
+}
+#endif
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/npu_allocator.cc
+++ b/paddle/fluid/memory/allocation/npu_allocator.cc
@ -0,0 +1,73 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/npu_allocator.h"
+#include <string>
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/npu_info.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+bool NPUAllocator::IsAllocThreadSafe() const { return true; }
+void NPUAllocator::FreeImpl(Allocation* allocation) {
+  PADDLE_ENFORCE_EQ(
+      BOOST_GET_CONST(platform::NPUPlace, allocation->place()), place_,
+      platform::errors::PermissionDenied(
+          "NPU memory is freed in incorrect device. This may be a bug"));
+  platform::RecordedNPUFree(allocation->ptr(), allocation->size(),
+                            place_.device);
+  delete allocation;
+}
+
+Allocation* NPUAllocator::AllocateImpl(size_t size) {
+  std::call_once(once_flag_,
+                 [this] { platform::SetNPUDeviceId(place_.device); });
+
+  void* ptr;
+  auto result = platform::RecordedNPUMalloc(&ptr, size, place_.device);
+  if (LIKELY(result == ACL_ERROR_NONE)) {
+    return new Allocation(ptr, size, platform::Place(place_));
+  }
+
+  size_t avail, total, actual_avail, actual_total;
+  bool is_limited = platform::RecordedNPUMemGetInfo(
+      &avail, &total, &actual_avail, &actual_total, place_.device);
+
+  std::string err_msg;
+  if (is_limited) {
+    auto limit_size = (total >> 20);
+    err_msg = string::Sprintf(
+        "Or set environment variable `FLAGS_gpu_memory_limit_mb` to a larger "
+        "value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the maximum "
+        "GPU memory usage is limited to %d MB.\n"
+        "   The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
+        limit_size, limit_size);
+  }
+
+  PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+      "\n\nOut of memory error on NPU %d. "
+      "Cannot allocate %s memory on NPU %d, "
+      "available memory is only %s.\n\n"
+      "Please check whether there is any other process using NPU %d.\n"
+      "1. If yes, please stop them, or start PaddlePaddle on another NPU.\n"
+      "2. If no, please decrease the batch size of your model. %s\n\n",
+      place_.device, string::HumanReadableSize(size), place_.device,
+      string::HumanReadableSize(avail), place_.device, err_msg));
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/npu_allocator.h
+++ b/paddle/fluid/memory/allocation/npu_allocator.h
@ -0,0 +1,41 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <mutex>  // NOLINT
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class NPUAllocator : public Allocator {
+ public:
+  explicit NPUAllocator(const platform::NPUPlace& place) : place_(place) {}
+
+  bool IsAllocThreadSafe() const override;
+
+ protected:
+  void FreeImpl(Allocation* allocation) override;
+  Allocation* AllocateImpl(size_t size) override;
+
+ private:
+  platform::NPUPlace place_;
+  std::once_flag once_flag_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/detail/CMakeLists.txt
+++ b/paddle/fluid/memory/detail/CMakeLists.txt
@ -2,11 +2,15 @@ include(ExternalProject)

 cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc DEPS place)

+set(system_allocator_DEPS gflags cpu_info place)
+
 if(${WITH_GPU})
  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place)
-else(${WITH_GPU})
+elseif(${WITH_ASCEND_CL})
+  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info npu_info place)
+else()
  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info place)
-endif(${WITH_GPU})
+endif()

 cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)

--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@ -21,6 +21,10 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
+#endif
+

 namespace paddle {
 namespace memory {
@ -235,6 +239,21 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
    }
  }
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  if (system_allocator_->UseGpu()) {
+    if ((total_used_ + total_free_) == 0) {
+      // Compute the allocation size for gpu for the first allocation.
+      allocate_bytes = std::max(platform::NPUInitAllocSize(), request_bytes);
+    } else {
+      // Compute the re-allocation size, we store the re-allocation size when
+      // user set FLAGS_reallocate_gpu_memory_in_mb to fix value.
+      if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) {
+        realloc_size_ = platform::NPUReallocSize();
+      }
+      allocate_bytes = std::max(realloc_size_, request_bytes);
+    }
+  }
+#endif

  // Allocate a new block
  void* p = system_allocator_->Alloc(&index, allocate_bytes);
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/npu_info.h"

 namespace paddle {
 namespace memory {
--- a/paddle/fluid/memory/detail/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc
@ -19,13 +19,15 @@ limitations under the License. */
 #ifdef WITH_GPERFTOOLS
 #include "gperftools/profiler.h"
 #endif
+#include <fstream>
+#include <string>
+
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/npu_info.h"

-#ifdef PADDLE_WITH_CUDA
-#include <fstream>
-#include <string>
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL)

 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_uint64(initial_gpu_memory_in_mb);
@ -324,6 +326,33 @@ TEST(BuddyAllocator, Release) {
 }
 #endif

+#ifdef PADDLE_WITH_ASCEND_CL
+TEST(BuddyAllocator, NpuFraction) {
+  // In a 16 GB machine, the pool size will be about 160 MB
+  FLAGS_fraction_of_gpu_memory_to_use = 0.005;
+  FLAGS_fraction_of_gpu_memory_to_use = 0.92;
+  FLAGS_initial_gpu_memory_in_mb = 0;
+  FLAGS_reallocate_gpu_memory_in_mb = 0;
+
+  BuddyAllocator buddy_allocator(
+      std::unique_ptr<SystemAllocator>(new NPUAllocator(0)),
+      platform::NPUMinChunkSize(), platform::NPUMaxChunkSize());
+
+  // Less than pool size
+  TestBuddyAllocator(&buddy_allocator, 10);
+  TestBuddyAllocator(&buddy_allocator, 10 << 10);
+  TestBuddyAllocator(&buddy_allocator, 10 << 20);
+  buddy_allocator.Release();
+
+  // Greater than max chunk size
+  TestBuddyAllocator(&buddy_allocator, 300 << 20,
+                     /* use_system_allocator = */ true);
+  TestBuddyAllocator(&buddy_allocator, 1 * static_cast<size_t>(1 << 30),
+                     /* use_system_allocator = */ true);
+}
+#endif
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
+
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@ -35,6 +35,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/npu_info.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
@ -239,6 +240,68 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; }

 #endif

+#ifdef PADDLE_WITH_ASCEND_CL
+void* NPUAllocator::Alloc(size_t* index, size_t size) {
+  if (size <= 0) return nullptr;
+
+  void* p;
+  auto result = platform::RecordedNPUMalloc(&p, size, npu_id_);
+
+  if (result == ACL_ERROR_NONE) {
+    *index = 0;
+    npu_alloc_size_ += size;
+    return p;
+  } else {
+    size_t avail, total, actual_avail, actual_total;
+    bool is_limited = platform::RecordedNPUMemGetInfo(
+        &avail, &total, &actual_avail, &actual_total, npu_id_);
+
+    std::string err_msg;
+    if (is_limited) {
+      auto limit_size = (total >> 20);
+      err_msg = string::Sprintf(
+          "\n   3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a "
+          "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
+          "maximum GPU memory usage is limited to %d MB.\n"
+          "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
+          limit_size, limit_size);
+    }
+
+    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+        "\n\nOut of memory error on NPU %d. "
+        "Cannot allocate %s memory on NPU %d, "
+        "available memory is only %s.\n\n"
+        "Please check whether there is any other process using NPU %d.\n"
+        "1. If yes, please stop them, or start PaddlePaddle on another NPU.\n"
+        "2. If no, please try one of the following suggestions:\n"
+        "   1) Decrease the batch size of your model.\n"
+        "   2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, "
+        "please set it to a higher value but less than 1.0.\n"
+        "      The command is "
+        "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
+        npu_id_, string::HumanReadableSize(size), npu_id_,
+        string::HumanReadableSize(avail), npu_id_,
+        FLAGS_fraction_of_gpu_memory_to_use, err_msg));
+  }
+}
+
+void NPUAllocator::Free(void* p, size_t size, size_t index) {
+  VLOG(4) << "Free " << p << " size " << size;
+  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
+                                  "The index should be 0, index is %d", index));
+  PADDLE_ENFORCE_GE(npu_alloc_size_, size,
+                    platform::errors::InvalidArgument(
+                        "The size of memory (%d) to free exceeds the size of "
+                        "allocated gpu memory (%d)",
+                        size, npu_alloc_size_));
+  npu_alloc_size_ -= size;
+
+  platform::RecordedNPUFree(p, size, npu_id_);
+}
+
+bool NPUAllocator::UseGpu() const { return true; }
+#endif
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@ -66,6 +66,22 @@ class CUDAPinnedAllocator : public SystemAllocator {
 };
 #endif

+#ifdef PADDLE_WITH_ASCEND_CL
+
+class NPUAllocator : public SystemAllocator {
+ public:
+  explicit NPUAllocator(int npu_id) : npu_id_(npu_id) {}
+
+  virtual void* Alloc(size_t* index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+
+ private:
+  size_t npu_alloc_size_ = 0;
+  int npu_id_;
+};
+#endif
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/detail/system_allocator_test.cc
+++ b/paddle/fluid/memory/detail/system_allocator_test.cc
@ -81,3 +81,11 @@ TEST(GPUAllocator, AllocFailure) {
  }
 }
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+TEST(NPUAllocator, Alloc) {
+  paddle::memory::detail::NPUAllocator a(0);
+  TestAllocator(&a, 1<<20);
+  TestAllocator(&a, 1);
+}
+#endif
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@ -148,6 +148,13 @@ void set_constant_with_place<platform::XPUPlace>(
  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
 }

+template <>
+void set_constant_with_place<platform::NPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported"));
+}
+
 template <>
 void set_constant_with_place<platform::CPUPlace>(
    const platform::DeviceContext& context, framework::Tensor* tensor,
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@ -71,6 +71,10 @@ if(WITH_ASCEND)
    cc_library(ascend_npu_info SRCS ascend_npu_info.cc DEPS gflags glog enforce atlas_acl)
 endif()

+if(WITH_ASCEND_CL)
+    cc_library(npu_info SRCS npu_info.cc DEPS gflags glog enforce monitor atlas_acl)
+endif()
+
 add_subdirectory(dynload)
 add_subdirectory(stream)

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@ -228,6 +228,33 @@ Place XPUDeviceContext::GetPlace() const { return place_; }
 xpu::Context* XPUDeviceContext::x_context() const { return context_; }
 #endif

+#ifdef PADDLE_WITH_ASCEND_CL
+NPUDeviceContext::NPUDeviceContext(NPUPlace place) : place_(place) {
+  NPUDeviceGuard guard(place_.device);
+  // PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateContext(&context_, place_.device));
+  // NOTE(zhiqiu): Usually, no need to create context explicitly,
+  // ACL creates a default context which contains 1 default stream
+  // and 1 sync strean after aclrtSetDevice.
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetCurrentContext(&context_));
+}
+
+NPUDeviceContext::~NPUDeviceContext() {
+  // NPUDeviceGuard guard(place_.device);
+  // PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyContext(context_));
+}
+
+void NPUDeviceContext::Wait() const {
+  NPUDeviceGuard guard(place_.device);
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice());
+}
+
+Place NPUDeviceContext::GetPlace() const { return place_; }
+
+aclrtContext* NPUDeviceContext::context() const {
+  return const_cast<aclrtContext*>(&context_);
+}
+#endif
+
 #ifdef PADDLE_WITH_CUDA

 class EigenCudaStreamDevice : public Eigen::StreamInterface {
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@ -59,6 +59,11 @@ struct GpuDevice;
 #include "paddle/fluid/platform/xpu_info.h"
 #endif

+#ifdef PADDLE_WITH_ASCEND_CL
+#include "acl/acl.h"
+#include "paddle/fluid/platform/npu_info.h"
+#endif
+
 namespace paddle {
 namespace platform {

@ -77,11 +82,13 @@ enum DeviceType {
  CPU = 0,
  CUDA = 1,
  XPU = 2,
+  NPU = 3,
 };

 constexpr DeviceType kCPU = DeviceType::CPU;
 constexpr DeviceType kCUDA = DeviceType::CUDA;
 constexpr DeviceType kXPU = DeviceType::XPU;
+constexpr DeviceType kNPU = DeviceType::NPU;

 class DeviceContext {
 public:
@ -153,6 +160,46 @@ struct DefaultDeviceContextType<platform::XPUPlace> {
 };
 #endif

+#ifdef PADDLE_WITH_ASCEND_CL
+class NPUDeviceContext : public DeviceContext {
+ public:
+  explicit NPUDeviceContext(NPUPlace place);
+  virtual ~NPUDeviceContext();
+  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
+  Place GetPlace() const override;
+  aclrtContext* context() const;
+
+  /*! \brief  Wait for all operations completion in the stream. */
+  void Wait() const override;
+
+#ifdef PADDLE_WITH_ASCEND_HCCL
+  /*! \brief  Return bkcl context. */
+  HCCLContext_t hccl_context() const { return hccl_context_; }
+
+  /*! \brief  Set bkcl context. */
+  void set_hccl_context(HCCLContext_t context) { hccl_context_ = context; }
+#endif
+
+ private:
+  NPUPlace place_;
+  aclrtContext context_;
+#ifdef PADDLE_WITH_ASCEND_HCCL
+  HCCLContext_t hccl_context_;
+#endif
+
+  // Need to be the same with other DeviceContext,
+  // Eventhough eigen_device_ is not used in NPU
+  // NOTE(zhiqiu): why need?
+  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
+  DISABLE_COPY_AND_ASSIGN(NPUDeviceContext);
+};
+
+template <>
+struct DefaultDeviceContextType<platform::NPUPlace> {
+  using TYPE = NPUDeviceContext;
+};
+#endif
+
 #ifdef PADDLE_WITH_CUDA

 class CudnnWorkspaceHandle;
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#ifdef PADDLE_WITH_CUDA
 #include <cudnn.h>
 #include <glog/logging.h>
 #include <mutex>  // NOLINT
@ -214,3 +215,5 @@ CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
+
+#endif
--- a/Show More
+++ b/Show More