[feature] support npu allocator (#30840)

[feature] support npu allocator
revert-31562-mean
Leo Chen 4 years ago committed by GitHub
parent ebef6601d5
commit 81138239db
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -32,11 +32,14 @@ option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF)
option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF)
option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF)
# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON
# to develop some acl related functionality on x86
option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND})
option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF)
if (WITH_GPU AND WITH_XPU)
message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
endif()
if (WITH_GPU AND WITH_ASCEND)
if (WITH_GPU AND WITH_ASCEND)
message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
endif()
# cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.

@ -82,6 +82,10 @@ if(WITH_ASCEND)
add_definitions(-DPADDLE_WITH_ASCEND)
endif()
if(WITH_ASCEND_CL)
add_definitions(-DPADDLE_WITH_ASCEND_CL)
endif()
if(WITH_XPU)
message(STATUS "Compile with XPU!")
add_definitions(-DPADDLE_WITH_XPU)

@ -21,38 +21,58 @@ else()
set(ASCEND_DIR /usr/local/Ascend)
endif()
set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR})
set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR})
set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
add_definitions(-DPADDLE_WITH_ASCEND_STRING)
endif()
if(WITH_ASCEND)
set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR})
set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR})
set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
add_definitions(-DPADDLE_WITH_ASCEND_STRING)
endif()
ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib})
ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib})
ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl)
add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl)
elseif(WITH_ASCEND_CL)
set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
set(atlas_acl_lib ${ATLAS_ACL_DIR}/libascendcl.so)
set(ATLAS_ACL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include)
message(STATUS "ATLAS_ACL_INC_DIR ${ATLAS_ACL_INC_DIR}")
message(STATUS "ATLAS_ACL_LIB_DIR ${ATLAS_ACL_DIR}")
INCLUDE_DIRECTORIES(${ATLAS_ACL_INC_DIR})
ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
add_custom_target(extern_ascend DEPENDS atlas_acl)
endif()

@ -274,10 +274,10 @@ if(WITH_BOX_PS)
list(APPEND third_party_deps extern_box_ps)
endif(WITH_BOX_PS)
if(WITH_ASCEND)
if(WITH_ASCEND OR WITH_ASCEND_CL)
include(external/ascend)
list(APPEND third_party_deps extern_ascend)
endif (WITH_ASCEND)
endif ()
if (WITH_PSCORE)
include(external/snappy)

@ -83,6 +83,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
platform::errors::Unimplemented("platform::XPUPlace is not supported"));
}
inline ::DLContext operator()(const platform::NPUPlace &place) const {
PADDLE_THROW(
platform::errors::Unimplemented("platform::NPUPlace is not supported"));
}
inline ::DLContext operator()(const platform::CUDAPlace &place) const {
#ifdef PADDLE_WITH_CUDA
::DLContext ctx;

@ -431,6 +431,14 @@ class AnyVisitor : public boost::static_visitor<bool> {
return GetResultHelper(out, gpu);
}
bool GetResult(const framework::Tensor& out,
const platform::NPUPlace& npu) const {
PADDLE_THROW(platform::errors::Unimplemented(
"Not supported on place (%s) ",
npu));
//return GetResultHelper(out, npu);
}
bool GetResult(const framework::Tensor& out,
const platform::CPUPlace& cpu) const {
return *out.data<bool>();
@ -633,6 +641,10 @@ struct BothFalseVisitor : public boost::static_visitor<> {
#endif
}
void VisitorImpl(const platform::NPUPlace& npu) const {
//TODO(zhiqiu)
}
void VisitorImpl(const platform::CPUPlace& cpu) const {
int num = in_.numel();
const bool* in_ptr = in_.data<bool>();

@ -116,6 +116,23 @@ class TensorAddFunctor : public boost::static_visitor<> {
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
void operator()(const platform::NPUPlace& place) {
// TODO(zhiqiu): SUPPORT it
PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#else
void operator()(const platform::NPUPlace& place) {
PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#endif
// there is NO blas in CUDAPinnedPlace
void operator()(const platform::CUDAPinnedPlace& place) {
PADDLE_THROW(platform::errors::PermissionDenied(

@ -19,6 +19,10 @@ if (WITH_GPU)
cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
endif()
if (WITH_ASCEND_CL)
cc_library(npu_allocator SRCS npu_allocator.cc DEPS allocator npu_info)
endif()
cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)

@ -42,6 +42,7 @@
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/xpu_info.h"
#endif
#include "paddle/fluid/platform/npu_info.h"
DEFINE_int64(
gpu_allocator_retry_time, 10000,
@ -76,6 +77,11 @@ class AllocatorFacadePrivate {
InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
}
InitNaiveBestFitCUDAPinnedAllocator();
#endif
#ifdef PADDLE_WITH_ASCEND_CL
for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
}
#endif
break;
}
@ -195,6 +201,12 @@ class AllocatorFacadePrivate {
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) {
allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
}
#endif
class ZeroSizeAllocator : public Allocator {
public:
explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}

@ -23,6 +23,7 @@
#include "paddle/fluid/memory/detail/buddy_allocator.h"
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/npu_info.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/split.h"
@ -112,6 +113,7 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
return GetCPUBuddyAllocator()->Used();
}
// For kunlun XPU
template <>
void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
#ifdef PADDLE_WITH_XPU
@ -216,6 +218,136 @@ size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
#endif
}
// For Ascend NPU
#ifdef PADDLE_WITH_ASCEND_CL
class NPUBuddyAllocatorList {
private:
NPUBuddyAllocatorList() : devices_(platform::GetSelectedNPUDevices()) {
auto npu_num = devices_.size();
allocators_.resize(npu_num);
init_flags_.reserve(npu_num);
for (size_t i = 0; i < npu_num; ++i) {
init_flags_.emplace_back(new std::once_flag());
}
}
static NPUBuddyAllocatorList *CreateNewInstance() {
return new NPUBuddyAllocatorList();
}
public:
static NPUBuddyAllocatorList *Instance() {
static auto *instance = CreateNewInstance();
return instance;
}
BuddyAllocator *Get(int npu_id) {
auto pos = std::distance(
devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id));
PADDLE_ENFORCE_LT(pos, devices_.size(),
platform::errors::OutOfRange(
"The index exceeds the size of devices, the size of "
"devices is %d, the index is %d",
devices_.size(), pos));
std::call_once(*init_flags_[pos], [this, pos] {
platform::SetNPUDeviceId(devices_[pos]);
allocators_[pos].reset(new BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(
new detail::NPUAllocator(devices_[pos])),
platform::NPUMinChunkSize(), platform::NPUMaxChunkSize()));
VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable "
<< "'FLAGS_fraction_of_gpu_memory_to_use' "
<< "or 'FLAGS_initial_gpu_memory_in_mb' "
<< "or 'FLAGS_reallocate_gpu_memory_in_mb' "
<< "to change the memory size for GPU usage.\n"
<< "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
<< FLAGS_fraction_of_gpu_memory_to_use
<< ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
<< FLAGS_initial_gpu_memory_in_mb
<< ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
<< FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
});
return allocators_[pos].get();
}
private:
std::vector<int> devices_;
std::vector<std::unique_ptr<std::once_flag>> init_flags_;
std::vector<std::unique_ptr<BuddyAllocator>> allocators_;
};
BuddyAllocator *GetNPUBuddyAllocator(int npu_id) {
return NPUBuddyAllocatorList::Instance()->Get(npu_id);
}
#endif
template <>
size_t Used<platform::NPUPlace>(const platform::NPUPlace &place) {
#ifdef PADDLE_WITH_ASCEND_CL
return GetNPUBuddyAllocator(place.device)->Used();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'NPUPlace' is not supported in CPU only device."));
#endif
}
template <>
void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
#ifdef PADDLE_WITH_ASCEND_CL
auto *buddy_allocator = GetNPUBuddyAllocator(place.device);
auto *ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
platform::NPUDeviceGuard(place.device);
size_t avail, total;
platform::NPUMemoryUsage(&avail, &total);
PADDLE_THROW(platform::errors::ResourceExhausted(
"Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize "
"%s, GpuMaxChunkSize %s, GPU memory used: %s.",
string::HumanReadableSize(size), place.device,
string::HumanReadableSize(avail), string::HumanReadableSize(total),
string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
string::HumanReadableSize(Used<platform::NPUPlace>(place))));
} else {
if (FLAGS_init_allocated_mem) {
aclrtMemset(ptr, size, 0xEF, size);
}
}
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
return ptr;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'NPUPlace' is not supported in CPU only device."));
#endif
}
template <>
void Free<platform::NPUPlace>(const platform::NPUPlace &place, void *p,
size_t size) {
#ifdef PADDLE_WITH_ASCEND_CL
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetNPUBuddyAllocator(place.device)->Free(p);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'NPUPlace' is not supported in CPU only device."));
#endif
}
template <>
uint64_t Release<platform::NPUPlace>(const platform::NPUPlace &place) {
#ifdef PADDLE_WITH_ASCEND_CL
return GetNPUBuddyAllocator(place.device)->Release();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'NPUPlace' is not supported in CPU only device."));
#endif
}
// For CUDA
#ifdef PADDLE_WITH_CUDA
class GPUBuddyAllocatorList {
private:

@ -14,6 +14,8 @@
#include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
#include <unistd.h>
#include <algorithm>
#include <chrono> // NOLINT
#include <condition_variable> // NOLINT
@ -69,6 +71,22 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) {
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
TEST(NaiveBestFitAllocatorTest, NpuAlloc) {
NaiveBestFitAllocator alloc{platform::NPUPlace(0)};
{
size_t size = (1 << 20);
auto allocation = alloc.Allocate(size);
}
sleep(10);
alloc.Release(platform::NPUPlace(0));
size_t size = (1 << 20);
auto allocation = alloc.Allocate(size);
alloc.Release(platform::NPUPlace(0));
}
#endif
} // namespace allocation
} // namespace memory
} // namespace paddle

@ -0,0 +1,73 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/npu_allocator.h"
#include <string>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/npu_info.h"
namespace paddle {
namespace memory {
namespace allocation {
bool NPUAllocator::IsAllocThreadSafe() const { return true; }
void NPUAllocator::FreeImpl(Allocation* allocation) {
PADDLE_ENFORCE_EQ(
BOOST_GET_CONST(platform::NPUPlace, allocation->place()), place_,
platform::errors::PermissionDenied(
"NPU memory is freed in incorrect device. This may be a bug"));
platform::RecordedNPUFree(allocation->ptr(), allocation->size(),
place_.device);
delete allocation;
}
Allocation* NPUAllocator::AllocateImpl(size_t size) {
std::call_once(once_flag_,
[this] { platform::SetNPUDeviceId(place_.device); });
void* ptr;
auto result = platform::RecordedNPUMalloc(&ptr, size, place_.device);
if (LIKELY(result == ACL_ERROR_NONE)) {
return new Allocation(ptr, size, platform::Place(place_));
}
size_t avail, total, actual_avail, actual_total;
bool is_limited = platform::RecordedNPUMemGetInfo(
&avail, &total, &actual_avail, &actual_total, place_.device);
std::string err_msg;
if (is_limited) {
auto limit_size = (total >> 20);
err_msg = string::Sprintf(
"Or set environment variable `FLAGS_gpu_memory_limit_mb` to a larger "
"value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the maximum "
"GPU memory usage is limited to %d MB.\n"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
limit_size, limit_size);
}
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
"\n\nOut of memory error on NPU %d. "
"Cannot allocate %s memory on NPU %d, "
"available memory is only %s.\n\n"
"Please check whether there is any other process using NPU %d.\n"
"1. If yes, please stop them, or start PaddlePaddle on another NPU.\n"
"2. If no, please decrease the batch size of your model. %s\n\n",
place_.device, string::HumanReadableSize(size), place_.device,
string::HumanReadableSize(avail), place_.device, err_msg));
}
} // namespace allocation
} // namespace memory
} // namespace paddle

@ -0,0 +1,41 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <mutex> // NOLINT
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace memory {
namespace allocation {
class NPUAllocator : public Allocator {
public:
explicit NPUAllocator(const platform::NPUPlace& place) : place_(place) {}
bool IsAllocThreadSafe() const override;
protected:
void FreeImpl(Allocation* allocation) override;
Allocation* AllocateImpl(size_t size) override;
private:
platform::NPUPlace place_;
std::once_flag once_flag_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle

@ -2,11 +2,15 @@ include(ExternalProject)
cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc DEPS place)
set(system_allocator_DEPS gflags cpu_info place)
if(${WITH_GPU})
nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place)
else(${WITH_GPU})
elseif(${WITH_ASCEND_CL})
cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info npu_info place)
else()
cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info place)
endif(${WITH_GPU})
endif()
cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)

@ -21,6 +21,10 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA
DECLARE_uint64(reallocate_gpu_memory_in_mb);
#endif
#ifdef PADDLE_WITH_ASCEND_CL
DECLARE_uint64(reallocate_gpu_memory_in_mb);
#endif
namespace paddle {
namespace memory {
@ -235,6 +239,21 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
}
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
if (system_allocator_->UseGpu()) {
if ((total_used_ + total_free_) == 0) {
// Compute the allocation size for gpu for the first allocation.
allocate_bytes = std::max(platform::NPUInitAllocSize(), request_bytes);
} else {
// Compute the re-allocation size, we store the re-allocation size when
// user set FLAGS_reallocate_gpu_memory_in_mb to fix value.
if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) {
realloc_size_ = platform::NPUReallocSize();
}
allocate_bytes = std::max(realloc_size_, request_bytes);
}
}
#endif
// Allocate a new block
void* p = system_allocator_->Alloc(&index, allocate_bytes);

@ -25,6 +25,7 @@ limitations under the License. */
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/npu_info.h"
namespace paddle {
namespace memory {

@ -19,13 +19,15 @@ limitations under the License. */
#ifdef WITH_GPERFTOOLS
#include "gperftools/profiler.h"
#endif
#include <fstream>
#include <string>
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/npu_info.h"
#ifdef PADDLE_WITH_CUDA
#include <fstream>
#include <string>
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL)
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
@ -324,6 +326,33 @@ TEST(BuddyAllocator, Release) {
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
TEST(BuddyAllocator, NpuFraction) {
// In a 16 GB machine, the pool size will be about 160 MB
FLAGS_fraction_of_gpu_memory_to_use = 0.005;
FLAGS_fraction_of_gpu_memory_to_use = 0.92;
FLAGS_initial_gpu_memory_in_mb = 0;
FLAGS_reallocate_gpu_memory_in_mb = 0;
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new NPUAllocator(0)),
platform::NPUMinChunkSize(), platform::NPUMaxChunkSize());
// Less than pool size
TestBuddyAllocator(&buddy_allocator, 10);
TestBuddyAllocator(&buddy_allocator, 10 << 10);
TestBuddyAllocator(&buddy_allocator, 10 << 20);
buddy_allocator.Release();
// Greater than max chunk size
TestBuddyAllocator(&buddy_allocator, 300 << 20,
/* use_system_allocator = */ true);
TestBuddyAllocator(&buddy_allocator, 1 * static_cast<size_t>(1 << 30),
/* use_system_allocator = */ true);
}
#endif
} // namespace detail
} // namespace memory
} // namespace paddle

@ -35,6 +35,7 @@ limitations under the License. */
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/npu_info.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
@ -239,6 +240,68 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; }
#endif
#ifdef PADDLE_WITH_ASCEND_CL
void* NPUAllocator::Alloc(size_t* index, size_t size) {
if (size <= 0) return nullptr;
void* p;
auto result = platform::RecordedNPUMalloc(&p, size, npu_id_);
if (result == ACL_ERROR_NONE) {
*index = 0;
npu_alloc_size_ += size;
return p;
} else {
size_t avail, total, actual_avail, actual_total;
bool is_limited = platform::RecordedNPUMemGetInfo(
&avail, &total, &actual_avail, &actual_total, npu_id_);
std::string err_msg;
if (is_limited) {
auto limit_size = (total >> 20);
err_msg = string::Sprintf(
"\n 3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a "
"larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
"maximum GPU memory usage is limited to %d MB.\n"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
limit_size, limit_size);
}
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
"\n\nOut of memory error on NPU %d. "
"Cannot allocate %s memory on NPU %d, "
"available memory is only %s.\n\n"
"Please check whether there is any other process using NPU %d.\n"
"1. If yes, please stop them, or start PaddlePaddle on another NPU.\n"
"2. If no, please try one of the following suggestions:\n"
" 1) Decrease the batch size of your model.\n"
" 2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, "
"please set it to a higher value but less than 1.0.\n"
" The command is "
"`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
npu_id_, string::HumanReadableSize(size), npu_id_,
string::HumanReadableSize(avail), npu_id_,
FLAGS_fraction_of_gpu_memory_to_use, err_msg));
}
}
void NPUAllocator::Free(void* p, size_t size, size_t index) {
VLOG(4) << "Free " << p << " size " << size;
PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
"The index should be 0, index is %d", index));
PADDLE_ENFORCE_GE(npu_alloc_size_, size,
platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)",
size, npu_alloc_size_));
npu_alloc_size_ -= size;
platform::RecordedNPUFree(p, size, npu_id_);
}
bool NPUAllocator::UseGpu() const { return true; }
#endif
} // namespace detail
} // namespace memory
} // namespace paddle

@ -66,6 +66,22 @@ class CUDAPinnedAllocator : public SystemAllocator {
};
#endif
#ifdef PADDLE_WITH_ASCEND_CL
class NPUAllocator : public SystemAllocator {
public:
explicit NPUAllocator(int npu_id) : npu_id_(npu_id) {}
virtual void* Alloc(size_t* index, size_t size);
virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const;
private:
size_t npu_alloc_size_ = 0;
int npu_id_;
};
#endif
} // namespace detail
} // namespace memory
} // namespace paddle

@ -81,3 +81,11 @@ TEST(GPUAllocator, AllocFailure) {
}
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
TEST(NPUAllocator, Alloc) {
paddle::memory::detail::NPUAllocator a(0);
TestAllocator(&a, 1<<20);
TestAllocator(&a, 1);
}
#endif

@ -148,6 +148,13 @@ void set_constant_with_place<platform::XPUPlace>(
PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
}
template <>
void set_constant_with_place<platform::NPUPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported"));
}
template <>
void set_constant_with_place<platform::CPUPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,

@ -71,6 +71,10 @@ if(WITH_ASCEND)
cc_library(ascend_npu_info SRCS ascend_npu_info.cc DEPS gflags glog enforce atlas_acl)
endif()
if(WITH_ASCEND_CL)
cc_library(npu_info SRCS npu_info.cc DEPS gflags glog enforce monitor atlas_acl)
endif()
add_subdirectory(dynload)
add_subdirectory(stream)

@ -228,6 +228,33 @@ Place XPUDeviceContext::GetPlace() const { return place_; }
xpu::Context* XPUDeviceContext::x_context() const { return context_; }
#endif
#ifdef PADDLE_WITH_ASCEND_CL
NPUDeviceContext::NPUDeviceContext(NPUPlace place) : place_(place) {
NPUDeviceGuard guard(place_.device);
// PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateContext(&context_, place_.device));
// NOTE(zhiqiu): Usually, no need to create context explicitly,
// ACL creates a default context which contains 1 default stream
// and 1 sync strean after aclrtSetDevice.
PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetCurrentContext(&context_));
}
NPUDeviceContext::~NPUDeviceContext() {
// NPUDeviceGuard guard(place_.device);
// PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyContext(context_));
}
void NPUDeviceContext::Wait() const {
NPUDeviceGuard guard(place_.device);
PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice());
}
Place NPUDeviceContext::GetPlace() const { return place_; }
aclrtContext* NPUDeviceContext::context() const {
return const_cast<aclrtContext*>(&context_);
}
#endif
#ifdef PADDLE_WITH_CUDA
class EigenCudaStreamDevice : public Eigen::StreamInterface {

@ -59,6 +59,11 @@ struct GpuDevice;
#include "paddle/fluid/platform/xpu_info.h"
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "acl/acl.h"
#include "paddle/fluid/platform/npu_info.h"
#endif
namespace paddle {
namespace platform {
@ -77,11 +82,13 @@ enum DeviceType {
CPU = 0,
CUDA = 1,
XPU = 2,
NPU = 3,
};
constexpr DeviceType kCPU = DeviceType::CPU;
constexpr DeviceType kCUDA = DeviceType::CUDA;
constexpr DeviceType kXPU = DeviceType::XPU;
constexpr DeviceType kNPU = DeviceType::NPU;
class DeviceContext {
public:
@ -153,6 +160,46 @@ struct DefaultDeviceContextType<platform::XPUPlace> {
};
#endif
#ifdef PADDLE_WITH_ASCEND_CL
class NPUDeviceContext : public DeviceContext {
public:
explicit NPUDeviceContext(NPUPlace place);
virtual ~NPUDeviceContext();
Eigen::DefaultDevice* eigen_device() const { return nullptr; }
Place GetPlace() const override;
aclrtContext* context() const;
/*! \brief Wait for all operations completion in the stream. */
void Wait() const override;
#ifdef PADDLE_WITH_ASCEND_HCCL
/*! \brief Return bkcl context. */
HCCLContext_t hccl_context() const { return hccl_context_; }
/*! \brief Set bkcl context. */
void set_hccl_context(HCCLContext_t context) { hccl_context_ = context; }
#endif
private:
NPUPlace place_;
aclrtContext context_;
#ifdef PADDLE_WITH_ASCEND_HCCL
HCCLContext_t hccl_context_;
#endif
// Need to be the same with other DeviceContext,
// Eventhough eigen_device_ is not used in NPU
// NOTE(zhiqiu): why need?
std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
DISABLE_COPY_AND_ASSIGN(NPUDeviceContext);
};
template <>
struct DefaultDeviceContextType<platform::NPUPlace> {
using TYPE = NPUDeviceContext;
};
#endif
#ifdef PADDLE_WITH_CUDA
class CudnnWorkspaceHandle;

@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_CUDA
#include <cudnn.h>
#include <glog/logging.h>
#include <mutex> // NOLINT
@ -214,3 +215,5 @@ CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
} // namespace dynload
} // namespace platform
} // namespace paddle
#endif

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save