!8291 [MD][GPU] minddata use pin memory can improve efficiency

From: @xiefangqi
Reviewed-by: @pandoublefeng,@jonyguo
Signed-off-by: @jonyguo
pull/8221/head
mindspore-ci-bot 4 years ago committed by Gitee
commit c88ed51c99

@ -49,7 +49,7 @@ class DatasetInitKernel : public GpuKernel {
std::vector<size_t> workspace_size_list_; std::vector<size_t> workspace_size_list_;
// The capacity of buffer Q. // The capacity of buffer Q.
size_t buffer_q_capacity_{1}; size_t buffer_q_capacity_{2};
}; };
MS_REG_GPU_KERNEL(InitDataSetQueue, DatasetInitKernel) MS_REG_GPU_KERNEL(InitDataSetQueue, DatasetInitKernel)

@ -42,16 +42,30 @@ if (ENABLE_CACHE)
storage_container.cc) storage_container.cc)
add_executable(cache_server cache_main.cc) add_executable(cache_server cache_main.cc)
target_link_libraries(cache_server if (ENABLE_GPU)
engine-cache-server target_link_libraries(cache_server
_c_dataengine engine-cache-server
_c_mindrecord _c_dataengine
mindspore::protobuf _c_mindrecord
mindspore::grpc++ mindspore::protobuf
mindspore_gvar mindspore::grpc++
${PYTHON_LIBRARIES} mindspore_gvar
${SECUREC_LIBRARY} ${CUDNN_LIBRARY_PATH}
pthread) ${PYTHON_LIBRARIES}
${SECUREC_LIBRARY}
pthread)
else()
target_link_libraries(cache_server
engine-cache-server
_c_dataengine
_c_mindrecord
mindspore::protobuf
mindspore::grpc++
mindspore_gvar
${PYTHON_LIBRARIES}
${SECUREC_LIBRARY}
pthread)
endif()
if (USE_GLOG) if (USE_GLOG)
target_link_libraries(cache_server mindspore::glog) target_link_libraries(cache_server mindspore::glog)

@ -91,7 +91,7 @@ Status DeviceQueueOp::operator()() {
#endif #endif
} else if (device_type_ == DeviceType::GPU) { } else if (device_type_ == DeviceType::GPU) {
#ifdef ENABLE_GPUQUE #ifdef ENABLE_GPUQUE
RETURN_IF_NOT_OK(CircularPool::CreateCircularPool(&pool_)); RETURN_IF_NOT_OK(CircularPool::CreateCircularPool(&pool_, -1, 1024, false, true));
RETURN_IF_NOT_OK(SendDataToGPU()); RETURN_IF_NOT_OK(SendDataToGPU());
#endif #endif
} else if (device_type_ == DeviceType::CPU) { } else if (device_type_ == DeviceType::CPU) {

@ -235,14 +235,43 @@ std::ostream &operator<<(std::ostream &os, const ArenaImpl &s) {
Status Arena::Init() { Status Arena::Init() {
try { try {
int64_t sz = size_in_MB_ * 1048576L; int64_t sz = size_in_MB_ * 1048576L;
#ifdef ENABLE_GPUQUE
if (is_cuda_malloc_) {
auto ret = cudaHostAlloc(&ptr_, sz, cudaHostAllocDefault);
if (ret != cudaSuccess) {
MS_LOG(ERROR) << "cudaHostAlloc failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
return Status(StatusCode::kOutOfMemory);
}
impl_ = std::make_unique<ArenaImpl>(ptr_, sz);
} else {
RETURN_IF_NOT_OK(DeMalloc(sz, &ptr_, false));
impl_ = std::make_unique<ArenaImpl>(ptr_, sz);
}
#else
RETURN_IF_NOT_OK(DeMalloc(sz, &ptr_, false)); RETURN_IF_NOT_OK(DeMalloc(sz, &ptr_, false));
impl_ = std::make_unique<ArenaImpl>(ptr_, sz); impl_ = std::make_unique<ArenaImpl>(ptr_, sz);
#endif
} catch (std::bad_alloc &e) { } catch (std::bad_alloc &e) {
return Status(StatusCode::kOutOfMemory); return Status(StatusCode::kOutOfMemory);
} }
return Status::OK(); return Status::OK();
} }
#ifdef ENABLE_GPUQUE
Arena::Arena(size_t val_in_MB, bool is_cuda_malloc)
: ptr_(nullptr), size_in_MB_(val_in_MB), is_cuda_malloc_(is_cuda_malloc) {}
Status Arena::CreateArena(std::shared_ptr<Arena> *p_ba, size_t val_in_MB, bool is_cuda_malloc) {
RETURN_UNEXPECTED_IF_NULL(p_ba);
auto ba = new (std::nothrow) Arena(val_in_MB, is_cuda_malloc);
if (ba == nullptr) {
return Status(StatusCode::kOutOfMemory);
}
(*p_ba).reset(ba);
RETURN_IF_NOT_OK(ba->Init());
return Status::OK();
}
#else
Arena::Arena(size_t val_in_MB) : ptr_(nullptr), size_in_MB_(val_in_MB) {} Arena::Arena(size_t val_in_MB) : ptr_(nullptr), size_in_MB_(val_in_MB) {}
Status Arena::CreateArena(std::shared_ptr<Arena> *p_ba, size_t val_in_MB) { Status Arena::CreateArena(std::shared_ptr<Arena> *p_ba, size_t val_in_MB) {
@ -255,5 +284,6 @@ Status Arena::CreateArena(std::shared_ptr<Arena> *p_ba, size_t val_in_MB) {
RETURN_IF_NOT_OK(ba->Init()); RETURN_IF_NOT_OK(ba->Init());
return Status::OK(); return Status::OK();
} }
#endif
} // namespace dataset } // namespace dataset
} // namespace mindspore } // namespace mindspore

@ -22,6 +22,9 @@
#include "minddata/dataset/util/allocator.h" #include "minddata/dataset/util/allocator.h"
#include "minddata/dataset/util/memory_pool.h" #include "minddata/dataset/util/memory_pool.h"
#include "minddata/dataset/util/treap.h" #include "minddata/dataset/util/treap.h"
#ifdef ENABLE_GPUQUE
#include <cuda_runtime_api.h>
#endif
#define ARENA_LOG_BLK_SZ (6u) #define ARENA_LOG_BLK_SZ (6u)
#define ARENA_BLK_SZ (static_cast<uint16_t>(1u << ARENA_LOG_BLK_SZ)) #define ARENA_BLK_SZ (static_cast<uint16_t>(1u << ARENA_LOG_BLK_SZ))
@ -105,10 +108,18 @@ class Arena : public MemoryPool {
Arena(const Arena &) = delete; Arena(const Arena &) = delete;
Arena &operator=(const Arena &) = delete; Arena &operator=(const Arena &) = delete;
~Arena() override { ~Arena() override {
#ifdef ENABLE_GPUQUE
if (is_cuda_malloc_) {
if (ptr_) {
(void)cudaFreeHost(ptr_);
}
}
#else
if (ptr_ != nullptr) { if (ptr_ != nullptr) {
free(ptr_); free(ptr_);
} }
ptr_ = nullptr; ptr_ = nullptr;
#endif
} }
/// As a derived class of MemoryPool, we have to implement the following. /// As a derived class of MemoryPool, we have to implement the following.
@ -140,16 +151,27 @@ class Arena : public MemoryPool {
return os; return os;
} }
#ifdef ENABLE_GPUQUE
/// The only method to create an arena.
static Status CreateArena(std::shared_ptr<Arena> *p_ba, size_t val_in_MB = 4096, bool is_cuda_malloc = false);
#else
/// The only method to create an arena. /// The only method to create an arena.
static Status CreateArena(std::shared_ptr<Arena> *p_ba, size_t val_in_MB = 4096); static Status CreateArena(std::shared_ptr<Arena> *p_ba, size_t val_in_MB = 4096);
#endif
protected: protected:
mutable std::mutex mux_; mutable std::mutex mux_;
std::unique_ptr<ArenaImpl> impl_; std::unique_ptr<ArenaImpl> impl_;
void *ptr_; void *ptr_;
size_t size_in_MB_; size_t size_in_MB_;
#ifdef ENABLE_GPUQUE
bool is_cuda_malloc_;
explicit Arena(size_t val_in_MB = 4096, bool is_cuda_malloc = false);
#else
explicit Arena(size_t val_in_MB = 4096); explicit Arena(size_t val_in_MB = 4096);
#endif
Status Init(); Status Init();
}; };

@ -27,7 +27,11 @@ namespace dataset {
Status CircularPool::AddOneArena() { Status CircularPool::AddOneArena() {
Status rc; Status rc;
std::shared_ptr<Arena> b; std::shared_ptr<Arena> b;
#ifdef ENABLE_GPUQUE
RETURN_IF_NOT_OK(Arena::CreateArena(&b, arena_size_, is_cuda_malloc_));
#else
RETURN_IF_NOT_OK(Arena::CreateArena(&b, arena_size_)); RETURN_IF_NOT_OK(Arena::CreateArena(&b, arena_size_));
#endif
tail_ = b.get(); tail_ = b.get();
cur_size_in_mb_ += arena_size_; cur_size_in_mb_ += arena_size_;
mem_segments_.push_back(std::move(b)); mem_segments_.push_back(std::move(b));
@ -194,12 +198,43 @@ int CircularPool::PercentFree() const {
} }
} }
#ifdef ENABLE_GPUQUE
CircularPool::CircularPool(int max_size_in_gb, int arena_size, bool is_cuda_malloc)
: unlimited_(max_size_in_gb <= 0),
max_size_in_mb_(unlimited_ ? std::numeric_limits<int32_t>::max() : max_size_in_gb * 1024),
arena_size_(arena_size),
is_cuda_malloc_(is_cuda_malloc),
cur_size_in_mb_(0) {}
#else
CircularPool::CircularPool(int max_size_in_gb, int arena_size) CircularPool::CircularPool(int max_size_in_gb, int arena_size)
: unlimited_(max_size_in_gb <= 0), : unlimited_(max_size_in_gb <= 0),
max_size_in_mb_(unlimited_ ? std::numeric_limits<int32_t>::max() : max_size_in_gb * 1024), max_size_in_mb_(unlimited_ ? std::numeric_limits<int32_t>::max() : max_size_in_gb * 1024),
arena_size_(arena_size), arena_size_(arena_size),
cur_size_in_mb_(0) {} cur_size_in_mb_(0) {}
#endif
#ifdef ENABLE_GPUQUE
Status CircularPool::CreateCircularPool(std::shared_ptr<MemoryPool> *out_pool, int max_size_in_gb, int arena_size,
bool createOneArena, bool is_cuda_malloc) {
Status rc;
if (out_pool == nullptr) {
RETURN_STATUS_UNEXPECTED("pPool is null");
}
auto pool = new (std::nothrow) CircularPool(max_size_in_gb, arena_size, is_cuda_malloc);
if (pool == nullptr) {
return Status(StatusCode::kOutOfMemory);
}
if (createOneArena) {
rc = pool->AddOneArena();
}
if (rc.IsOk()) {
(*out_pool).reset(pool);
} else {
delete pool;
}
return rc;
}
#else
Status CircularPool::CreateCircularPool(std::shared_ptr<MemoryPool> *out_pool, int max_size_in_gb, int arena_size, Status CircularPool::CreateCircularPool(std::shared_ptr<MemoryPool> *out_pool, int max_size_in_gb, int arena_size,
bool createOneArena) { bool createOneArena) {
Status rc; Status rc;
@ -220,6 +255,7 @@ Status CircularPool::CreateCircularPool(std::shared_ptr<MemoryPool> *out_pool, i
} }
return rc; return rc;
} }
#endif
CircularPool::~CircularPool() = default; CircularPool::~CircularPool() = default;
} // namespace dataset } // namespace dataset

@ -85,8 +85,13 @@ class CircularPool : public MemoryPool {
return os; return os;
} }
#ifdef ENABLE_GPUQUE
static Status CreateCircularPool(std::shared_ptr<MemoryPool> *out_pool, int max_size_in_gb = -1,
int arena_size = 4096, bool create_one_arena = false, bool is_cuda_malloc = false);
#else
static Status CreateCircularPool(std::shared_ptr<MemoryPool> *out_pool, int max_size_in_gb = -1, static Status CreateCircularPool(std::shared_ptr<MemoryPool> *out_pool, int max_size_in_gb = -1,
int arena_size = 4096, bool create_one_arena = false); int arena_size = 4096, bool create_one_arena = false);
#endif
private: private:
ListOfArenas mem_segments_; ListOfArenas mem_segments_;
@ -96,9 +101,16 @@ class CircularPool : public MemoryPool {
int arena_size_; int arena_size_;
int cur_size_in_mb_; int cur_size_in_mb_;
RWLock rw_lock_; RWLock rw_lock_;
#ifdef ENABLE_GPU
bool is_cuda_malloc_;
// We can take negative or 0 as input which means unlimited.
CircularPool(int max_size_in_gb, int arena_size, bool is_cuda_malloc);
#else
// We can take negative or 0 as input which means unlimited. // We can take negative or 0 as input which means unlimited.
CircularPool(int max_size_in_gb, int arena_size); CircularPool(int max_size_in_gb, int arena_size);
#endif
Status AddOneArena(); Status AddOneArena();
}; };

Loading…
Cancel
Save