From b365c3c3778840190916c306b135cc7920b1d06f Mon Sep 17 00:00:00 2001 From: wandongdong Date: Tue, 24 Nov 2020 04:51:20 -0800 Subject: [PATCH] implement opencl cache bin --- mindspore/lite/schema/gpu_cache.fbs | 38 ++++++++ mindspore/lite/src/CMakeLists.txt | 1 + .../src/runtime/opencl/opencl_allocator.cc | 7 +- .../lite/src/runtime/opencl/opencl_runtime.cc | 91 ++++++++++++++++++- .../lite/src/runtime/opencl/opencl_runtime.h | 12 ++- 5 files changed, 143 insertions(+), 6 deletions(-) create mode 100644 mindspore/lite/schema/gpu_cache.fbs diff --git a/mindspore/lite/schema/gpu_cache.fbs b/mindspore/lite/schema/gpu_cache.fbs new file mode 100644 index 0000000000..ad9c1c613a --- /dev/null +++ b/mindspore/lite/schema/gpu_cache.fbs @@ -0,0 +1,38 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace mindspore.schema; + +table TuneParam { + local: [int]; + block: [int]; + shape: [int]; + opPara: [int]; +} + +table KernelBin { + name: string; + tune: TuneParam; + data: [ubyte]; +} + +table GpuCache { + name: string; + version: string; + allBins: [KernelBin]; +} + +root_type GpuCache; diff --git a/mindspore/lite/src/CMakeLists.txt b/mindspore/lite/src/CMakeLists.txt index 36fd19391b..b5c1d5a65a 100644 --- a/mindspore/lite/src/CMakeLists.txt +++ b/mindspore/lite/src/CMakeLists.txt @@ -17,6 +17,7 @@ if (PLATFORM_ARM32 OR PLATFORM_ARM64) endif () set(LITE_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/common/file_utils.cc ${CMAKE_CURRENT_SOURCE_DIR}/common/utils.cc ${CMAKE_CURRENT_SOURCE_DIR}/common/graph_util.cc ${CMAKE_CURRENT_SOURCE_DIR}/common/log_adapter.cc diff --git a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc index ce98df79e1..8beaa6054a 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc @@ -67,7 +67,7 @@ void *OpenCLAllocator::MinimumFit(size_t size, const std::vector &img_si void *OpenCLAllocator::CreateBuffer(size_t size, void *data, size_t flags, cl::Buffer **buffer) { cl_int ret = CL_SUCCESS; MS_ASSERT(buffer); - *buffer = new (std::nothrow) cl::Buffer(*ocl_runtime_->Context(), flags, size, data, &ret); + *buffer = new (std::nothrow) cl::Buffer(*ocl_runtime_->Context(), static_cast(flags), size, data, &ret); if (*buffer == nullptr) { MS_LOG(ERROR) << "Create OpenCL buffer failed! (ERROR CODE: " << ret << ")"; return nullptr; @@ -90,6 +90,9 @@ void *OpenCLAllocator::CreateBuffer(size_t size, void *data, size_t flags, cl::B void *OpenCLAllocator::CreateImage2D(size_t size, const std::vector &img_size, void *data, size_t flags, bool is_map, cl::Buffer **buffer, cl::Image2D **image) { cl_int ret = CL_SUCCESS; + MS_ASSERT(buffer); + MS_ASSERT(image); + MS_ASSERT(img_size.size() == 3); cl::ImageFormat image_format(CL_RGBA, img_size[2]); if (data == nullptr) { *image = new (std::nothrow) @@ -332,7 +335,7 @@ void *OpenCLAllocator::MapBuffer(void *host_ptr, int flags, void *command_queue, } MemBuf *mem_buf = it->second; MS_ASSERT(mem_buf); - void *new_host_ptr; + void *new_host_ptr{nullptr}; if (mem_buf->img_size.empty()) { cl::Buffer *buffer = static_cast(mem_buf->device_ptr_); MS_ASSERT(buffer); diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc index 881005522c..409ec1968c 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc @@ -17,12 +17,14 @@ #include "src/runtime/opencl/opencl_runtime.h" #include #include +#include #ifdef SHARING_MEM_WITH_OPENGL #include #endif #include "include/errorcode.h" #include "src/runtime/kernel/opencl/utils.h" #include "src/runtime/opencl/opencl_allocator.h" +#include "src/common/file_utils.h" #ifdef PROGRAM_WITH_IL #include "src/backend/opencl/cl/program.inc" #endif @@ -254,6 +256,9 @@ int OpenCLRuntime::Init() { std::string flag = ""; binary_program_ = CreateProgramFromIL(g_program_binary, flag); #endif + if (enable_cache_) { + InitGpuCache(); + } init_done_ = true; MS_LOG(INFO) << "OpenCLRuntime init done!"; @@ -261,6 +266,10 @@ int OpenCLRuntime::Init() { } int OpenCLRuntime::Uninit() { + if (enable_cache_) { + StoreCache(); + } + binary_map_.clear(); program_map_.clear(); delete allocator_; delete default_command_queue_; @@ -374,6 +383,12 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na MS_LOG(ERROR) << program_name << " build failed!"; return RET_ERROR; } + if (enable_cache_) { + need_write_ = true; + auto bin = GetProgramBinaries(program); + MS_ASSERT(bin.size() >= 1); + binary_map_.emplace(build_program_key, bin[0]); + } program_map_.emplace(build_program_key, program); } @@ -673,9 +688,8 @@ cl::Program OpenCLRuntime::CreateProgramFromIL(const std::vector &binary, } // build program with binary -cl::Program OpenCLRuntime::CreateProgramFromBinary(const std::vector> &binary, - const std::string &flag) { - cl::Program program = cl::Program(*context_, {*device_}, binary); +cl::Program OpenCLRuntime::CreateProgramFromBinary(const std::vector &binary, const std::string &flag) { + cl::Program program = cl::Program(*context_, {*device_}, {binary}); bool status = BuildProgram(default_build_opts_, program); if (!status) { MS_LOG(ERROR) << "Build program with binary failed!"; @@ -691,4 +705,75 @@ std::vector> OpenCLRuntime::GetProgramBinaries(const } return binary; } +void OpenCLRuntime::InitGpuCache() { + size_t len; + char *buf = lite::ReadFile(cache_path_.c_str(), &len); + if (LoadCache(buf) != RET_OK) { + MS_LOG(ERROR) << "Load opencl cache fail"; + } + delete buf; + MS_LOG(INFO) << "Init opencl cache success"; +} +int OpenCLRuntime::LoadCache(const void *buf) { + if (buf == nullptr) { + return RET_ERROR; + } + auto gpu_cache = schema::GetGpuCache(buf); + if (gpu_cache == nullptr) { + return RET_ERROR; + } + auto *bins = gpu_cache->allBins(); + if (bins == nullptr) { + return RET_ERROR; + } + auto n = bins->size(); + for (auto i = 0; i < n; ++i) { + auto *kernel_bin = bins->template GetAs(i); + if (kernel_bin == nullptr) { + MS_LOG(ERROR) << "kernel_bin[" << i << "] null"; + return RET_ERROR; + } + auto *pdata = kernel_bin->data(); + MS_ASSERT(pdata); + if (pdata->size() == 0) { + continue; + } + std::vector bin(pdata->begin(), pdata->end()); + auto program = CreateProgramFromBinary(bin, kernel_bin->name()->str()); + program_map_.emplace(kernel_bin->name()->str(), program); + binary_map_.emplace(kernel_bin->name()->str(), bin); + MS_LOG(INFO) << "LoadCache " << kernel_bin->name()->str() << " success, size=" << pdata->size(); + } + return RET_OK; +} +void OpenCLRuntime::StoreCache() { + if (need_write_) { + auto fbb_ = new (std::nothrow) flatbuffers::FlatBufferBuilder; + if (fbb_ == nullptr) { + MS_LOG(ERROR) << "new opencl FlatBufferBuilder fail"; + return; + } + std::vector> vec_kernel_bin; + for (auto iv : binary_map_) { + auto name = fbb_->CreateString(iv.first); + auto data = fbb_->CreateVector(iv.second); + std::vector shape; + auto tune = schema::CreateTuneParam(*fbb_, fbb_->CreateVector(shape), fbb_->CreateVector(shape), + fbb_->CreateVector(shape), fbb_->CreateVector(shape)); + auto kbin = schema::CreateKernelBin(*fbb_, name, tune, data); + vec_kernel_bin.emplace_back(kbin); + MS_LOG(INFO) << "StoreCache " << iv.first << " success, size=" << iv.second.size(); + } + + auto data = fbb_->CreateVector>(vec_kernel_bin); + auto name = fbb_->CreateString("OpenCLCache"); + auto version = fbb_->CreateString(version_); + auto gpu_cache = schema::CreateGpuCache(*fbb_, name, version, data); + fbb_->Finish(gpu_cache); + uint8_t *buf = fbb_->GetBufferPointer(); + lite::WriteToBin(cache_path_, reinterpret_cast(buf), fbb_->GetSize()); + MS_LOG(INFO) << "store opencl cache ok, size=" << fbb_->GetSize(); + delete fbb_; + } +} } // namespace mindspore::lite::opencl diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.h b/mindspore/lite/src/runtime/opencl/opencl_runtime.h index e82f09072e..ffa70254ac 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_runtime.h +++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.h @@ -27,6 +27,7 @@ j* you may not use this file except in compliance with the License. #include "src/common/log_adapter.h" #include "src/runtime/opencl/opencl_wrapper.h" #include "src/runtime/opencl/opencl_allocator.h" +#include "schema/gpu_cache_generated.h" namespace mindspore::lite::opencl { @@ -107,7 +108,7 @@ class OpenCLRuntime { } cl::Program CreateProgramFromIL(const std::vector &binary, const std::string &flag); - cl::Program CreateProgramFromBinary(const std::vector> &binary, const std::string &flag); + cl::Program CreateProgramFromBinary(const std::vector &binary, const std::string &flag); cl::Kernel GetKernelFromBinary(const std::string &kernel_name); std::vector> GetProgramBinaries(const cl::Program &program); bool LoadSource(const std::string &program_name, const std::string &source); @@ -139,6 +140,10 @@ class OpenCLRuntime { */ int GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id); + void InitGpuCache(); + int LoadCache(const void *buf); + void StoreCache(); + private: static OpenCLRuntime *GetInstance(); static void DeleteInstance(); @@ -171,6 +176,11 @@ class OpenCLRuntime { cl_uint image_pitch_align_{0}; std::vector max_work_item_sizes_; void *handle_{nullptr}; + std::map> binary_map_; + std::string cache_path_{"/data/local/tmp/opencl_cache"}; + const std::string version_{"V0.1"}; + bool need_write_{false}; + bool enable_cache_{false}; }; class OpenCLRuntimeWrapper {