From e153665a48442be9f006bbcbf3ef8d96152bf956 Mon Sep 17 00:00:00 2001 From: wangdongxu Date: Tue, 12 Jan 2021 19:56:59 +0800 Subject: [PATCH] optimize opencl load/store program cache --- mindspore/lite/CMakeLists.txt | 6 +- mindspore/lite/schema/gpu_cache.fbs | 7 +- mindspore/lite/src/lite_session.cc | 10 +- mindspore/lite/src/lite_session.h | 2 +- .../runtime/kernel/opencl/kernel/argminmax.cc | 3 +- .../runtime/kernel/opencl/kernel/batchnorm.cc | 3 +- .../src/runtime/kernel/opencl/kernel/cast.cc | 3 +- .../kernel/opencl/kernel/fusion_eltwise.cc | 2 - .../kernel/opencl/kernel/layer_norm.cc | 5 +- .../runtime/kernel/opencl/kernel/one_hot.cc | 3 +- .../lite/src/runtime/opencl/opencl_runtime.cc | 244 +++++++++--------- .../lite/src/runtime/opencl/opencl_runtime.h | 33 +-- .../lite/src/runtime/opencl/opencl_wrapper.cc | 27 +- .../lite/src/runtime/opencl/opencl_wrapper.h | 14 +- 14 files changed, 179 insertions(+), 183 deletions(-) diff --git a/mindspore/lite/CMakeLists.txt b/mindspore/lite/CMakeLists.txt index 3e5bae9214..f5b9774f35 100644 --- a/mindspore/lite/CMakeLists.txt +++ b/mindspore/lite/CMakeLists.txt @@ -156,13 +156,15 @@ if (SUPPORT_GPU) gene_opencl(${CMAKE_CURRENT_SOURCE_DIR}) add_definitions(-DUSE_OPENCL_WRAPPER) add_definitions(-DMS_OPENCL_PROFILE=false) + add_definitions(-DCL_TARGET_OPENCL_VERSION=200) add_definitions(-DCL_HPP_TARGET_OPENCL_VERSION=200) + add_definitions(-DCL_HPP_MINIMUM_OPENCL_VERSION=110) add_compile_definitions(SUPPORT_GPU) if (OFFLINE_COMPILE) add_compile_definitions(PROGRAM_WITH_IL) endif () - include_directories(${CMAKE_CURRENT_SOURCE_DIR}/build/_deps/opencl-headers-src/) - include_directories(${CMAKE_CURRENT_SOURCE_DIR}/build/_deps/opencl-clhpp-src/include) + include_directories(${CMAKE_BINARY_DIR}/_deps/opencl-headers-src/) + include_directories(${CMAKE_BINARY_DIR}/_deps/opencl-clhpp-src/include) endif () if (WIN32) diff --git a/mindspore/lite/schema/gpu_cache.fbs b/mindspore/lite/schema/gpu_cache.fbs index ad9c1c613a..990eb085b2 100644 --- a/mindspore/lite/schema/gpu_cache.fbs +++ b/mindspore/lite/schema/gpu_cache.fbs @@ -23,8 +23,9 @@ table TuneParam { opPara: [int]; } -table KernelBin { - name: string; +table ProgramBinary { + program_name: string; + build_option: string; tune: TuneParam; data: [ubyte]; } @@ -32,7 +33,7 @@ table KernelBin { table GpuCache { name: string; version: string; - allBins: [KernelBin]; + allBins: [ProgramBinary]; } root_type GpuCache; diff --git a/mindspore/lite/src/lite_session.cc b/mindspore/lite/src/lite_session.cc index 1504b14939..2ef9e880d5 100644 --- a/mindspore/lite/src/lite_session.cc +++ b/mindspore/lite/src/lite_session.cc @@ -546,6 +546,9 @@ LiteSession::~LiteSession() { #if SUPPORT_NPU mindspore::lite::NPUPassManager::GetInstance()->Clear(); mindspore::lite::NPUManager::GetInstance()->Reset(); +#endif +#if SUPPORT_GPU && !SUPPORT_TRAIN + delete opencl_runtime_wrapper_; #endif delete (model_); is_running_.store(false); @@ -676,8 +679,13 @@ int LiteSession::Resize(const std::vector &inputs int LiteSession::InitGPURuntime() { #if SUPPORT_GPU && !SUPPORT_TRAIN if (this->context_->IsGpuEnabled()) { + opencl_runtime_wrapper_ = new (std::nothrow) opencl::OpenCLRuntimeWrapper(); + if (opencl_runtime_wrapper_ == nullptr) { + MS_LOG(ERROR) << "create OpenCLRuntimeWrapper failed"; + return RET_ERROR; + } auto gpu_device_info = this->context_->GetGpuInfo(); - auto opencl_runtime = ocl_runtime_wrap_.GetInstance(); + auto opencl_runtime = opencl_runtime_wrapper_->GetInstance(); opencl_runtime->SetFp16Enable(gpu_device_info.enable_float16_); if (opencl_runtime->Init() != RET_OK) { this->context_->device_list_ = {{DT_CPU, {gpu_device_info.enable_float16_, MID_CPU}}}; diff --git a/mindspore/lite/src/lite_session.h b/mindspore/lite/src/lite_session.h index 7cbbfadfc6..d1963d019f 100644 --- a/mindspore/lite/src/lite_session.h +++ b/mindspore/lite/src/lite_session.h @@ -128,7 +128,7 @@ class LiteSession : public session::LiteSession { Model *model_ = nullptr; std::atomic is_running_ = false; #if SUPPORT_GPU && !SUPPORT_TRAIN - opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_; + opencl::OpenCLRuntimeWrapper *opencl_runtime_wrapper_{nullptr}; #endif }; } // namespace lite diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc index e5ff86048a..a6867fe12b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc @@ -139,11 +139,10 @@ int ArgMinMaxOpenCLKernel::Prepare() { kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); #else - std::set build_options; std::string source = argminmax_source; std::string program_name = "argminmax"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); #endif InitWeights(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc index 6da224c228..268e1c9829 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc @@ -165,11 +165,10 @@ int BatchNormOpenCLKernel::Initweight() { int BatchNormOpenCLKernel::Prepare() { use_fp16_enable_ = ocl_runtime_->GetFp16Enable(); std::string kernel_name = "Batch_normalization_NHWC4"; - std::set build_options; std::string source = batchnorm_source; std::string program_name = "Batch_normalization"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); MS_LOG(DEBUG) << kernel_name << " Init Done!"; int ret = Initweight(); if (ret) { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc index cac95b2a61..669c9f0c1c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc @@ -94,11 +94,10 @@ int CastOpenCLKernel::Prepare() { std::string kernel_name = "Cast"; GetKernelName(&kernel_name, param); kernel_name += "_NHWC4"; - std::set build_options; std::string source = cast_source; std::string program_name = "cast"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); MS_LOG(DEBUG) << kernel_name << " Init Done!"; SetConstArgs(); SetGlobalLocal(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.cc index e3ec7c5f50..6c7c713516 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.cc @@ -145,7 +145,6 @@ int FusionEltwiseOpenCLKernel::Prepare() { static std::set code_map; std::string source = Codegen(); code_map.insert(source); - // std::cout << name() << "\n" << source; std::string program_name = "FusionEltwise" + std::to_string(code_map.size()); std::string kernel_name = "FusionEltwise"; @@ -182,7 +181,6 @@ int FusionEltwiseOpenCLKernel::InitWeights() { if (IsScalar(tensor->shape())) { float value = (tensor->data_type() == kNumberTypeFloat16) ? *(reinterpret_cast(tensor->data_c())) : *(reinterpret_cast(tensor->data_c())); - // std::cout << "value=" << value << std::endl; scalar_weights_.push_back(value); } else { auto tensor_info = GpuTensorInfo(tensor); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc index e6fc9e9b5a..3a3f5faa1e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc @@ -212,13 +212,12 @@ int LayerNormOpenCLKernel::Prepare() { } std::string kernel_name = "LayerNormalization_NHWC4"; std::string kernel_name_mean_var = "ComputeMeanVar"; - std::set build_options; std::string source = layer_norm_source; std::string program_name = "LayerNormalization"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); kernel_name_mean_var += "Dim" + std::to_string(normalized_dims_) + "NHWC4"; - ocl_runtime_->BuildKernel(kernel_mean_var_, program_name, kernel_name_mean_var, build_options); + ocl_runtime_->BuildKernel(kernel_mean_var_, program_name, kernel_name_mean_var); MS_LOG(DEBUG) << kernel_name << " Init Done!"; SetConstArgs(); SetGlobalLocal(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc index 4b1d79cd39..7a7cac3e47 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc @@ -51,11 +51,10 @@ int OneHotOpenCLKernel::Prepare() { #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); #else - std::set build_options; std::string source = one_hot_source; std::string program_name = "OneHot"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); #endif InitWeights(); SetConstArgs(); diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc index cd77c43b31..dce0dca113 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc @@ -37,11 +37,11 @@ using mindspore::kernel::CLErrorCode; namespace mindspore::lite::opencl { -static std::map g_opencl_program_map; +static std::map g_source_map; static std::mutex g_mtx; static std::mutex g_init_mtx; -bool OpenCLRuntime::init_done_ = false; +InitState OpenCLRuntime::init_state_ = UnInit; OpenCLRuntime *OpenCLRuntime::ocl_runtime_instance_ = nullptr; size_t OpenCLRuntime::instance_count_ = 0; @@ -60,6 +60,7 @@ void OpenCLRuntime::DeleteInstance() { std::unique_lock lck(g_mtx); if (instance_count_ == 0) { MS_LOG(ERROR) << "No OpenCLRuntime instance could delete!"; + return; } instance_count_--; if (instance_count_ == 0) { @@ -67,8 +68,6 @@ void OpenCLRuntime::DeleteInstance() { } } -OpenCLRuntime::OpenCLRuntime() { default_build_opts_ = " -cl-mad-enable -cl-fast-relaxed-math -Werror"; } - void printf_callback(const char *buffer, size_t length, size_t final, void *user_data) { fwrite(buffer, 1, length, stdout); } @@ -76,16 +75,19 @@ void printf_callback(const char *buffer, size_t length, size_t final, void *user // Init will get platforms info, get devices info, create opencl context. int OpenCLRuntime::Init() { std::unique_lock lck(g_init_mtx); - - if (init_done_) { + if (init_state_ == InitSuccess) { return RET_OK; + } else if (init_state_ == InitFailed) { + return RET_ERROR; } + init_state_ = InitFailed; + MS_LOG(INFO) << "OpenCL version: CL_TARGET_OPENCL_VERSION " << CL_TARGET_OPENCL_VERSION; MS_LOG(INFO) << "CL_HPP_TARGET_OPENCL_VERSION " << CL_HPP_TARGET_OPENCL_VERSION; MS_LOG(INFO) << "CL_HPP_MINIMUM_OPENCL_VERSION " << CL_HPP_MINIMUM_OPENCL_VERSION; #ifdef USE_OPENCL_WRAPPER - if (lite::opencl::LoadOpenCLLibrary(handle_) == false) { + if (!lite::opencl::LoadOpenCLLibrary(&handle_)) { MS_LOG(ERROR) << "Load OpenCL symbols failed!"; return RET_ERROR; } @@ -93,35 +95,35 @@ int OpenCLRuntime::Init() { std::vector platforms; cl_int ret = cl::Platform::get(&platforms); - if (platforms.size() == 0) { + if (platforms.empty()) { MS_LOG(ERROR) << "OpenCL Platform not found!" << CLErrorCode(ret); return RET_ERROR; } // search GPU std::vector devices; - for (auto it = platforms.begin(); it != platforms.end(); ++it) { + for (auto &platform : platforms) { std::string platform_name; - ret = it->getInfo(CL_PLATFORM_NAME, &platform_name); + ret = platform.getInfo(CL_PLATFORM_NAME, &platform_name); if (ret != CL_SUCCESS) { MS_LOG(WARNING) << CLErrorCode(ret); } - ret = it->getDevices(CL_DEVICE_TYPE_GPU, &devices); + ret = platform.getDevices(CL_DEVICE_TYPE_GPU, &devices); if (ret != CL_SUCCESS) { MS_LOG(WARNING) << CLErrorCode(ret); } MS_LOG(INFO) << "Platform (" << platform_name << ") has " << devices.size() << " GPUs"; - if (devices.size() > 0) { + if (!devices.empty()) { std::string device_name = devices[0].getInfo(); MS_LOG(INFO) << "Find GPU: " << device_name.c_str(); - cl::Platform::setDefault(*it); + cl::Platform::setDefault(platform); break; } } // not found, return error code. - if (devices.size() == 0) { + if (devices.empty()) { MS_LOG(ERROR) << "OpenCL Device not found!"; return RET_ERROR; } @@ -264,23 +266,18 @@ int OpenCLRuntime::Init() { std::string flag = ""; binary_program_ = CreateProgramFromIL(g_program_binary, flag); #endif - if (enable_cache_) { - InitGpuCache(); - } - init_done_ = true; + LoadCache(); + init_state_ = InitSuccess; MS_LOG(INFO) << "OpenCLRuntime init done!"; - return RET_OK; } int OpenCLRuntime::Uninit() { - if (!init_done_) { + std::unique_lock lck(g_init_mtx); + if (init_state_ != InitSuccess) { return RET_OK; } - if (enable_cache_ && !binary_map_.empty()) { - StoreCache(); - } - binary_map_.clear(); + StoreCache(); program_map_.clear(); delete allocator_; delete default_command_queue_; @@ -296,7 +293,7 @@ int OpenCLRuntime::Uninit() { lite::opencl::UnLoadOpenCLLibrary(handle_); handle_ = nullptr; #endif - init_done_ = false; + init_state_ = UnInit; return RET_OK; } @@ -355,54 +352,39 @@ bool OpenCLRuntime::SetFp16Enable(bool enable) { } int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_name, const std::string &kernel_name, - const std::set &build_options) { - std::string build_options_str; - // set default macro + const std::vector &build_options_ext) { + std::string build_option = default_build_option_; if (fp16_enable_) { - // fp16 enable, kernel will use half and read_imageh and write_imageh. - build_options_str = - "-DFLT=half -DFLT4=half4 -DFLT16=half16 -DAS_FLT4=as_half4 -DAS_UINT4=as_ushort4 -DUINT4=ushort4 " - "-DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh -DTO_FLT=convert_half -DTO_FLT4=convert_half4 "; + build_option += + " -DFLT=half -DFLT4=half4 -DFLT16=half16 -DAS_FLT4=as_half4 -DAS_UINT4=as_ushort4 -DUINT4=ushort4 " + "-DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh -DTO_FLT=convert_half -DTO_FLT4=convert_half4"; } else { - // fp16 not enable, kernel will use float and read_imagef and write_imagef. - build_options_str = - "-DFLT=float -DFLT4=float4 -DFLT16=float16 -DAS_FLT4=as_float4 -DAS_UINT4=as_uint4 -DUINT4=uint4 " - "-DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef -DTO_FLT=convert_float -DTO_FLT4=convert_float4 "; - } - - auto build_options_ext = std::accumulate(build_options.begin(), build_options.end(), std::string(""), - [](const std::string &options, const std::string &option) -> std::string { - auto res = options + " " + option; - return res; - }); - build_options_str += default_build_opts_; - // program identifier = program_name + build_options - std::string build_program_key = program_name + build_options_str + build_options_ext; - - auto build_program_it = program_map_.find(build_program_key); + build_option += + " -DFLT=float -DFLT4=float4 -DFLT16=float16 -DAS_FLT4=as_float4 -DAS_UINT4=as_uint4 -DUINT4=uint4 " + "-DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef -DTO_FLT=convert_float -DTO_FLT4=convert_float4"; + } + build_option = + std::accumulate(build_options_ext.begin(), build_options_ext.end(), build_option, + [](const std::string &options, const std::string &option) { return options + " " + option; }); + cl::Program program; - // if search program identifier exist, then use it. - if (build_program_it != program_map_.end()) { - program = build_program_it->second; + auto program_key = std::make_pair(program_name, build_option); + auto iter = program_map_.find(program_key); + if (iter != program_map_.end()) { + program = iter->second; } else { - // load program and build program + flush_cache_ = true; auto status = this->LoadProgram(program_name, &program); if (!status) { MS_LOG(ERROR) << "load program (" << program_name << ") failed!"; return RET_ERROR; } - status = this->BuildProgram(build_options_str, program); + status = this->BuildProgram(build_option, program); if (!status) { MS_LOG(ERROR) << program_name << " build failed!"; return RET_ERROR; } - if (enable_cache_) { - need_write_ = true; - auto bin = GetProgramBinaries(program); - MS_ASSERT(bin.size() >= 1); - binary_map_.emplace(build_program_key, bin[0]); - } - program_map_.emplace(build_program_key, program); + program_map_.emplace(program_key, program); } cl_int ret; @@ -446,6 +428,7 @@ int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const cl::NDRange &global } return RET_OK; } + // get gpu divce type GpuInfo OpenCLRuntime::ParseGpuInfo(std::string device_name, std::string device_version) { GpuInfo info; @@ -472,17 +455,17 @@ GpuInfo OpenCLRuntime::ParseGpuInfo(std::string device_name, std::string device_ } bool OpenCLRuntime::LoadSource(const std::string &program_name, const std::string &source) { - auto it_source = g_opencl_program_map.find(program_name); - if (it_source == g_opencl_program_map.end()) { - g_opencl_program_map.emplace(program_name, source); + auto it_source = g_source_map.find(program_name); + if (it_source == g_source_map.end()) { + g_source_map.emplace(program_name, source); } return true; } // load program with program name. bool OpenCLRuntime::LoadProgram(const std::string &program_name, cl::Program *program) { - auto it_source = g_opencl_program_map.find(program_name); - if (it_source != g_opencl_program_map.end()) { + auto it_source = g_source_map.find(program_name); + if (it_source != g_source_map.end()) { cl::Program::Sources sources; sources.push_back(it_source->second); *program = cl::Program(*context_, sources); @@ -494,8 +477,8 @@ bool OpenCLRuntime::LoadProgram(const std::string &program_name, cl::Program *pr } // build program with build options -bool OpenCLRuntime::BuildProgram(const std::string &build_options, const cl::Program &program) { - cl_int ret = program.build({*device_}, build_options.c_str()); +bool OpenCLRuntime::BuildProgram(const std::string &build_option, const cl::Program &program) { + cl_int ret = program.build({*device_}, build_option.c_str()); if (ret != CL_SUCCESS) { if (program.getBuildInfo(*device_) == CL_BUILD_ERROR) { std::string build_log = program.getBuildInfo(*device_); @@ -658,92 +641,103 @@ cl::Program OpenCLRuntime::CreateProgramFromIL(const std::vector &binary, } // build program with binary -cl::Program OpenCLRuntime::CreateProgramFromBinary(const std::vector &binary, const std::string &flag) { +cl::Program OpenCLRuntime::CreateProgramFromBinary(const std::vector &binary, + const std::string &build_option) { cl::Program program = cl::Program(*context_, {*device_}, {binary}); - bool status = BuildProgram(default_build_opts_, program); + bool status = BuildProgram(build_option, program); if (!status) { MS_LOG(ERROR) << "Build program with binary failed!"; } return program; } -std::vector> OpenCLRuntime::GetProgramBinaries(const cl::Program &program) { +std::vector OpenCLRuntime::GetProgramBinary(const cl::Program &program) { cl_int ret = CL_SUCCESS; - auto binary = program.getInfo(&ret); + auto binarys = program.getInfo(&ret); if (ret != CL_SUCCESS) { MS_LOG(ERROR) << "Get program binary failed: " << CLErrorCode(ret); } - return binary; -} -void OpenCLRuntime::InitGpuCache() { - size_t len; - char *buf = lite::ReadFile(cache_path_.c_str(), &len); - if (LoadCache(buf) != RET_OK) { - MS_LOG(ERROR) << "Load opencl cache fail"; + if (binarys.empty()) { + MS_LOG(ERROR) << "binarys is empty"; + return {}; } - delete buf; - MS_LOG(INFO) << "Init opencl cache success"; + return binarys.front(); } -int OpenCLRuntime::LoadCache(const void *buf) { + +void OpenCLRuntime::LoadCache() { + if (!enable_cache_) { + return; + } + size_t len; + std::unique_ptr buf(lite::ReadFile(cache_path_.c_str(), &len)); if (buf == nullptr) { - return RET_ERROR; + MS_LOG(ERROR) << "Load opencl cache fail: buf == nullptr"; + return; } - auto gpu_cache = schema::GetGpuCache(buf); + auto gpu_cache = schema::GetGpuCache(buf.get()); if (gpu_cache == nullptr) { - return RET_ERROR; + MS_LOG(ERROR) << "Load opencl cache fail: gpu_cache == nullptr"; + return; } auto *bins = gpu_cache->allBins(); if (bins == nullptr) { - return RET_ERROR; + MS_LOG(ERROR) << "Load opencl cache fail: bins == nullptr"; + return; } - auto n = bins->size(); - for (auto i = 0; i < n; ++i) { - auto *kernel_bin = bins->template GetAs(i); - if (kernel_bin == nullptr) { + for (auto i = 0; i < bins->size(); ++i) { + auto *bin = bins->template GetAs(i); + if (bin == nullptr) { MS_LOG(ERROR) << "kernel_bin[" << i << "] null"; - return RET_ERROR; + return; } - auto *pdata = kernel_bin->data(); + auto *pdata = bin->data(); MS_ASSERT(pdata); if (pdata->size() == 0) { continue; } - std::vector bin(pdata->begin(), pdata->end()); - auto program = CreateProgramFromBinary(bin, kernel_bin->name()->str()); - program_map_.emplace(kernel_bin->name()->str(), program); - binary_map_.emplace(kernel_bin->name()->str(), bin); - MS_LOG(INFO) << "LoadCache " << kernel_bin->name()->str() << " success, size=" << pdata->size(); + std::vector binary(pdata->begin(), pdata->end()); + auto program = CreateProgramFromBinary(binary, bin->build_option()->str()); + program_map_.emplace(std::make_pair(bin->program_name()->str(), bin->build_option()->str()), program); + MS_LOG(INFO) << "LoadCache " << bin->program_name() << " success, size=" << binary.size(); } - return RET_OK; + MS_LOG(INFO) << "Init opencl cache success"; } -void OpenCLRuntime::StoreCache() { - if (need_write_) { - auto fbb_ = new (std::nothrow) flatbuffers::FlatBufferBuilder; - if (fbb_ == nullptr) { - MS_LOG(ERROR) << "new opencl FlatBufferBuilder fail"; - return; - } - std::vector> vec_kernel_bin; - for (auto iv : binary_map_) { - auto name = fbb_->CreateString(iv.first); - auto data = fbb_->CreateVector(iv.second); - std::vector shape; - auto tune = schema::CreateTuneParam(*fbb_, fbb_->CreateVector(shape), fbb_->CreateVector(shape), - fbb_->CreateVector(shape), fbb_->CreateVector(shape)); - auto kbin = schema::CreateKernelBin(*fbb_, name, tune, data); - vec_kernel_bin.emplace_back(kbin); - MS_LOG(INFO) << "StoreCache " << iv.first << " success, size=" << iv.second.size(); - } - auto data = fbb_->CreateVector>(vec_kernel_bin); - auto name = fbb_->CreateString("OpenCLCache"); - auto version = fbb_->CreateString(version_); - auto gpu_cache = schema::CreateGpuCache(*fbb_, name, version, data); - fbb_->Finish(gpu_cache); - uint8_t *buf = fbb_->GetBufferPointer(); - lite::WriteToBin(cache_path_, reinterpret_cast(buf), fbb_->GetSize()); - MS_LOG(INFO) << "store opencl cache ok, size=" << fbb_->GetSize(); - delete fbb_; - } +void OpenCLRuntime::StoreCache() { + if (!enable_cache_) { + return; + } + if (!flush_cache_) { + return; + } + auto fbb = std::make_unique(); + if (fbb == nullptr) { + MS_LOG(ERROR) << "new opencl FlatBufferBuilder fail"; + return; + } + std::vector> program_binarys; + for (const auto &kv : program_map_) { + auto program_name = kv.first.first; + auto build_option = kv.first.second; + cl::Program program = kv.second; + auto binary = this->GetProgramBinary(program); + std::vector shape; + auto tune = schema::CreateTuneParam(*fbb, fbb->CreateVector(shape), fbb->CreateVector(shape), + fbb->CreateVector(shape), fbb->CreateVector(shape)); + auto program_binary = schema::CreateProgramBinary( + *fbb, fbb->CreateString(program_name), fbb->CreateString(build_option), tune, fbb->CreateVector(binary)); + program_binarys.emplace_back(program_binary); + MS_LOG(INFO) << "StoreCache " << program_name << " success, size=" << binary.size(); + } + + auto data = fbb->CreateVector>(program_binarys); + auto name = fbb->CreateString("OpenCLCache"); + auto version = fbb->CreateString(cache_version_); + auto gpu_cache = schema::CreateGpuCache(*fbb, name, version, data); + fbb->Finish(gpu_cache); + uint8_t *buf = fbb->GetBufferPointer(); + lite::WriteToBin(cache_path_, reinterpret_cast(buf), fbb->GetSize()); + MS_LOG(INFO) << "store opencl cache ok, size=" << fbb->GetSize(); } + } // namespace mindspore::lite::opencl diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.h b/mindspore/lite/src/runtime/opencl/opencl_runtime.h index 3af20f5c69..8a94912d1e 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_runtime.h +++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.h @@ -23,6 +23,7 @@ j* you may not use this file except in compliance with the License. #include #include #include +#include #include #include "src/common/log_adapter.h" #include "src/runtime/opencl/opencl_wrapper.h" @@ -33,6 +34,7 @@ namespace mindspore::lite::opencl { enum GpuType { OTHER = 0, ADRENO = 1, MALI = 2, MALI_T = 3, MALI_G = 4 }; enum TuningMode { DEFAULT = 0, FAST = 1, EXTREME = 2 }; +enum InitState { UnInit = 0, InitSuccess = 1, InitFailed = 2 }; struct GpuInfo { GpuType type = OTHER; @@ -113,10 +115,10 @@ class OpenCLRuntime { cl::Program CreateProgramFromIL(const std::vector &binary, const std::string &flag); cl::Program CreateProgramFromBinary(const std::vector &binary, const std::string &flag); cl::Kernel GetKernelFromBinary(const std::string &kernel_name); - std::vector> GetProgramBinaries(const cl::Program &program); + std::vector GetProgramBinary(const cl::Program &program); bool LoadSource(const std::string &program_name, const std::string &source); int BuildKernel(cl::Kernel &kernel, const std::string &program_name, const std::string &kernel_name, - const std::set &build_options = {}); + const std::vector &build_options_ext = {}); int RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local, cl::CommandQueue *command_queue = nullptr, cl::Event *event = nullptr); int ReadOrWriteImage(void *buffer, void *data, bool is_read); @@ -146,23 +148,20 @@ class OpenCLRuntime { void SetTuningMode(TuningMode mode) { tuning_mode_ = mode; } TuningMode GetTuningMode() const { return tuning_mode_; } - void InitGpuCache(); - int LoadCache(const void *buf); - void StoreCache(); bool isProfiling() const { return profiling_; } void SetProfiling(bool profiling) { profiling_ = profiling; } private: static OpenCLRuntime *GetInstance(); static void DeleteInstance(); - OpenCLRuntime(); + OpenCLRuntime() = default; GpuInfo ParseGpuInfo(std::string device_name, std::string device_version); bool LoadProgram(const std::string &program_name, cl::Program *program); bool BuildProgram(const std::string &build_options, const cl::Program &program); private: - static bool init_done_; + static InitState init_state_; static size_t instance_count_; static OpenCLRuntime *ocl_runtime_instance_; cl::CommandQueue *default_command_queue_{nullptr}; @@ -170,15 +169,15 @@ class OpenCLRuntime { cl::Context *context_{nullptr}; cl::Device *device_{nullptr}; OpenCLAllocator *allocator_{nullptr}; - std::map program_map_; - cl::Program binary_program_{0}; + std::map, cl::Program> program_map_; + cl::Program binary_program_; uint64_t global_memery_cachesize_{0}; uint64_t global_memery_size_{0}; uint64_t max_alloc_size_{0}; int max_work_group_size_{1}; uint32_t compute_units_{0}; uint32_t max_freq_{0}; - std::string default_build_opts_{""}; + std::string default_build_option_{"-cl-mad-enable -cl-fast-relaxed-math -Werror"}; GpuInfo gpu_info_; bool support_fp16_{false}; bool fp16_enable_{false}; @@ -187,13 +186,17 @@ class OpenCLRuntime { cl_uint image_pitch_align_{0}; std::vector max_work_item_sizes_; void *handle_{nullptr}; - std::map> binary_map_; - std::string cache_path_{"/data/local/tmp/opencl_cache"}; - const std::string version_{"V0.1"}; - bool need_write_{false}; - bool enable_cache_{false}; TuningMode tuning_mode_{TuningMode::DEFAULT}; bool profiling_{false}; + + // for cache + private: + void LoadCache(); + void StoreCache(); + bool enable_cache_{false}; + bool flush_cache_{false}; + std::string cache_path_{"/data/local/tmp/.opencl_cache"}; + const std::string cache_version_{"V0.1"}; }; class OpenCLRuntimeWrapper { diff --git a/mindspore/lite/src/runtime/opencl/opencl_wrapper.cc b/mindspore/lite/src/runtime/opencl/opencl_wrapper.cc index 7cbac3205b..eb039872d0 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_wrapper.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_wrapper.cc @@ -74,18 +74,22 @@ bool UnLoadOpenCLLibrary(void *handle) { return true; } -bool LoadLibraryFromPath(const std::string &library_path, void *handle) { - handle = dlopen(library_path.c_str(), RTLD_NOW | RTLD_LOCAL); - if (handle == nullptr) { +bool LoadLibraryFromPath(const std::string &library_path, void **handle_ptr) { + if (handle_ptr == nullptr) { + return false; + } + + *handle_ptr = dlopen(library_path.c_str(), RTLD_NOW | RTLD_LOCAL); + if (*handle_ptr == nullptr) { return false; } // load function ptr use dlopen and dlsym. #define LOAD_OPENCL_FUNCTION_PTR(func_name) \ - func_name = reinterpret_cast(dlsym(handle, #func_name)); \ + func_name = reinterpret_cast(dlsym(*handle_ptr, #func_name)); \ if (func_name == nullptr) { \ MS_LOG(ERROR) << "load func (" << #func_name << ") from (" << library_path << ") failed!"; \ - UnLoadOpenCLLibrary(handle); \ + UnLoadOpenCLLibrary(*handle_ptr); \ return false; \ } @@ -160,13 +164,16 @@ bool LoadLibraryFromPath(const std::string &library_path, void *handle) { return true; } // load default library path -bool LoadOpenCLLibrary(void *handle) { - if (handle != nullptr) { +bool LoadOpenCLLibrary(void **handle_ptr) { + if (handle_ptr == nullptr) { + return false; + } + if (*handle_ptr != nullptr) { return true; } - auto it = std::find_if( - g_opencl_library_paths.begin(), g_opencl_library_paths.end(), - [&handle](const std::string &lib_path) { return lite::opencl::LoadLibraryFromPath(lib_path, handle); }); + auto it = + std::find_if(g_opencl_library_paths.begin(), g_opencl_library_paths.end(), + [&](const std::string &lib_path) { return lite::opencl::LoadLibraryFromPath(lib_path, handle_ptr); }); if (it != g_opencl_library_paths.end()) { MS_LOG(DEBUG) << "Find a OpenCL dynamic library : " << *it; return true; diff --git a/mindspore/lite/src/runtime/opencl/opencl_wrapper.h b/mindspore/lite/src/runtime/opencl/opencl_wrapper.h index 2f9a3cb5a7..17ab07f769 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_wrapper.h +++ b/mindspore/lite/src/runtime/opencl/opencl_wrapper.h @@ -20,25 +20,13 @@ #include #include #include - -// support opencl min version is 1.1 -#ifndef CL_TARGET_OPENCL_VERSION -#define CL_TARGET_OPENCL_VERSION 210 -#endif -#ifndef CL_HPP_TARGET_OPENCL_VERSION -#define CL_HPP_TARGET_OPENCL_VERSION 210 -#endif -#ifndef CL_HPP_MINIMUM_OPENCL_VERSION -#define CL_HPP_MINIMUM_OPENCL_VERSION 110 -#endif - #include "CL/cl2.hpp" #ifdef USE_OPENCL_WRAPPER namespace mindspore::lite::opencl { // This is a opencl function wrapper. -bool LoadOpenCLLibrary(void *handle); +bool LoadOpenCLLibrary(void **handle_ptr); bool UnLoadOpenCLLibrary(void *handle); // get platfrom id