!11204 【MS】【LITE】【GPU】 optimize opencl load/store program cache

From: @wangdongxu6 Reviewed-by: @ddwsky,@zhanghaibo5 Signed-off-by: @ddwsky
4 years ago · 2924552783
parent 76b4adec24 e153665a48
commit 2924552783
14 changed files with 179 additions and 183 deletions
--- a/mindspore/lite/CMakeLists.txt
+++ b/mindspore/lite/CMakeLists.txt
@ -156,13 +156,15 @@ if (SUPPORT_GPU)
    gene_opencl(${CMAKE_CURRENT_SOURCE_DIR})
    add_definitions(-DUSE_OPENCL_WRAPPER)
    add_definitions(-DMS_OPENCL_PROFILE=false)
+    add_definitions(-DCL_TARGET_OPENCL_VERSION=200)
    add_definitions(-DCL_HPP_TARGET_OPENCL_VERSION=200)
+    add_definitions(-DCL_HPP_MINIMUM_OPENCL_VERSION=110)
    add_compile_definitions(SUPPORT_GPU)
    if (OFFLINE_COMPILE)
        add_compile_definitions(PROGRAM_WITH_IL)
    endif ()
-    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/build/_deps/opencl-headers-src/)
-    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/build/_deps/opencl-clhpp-src/include)
+    include_directories(${CMAKE_BINARY_DIR}/_deps/opencl-headers-src/)
+    include_directories(${CMAKE_BINARY_DIR}/_deps/opencl-clhpp-src/include)
 endif ()

 if (WIN32)
--- a/mindspore/lite/schema/gpu_cache.fbs
+++ b/mindspore/lite/schema/gpu_cache.fbs
@ -23,8 +23,9 @@ table TuneParam {
    opPara: [int];
 }

-table KernelBin {
-    name: string;
+table ProgramBinary {
+    program_name: string;
+    build_option: string;
    tune: TuneParam;
    data: [ubyte];
 }
@ -32,7 +33,7 @@ table KernelBin {
 table GpuCache {
    name: string;
    version: string;
-    allBins: [KernelBin];
+    allBins: [ProgramBinary];
 }

 root_type GpuCache;
--- a/mindspore/lite/src/lite_session.cc
+++ b/mindspore/lite/src/lite_session.cc
@ -546,6 +546,9 @@ LiteSession::~LiteSession() {
 #if SUPPORT_NPU
  mindspore::lite::NPUPassManager::GetInstance()->Clear();
  mindspore::lite::NPUManager::GetInstance()->Reset();
+#endif
+#if SUPPORT_GPU && !SUPPORT_TRAIN
+  delete opencl_runtime_wrapper_;
 #endif
  delete (model_);
  is_running_.store(false);
@ -676,8 +679,13 @@ int LiteSession::Resize(const std::vector<mindspore::tensor::MSTensor *> &inputs
 int LiteSession::InitGPURuntime() {
 #if SUPPORT_GPU && !SUPPORT_TRAIN
  if (this->context_->IsGpuEnabled()) {
+    opencl_runtime_wrapper_ = new (std::nothrow) opencl::OpenCLRuntimeWrapper();
+    if (opencl_runtime_wrapper_ == nullptr) {
+      MS_LOG(ERROR) << "create OpenCLRuntimeWrapper failed";
+      return RET_ERROR;
+    }
    auto gpu_device_info = this->context_->GetGpuInfo();
-    auto opencl_runtime = ocl_runtime_wrap_.GetInstance();
+    auto opencl_runtime = opencl_runtime_wrapper_->GetInstance();
    opencl_runtime->SetFp16Enable(gpu_device_info.enable_float16_);
    if (opencl_runtime->Init() != RET_OK) {
      this->context_->device_list_ = {{DT_CPU, {gpu_device_info.enable_float16_, MID_CPU}}};
--- a/mindspore/lite/src/lite_session.h
+++ b/mindspore/lite/src/lite_session.h
@ -128,7 +128,7 @@ class LiteSession : public session::LiteSession {
  Model *model_ = nullptr;
  std::atomic<bool> is_running_ = false;
 #if SUPPORT_GPU && !SUPPORT_TRAIN
-  opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_;
+  opencl::OpenCLRuntimeWrapper *opencl_runtime_wrapper_{nullptr};
 #endif
 };
 }  // namespace lite
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc
@ -139,11 +139,10 @@ int ArgMinMaxOpenCLKernel::Prepare() {
  kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else

-  std::set<std::string> build_options;
  std::string source = argminmax_source;
  std::string program_name = "argminmax";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif

  InitWeights();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
@ -165,11 +165,10 @@ int BatchNormOpenCLKernel::Initweight() {
 int BatchNormOpenCLKernel::Prepare() {
  use_fp16_enable_ = ocl_runtime_->GetFp16Enable();
  std::string kernel_name = "Batch_normalization_NHWC4";
-  std::set<std::string> build_options;
  std::string source = batchnorm_source;
  std::string program_name = "Batch_normalization";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
  MS_LOG(DEBUG) << kernel_name << " Init Done!";
  int ret = Initweight();
  if (ret) {
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc
@ -94,11 +94,10 @@ int CastOpenCLKernel::Prepare() {
  std::string kernel_name = "Cast";
  GetKernelName(&kernel_name, param);
  kernel_name += "_NHWC4";
-  std::set<std::string> build_options;
  std::string source = cast_source;
  std::string program_name = "cast";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
  MS_LOG(DEBUG) << kernel_name << " Init Done!";
  SetConstArgs();
  SetGlobalLocal();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.cc
@ -145,7 +145,6 @@ int FusionEltwiseOpenCLKernel::Prepare() {
  static std::set<std::string> code_map;
  std::string source = Codegen();
  code_map.insert(source);
-  //  std::cout << name() << "\n" << source;

  std::string program_name = "FusionEltwise" + std::to_string(code_map.size());
  std::string kernel_name = "FusionEltwise";
@ -182,7 +181,6 @@ int FusionEltwiseOpenCLKernel::InitWeights() {
      if (IsScalar(tensor->shape())) {
        float value = (tensor->data_type() == kNumberTypeFloat16) ? *(reinterpret_cast<float16_t *>(tensor->data_c()))
                                                                  : *(reinterpret_cast<float32_t *>(tensor->data_c()));
-        //        std::cout << "value=" << value << std::endl;
        scalar_weights_.push_back(value);
      } else {
        auto tensor_info = GpuTensorInfo(tensor);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc
@ -212,13 +212,12 @@ int LayerNormOpenCLKernel::Prepare() {
  }
  std::string kernel_name = "LayerNormalization_NHWC4";
  std::string kernel_name_mean_var = "ComputeMeanVar";
-  std::set<std::string> build_options;
  std::string source = layer_norm_source;
  std::string program_name = "LayerNormalization";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
  kernel_name_mean_var += "Dim" + std::to_string(normalized_dims_) + "NHWC4";
-  ocl_runtime_->BuildKernel(kernel_mean_var_, program_name, kernel_name_mean_var, build_options);
+  ocl_runtime_->BuildKernel(kernel_mean_var_, program_name, kernel_name_mean_var);
  MS_LOG(DEBUG) << kernel_name << " Init Done!";
  SetConstArgs();
  SetGlobalLocal();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc
@ -51,11 +51,10 @@ int OneHotOpenCLKernel::Prepare() {
 #ifdef PROGRAM_WITH_IL
  kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
-  std::set<std::string> build_options;
  std::string source = one_hot_source;
  std::string program_name = "OneHot";
  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
  InitWeights();
  SetConstArgs();
--- a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
--- a/mindspore/lite/src/runtime/opencl/opencl_runtime.h
+++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.h
@ -23,6 +23,7 @@ j* you may not use this file except in compliance with the License.
 #include <memory>
 #include <set>
 #include <string>
+#include <utility>
 #include <type_traits>
 #include "src/common/log_adapter.h"
 #include "src/runtime/opencl/opencl_wrapper.h"
@ -33,6 +34,7 @@ namespace mindspore::lite::opencl {

 enum GpuType { OTHER = 0, ADRENO = 1, MALI = 2, MALI_T = 3, MALI_G = 4 };
 enum TuningMode { DEFAULT = 0, FAST = 1, EXTREME = 2 };
+enum InitState { UnInit = 0, InitSuccess = 1, InitFailed = 2 };

 struct GpuInfo {
  GpuType type = OTHER;
@ -113,10 +115,10 @@ class OpenCLRuntime {
  cl::Program CreateProgramFromIL(const std::vector<char> &binary, const std::string &flag);
  cl::Program CreateProgramFromBinary(const std::vector<unsigned char> &binary, const std::string &flag);
  cl::Kernel GetKernelFromBinary(const std::string &kernel_name);
-  std::vector<std::vector<unsigned char>> GetProgramBinaries(const cl::Program &program);
+  std::vector<unsigned char> GetProgramBinary(const cl::Program &program);
  bool LoadSource(const std::string &program_name, const std::string &source);
  int BuildKernel(cl::Kernel &kernel, const std::string &program_name, const std::string &kernel_name,
-                  const std::set<std::string> &build_options = {});
+                  const std::vector<std::string> &build_options_ext = {});
  int RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local,
                cl::CommandQueue *command_queue = nullptr, cl::Event *event = nullptr);
  int ReadOrWriteImage(void *buffer, void *data, bool is_read);
@ -146,23 +148,20 @@ class OpenCLRuntime {
  void SetTuningMode(TuningMode mode) { tuning_mode_ = mode; }
  TuningMode GetTuningMode() const { return tuning_mode_; }

-  void InitGpuCache();
-  int LoadCache(const void *buf);
-  void StoreCache();
  bool isProfiling() const { return profiling_; }
  void SetProfiling(bool profiling) { profiling_ = profiling; }

 private:
  static OpenCLRuntime *GetInstance();
  static void DeleteInstance();
-  OpenCLRuntime();
+  OpenCLRuntime() = default;
  GpuInfo ParseGpuInfo(std::string device_name, std::string device_version);

  bool LoadProgram(const std::string &program_name, cl::Program *program);
  bool BuildProgram(const std::string &build_options, const cl::Program &program);

 private:
-  static bool init_done_;
+  static InitState init_state_;
  static size_t instance_count_;
  static OpenCLRuntime *ocl_runtime_instance_;
  cl::CommandQueue *default_command_queue_{nullptr};
@ -170,15 +169,15 @@ class OpenCLRuntime {
  cl::Context *context_{nullptr};
  cl::Device *device_{nullptr};
  OpenCLAllocator *allocator_{nullptr};
-  std::map<std::string, cl::Program> program_map_;
-  cl::Program binary_program_{0};
+  std::map<std::pair<std::string, std::string>, cl::Program> program_map_;
+  cl::Program binary_program_;
  uint64_t global_memery_cachesize_{0};
  uint64_t global_memery_size_{0};
  uint64_t max_alloc_size_{0};
  int max_work_group_size_{1};
  uint32_t compute_units_{0};
  uint32_t max_freq_{0};
-  std::string default_build_opts_{""};
+  std::string default_build_option_{"-cl-mad-enable -cl-fast-relaxed-math -Werror"};
  GpuInfo gpu_info_;
  bool support_fp16_{false};
  bool fp16_enable_{false};
@ -187,13 +186,17 @@ class OpenCLRuntime {
  cl_uint image_pitch_align_{0};
  std::vector<size_t> max_work_item_sizes_;
  void *handle_{nullptr};
-  std::map<std::string, std::vector<unsigned char>> binary_map_;
-  std::string cache_path_{"/data/local/tmp/opencl_cache"};
-  const std::string version_{"V0.1"};
-  bool need_write_{false};
-  bool enable_cache_{false};
  TuningMode tuning_mode_{TuningMode::DEFAULT};
  bool profiling_{false};
+
+  // for cache
+ private:
+  void LoadCache();
+  void StoreCache();
+  bool enable_cache_{false};
+  bool flush_cache_{false};
+  std::string cache_path_{"/data/local/tmp/.opencl_cache"};
+  const std::string cache_version_{"V0.1"};
 };

 class OpenCLRuntimeWrapper {
--- a/mindspore/lite/src/runtime/opencl/opencl_wrapper.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_wrapper.cc
@ -74,18 +74,22 @@ bool UnLoadOpenCLLibrary(void *handle) {
  return true;
 }

-bool LoadLibraryFromPath(const std::string &library_path, void *handle) {
-  handle = dlopen(library_path.c_str(), RTLD_NOW | RTLD_LOCAL);
-  if (handle == nullptr) {
+bool LoadLibraryFromPath(const std::string &library_path, void **handle_ptr) {
+  if (handle_ptr == nullptr) {
+    return false;
+  }
+
+  *handle_ptr = dlopen(library_path.c_str(), RTLD_NOW | RTLD_LOCAL);
+  if (*handle_ptr == nullptr) {
    return false;
  }

 // load function ptr use dlopen and dlsym.
 #define LOAD_OPENCL_FUNCTION_PTR(func_name)                                                    \
-  func_name = reinterpret_cast<func_name##Func>(dlsym(handle, #func_name));                    \
+  func_name = reinterpret_cast<func_name##Func>(dlsym(*handle_ptr, #func_name));               \
  if (func_name == nullptr) {                                                                  \
    MS_LOG(ERROR) << "load func (" << #func_name << ") from (" << library_path << ") failed!"; \
-    UnLoadOpenCLLibrary(handle);                                                               \
+    UnLoadOpenCLLibrary(*handle_ptr);                                                          \
    return false;                                                                              \
  }

@ -160,13 +164,16 @@ bool LoadLibraryFromPath(const std::string &library_path, void *handle) {
  return true;
 }
 // load default library path
-bool LoadOpenCLLibrary(void *handle) {
-  if (handle != nullptr) {
+bool LoadOpenCLLibrary(void **handle_ptr) {
+  if (handle_ptr == nullptr) {
+    return false;
+  }
+  if (*handle_ptr != nullptr) {
    return true;
  }
-  auto it = std::find_if(
-    g_opencl_library_paths.begin(), g_opencl_library_paths.end(),
-    [&handle](const std::string &lib_path) { return lite::opencl::LoadLibraryFromPath(lib_path, handle); });
+  auto it =
+    std::find_if(g_opencl_library_paths.begin(), g_opencl_library_paths.end(),
+                 [&](const std::string &lib_path) { return lite::opencl::LoadLibraryFromPath(lib_path, handle_ptr); });
  if (it != g_opencl_library_paths.end()) {
    MS_LOG(DEBUG) << "Find a OpenCL dynamic library : " << *it;
    return true;
--- a/mindspore/lite/src/runtime/opencl/opencl_wrapper.h
+++ b/mindspore/lite/src/runtime/opencl/opencl_wrapper.h
@ -20,25 +20,13 @@
 #include <memory>
 #include <string>
 #include <algorithm>
-
-// support opencl min version is 1.1
-#ifndef CL_TARGET_OPENCL_VERSION
-#define CL_TARGET_OPENCL_VERSION 210
-#endif
-#ifndef CL_HPP_TARGET_OPENCL_VERSION
-#define CL_HPP_TARGET_OPENCL_VERSION 210
-#endif
-#ifndef CL_HPP_MINIMUM_OPENCL_VERSION
-#define CL_HPP_MINIMUM_OPENCL_VERSION 110
-#endif
-
 #include "CL/cl2.hpp"

 #ifdef USE_OPENCL_WRAPPER

 namespace mindspore::lite::opencl {
 // This is a opencl function wrapper.
-bool LoadOpenCLLibrary(void *handle);
+bool LoadOpenCLLibrary(void **handle_ptr);
 bool UnLoadOpenCLLibrary(void *handle);

 // get platfrom id