From b365c3c3778840190916c306b135cc7920b1d06f Mon Sep 17 00:00:00 2001
From: wandongdong <wandongdong1@huawei.com>
Date: Tue, 24 Nov 2020 04:51:20 -0800
Subject: [PATCH] implement opencl cache bin

---
 mindspore/lite/schema/gpu_cache.fbs           | 38 ++++++++
 mindspore/lite/src/CMakeLists.txt             |  1 +
 .../src/runtime/opencl/opencl_allocator.cc    |  7 +-
 .../lite/src/runtime/opencl/opencl_runtime.cc | 91 ++++++++++++++++++-
 .../lite/src/runtime/opencl/opencl_runtime.h  | 12 ++-
 5 files changed, 143 insertions(+), 6 deletions(-)
 create mode 100644 mindspore/lite/schema/gpu_cache.fbs

diff --git a/mindspore/lite/schema/gpu_cache.fbs b/mindspore/lite/schema/gpu_cache.fbs
new file mode 100644
index 0000000000..ad9c1c613a
--- /dev/null
+++ b/mindspore/lite/schema/gpu_cache.fbs
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace mindspore.schema;
+
+table TuneParam {
+    local: [int];
+    block: [int];
+    shape: [int];
+    opPara: [int];
+}
+
+table KernelBin {
+    name: string;
+    tune: TuneParam;
+    data: [ubyte];
+}
+
+table GpuCache {
+    name: string;
+    version: string;
+    allBins: [KernelBin];
+}
+
+root_type GpuCache;
diff --git a/mindspore/lite/src/CMakeLists.txt b/mindspore/lite/src/CMakeLists.txt
index 36fd19391b..b5c1d5a65a 100644
--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@@ -17,6 +17,7 @@ if (PLATFORM_ARM32 OR PLATFORM_ARM64)
 endif ()
 
 set(LITE_SRC
+        ${CMAKE_CURRENT_SOURCE_DIR}/common/file_utils.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/common/utils.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/common/graph_util.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/common/log_adapter.cc
diff --git a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
index ce98df79e1..8beaa6054a 100644
--- a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc
@@ -67,7 +67,7 @@ void *OpenCLAllocator::MinimumFit(size_t size, const std::vector<size_t> &img_si
 void *OpenCLAllocator::CreateBuffer(size_t size, void *data, size_t flags, cl::Buffer **buffer) {
   cl_int ret = CL_SUCCESS;
   MS_ASSERT(buffer);
-  *buffer = new (std::nothrow) cl::Buffer(*ocl_runtime_->Context(), flags, size, data, &ret);
+  *buffer = new (std::nothrow) cl::Buffer(*ocl_runtime_->Context(), static_cast<cl_mem_flags>(flags), size, data, &ret);
   if (*buffer == nullptr) {
     MS_LOG(ERROR) << "Create OpenCL buffer failed! (ERROR CODE: " << ret << ")";
     return nullptr;
@@ -90,6 +90,9 @@ void *OpenCLAllocator::CreateBuffer(size_t size, void *data, size_t flags, cl::B
 void *OpenCLAllocator::CreateImage2D(size_t size, const std::vector<size_t> &img_size, void *data, size_t flags,
                                      bool is_map, cl::Buffer **buffer, cl::Image2D **image) {
   cl_int ret = CL_SUCCESS;
+  MS_ASSERT(buffer);
+  MS_ASSERT(image);
+  MS_ASSERT(img_size.size() == 3);
   cl::ImageFormat image_format(CL_RGBA, img_size[2]);
   if (data == nullptr) {
     *image = new (std::nothrow)
@@ -332,7 +335,7 @@ void *OpenCLAllocator::MapBuffer(void *host_ptr, int flags, void *command_queue,
   }
   MemBuf *mem_buf = it->second;
   MS_ASSERT(mem_buf);
-  void *new_host_ptr;
+  void *new_host_ptr{nullptr};
   if (mem_buf->img_size.empty()) {
     cl::Buffer *buffer = static_cast<cl::Buffer *>(mem_buf->device_ptr_);
     MS_ASSERT(buffer);
diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
index 881005522c..409ec1968c 100644
--- a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
@@ -17,12 +17,14 @@
 #include "src/runtime/opencl/opencl_runtime.h"
 #include <vector>
 #include <numeric>
+#include <utility>
 #ifdef SHARING_MEM_WITH_OPENGL
 #include <EGL/egl.h>
 #endif
 #include "include/errorcode.h"
 #include "src/runtime/kernel/opencl/utils.h"
 #include "src/runtime/opencl/opencl_allocator.h"
+#include "src/common/file_utils.h"
 #ifdef PROGRAM_WITH_IL
 #include "src/backend/opencl/cl/program.inc"
 #endif
@@ -254,6 +256,9 @@ int OpenCLRuntime::Init() {
   std::string flag = "";
   binary_program_ = CreateProgramFromIL(g_program_binary, flag);
 #endif
+  if (enable_cache_) {
+    InitGpuCache();
+  }
   init_done_ = true;
   MS_LOG(INFO) << "OpenCLRuntime init done!";
 
@@ -261,6 +266,10 @@ int OpenCLRuntime::Init() {
 }
 
 int OpenCLRuntime::Uninit() {
+  if (enable_cache_) {
+    StoreCache();
+  }
+  binary_map_.clear();
   program_map_.clear();
   delete allocator_;
   delete default_command_queue_;
@@ -374,6 +383,12 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na
       MS_LOG(ERROR) << program_name << " build failed!";
       return RET_ERROR;
     }
+    if (enable_cache_) {
+      need_write_ = true;
+      auto bin = GetProgramBinaries(program);
+      MS_ASSERT(bin.size() >= 1);
+      binary_map_.emplace(build_program_key, bin[0]);
+    }
     program_map_.emplace(build_program_key, program);
   }
 
@@ -673,9 +688,8 @@ cl::Program OpenCLRuntime::CreateProgramFromIL(const std::vector<char> &binary,
 }
 
 // build program with binary
-cl::Program OpenCLRuntime::CreateProgramFromBinary(const std::vector<std::vector<unsigned char>> &binary,
-                                                   const std::string &flag) {
-  cl::Program program = cl::Program(*context_, {*device_}, binary);
+cl::Program OpenCLRuntime::CreateProgramFromBinary(const std::vector<unsigned char> &binary, const std::string &flag) {
+  cl::Program program = cl::Program(*context_, {*device_}, {binary});
   bool status = BuildProgram(default_build_opts_, program);
   if (!status) {
     MS_LOG(ERROR) << "Build program with binary failed!";
@@ -691,4 +705,75 @@ std::vector<std::vector<unsigned char>> OpenCLRuntime::GetProgramBinaries(const
   }
   return binary;
 }
+void OpenCLRuntime::InitGpuCache() {
+  size_t len;
+  char *buf = lite::ReadFile(cache_path_.c_str(), &len);
+  if (LoadCache(buf) != RET_OK) {
+    MS_LOG(ERROR) << "Load opencl cache fail";
+  }
+  delete buf;
+  MS_LOG(INFO) << "Init opencl cache success";
+}
+int OpenCLRuntime::LoadCache(const void *buf) {
+  if (buf == nullptr) {
+    return RET_ERROR;
+  }
+  auto gpu_cache = schema::GetGpuCache(buf);
+  if (gpu_cache == nullptr) {
+    return RET_ERROR;
+  }
+  auto *bins = gpu_cache->allBins();
+  if (bins == nullptr) {
+    return RET_ERROR;
+  }
+  auto n = bins->size();
+  for (auto i = 0; i < n; ++i) {
+    auto *kernel_bin = bins->template GetAs<schema::KernelBin>(i);
+    if (kernel_bin == nullptr) {
+      MS_LOG(ERROR) << "kernel_bin[" << i << "] null";
+      return RET_ERROR;
+    }
+    auto *pdata = kernel_bin->data();
+    MS_ASSERT(pdata);
+    if (pdata->size() == 0) {
+      continue;
+    }
+    std::vector<unsigned char> bin(pdata->begin(), pdata->end());
+    auto program = CreateProgramFromBinary(bin, kernel_bin->name()->str());
+    program_map_.emplace(kernel_bin->name()->str(), program);
+    binary_map_.emplace(kernel_bin->name()->str(), bin);
+    MS_LOG(INFO) << "LoadCache " << kernel_bin->name()->str() << " success, size=" << pdata->size();
+  }
+  return RET_OK;
+}
+void OpenCLRuntime::StoreCache() {
+  if (need_write_) {
+    auto fbb_ = new (std::nothrow) flatbuffers::FlatBufferBuilder;
+    if (fbb_ == nullptr) {
+      MS_LOG(ERROR) << "new opencl FlatBufferBuilder fail";
+      return;
+    }
+    std::vector<flatbuffers::Offset<schema::KernelBin>> vec_kernel_bin;
+    for (auto iv : binary_map_) {
+      auto name = fbb_->CreateString(iv.first);
+      auto data = fbb_->CreateVector<uint8_t>(iv.second);
+      std::vector<int32_t> shape;
+      auto tune = schema::CreateTuneParam(*fbb_, fbb_->CreateVector<int32_t>(shape), fbb_->CreateVector<int32_t>(shape),
+                                          fbb_->CreateVector<int32_t>(shape), fbb_->CreateVector<int32_t>(shape));
+      auto kbin = schema::CreateKernelBin(*fbb_, name, tune, data);
+      vec_kernel_bin.emplace_back(kbin);
+      MS_LOG(INFO) << "StoreCache " << iv.first << " success, size=" << iv.second.size();
+    }
+
+    auto data = fbb_->CreateVector<flatbuffers::Offset<schema::KernelBin>>(vec_kernel_bin);
+    auto name = fbb_->CreateString("OpenCLCache");
+    auto version = fbb_->CreateString(version_);
+    auto gpu_cache = schema::CreateGpuCache(*fbb_, name, version, data);
+    fbb_->Finish(gpu_cache);
+    uint8_t *buf = fbb_->GetBufferPointer();
+    lite::WriteToBin(cache_path_, reinterpret_cast<void *>(buf), fbb_->GetSize());
+    MS_LOG(INFO) << "store opencl cache ok, size=" << fbb_->GetSize();
+    delete fbb_;
+  }
+}
 }  // namespace mindspore::lite::opencl
diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.h b/mindspore/lite/src/runtime/opencl/opencl_runtime.h
index e82f09072e..ffa70254ac 100644
--- a/mindspore/lite/src/runtime/opencl/opencl_runtime.h
+++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.h
@@ -27,6 +27,7 @@ j* you may not use this file except in compliance with the License.
 #include "src/common/log_adapter.h"
 #include "src/runtime/opencl/opencl_wrapper.h"
 #include "src/runtime/opencl/opencl_allocator.h"
+#include "schema/gpu_cache_generated.h"
 
 namespace mindspore::lite::opencl {
 
@@ -107,7 +108,7 @@ class OpenCLRuntime {
   }
 
   cl::Program CreateProgramFromIL(const std::vector<char> &binary, const std::string &flag);
-  cl::Program CreateProgramFromBinary(const std::vector<std::vector<unsigned char>> &binary, const std::string &flag);
+  cl::Program CreateProgramFromBinary(const std::vector<unsigned char> &binary, const std::string &flag);
   cl::Kernel GetKernelFromBinary(const std::string &kernel_name);
   std::vector<std::vector<unsigned char>> GetProgramBinaries(const cl::Program &program);
   bool LoadSource(const std::string &program_name, const std::string &source);
@@ -139,6 +140,10 @@ class OpenCLRuntime {
    */
   int GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id);
 
+  void InitGpuCache();
+  int LoadCache(const void *buf);
+  void StoreCache();
+
  private:
   static OpenCLRuntime *GetInstance();
   static void DeleteInstance();
@@ -171,6 +176,11 @@ class OpenCLRuntime {
   cl_uint image_pitch_align_{0};
   std::vector<size_t> max_work_item_sizes_;
   void *handle_{nullptr};
+  std::map<std::string, std::vector<unsigned char>> binary_map_;
+  std::string cache_path_{"/data/local/tmp/opencl_cache"};
+  const std::string version_{"V0.1"};
+  bool need_write_{false};
+  bool enable_cache_{false};
 };
 
 class OpenCLRuntimeWrapper {