rewrite schduler & fix subGraph segmentation bug

4 years ago · a3c6ad09a0
parent 5fe2164a70
commit a3c6ad09a0
215 changed files with 1717 additions and 1387 deletions
--- a/mindspore/lite/CMakeLists.txt
+++ b/mindspore/lite/CMakeLists.txt
@ -81,7 +81,6 @@ include_directories(${CCSRC_DIR})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src/runtime/kernel/arm)
 include_directories(${TOP_DIR}/third_party)
-include_directories(${TOP_DIR}/third_party/flatbuffers/include)
 include_directories(${CMAKE_BINARY_DIR})

 include(${TOP_DIR}/cmake/utils.cmake)
--- a/mindspore/lite/include/lite_session.h
+++ b/mindspore/lite/include/lite_session.h
@ -27,16 +27,6 @@

 namespace mindspore {
 namespace session {
-/// \brief CallBackParam defined input arguments for callBack function.
-struct CallBackParam {
-  std::string node_name; /**< node name argument */
-  std::string node_type; /**< node type argument */
-};
-
-/// \brief KernelCallBack defined the function pointer for callBack.
-using KernelCallBack = std::function<bool(std::vector<tensor::MSTensor *> inputs,
-                                          std::vector<tensor::MSTensor *> outputs, const CallBackParam &opInfo)>;
-
 /// \brief LiteSession defined session in MindSpore Lite for compiling Model and forwarding model.
 class MS_API LiteSession {
 public:
--- a/mindspore/lite/include/ms_tensor.h
+++ b/mindspore/lite/include/ms_tensor.h
@ -17,9 +17,11 @@
 #ifndef MINDSPORE_LITE_INCLUDE_MS_TENSOR_H_
 #define MINDSPORE_LITE_INCLUDE_MS_TENSOR_H_

+#include <functional>
+#include <memory>
+#include <string>
 #include <utility>
 #include <vector>
-#include <memory>
 #include "ir/dtype/type_id.h"

 namespace mindspore {
@ -74,5 +76,14 @@ class MS_API MSTensor {
  virtual void *MutableData() = 0;
 };
 }  // namespace tensor
+/// \brief CallBackParam defined input arguments for callBack function.
+struct CallBackParam {
+  std::string node_name; /**< node name argument */
+  std::string node_type; /**< node type argument */
+};
+
+/// \brief KernelCallBack defined the function pointer for callBack.
+using KernelCallBack = std::function<bool(std::vector<tensor::MSTensor *> inputs,
+                                          std::vector<tensor::MSTensor *> outputs, const CallBackParam &opInfo)>;
 }  // namespace mindspore
 #endif  // MINDSPORE_LITE_INCLUDE_MS_TENSOR_H_
--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@ -29,6 +29,7 @@ set(LITE_SRC
        ${CMAKE_CURRENT_SOURCE_DIR}/inner_context.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/kernel_registry.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/lite_kernel.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/sub_graph_kernel.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/populate_parameter.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/scheduler.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/lite_session.cc
--- a/mindspore/lite/src/common/file_utils.cc
+++ b/mindspore/lite/src/common/file_utils.cc
@ -14,11 +14,11 @@
 * limitations under the License.
 */

-#include <stdlib.h>
+#include "src/common/file_utils.h"
 #include <fcntl.h>
+#include <stdlib.h>
 #include <climits>
 #include <cmath>
-#include "src/common/file_utils.h"
 #include "securec/include/securec.h"

 namespace mindspore {
@ -78,7 +78,7 @@ std::string RealPath(const char *path) {
  char *real_path = realpath(path, resolvedPath.get());
 #endif
  if (real_path == nullptr || strlen(real_path) == 0) {
-    MS_LOG(ERROR) << "Proto file path is not valid";
+    MS_LOG(ERROR) << "file path is not valid : " << path;
    return "";
  }
  std::string res = resolvedPath.get();
--- a/mindspore/lite/src/executor.cc
+++ b/mindspore/lite/src/executor.cc
@ -19,10 +19,7 @@
 #include "include/errorcode.h"

 namespace mindspore::lite {
-int Executor::Run(std::vector<Tensor *> &in_tensors, std::vector<Tensor *> &out_tensors,
-                  std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator,
-                  const session::KernelCallBack &before, const session::KernelCallBack &after) {
-  MS_ASSERT(nullptr != allocator);
+int Executor::CheckInputs(std::vector<Tensor *> &in_tensors) {
  for (auto &inTensor : in_tensors) {
    if (inTensor == nullptr) {
      MS_LOG(ERROR) << "Graph input tensor is nullptr";
@ -32,10 +29,18 @@ int Executor::Run(std::vector<Tensor *> &in_tensors, std::vector<Tensor *> &out_
      MS_LOG(ERROR) << "Graph input tensor data is nullptr";
      return RET_ERROR;
    }
-    if (inTensor->GetFormat() != schema::Format::Format_NHWC) {
-      MS_LOG(ERROR) << "Model input tensor should be NHWC";
-      return RET_ERROR;
-    }
+  }
+  return RET_OK;
+}
+
+int Executor::Run(std::vector<Tensor *> &in_tensors, std::vector<Tensor *> &out_tensors,
+                  std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator, const KernelCallBack &before,
+                  const KernelCallBack &after) {
+  MS_ASSERT(nullptr != allocator);
+  auto ret = this->CheckInputs(in_tensors);
+  if (RET_OK != ret) {
+    MS_LOG(ERROR) << "CheckInputs failed";
+    return ret;
  }
  kernel::LiteKernelUtil::InitTensorRefCount(kernels);
  for (auto out_tensor : out_tensors) {  // increase RefCount of output tensors, such that Run will not free them
@ -44,34 +49,20 @@ int Executor::Run(std::vector<Tensor *> &in_tensors, std::vector<Tensor *> &out_

  for (auto *kernel : kernels) {
    MS_ASSERT(nullptr != kernel);
-
-    if (before != nullptr) {
-      if (!before(TensorVectorCast(kernel->in_tensors()), TensorVectorCast(kernel->out_tensors()),
-                  {kernel->name(), kernel->type_str()})) {
-        MS_LOG(ERROR) << "run kernel before_callback failed, name: " << kernel->name();
-      }
+    ret = kernel->PreProcess();
+    if (RET_OK != ret) {
+      MS_LOG(ERROR) << "PreProcess kernel failed, name: " << kernel->name();
+      return ret;
    }
-
-    auto ret = kernel->Run();
-    if (0 != ret) {
+    ret = kernel->Run(before, after);
+    if (RET_OK != ret) {
      MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name();
      return ret;
    }
-    if (after != nullptr) {
-      if (!after(TensorVectorCast(kernel->in_tensors()), TensorVectorCast(kernel->out_tensors()),
-                 {kernel->name(), kernel->type_str()})) {
-        MS_LOG(ERROR) << "run kernel after_callback failed, name: " << kernel->name();
-      }
-    }
-    for (auto input_kernel : kernel->in_kernels()) {
-      MS_ASSERT(input_kernel != nullptr);
-      if (input_kernel->is_model_output()) {
-        continue;
-      }
-      ret = input_kernel->DecOutTensorRefCount();
-      if (0 != ret) {
-        MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed";
-      }
+    ret = kernel->PostProcess();
+    if (RET_OK != ret) {
+      MS_LOG(ERROR) << "PostProcess kernel failed, name: " << kernel->name();
+      return ret;
    }
  }
  return RET_OK;
@ -99,9 +90,9 @@ int Executor::TransformTensorLayoutFp32(Tensor *tensor, schema::Format dst_forma
  MS_ASSERT(4 == tensor->shape().size());
  auto src_format = tensor->GetFormat();
  if (src_format == schema::Format::Format_NC4HW4 && dst_format == schema::Format::Format_NHWC) {
-    auto *src_data = tensor->MutableData();
+    auto *src_data = tensor->data_c();
    if (src_data == nullptr) {
-      MS_LOG(ERROR) << "MutableData return nullptr";
+      MS_LOG(ERROR) << "data of tensor is nullptr";
      return RET_ERROR;
    }
    auto *dst_data = allocator->Malloc(tensor->Size());
--- a/mindspore/lite/src/executor.h
+++ b/mindspore/lite/src/executor.h
@ -28,13 +28,15 @@ class Executor {
  Executor() = default;
  virtual ~Executor() = default;

-  virtual int Prepare(std::vector<kernel::LiteKernel *> &kernels) { return 0; }
+  virtual int Prepare(const std::vector<kernel::LiteKernel *> &kernels) { return 0; }

  virtual int Run(std::vector<Tensor *> &in_tensors, std::vector<Tensor *> &out_tensors,
                  std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator = nullptr,
-                  const session::KernelCallBack &before = nullptr, const session::KernelCallBack &after = nullptr);
+                  const KernelCallBack &before = nullptr, const KernelCallBack &after = nullptr);

 protected:
+  int CheckInputs(std::vector<Tensor *> &in_tensors);
+
  int TransformTensorLayoutFp32(Tensor *tensor, schema::Format dst_format, Allocator *allocator = nullptr);

  int TransformTensorLayoutUint8(Tensor *tensor, schema::Format dst_format, Allocator *allocator = nullptr);
--- a/mindspore/lite/src/inner_context.cc
+++ b/mindspore/lite/src/inner_context.cc
@ -19,12 +19,21 @@
 #include "src/common/log_adapter.h"

 namespace mindspore::lite {
+InnerContext::InnerContext(const Context *context) {
+  this->allocator = context->allocator;
+  this->thread_num_ = context->thread_num_;
+  this->device_list_.clear();
+  for (auto &device_ctx : context->device_list_) {
+    this->device_list_.push_back(device_ctx);
+  }
+}
+
 int InnerContext::Init() {
-  if (this->device_list_.empty()) {
-    MS_LOG(ERROR) << "Device list is empty.";
+  if (RET_OK != this->IsValid()) {
+    MS_LOG(ERROR) << "Context is not valid";
    return RET_NOT_SUPPORT;
  }
-  if (this->thread_pool_ == nullptr && this->device_list_[0].device_type_ == DT_CPU) {
+  if (this->thread_pool_ == nullptr && this->IsCpuEnabled()) {
    this->thread_pool_ =
      CreateLiteThreadPool(this->thread_num_, this->device_list_[0].device_info_.cpu_device_info_.cpu_bind_mode_);
    if (this->thread_pool_ == nullptr) {
@ -49,4 +58,74 @@ InnerContext::~InnerContext() {
    this->thread_pool_ = NULL;
  }
 }
+
+int InnerContext::IsValid() {
+  if (this->device_list_.empty()) {
+    MS_LOG(ERROR) << "Device list is empty.";
+    return RET_NOT_SUPPORT;
+  }
+#ifndef SUPPORT_GPU
+  if (IsGpuEnabled()) {
+    MS_LOG(ERROR) << "GPU is not supported.";
+    return RET_NOT_SUPPORT;
+  }
+#endif
+  if (IsNpuEnabled()) {
+    MS_LOG(ERROR) << "NPU is not supported.";
+    return RET_NOT_SUPPORT;
+  }
+  return RET_OK;
+}
+
+bool InnerContext::IsCpuFloat16Enabled() {
+  if (!IsCpuEnabled()) {
+    return false;
+  }
+  return GetCpuInfo().enable_float16_;
+}
+
+bool InnerContext::IsGpuFloat16Enabled() {
+  if (!IsGpuEnabled()) {
+    return false;
+  }
+  return GetGpuInfo().enable_float16_;
+}
+
+bool InnerContext::IsCpuEnabled() {
+  return this->device_list_.end() !=
+         std::find_if(this->device_list_.begin(), this->device_list_.end(),
+                      [](const DeviceContext &device) { return device.device_type_ == DT_CPU; });
+}
+
+bool InnerContext::IsGpuEnabled() {
+  return this->device_list_.end() !=
+         std::find_if(this->device_list_.begin(), this->device_list_.end(),
+                      [](const DeviceContext &device) { return device.device_type_ == DT_GPU; });
+}
+
+bool InnerContext::IsNpuEnabled() {
+  return this->device_list_.end() !=
+         std::find_if(this->device_list_.begin(), this->device_list_.end(),
+                      [](const DeviceContext &device) { return device.device_type_ == DT_NPU; });
+}
+
+CpuDeviceInfo InnerContext::GetCpuInfo() {
+  auto iter = std::find_if(this->device_list_.begin(), this->device_list_.end(),
+                           [](const DeviceContext &device) { return device.device_type_ == DT_CPU; });
+  if (iter == this->device_list_.end()) {
+    return {};
+  } else {
+    return iter->device_info_.cpu_device_info_;
+  }
+}
+
+GpuDeviceInfo InnerContext::GetGpuInfo() {
+  auto iter = std::find_if(this->device_list_.begin(), this->device_list_.end(),
+                           [](const DeviceContext &device) { return device.device_type_ == DT_GPU; });
+  if (iter == this->device_list_.end()) {
+    return {};
+  } else {
+    return iter->device_info_.gpu_device_info_;
+  }
+}
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/inner_context.h
+++ b/mindspore/lite/src/inner_context.h
@ -27,8 +27,28 @@ struct InnerContext : public Context {
  struct ThreadPool *thread_pool_ = nullptr;

 public:
+  InnerContext() = default;
+
+  explicit InnerContext(const Context *context);
+
  int Init();

+  bool IsCpuFloat16Enabled();
+
+  bool IsGpuFloat16Enabled();
+
+  bool IsCpuEnabled();
+
+  bool IsGpuEnabled();
+
+  bool IsNpuEnabled();
+
+  CpuDeviceInfo GetCpuInfo();
+
+  GpuDeviceInfo GetGpuInfo();
+
+  int IsValid();
+
  virtual ~InnerContext();
 };
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/kernel_registry.cc
+++ b/mindspore/lite/src/kernel_registry.cc
@ -117,8 +117,9 @@ kernel::LiteKernel *KernelRegistry::GetKernel(const std::vector<Tensor *> &in_te
  if (creator != nullptr) {
    auto kernel = creator(in_tensors, out_tensors, parameter, ctx, key, primitive);
    if (kernel != nullptr) {
-      return kernel;
+      kernel->set_desc(key);
    }
+    return kernel;
  }
  return nullptr;
 }
--- a/mindspore/lite/src/lite_kernel.cc
+++ b/mindspore/lite/src/lite_kernel.cc
@ -16,8 +16,11 @@

 #include "src/lite_kernel.h"
 #include <algorithm>
+#include "src/tensor.h"

 namespace mindspore::kernel {
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;

 void *LiteKernel::workspace_ = nullptr;

@ -54,7 +57,21 @@ int LiteKernel::DecOutTensorRefCount() {
  return 0;
 }

-int LiteKernel::Prepare() {
+int LiteKernel::FreeWorkTensor() const {
+  for (auto input_kernel : this->in_kernels()) {
+    MS_ASSERT(input_kernel != nullptr);
+    if (input_kernel->is_model_output()) {
+      continue;
+    }
+    auto ret = input_kernel->DecOutTensorRefCount();
+    if (0 != ret) {
+      MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << this->name() << " failed";
+    }
+  }
+  return RET_OK;
+}
+
+int LiteKernel::PreProcess() {
  if (!InferShapeDone()) {
    (const_cast<mindspore::lite::PrimitiveC *>(primitive_))->SetInferFlag(true);
    auto ret = (const_cast<mindspore::lite::PrimitiveC *>(primitive_))->InferShape(in_tensors_, out_tensors_);
@ -70,7 +87,7 @@ int LiteKernel::Prepare() {
    }
  }

-  auto &outputs = this->out_tensors();
+  auto outputs = this->out_tensors();
  for (auto *output : outputs) {
    MS_ASSERT(output != nullptr);
    output->MallocData();
@ -78,6 +95,50 @@ int LiteKernel::Prepare() {
  return RET_OK;
 }

+int LiteKernel::Run(const KernelCallBack &before, const KernelCallBack &after) {
+  if (before != nullptr) {
+    if (!before(TensorVectorCast(this->in_tensors_), TensorVectorCast(this->out_tensors_),
+                {this->name_, this->type_str()})) {
+      MS_LOG(WARNING) << "run kernel before_callback failed, name: " << this->name_;
+    }
+  }
+  auto ret = Run();
+  if (RET_OK != ret) {
+    MS_LOG(ERROR) << "run kernel failed, name: " << this->name_;
+    return ret;
+  }
+  if (after != nullptr) {
+    if (!after(TensorVectorCast(this->in_tensors_), TensorVectorCast(this->out_tensors_),
+               {this->name_, this->type_str()})) {
+      MS_LOG(ERROR) << "run kernel after_callback failed, name: " << this->name_;
+    }
+  }
+  return RET_OK;
+}
+
+std::string LiteKernel::ToString() const {
+  std::ostringstream oss;
+  oss << "LiteKernel: " << this->name_;
+  oss << ", Type: " << this->type_str();
+  oss << std::endl << this->in_tensors_.size() << " InputTensors:";
+  for (auto tensor : in_tensors_) {
+    oss << " " << tensor << ":" << tensor->ToString();
+  }
+  oss << std::endl << this->out_tensors_.size() << " OutputTensors:";
+  for (auto tensor : out_tensors_) {
+    oss << " " << tensor << ":" << tensor->ToString();
+  }
+  oss << std::endl << this->in_kernels_.size() << " InputKernels:";
+  for (auto in_kernel : in_kernels_) {
+    oss << " " << in_kernel->name_;
+  }
+  oss << std::endl << this->out_kernels_.size() << " OutputKernels:";
+  for (auto out_kernel : out_kernels_) {
+    oss << " " << out_kernel->name_;
+  }
+  return oss.str();
+}
+
 std::vector<kernel::LiteKernel *> LiteKernelUtil::SubgraphInputKernels(
  const std::vector<kernel::LiteKernel *> &kernels) {
  std::vector<kernel::LiteKernel *> input_kernels;
@ -87,10 +148,11 @@ std::vector<kernel::LiteKernel *> LiteKernelUtil::SubgraphInputKernels(
      continue;
    }
    for (const auto &input : kernel->in_kernels()) {
-      auto iter = std::find(kernels.begin(), kernels.end(), input);
-      auto item = std::find(input_kernels.begin(), input_kernels.end(), kernel);
-      if (iter == kernels.end() && item == input_kernels.end()) {
+      auto in_kernel_in_graph = std::find(kernels.begin(), kernels.end(), input);
+      auto in_kernel_in_ret = std::find(input_kernels.begin(), input_kernels.end(), kernel);
+      if (in_kernel_in_graph == kernels.end() && in_kernel_in_ret == input_kernels.end()) {
        input_kernels.emplace_back(kernel);
+        break;
      }
    }
  }
@ -106,10 +168,11 @@ std::vector<kernel::LiteKernel *> LiteKernelUtil::SubgraphOutputKernels(
      continue;
    }
    for (const auto &output : kernel->out_kernels()) {
-      auto iter = std::find(kernels.begin(), kernels.end(), output);
-      auto item = std::find(output_kernels.begin(), output_kernels.end(), kernel);
-      if (iter == kernels.end() && item == output_kernels.end()) {
+      auto out_kernel_in_graph = std::find(kernels.begin(), kernels.end(), output);
+      auto out_kernel_in_ret = std::find(output_kernels.begin(), output_kernels.end(), kernel);
+      if (out_kernel_in_graph == kernels.end() && out_kernel_in_ret == output_kernels.end()) {
        output_kernels.emplace_back(kernel);
+        break;
      }
    }
  }
@ -120,7 +183,8 @@ std::vector<lite::Tensor *> LiteKernelUtil::SubgraphInputTensors(const std::vect
  std::vector<lite::Tensor *> input_tensors;
  std::vector<lite::Tensor *> all_output_tensors;
  for (const auto &kernel : kernels) {
-    all_output_tensors.insert(all_output_tensors.end(), kernel->out_tensors().begin(), kernel->out_tensors().end());
+    auto kernel_out_tensors = kernel->out_tensors();
+    all_output_tensors.insert(all_output_tensors.end(), kernel_out_tensors.begin(), kernel_out_tensors.end());
  }
  std::vector<kernel::LiteKernel *> input_kernels = SubgraphInputKernels(kernels);
  for (const auto &kernel : input_kernels) {
@ -139,7 +203,8 @@ std::vector<lite::Tensor *> LiteKernelUtil::SubgraphOutputTensors(const std::vec
  std::vector<lite::Tensor *> output_tensors;
  std::vector<lite::Tensor *> all_input_tensors;
  for (const auto &kernel : kernels) {
-    all_input_tensors.insert(all_input_tensors.end(), kernel->in_tensors().begin(), kernel->in_tensors().end());
+    auto kernel_in_tensors = kernel->in_tensors();
+    all_input_tensors.insert(all_input_tensors.end(), kernel_in_tensors.begin(), kernel_in_tensors.end());
  }
  std::vector<kernel::LiteKernel *> output_kernels = SubgraphOutputKernels(kernels);
  for (const auto &kernel : output_kernels) {
@ -153,8 +218,12 @@ std::vector<lite::Tensor *> LiteKernelUtil::SubgraphOutputTensors(const std::vec
  return output_tensors;
 }

-void LiteKernelUtil::TopologicalSortKernels(std::vector<kernel::LiteKernel *> &kernels) {
+void LiteKernelUtil::InitIOKernels(std::vector<kernel::LiteKernel *> &kernels) {
  for (auto *kernel : kernels) {
+    // clean io kernels
+    kernel->SetInKernel({});
+    kernel->SetOutKernel({});
+    // find io kernels
    for (auto *search_kernel : kernels) {
      if (search_kernel == kernel) {
        continue;
--- a/mindspore/lite/src/lite_kernel.h
+++ b/mindspore/lite/src/lite_kernel.h
@ -19,6 +19,7 @@
 #include <string>
 #include <vector>
 #include <memory>
+#include <utility>
 #include "src/ops/primitive_c.h"
 #include "src/common/utils.h"
 #ifdef ENABLE_ARM
@ -32,9 +33,7 @@
 static constexpr int kPerTensor = 1;

 namespace mindspore::kernel {
-using mindspore::lite::RET_ERROR;
-using mindspore::lite::RET_OK;
-enum KERNEL_ARCH { kCPU, kGPU, kNPU, kKernelArch_MIN = kCPU, kKernelArch_MAX = kNPU };
+enum KERNEL_ARCH { kCPU, kGPU, kAPU, kNPU, kKernelArch_MIN = kCPU, kKernelArch_MAX = kNPU };
 struct KernelKey {
  KERNEL_ARCH arch;
  TypeId data_type;
@ -51,16 +50,17 @@ struct KernelKey {
  }
 };

+enum SubGraphType { kNotSubGraph = 0, kCpuFP32SubGraph, kCpuFP16SubGraph, kGpuSubGraph, kNpuSubGraph, kApuSubGraph };
+
 class LiteKernel {
 public:
  LiteKernel() = default;
  // parameter should be deleted or freed by caller, and should be deleted or freed after LiteKernel is deleted
-  LiteKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &in_tensors,
-             const std::vector<lite::Tensor *> &out_tensors, const lite::InnerContext *ctx,
-             const mindspore::lite::PrimitiveC *primitive)
+  LiteKernel(OpParameter *parameter, std::vector<lite::Tensor *> in_tensors, std::vector<lite::Tensor *> out_tensors,
+             const lite::InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive)
      : op_parameter_(parameter),
-        in_tensors_(in_tensors),
-        out_tensors_(out_tensors),
+        in_tensors_(std::move(in_tensors)),
+        out_tensors_(std::move(out_tensors)),
        primitive_(primitive),
        context_(ctx) {
    if (op_parameter_ != nullptr && ctx != nullptr) {
@ -77,15 +77,22 @@ class LiteKernel {
    }
  }

-  virtual int Prepare();
+  // called while compiling graph
+  virtual int Prepare() { return mindspore::lite::RET_OK; }
+  // called before Run
+  virtual int PreProcess();
+
+  virtual int Run() { return mindspore::lite::RET_ERROR; }

-  virtual int Init() { return -1; }
+  virtual int Run(const KernelCallBack &before, const KernelCallBack &after);
+  // called after Run
+  virtual int PostProcess() { return FreeWorkTensor(); }

-  virtual int ReSize() { return -1; }
+  virtual int ReSize() { return mindspore::lite::RET_ERROR; }

-  virtual int Run() { return -1; }
+  virtual int Init() { return mindspore::lite::RET_ERROR; }

-  std::string name() { return this->name_; }
+  std::string name() const { return this->name_; }

  virtual void train() { train_mode_ = true; }

@ -101,20 +108,20 @@ class LiteKernel {

  bool is_model_output() const { return this->is_model_output_; }

-  schema::PrimitiveType Type() {
+  schema::PrimitiveType Type() const {
    return (this->op_parameter_ != nullptr) ? schema::PrimitiveType(this->op_parameter_->type_)
                                            : schema::PrimitiveType_NONE;
  }

-  std::string type_str() { return schema::EnumNamePrimitiveType(this->Type()); }
+  std::string type_str() const { return schema::EnumNamePrimitiveType(this->Type()); }

  void set_in_tensors(const std::vector<lite::Tensor *> &in_tensors) { this->in_tensors_ = in_tensors; }

  void set_out_tensors(const std::vector<lite::Tensor *> &out_tensors) { this->out_tensors_ = out_tensors; }

-  std::vector<lite::Tensor *> &in_tensors() { return this->in_tensors_; }
+  std::vector<lite::Tensor *> in_tensors() const { return this->in_tensors_; }

-  std::vector<lite::Tensor *> &out_tensors() { return this->out_tensors_; }
+  std::vector<lite::Tensor *> out_tensors() const { return this->out_tensors_; }

  void AddInKernel(LiteKernel *kernel) {
    if (!lite::IsContain(this->in_kernels_, kernel)) {
@ -132,14 +139,16 @@ class LiteKernel {

  void SetOutKernel(const std::vector<LiteKernel *> &kernel) { this->out_kernels_ = kernel; }

-  std::vector<LiteKernel *> &in_kernels() { return this->in_kernels_; }
+  std::vector<LiteKernel *> in_kernels() const { return this->in_kernels_; }

-  std::vector<LiteKernel *> &out_kernels() { return this->out_kernels_; }
+  std::vector<LiteKernel *> out_kernels() const { return this->out_kernels_; }

  void InitOutTensorRefCount();

  int DecOutTensorRefCount();

+  int FreeWorkTensor() const;
+
  KernelKey desc() const { return desc_; }

  void set_desc(const KernelKey kernel_key) { desc_ = kernel_key; }
@ -151,10 +160,14 @@ class LiteKernel {
  static void FreeWorkspace();
  void *GetWorkspace() { return workspace_; }

+  SubGraphType subgraph_type() const { return this->subgraph_type_; }
+
+  virtual std::string ToString() const;
+
 protected:
-  bool InferShapeDone() { return !(primitive_ != nullptr && !primitive_->GetInferFlag()) && true; }
+  bool InferShapeDone() { return !(primitive_ != nullptr && !primitive_->GetInferFlag()); }

-  KernelKey desc_;
+  KernelKey desc_{};
  std::string name_;
  OpParameter *op_parameter_ = nullptr;
  // tensor will free in ~lite_session()
@ -168,27 +181,7 @@ class LiteKernel {
  bool is_model_output_ = false;
  size_t workspace_size_ = 0;
  static void *workspace_;
-};
-
-class SubGraphKernel : public LiteKernel {
- public:
-  explicit SubGraphKernel(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
-                          const std::vector<kernel::LiteKernel *> &in_kernels,
-                          const std::vector<kernel::LiteKernel *> &out_kernels,
-                          const std::vector<kernel::LiteKernel *> &nodes, const lite::InnerContext *ctx,
-                          const mindspore::lite::PrimitiveC *primitive)
-      : LiteKernel(nullptr, inputs, outputs, ctx, primitive), nodes_(nodes) {
-    in_kernels_ = in_kernels;
-    out_kernels_ = out_kernels;
-  }
-
-  virtual int Init() { return -1; }
-  virtual int InferShape() { return -1; }
-  virtual int ReSize() { return -1; }
-  virtual int Run() { return -1; }
-
- protected:
-  std::vector<LiteKernel *> nodes_;
+  SubGraphType subgraph_type_ = kNotSubGraph;
 };

 typedef LiteKernel *(*KernelCreator)(const std::vector<lite::Tensor *> &inputs,
@ -198,7 +191,7 @@ typedef LiteKernel *(*KernelCreator)(const std::vector<lite::Tensor *> &inputs,

 class LiteKernelUtil {
 public:
-  static void TopologicalSortKernels(std::vector<kernel::LiteKernel *> &kernels);
+  static void InitIOKernels(std::vector<kernel::LiteKernel *> &kernels);

  static std::vector<kernel::LiteKernel *> SubgraphInputKernels(const std::vector<kernel::LiteKernel *> &kernels);

--- a/mindspore/lite/src/lite_session.cc
+++ b/mindspore/lite/src/lite_session.cc
@ -295,13 +295,13 @@ int LiteSession::CompileGraph(Model *model) {

 std::vector<mindspore::tensor::MSTensor *> LiteSession::GetInputs() const { return this->input_vec_; }

-int LiteSession::RunGraph(const session::KernelCallBack &before, const session::KernelCallBack &after) {
+int LiteSession::RunGraph(const KernelCallBack &before, const KernelCallBack &after) {
  bool expected = false;
  if (!is_running_.compare_exchange_strong(expected, true)) {
    MS_LOG(ERROR) << "Not support multi-threading";
    return RET_ERROR;
  }
-  STATUS ret = RET_ERROR;
+  STATUS ret;
  MS_ASSERT(this->context_);
  if (before == nullptr && after == nullptr) {
    ret = executor->Run(this->inputs_, this->outputs_, this->kernels_, this->context_->allocator.get());
@ -325,39 +325,12 @@ int LiteSession::Init(Context *context) {
    return RET_NULL_PTR;
  }

-  if (context->device_list_.empty()) {
-    MS_LOG(ERROR) << "Device list is empty.";
-    is_running_.store(false);
-    return RET_NOT_SUPPORT;
-  }
-
-  auto &device_type = context->device_list_[0].device_type_;
-
-  if (device_type == DT_NPU) {
-    MS_LOG(ERROR) << "NPU is not supported.";
-    is_running_.store(false);
-    return RET_NOT_SUPPORT;
-  }
-#ifndef SUPPORT_GPU
-  if (device_type == DT_GPU) {
-    MS_LOG(ERROR) << "GPU is not supported.";
-    is_running_.store(false);
-    return RET_NOT_SUPPORT;
-  }
-#endif
-
-  this->context_ = new (std::nothrow) InnerContext();
+  this->context_ = new (std::nothrow) InnerContext(context);
  if (this->context_ == nullptr) {
    MS_LOG(ERROR) << "New Context failed";
    is_running_.store(false);
    return RET_MEMORY_FAILED;
  }
-  this->context_->allocator = context->allocator;
-  this->context_->thread_num_ = context->thread_num_;
-  this->context_->device_list_.clear();
-  for (auto &device_ctx : context->device_list_) {
-    this->context_->device_list_.push_back(device_ctx);
-  }
  auto ret = this->context_->Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Init Context failed";
@ -371,12 +344,11 @@ int LiteSession::Init(Context *context) {
    return ret;
  }
 #if SUPPORT_GPU
-  if (device_type == DT_GPU) {
-    auto gpu_device_info = this->context_->device_list_[0].device_info_.gpu_device_info_;
+  if (this->context_->IsGpuEnabled()) {
+    auto gpu_device_info = this->context_->GetGpuInfo();
    auto opencl_runtime = ocl_runtime_wrap_.GetInstance();
    opencl_runtime->SetFp16Enable(gpu_device_info.enable_float16_);
    if (opencl_runtime->Init() != RET_OK) {
-      device_type = DT_CPU;
      MS_LOG(WARNING) << "Init OpenCL runtime failed, change to CPU mode.";
    } else {
      MS_LOG(INFO) << "Init OpenCL runtime success.";
@ -398,14 +370,13 @@ void LiteSession::BindThread(bool if_bind) {
    MS_LOG(ERROR) << "Device list is empty.";
    return;
  }
-  auto &device_ctx = this->context_->device_list_[0];
-  if (device_ctx.device_type_ != DT_CPU) {
-    MS_LOG(ERROR) << "Device is not CPU.";
+  if (this->context_->IsCpuEnabled()) {
    return;
  }
-  if (device_ctx.device_info_.cpu_device_info_.cpu_bind_mode_ != NO_BIND) {
+  auto cpu_device_info = this->context_->GetCpuInfo();
+  if (cpu_device_info.cpu_bind_mode_ != NO_BIND) {
    MS_ASSERT(this->context_->thread_pool_ != NULL);
-    BindThreads(this->context_->thread_pool_, if_bind, device_ctx.device_info_.cpu_device_info_.cpu_bind_mode_);
+    BindThreads(this->context_->thread_pool_, if_bind, cpu_device_info.cpu_bind_mode_);
  }
 }

--- a/mindspore/lite/src/lite_session.h
+++ b/mindspore/lite/src/lite_session.h
@ -52,8 +52,7 @@ class LiteSession : public session::LiteSession {

  mindspore::tensor::MSTensor *GetInputsByTensorName(const std::string &name) const override;

-  int RunGraph(const session::KernelCallBack &before = nullptr,
-               const session::KernelCallBack &after = nullptr) override;
+  int RunGraph(const KernelCallBack &before = nullptr, const KernelCallBack &after = nullptr) override;

  std::vector<mindspore::tensor::MSTensor *> GetOutputsByNodeName(const std::string &node_name) const override;

--- a/mindspore/lite/src/runtime/kernel/arm/base/prior_box.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/prior_box.cc
@ -163,11 +163,6 @@ int RunPriorBox(void *cdata, int task_id) {
 }

 int PriorBoxCPUKernel::Run() {
-  auto prepare_ret = Prepare();
-  if (prepare_ret != RET_OK) {
-    MS_LOG(ERROR) << "Prepare fail! Ret error code[" << prepare_ret << "]";
-    return prepare_ret;
-  }
  int error_code = ParallelLaunch(this->context_->thread_pool_, RunPriorBox, this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "PriorBox run error, error_code[" << error_code << "]";
--- a/mindspore/lite/src/runtime/kernel/arm/base/quant_dtype_cast.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/quant_dtype_cast.cc
@ -140,12 +140,6 @@ int QuantDTypeCastRun(void *cdata, int task_id) {
 }

 int QuantDTypeCastCPUKernel::Run() {
-  auto prepare_ret = Prepare();
-  if (prepare_ret != RET_OK) {
-    MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
-    return prepare_ret;
-  }
-
  if (in_tensors_[0]->data_type() == TypeId::kNumberTypeInt8 &&
      out_tensors_[0]->data_type() == TypeId::kNumberTypeFloat32) {
    int8_ptr_ = reinterpret_cast<int8_t *>(in_tensors_[0]->data_c());
--- a/mindspore/lite/src/runtime/kernel/arm/base/strided_slice.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/strided_slice.cc
@ -91,24 +91,18 @@ int StridedSliceCPUKernel::HandleMultiInputs() {
 }

 int StridedSliceCPUKernel::Run() {
-  auto ret = Prepare();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
-    return ret;
-  }
-
  auto input = in_tensors_.at(0);
  auto output = out_tensors_.at(0);
  MS_ASSERT(input);
  MS_ASSERT(output);
  if (in_tensors().size() == kMultiInputsSize) {
-    ret = HandleMultiInputs();
+    auto ret = HandleMultiInputs();
    if (ret != RET_OK) {
      return ret;
    }
  }
-  ret = DoStridedSlice(input->MutableData(), output->MutableData(),
-                       reinterpret_cast<StridedSliceParameter *>(op_parameter_));
+  auto ret = DoStridedSlice(input->MutableData(), output->MutableData(),
+                            reinterpret_cast<StridedSliceParameter *>(op_parameter_));
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "StridedSlice error error_code[" << ret << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.cc
@ -103,9 +103,9 @@ int ActivationFp16Run(void *cdata, int task_id) {
 }

 int ActivationFp16CPUKernel::Run() {
-  auto ret = Prepare();
+  auto ret = MallocTmpBuffer();
  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Prepare failed.";
+    MS_LOG(ERROR) << "MallocTmpBuffer failed.";
    return ret;
  }

--- a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.cc
@ -185,11 +185,6 @@ static int ArithmeticsRunFp16(void *cdata, int task_id) {
 }

 int ArithmeticFP16CPUKernel::Run() {
-  auto ret = Prepare();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
-    return ret;
-  }
  auto output_tensor = out_tensors_.at(0);
  is_input0_fp32_ = in_tensors_.at(0)->data_type() == kNumberTypeFloat32;
  is_input1_fp32_ = in_tensors_.at(1)->data_type() == kNumberTypeFloat32;
@ -203,7 +198,7 @@ int ArithmeticFP16CPUKernel::Run() {
    FreeTmpBuffer();
    return RET_ERROR;
  }
-  ret = ParallelLaunch(this->context_->thread_pool_, ArithmeticsRunFp16, this, context_->thread_num_);
+  auto ret = ParallelLaunch(this->context_->thread_pool_, ArithmeticsRunFp16, this, context_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ArithmeticsRunFp16 run error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_self_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_self_fp16.cc
@ -20,6 +20,8 @@
 #include "nnacl/fp16/arithmetic_self_fp16.h"

 using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;

 namespace mindspore::kernel {
 namespace {
@ -81,11 +83,6 @@ void ArithmeticSelfFp16CPUKernel::FreeInputAndOutput() {
 }

 int ArithmeticSelfFp16CPUKernel::Run() {
-  auto ret = Prepare();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Prepare fail! ret: " << ret;
-    return ret;
-  }
  auto input_tensor = in_tensors_.at(0);
  auto output_tensor = out_tensors_.at(0);
  input_fp16_ptr_ = ConvertInputFp32toFp16(input_tensor, context_);
@ -95,7 +92,7 @@ int ArithmeticSelfFp16CPUKernel::Run() {
    MS_LOG(ERROR) << "input or output is nullptr";
    return RET_ERROR;
  }
-  ret = ParallelLaunch(this->context_->thread_pool_, ArithmeticSelfRun, this, op_parameter_->thread_num_);
+  auto ret = ParallelLaunch(this->context_->thread_pool_, ArithmeticSelfRun, this, op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ArithmeticSelfRun error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/batchnorm_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/batchnorm_fp16.cc
@ -21,6 +21,8 @@
 #include "src/kernel_registry.h"

 using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_BatchNorm;

 namespace mindspore::kernel {
@ -47,11 +49,6 @@ int BatchnormFp16CPUKernel::InitConstTensor() {
 }

 int BatchnormFp16CPUKernel::Run() {
-  auto ret = Prepare();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Prepare fail! Ret error code: " << ret;
-    return ret;
-  }
  auto input_tensor = in_tensors_.at(0);
  auto output_tensor = out_tensors_.at(0);
  input_ = ConvertInputFp32toFp16(input_tensor, context_);
@ -62,7 +59,7 @@ int BatchnormFp16CPUKernel::Run() {
    return RET_ERROR;
  }

-  ret = ParallelLaunch(this->context_->thread_pool_, BatchNormRun, this, op_parameter_->thread_num_);
+  auto ret = ParallelLaunch(this->context_->thread_pool_, BatchNormRun, this, op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "BatchnormRun error error_code[" << ret << "]";
  }
@ -76,7 +73,7 @@ int BatchnormFp16CPUKernel::Run() {
 int BatchnormFp16CPUKernel::DoExecute(int task_id) {
  auto param = reinterpret_cast<BatchNormParameter *>(op_parameter_);
  BatchNormFp16(input_, mean_, variance_, param, task_id, output_);
-  return mindspore::lite::RET_OK;
+  return RET_OK;
 }

 void BatchnormFp16CPUKernel::FreeInputAndOutput() {
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/cast_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/cast_fp16.cc
@ -83,11 +83,6 @@ int CastFp16CPUKernel::DoCast(int thread_id) {
 }

 int CastFp16CPUKernel::Run() {
-  auto prepare_ret = Prepare();
-  if (prepare_ret != RET_OK) {
-    MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
-    return prepare_ret;
-  }
  if (data_num_ == 0) {
    return RET_OK;
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.cc
@ -91,12 +91,6 @@ void ConcatFp16CPUKernel::FreeTmpBuffer() {
 }

 int ConcatFp16CPUKernel::Run() {
-  auto prepare_ret = Prepare();
-  if (prepare_ret != RET_OK) {
-    MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
-    return prepare_ret;
-  }
-
  auto ret = MallocTmpBuffer();
  if (ret != RET_OK) {
    FreeTmpBuffer();
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
@ -218,13 +218,7 @@ static int Convolution1x1Fp16RunHw(void *cdata, int task_id) {
 }

 int Convolution1x1FP16CPUKernel::Run() {
-  auto ret = Prepare();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Prepare failed.";
-    return RET_ERROR;
-  }
-
-  ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
+  auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Get executor tensor failed.";
    return ret;
@ -248,10 +242,14 @@ int Convolution1x1FP16CPUKernel::Run() {
    }

    if (multi_thread_by_hw_) {
-      ParallelLaunch(this->context_->thread_pool_, Convolution1x1Fp16RunHw, this, thread_count_);
+      ret = ParallelLaunch(this->context_->thread_pool_, Convolution1x1Fp16RunHw, this, thread_count_);
    } else {
      RowMajor2Col16MajorFp16Opt(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
-      ParallelLaunch(this->context_->thread_pool_, Convolution1x1Fp16RunOc, this, thread_count_);
+      ret = ParallelLaunch(this->context_->thread_pool_, Convolution1x1Fp16RunOc, this, thread_count_);
+    }
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "ParallelLaunch failed.";
+      return ret;
    }
  }

--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc
@ -23,7 +23,8 @@
 #include "src/runtime/runtime_api.h"

 namespace mindspore::kernel {
-
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
 ConvolutionBaseFP16CPUKernel::~ConvolutionBaseFP16CPUKernel() {
  if (fp16_weight_ != nullptr) {
    free(fp16_weight_);
--- a/Show More
+++ b/Show More