!1131 invoke sub kernel with tiling_key in dynamic shape & all scene

From: @HW_KK Reviewed-by: Signed-off-by:
4 years ago · fbf9ece38e
parent 92417142da 1db59ce1bc
commit fbf9ece38e
15 changed files with 694 additions and 87 deletions
--- a/ge/hybrid/model/hybrid_model_builder.cc
+++ b/ge/hybrid/model/hybrid_model_builder.cc
@ -1199,6 +1199,8 @@ Status HybridModelBuilder::IndexTaskDefs() {
        op_index = task_def.kernel_ex().op_index();
      } else if (task_type == RT_MODEL_TASK_HCCL) {
        op_index = task_def.kernel_hccl().op_index();
+      } else if (task_type == RT_MODEL_TASK_ALL_KERNEL) {
+        op_index = task_def.kernel_with_handle().context().op_index();
      } else {
        GELOGD("Skip task type: %d", static_cast<int>(task_type));
        continue;
@ -1211,7 +1213,7 @@ Status HybridModelBuilder::IndexTaskDefs() {
      }

      auto &node = iter->second;
-      if (task_type == RT_MODEL_TASK_KERNEL) {
+      if (task_type == RT_MODEL_TASK_KERNEL || task_type == RT_MODEL_TASK_ALL_KERNEL) {
        ge_model->GetTBEKernelStore().LoadTBEKernelBinToOpDesc(node->GetOpDesc());
      }

--- a/ge/hybrid/node_executor/aicore/aicore_op_task.cc
+++ b/ge/hybrid/node_executor/aicore/aicore_op_task.cc
--- a/ge/hybrid/node_executor/aicore/aicore_op_task.h
+++ b/ge/hybrid/node_executor/aicore/aicore_op_task.h
@ -28,6 +28,32 @@

 namespace ge {
 namespace hybrid {
+class TbeHandleHolder {
+ public:
+  TbeHandleHolder(void *bin_handle);
+  ~TbeHandleHolder();
+
+  void SetBinHandle(void *bin_handle) { bin_handle_ = bin_handle; }
+  void *GetBinHandle() { return bin_handle_; }
+
+ private:
+  friend class TbeHandleRegistry;
+  void *bin_handle_ = nullptr;
+};
+
+class TbeHandleRegistry {
+ public:
+  static TbeHandleRegistry &GetInstance() {
+    static TbeHandleRegistry instance;
+    return instance;
+  }
+
+  bool AddHandle(std::unique_ptr<TbeHandleHolder> &&holder);
+
+ private:
+  std::set<std::unique_ptr<TbeHandleHolder>> registered_handles_;
+};
+
 class AiCoreOpTask {
 public:
  AiCoreOpTask() = default;
@ -67,6 +93,9 @@ class AiCoreOpTask {
  Status InitWithTaskDef(const OpDesc &node, const domi::TaskDef &task_def);
  Status InitTilingInfo(const OpDesc &op_desc);
  Status RegisterTbeHandle(const OpDesc &op_desc);
+  Status RegisterKernelHandle(const OpDesc &op_desc);
+  Status InitWithKernelDef(const OpDesc &op_desc, const domi::TaskDef &task_def);
+  Status InitWithKernelDefWithHandle(const OpDesc &node, const domi::TaskDef &task_def);

  std::string stub_name_;
  void *stub_func_ = nullptr;
@ -76,6 +105,11 @@ class AiCoreOpTask {
  bool clear_atomic_ = true;
  bool is_single_op_ = false;
  std::vector<int> output_indices_to_skip_;
+  string original_kernel_key_;
+  string node_info_;
+  uint32_t tiling_key_ = 0;
+  void *handle_ = nullptr;
+  bool is_dynamic_ = false;
 };

 class AtomicAddrCleanOpTask : public AiCoreOpTask {
--- a/ge/single_op/single_op_model.cc
+++ b/ge/single_op/single_op_model.cc
@ -261,7 +261,7 @@ Status SingleOpModel::BuildTaskList(StreamResource *stream_resource, SingleOp &s
      if (kernel_type == ccKernelType::TE) {
        GELOGD("Building TBE task");
        TbeOpTask *tbe_task = nullptr;
-        auto ret = BuildKernelTask(task_def.kernel(), &tbe_task);
+        auto ret = BuildKernelTask(task_def, &tbe_task);
        if (ret != SUCCESS) {
          return ret;
        }
@ -332,9 +332,11 @@ void SingleOpModel::ParseArgTable(OpTask *task, SingleOp &op) {
  }
 }

-Status SingleOpModel::BuildKernelTask(const domi::KernelDef &kernel_def, TbeOpTask **task) {
+Status SingleOpModel::BuildKernelTask(const domi::TaskDef &task_def, TbeOpTask **task) {
  GE_CHECK_NOTNULL(task);
-  const auto &context = kernel_def.context();
+  auto task_type = static_cast<rtModelTaskType_t>(task_def.type());
+  const auto &context = task_type == RT_MODEL_TASK_KERNEL ? task_def.kernel().context() :
+                                                            task_def.kernel_with_handle().context();
  auto iter = op_list_.find(context.op_index());
  if (iter == op_list_.end()) {
    GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "op desc not found. op index = %u", context.op_index());
@ -347,7 +349,7 @@ Status SingleOpModel::BuildKernelTask(const domi::KernelDef &kernel_def, TbeOpTa
    return ACL_ERROR_GE_MEMORY_ALLOCATION;
  }

-  auto builder = TbeTaskBuilder(model_name_, iter->second, kernel_def);
+  auto builder = TbeTaskBuilder(model_name_, iter->second, task_def);
  auto ret = builder.BuildTask(*tbe_task, model_params_);
  if (ret != SUCCESS) {
    delete tbe_task;
@ -418,13 +420,15 @@ Status SingleOpModel::BuildOp(StreamResource &resource, SingleOp &single_op) {
 }

 Status SingleOpModel::BuildModelTaskKernel(const TaskDef &task_def, DynamicSingleOp &single_op) {
-  const domi::KernelDef &kernel_def = task_def.kernel();
-  const auto &context = kernel_def.context();
+  auto task_type = static_cast<rtModelTaskType_t>(task_def.type());
+  const auto &context = task_type == RT_MODEL_TASK_KERNEL ? task_def.kernel().context() :
+                                                            task_def.kernel_with_handle().context();
+
  auto kernel_type = static_cast<ccKernelType>(context.kernel_type());
  if (kernel_type == ccKernelType::TE) {
    GELOGD("Building TBE task");
    TbeOpTask *tbe_task = nullptr;
-    GE_CHK_STATUS_RET_NOLOG(BuildKernelTask(task_def.kernel(), &tbe_task));
+    GE_CHK_STATUS_RET_NOLOG(BuildKernelTask(task_def, &tbe_task));
    tbe_task->SetModelArgs(model_name_, model_id_);
    single_op.op_task_.reset(tbe_task);
  } else if (kernel_type == ccKernelType::AI_CPU || kernel_type == ccKernelType::CUST_AI_CPU) {
@ -453,7 +457,7 @@ Status SingleOpModel::BuildTaskListForDynamicOp(DynamicSingleOp &single_op) {
    GELOGI("[%s] Task[%d], type = %u, DebugString = %s", model_name_.c_str(), i, task_def.type(),
           task_def.DebugString().c_str());
    auto task_type = static_cast<rtModelTaskType_t>(task_def.type());
-    if (task_type == RT_MODEL_TASK_KERNEL) {
+    if (task_type == RT_MODEL_TASK_KERNEL || task_type == RT_MODEL_TASK_ALL_KERNEL) {
      if (single_op.op_task_ != nullptr) {
        GELOGE(ACL_ERROR_GE_OP_TASK_TYPE_INVALID, "Do not support dynamic op with multiple tasks.");
        return ACL_ERROR_GE_OP_TASK_TYPE_INVALID;
--- a/ge/single_op/single_op_model.h
+++ b/ge/single_op/single_op_model.h
@ -67,7 +67,7 @@ class SingleOpModel {

  Status BuildTaskList(StreamResource *stream_resource, SingleOp &single_op);
  Status BuildTaskListForDynamicOp(DynamicSingleOp &dynamic_single_op);
-  Status BuildKernelTask(const domi::KernelDef &kernel_def, TbeOpTask **task);
+  Status BuildKernelTask(const domi::TaskDef &task_def, TbeOpTask **task);
  Status BuildKernelExTask(const domi::KernelExDef &kernel_def, AiCpuTask **task,
                           bool dynamic_flag, bool& depend_compute_flag, uint64_t kernel_id);
  Status BuildCpuKernelTask(const domi::KernelDef &kernel_def, OpTask **task, uint64_t kernel_id);
--- a/ge/single_op/task/op_task.cc
+++ b/ge/single_op/task/op_task.cc
@ -93,6 +93,14 @@ void TbeOpTask::SetKernelArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size
  op_desc_ = op_desc;
 }

+void TbeOpTask::SetKernelWithHandleArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size, uint32_t block_dim,
+                                        const OpDescPtr &op_desc,
+                                        const domi::KernelDefWithHandle &kernel_def_with_handle) {
+  SetKernelArgs(std::move(args), arg_size, block_dim, op_desc);
+  original_kernel_key_ = kernel_def_with_handle.original_kernel_key();
+  node_info_ = kernel_def_with_handle.node_info();
+}
+
 void TbeOpTask::SetSmDesc(void *sm_desc) { sm_desc_ = sm_desc; }

 void OpTask::SetModelArgs(std::string model_name, uint32_t model_id) {
@ -165,6 +173,10 @@ const std::string &TbeOpTask::GetStubName() const { return stub_name_; }

 uint32_t TbeOpTask::GetTaskType() const { return kTaskTypeAicore; }

+void TbeOpTask::SetHandle(void *handle) {
+  this->handle_ = handle;
+}
+
 Status TbeOpTask::LaunchKernel(rtStream_t stream) {
  GELOGD("To invoke rtKernelLaunch. task = %s, block_dim = %u", this->stub_name_.c_str(), block_dim_);
  auto *sm_desc = reinterpret_cast<rtSmDesc_t *>(sm_desc_);
@ -204,8 +216,9 @@ Status TbeOpTask::UpdateRunInfo(const vector<GeTensorDesc> &input_desc, const ve
  }
  block_dim_ = run_info.block_dim;
  tiling_data_ = run_info.tiling_data.str();
-  GELOGD("Done invoking OpParaCalculate successfully. block_dim = %u, tiling size = %zu", block_dim_,
-         tiling_data_.size());
+  tiling_key_ = run_info.tiling_key;
+  GELOGD("Done invoking OpParaCalculate successfully. block_dim = %u, tiling size = %zu, tiling_key = %u", block_dim_,
+         tiling_data_.size(), tiling_key_);

  GE_CHK_STATUS_RET(AllocateWorkspaces(run_info.workspaces), "Failed to allocate workspaces");
  return SUCCESS;
@ -329,8 +342,17 @@ Status TbeOpTask::LaunchKernel(const vector<GeTensorDesc> &input_desc,
  }

  GELOGD("[%s] Start to invoke rtKernelLaunch", node_->GetName().c_str());
-  GE_CHK_RT_RET(rtKernelLaunch(stub_func_, block_dim_, args_.get(), arg_size_, nullptr, stream));
-  GELOGD("[%s] Done invoking rtKernelLaunch successfully", node_->GetName().c_str());
+  if (handle_ == nullptr) {
+    GE_CHK_RT_RET(rtKernelLaunch(stub_func_, block_dim_, args_.get(), arg_size_, nullptr, stream));
+    GELOGD("[%s] Done invoking rtKernelLaunch successfully", node_->GetName().c_str());
+  } else {
+    std::string dev_func = original_kernel_key_ + "_" + std::to_string(tiling_key_);
+    std::string kernel_info = node_info_ + "/" + std::to_string(tiling_key_);
+    GE_CHK_RT_RET(rtKernelLaunchWithHandle(handle_, dev_func.c_str(), block_dim_, args_.get(), arg_size_, nullptr,
+                                           stream, kernel_info.c_str()));
+    GELOGD("[%s] Done invoking rtKernelLaunchWithHandle successfully", node_->GetName().c_str());
+  }
+
  return SUCCESS;
 }

--- a/ge/single_op/task/op_task.h
+++ b/ge/single_op/task/op_task.h
@ -78,6 +78,8 @@ class TbeOpTask : public OpTask {
  void SetSmDesc(void *sm_desc);
  void SetStubFunc(const std::string &name, const void *stub_func);
  void SetKernelArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size, uint32_t block_dim, const OpDescPtr &op_desc);
+  void SetKernelWithHandleArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size, uint32_t block_dim,
+                               const OpDescPtr &op_desc, const domi::KernelDefWithHandle& kernel_def_with_handle);

  Status UpdateRunInfo(const vector<GeTensorDesc> &input_desc,
                       const vector<GeTensorDesc> &output_desc) override;
@ -87,6 +89,7 @@ class TbeOpTask : public OpTask {
  const std::string &GetStubName() const;
  void EnableDynamicSupport(const NodePtr &node, void *tiling_buffer, size_t max_tiling_size);
  uint32_t GetTaskType() const override;
+  void SetHandle(void *handle);

 private:
  friend class SingleOpModel;
@ -107,6 +110,11 @@ class TbeOpTask : public OpTask {
  std::string tiling_data_;
  std::vector<void *> workspaces_;
  NodePtr node_;
+
+  uint32_t tiling_key_ = 0;
+  void* handle_ = nullptr;
+  std::string original_kernel_key_;
+  std::string node_info_;
 };

 class AiCpuBaseTask : public OpTask {
--- a/ge/single_op/task/tbe_task_builder.cc
+++ b/ge/single_op/task/tbe_task_builder.cc
--- a/ge/single_op/task/tbe_task_builder.h
+++ b/ge/single_op/task/tbe_task_builder.h
@ -42,6 +42,19 @@ class KernelHolder {
  std::shared_ptr<ge::OpKernelBin> kernel_bin_;
 };

+class HandleHolder {
+ public:
+  HandleHolder(void *bin_handle);
+  ~HandleHolder();
+
+  void SetBinHandle(void *bin_handle) { bin_handle_ = bin_handle; }
+  void *GetBinHandle() { return bin_handle_; }
+
+ private:
+  friend class HandleRegistry;
+  void *bin_handle_ = nullptr;
+};
+
 class KernelBinRegistry {
 public:
  static KernelBinRegistry &GetInstance() {
@ -61,9 +74,22 @@ class KernelBinRegistry {
  std::mutex mutex_;
 };

+class HandleRegistry {
+ public:
+  static HandleRegistry &GetInstance() {
+    static HandleRegistry instance;
+    return instance;
+  }
+
+  bool AddHandle(std::unique_ptr<HandleHolder> &&holder);
+
+ private:
+  std::set<std::unique_ptr<HandleHolder>> registered_handles_;
+};
+
 class TbeTaskBuilder {
 public:
-  TbeTaskBuilder(const std::string &model_name, const NodePtr &node, const domi::KernelDef &kernel_def);
+  TbeTaskBuilder(const std::string &model_name, const NodePtr &node, const domi::TaskDef &task_def);
  ~TbeTaskBuilder() = default;

  Status BuildTask(TbeOpTask &task, const SingleOpModelParam &param);
@ -71,9 +97,11 @@ class TbeTaskBuilder {
 private:
  Status InitTilingInfo(TbeOpTask &task);
  Status SetKernelArgs(TbeOpTask &task, const SingleOpModelParam &param, const OpDescPtr &op_desc);
+  Status SetKernelWithHandleArgs(TbeOpTask &task, const SingleOpModelParam &param, const OpDescPtr &op_desc);
  Status GetSmDesc(void **sm_desc, const SingleOpModelParam &param) const;

  Status RegisterKernel(TbeOpTask &task, const SingleOpModelParam &param);
+  Status RegisterKernelWithHandle(TbeOpTask &task, const SingleOpModelParam &param);
  Status DoRegisterKernel(const OpKernelBin &kernel_bin, const char *bin_file_key, void **bin_handle,
                          const SingleOpModelParam &param);
  Status DoRegisterBinary(const OpKernelBin &kernel_bin, void **bin_handle, const SingleOpModelParam &param) const;
@ -83,8 +111,11 @@ class TbeTaskBuilder {

  const NodePtr node_;
  const OpDescPtr op_desc_;
+  const domi::TaskDef &task_def_;
  const domi::KernelDef &kernel_def_;
+  const domi::KernelDefWithHandle &kernel_def_with_handle_;
  const std::string stub_name_;
+  void *handle_ = nullptr;
 };
 }  // namespace ge

--- a/tests/depends/runtime/src/runtime_stub.cc
+++ b/tests/depends/runtime/src/runtime_stub.cc
@ -131,8 +131,15 @@ rtError_t rtFunctionRegister(void *bin_handle, const void *stub_func, const char

 rtError_t rtDevBinaryRegister(const rtDevBinary_t *bin, void **handle) { return RT_ERROR_NONE; }

+rtError_t rtRegisterAllKernel(const rtDevBinary_t *bin, void **handle) { return RT_ERROR_NONE; }
+
 rtError_t rtKernelConfigTransArg(const void *ptr, uint64_t size, uint32_t flag, void **arg) { return RT_ERROR_NONE; }

+rtError_t rtKernelLaunchWithHandle(void *handle, const void *devFunc, uint32_t blockDim, void *args, uint32_t argsSize,
+                                   rtSmDesc_t *smDesc, rtStream_t stream, const void *kernelInfo) {
+  return RT_ERROR_NONE;
+}
+
 rtError_t rtKernelLaunch(const void *stub_func, uint32_t block_dim, void *args, uint32_t args_size, rtSmDesc_t *sm_desc,
                         rtStream_t stream) {
  return RT_ERROR_NONE;
--- a/tests/ut/ge/CMakeLists.txt
+++ b/tests/ut/ge/CMakeLists.txt
@ -763,12 +763,17 @@ set(SINGLE_OP_TEST_FILES
    #"single_op/single_op_model_unittest.cc"
    "single_op/single_op_manager_unittest.cc"
    "single_op/stream_resource_unittest.cc"
+    "single_op/single_op_task_unittest.cc"
 )

 set(PROFILING_MNG_TEST_FILES
    "profiling/ge_profiling_manager_unittest.cc"
 )

+set(HYBRID_TEST_FILES
+    "hybrid/ge_hybrid_unittest.cc"
+)
+
 set(OTHERS_TEST_FILES
    "plugin_manager/ge_util_unittest.cc"
 )
@ -1064,6 +1069,7 @@ add_executable(ut_libge_distinct_load_utest
        ${DISTINCT_GRAPH_LOAD_SRC_FILES}
        ${SINGLE_OP_TEST_FILES}
        ${PROFILING_MNG_TEST_FILES}
+        ${HYBRID_TEST_FILES}
 )

 target_compile_options(ut_libge_distinct_load_utest PRIVATE
--- a/tests/ut/ge/hybrid/ge_hybrid_unittest.cc
+++ b/tests/ut/ge/hybrid/ge_hybrid_unittest.cc
@ -0,0 +1,101 @@
+/**
+ * Copyright 2019-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <vector>
+
+#include "runtime/rt.h"
+
+#define protected public
+#define private public
+#include "hybrid/model/hybrid_model_builder.h"
+#include "hybrid/model/hybrid_model.h"
+#include "model/ge_model.h"
+#include "model/ge_root_model.h"
+
+#include "hybrid/node_executor/aicore/aicore_op_task.h"
+#include "framework/common/taskdown_common.h"
+#include "framework/common/debug/log.h"
+#include "graph/ge_context.h"
+#include "hybrid/executor/hybrid_execution_context.h"
+#include "hybrid/node_executor/aicore/aicore_task_builder.h"
+#include "graph/load/model_manager/tbe_handle_store.h"
+#include "graph/types.h"
+
+#undef private
+#undef protected
+
+using namespace std;
+using namespace testing;
+using namespace ge;
+
+class UtestGeHybrid : public testing::Test {
+ protected:
+  void SetUp() {}
+
+  void TearDown() {}
+};
+
+static ge::OpDescPtr CreateOpDesc(string name = "", string type = "") {
+  auto op_desc = std::make_shared<ge::OpDesc>(name, type);
+  op_desc->SetStreamId(0);
+  op_desc->SetId(0);
+
+  op_desc->SetWorkspace({});
+  ;
+  op_desc->SetWorkspaceBytes({});
+  op_desc->SetInputOffset({});
+  op_desc->SetOutputOffset({});
+
+  ge::AttrUtils::SetStr(op_desc, ge::TVM_ATTR_NAME_MAGIC, "RT_DEV_BINARY_MAGIC_ELF_AIVEC");
+  bool support_dynamic = true;
+  ge::AttrUtils::GetBool(op_desc, "support_dynamicshape", support_dynamic);
+  return op_desc;
+}
+
+TEST_F(UtestGeHybrid, aicore_op_task_init_success) {
+  // build aicore task
+  auto aicore_task = std::unique_ptr<hybrid::AiCoreOpTask>(new(std::nothrow)hybrid::AiCoreOpTask());
+  domi::TaskDef task_def;
+  task_def.set_type(RT_MODEL_TASK_ALL_KERNEL);
+  domi::KernelDefWithHandle *kernel_with_handle = task_def.mutable_kernel_with_handle();
+  kernel_with_handle->set_original_kernel_key("");
+  kernel_with_handle->set_node_info("");
+  kernel_with_handle->set_block_dim(32);
+  kernel_with_handle->set_args_size(64);
+  string args(64, '1');
+  kernel_with_handle->set_args(args.data(), 64);
+  domi::KernelContext *context = kernel_with_handle->mutable_context();
+  context->set_op_index(1);
+  context->set_kernel_type(2);    // ccKernelType::TE
+  uint16_t args_offset[9] = {0};
+  context->set_args_offset(args_offset, 9 * sizeof(uint16_t));
+
+  OpDescPtr op_desc = CreateOpDesc("Add", "Add");
+  std::vector<char> kernelBin;
+  TBEKernelPtr tbe_kernel = std::make_shared<ge::OpKernelBin>("name/Add", std::move(kernelBin));
+  op_desc->SetExtAttr(ge::OP_EXTATTR_NAME_TBE_KERNEL, tbe_kernel);
+  std::string kernel_name("kernel/Add");
+  AttrUtils::SetStr(op_desc, op_desc->GetName() + "_kernelname", kernel_name);
+  ASSERT_EQ(aicore_task->InitWithTaskDef(*op_desc.get(), task_def), SUCCESS);
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  ASSERT_EQ(aicore_task->LaunchKernel(stream), SUCCESS);
+  char *handle = "";
+  aicore_task->handle_ = handle;
+  aicore_task->tiling_key_ = 1;
+  ASSERT_EQ(aicore_task->LaunchKernel(stream), SUCCESS);
+}
--- a/tests/ut/ge/single_op/single_op_task_unittest.cc
+++ b/tests/ut/ge/single_op/single_op_task_unittest.cc
@ -0,0 +1,117 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <vector>
+
+#include "graph/load/model_manager/model_utils.h"
+#include "graph/utils/graph_utils.h"
+#include "runtime/rt.h"
+
+#define protected public
+#define private public
+#include "single_op/single_op_model.h"
+#include "single_op/task/tbe_task_builder.h"
+#include "single_op/task/op_task.h"
+#include "single_op/task/tbe_task_builder.h"
+#include "external/register/op_tiling_registry.h"
+#undef private
+#undef protected
+
+using namespace std;
+using namespace testing;
+using namespace ge;
+using namespace optiling;
+
+class UtestSingleOpTask : public testing::Test {
+ protected:
+  void SetUp() {}
+
+  void TearDown() {}
+};
+
+TEST_F(UtestSingleOpTask, test_build_kernel_task) {
+  string model_data_str = "123456789";
+  SingleOpModel model("model", model_data_str.c_str(), model_data_str.size());
+  model.input_offset_list_.push_back(0);
+  model.input_sizes_.push_back(16);
+
+  model.output_offset_list_.push_back(0);
+  model.output_sizes_.push_back(16);
+
+  auto graph = make_shared<ComputeGraph>("graph");
+  auto op_desc = make_shared<OpDesc>("Add", "Add");
+  std::vector<char> kernelBin;
+  TBEKernelPtr tbe_kernel = std::make_shared<ge::OpKernelBin>("name/Add", std::move(kernelBin));
+  op_desc->SetExtAttr(ge::OP_EXTATTR_NAME_TBE_KERNEL, tbe_kernel);
+  std::string kernel_name("kernel/Add");
+  AttrUtils::SetStr(op_desc, op_desc->GetName() + "_kernelname", kernel_name);
+
+  vector<int64_t> shape{16, 16};
+  GeShape ge_shape(shape);
+  GeTensorDesc desc(ge_shape);
+  op_desc->AddInputDesc(desc);
+  op_desc->AddOutputDesc(desc);
+  auto node = graph->AddNode(op_desc);
+
+  std::mutex stream_mu_;
+  rtStream_t stream_ = nullptr;
+  StreamResource stream_resource(0);
+  SingleOp single_op(&stream_resource, &stream_mu_, stream_);
+
+  domi::TaskDef task_def;
+  task_def.set_type(RT_MODEL_TASK_ALL_KERNEL);
+  domi::KernelDefWithHandle *kernel_with_handle = task_def.mutable_kernel_with_handle();
+  kernel_with_handle->set_original_kernel_key("");
+  kernel_with_handle->set_node_info("");
+  kernel_with_handle->set_block_dim(32);
+  kernel_with_handle->set_args_size(64);
+  string args(64, '1');
+  kernel_with_handle->set_args(args.data(), 64);
+  domi::KernelContext *context = kernel_with_handle->mutable_context();
+  context->set_op_index(1);
+  context->set_kernel_type(2);    // ccKernelType::TE
+  uint16_t args_offset[9] = {0};
+  context->set_args_offset(args_offset, 9 * sizeof(uint16_t));
+  model.op_list_[1] = node;
+
+  TbeOpTask task_tmp;
+  TbeOpTask *task = &task_tmp;
+  ASSERT_EQ(model.BuildKernelTask(task_def, &task), SUCCESS);
+  vector<GeTensorDesc> input_desc;
+  vector<DataBuffer> input_buffers;
+  vector<GeTensorDesc> output_desc;
+  vector<DataBuffer> output_buffers;
+  task->node_ = node;
+  OpTilingFunc op_tiling_func = [](const TeOpParas &, const OpCompileInfo &, OpRunInfo &) -> bool {return true;};
+  OpTilingRegistryInterf("Add", op_tiling_func);
+  ge::AttrUtils::SetStr(op_desc, "compile_info_key", "op_compile_info_key");
+  ge::AttrUtils::SetStr(op_desc, "compile_info_json", "op_compile_info_json");
+  char c = '0';
+  char* buffer = &c;
+  task->tiling_buffer_ = buffer;
+  task->max_tiling_size_ = 64;
+  task->tiling_data_ = "tiling_data";
+  task->arg_size_ = 64;
+  uint8_t task_args{0};
+  task->args_.reset(&task_args);
+
+  ASSERT_EQ(task->LaunchKernel(input_desc, input_buffers, output_desc, output_buffers, stream_), SUCCESS);
+  char handle_tmp = '0';
+  char *handle = &handle_tmp;
+  task->SetHandle(handle);
+  ASSERT_EQ(task->LaunchKernel(input_desc, input_buffers, output_desc, output_buffers, stream_), SUCCESS);
+}
--- a/third_party/fwkacllib/inc/runtime/kernel.h
+++ b/third_party/fwkacllib/inc/runtime/kernel.h
@ -191,6 +191,14 @@ typedef void (*rtCallback_t)(void *fnData);
 #define RT_FUSION_KERNEL_DUMPFLAG (0x04)
 #define RT_KERNEL_CUSTOM_AICPU (0x08)

+/**
+ * @ingroup rt_kernel
+ * @brief kernel mode
+ */
+#define RT_DEFAULT_KERNEL_MODE (0x00)
+#define RT_NORMAL_KERNEL_MODE (0x01)
+#define RT_ALL_KERNEL_MODE (0x02)
+
 /**
 * @ingroup rt_kernel
 * @brief kernel L1 Fusion Dump bit flags
@ -207,6 +215,16 @@ typedef void (*rtCallback_t)(void *fnData);
 */
 RTS_API rtError_t rtDevBinaryRegister(const rtDevBinary_t *bin, void **handle);

+/**
+ * @ingroup rt_kernel
+ * @brief register device binary
+ * @param [in] bin   device binary description
+ * @param [out] handle   device binary handle
+ * @return RT_ERROR_NONE for ok
+ * @return RT_ERROR_INVALID_VALUE for error input
+ */
+RTS_API rtError_t rtRegisterAllKernel(const rtDevBinary_t *bin, void **handle);
+
 /**
 * @ingroup rt_kernel
 * @brief register fast memeory device binary
@ -314,6 +332,23 @@ RTS_API rtError_t rtKernelConfigDump(uint32_t kind, uint32_t dumpSizePerBlock, u
 RTS_API rtError_t rtKernelLaunch(const void *stubFunc, uint32_t blockDim, void *args, uint32_t argsSize,
                                 rtSmDesc_t *smDesc, rtStream_t stream);

+/**
+ * @ingroup rt_kernel
+ * @brief launch kernel with handle to device
+ * @param [in] handle   program
+ * @param [in] devFunc    device function description
+ * @param [in] blockDim   block dimentions
+ * @param [in] args   argments address for kernel function
+ * @param [in] argsSize   argements size
+ * @param [in] smDesc   shared memory description
+ * @param [in] stream   associated stream
+ * @param [in] kernelInfo   kernel info
+ * @return RT_ERROR_NONE for ok
+ * @return RT_ERROR_INVALID_VALUE for error input
+ */
+RTS_API rtError_t rtKernelLaunchWithHandle(void *handle, const void *devFunc, uint32_t blockDim, void *args, uint32_t argsSize,
+                                           rtSmDesc_t *smDesc, rtStream_t stream, const void *kernelInfo);
+
 /**
 * @ingroup rt_kernel
 * @brief launch kernel to device
--- a/third_party/fwkacllib/inc/runtime/rt_model.h
+++ b/third_party/fwkacllib/inc/runtime/rt_model.h
@ -50,6 +50,7 @@ typedef enum tagModelTaskType {
    RT_MODEL_TASK_STREAM_LABEL_SWITCH_BY_INDEX,
    RT_MODEL_TASK_STREAM_LABEL_GOTO,
    RT_MODEL_TASK_MODEL_EXIT,
+    RT_MODEL_TASK_ALL_KERNEL,
 } rtModelTaskType_t;

 typedef enum tagModelStreamType {
@ -127,6 +128,17 @@ typedef struct tagKernelTaskInfo {
    uint16_t *argsOffset;
 } rtKernelTaskInfo_t;

+typedef struct tagAllKernelTaskInfo {
+    uint16_t blockDim;
+    uint16_t argsCount;
+    uint16_t argsSize;
+    uint16_t reserved;
+    const void *dev_func;
+    void *handle;
+    uint8_t *smDesc;
+    uint8_t *args;
+    uint16_t *argsOffset;
+} rtAllKernelTaskInfo_t;
 typedef struct tagKernelTaskInfoEx {
    uint32_t flags;
    uint32_t argsSize;
@ -251,6 +263,7 @@ typedef struct tagTaskInfo {
    union {
        rtKernelTaskInfoEx_t kernelTaskEx;
        rtKernelTaskInfo_t kernelTask;
+        rtAllKernelTaskInfo_t allkernelTask;
        rtEventTaskInfo_t eventTask;
        rtStreamSwitchTaskInfo_t streamSwitchTask;
        rtStreamActiveTaskInfo_t streamActiveTask;