!718 Synchronize latest Ascend software suite 24 Dec 2020

From: @nicholas_yhr Reviewed-by: @liujunzhu,@youui Signed-off-by: @youui
4 years ago · c762dd5dcc
parent 9a7b271674 274dbb5dc9
commit c762dd5dcc
71 changed files with 1238 additions and 1130 deletions
--- a/ge/CMakeLists.txt
+++ b/ge/CMakeLists.txt
@ -607,7 +607,7 @@ set(INFER_SRC_LIST

 if (NOT ENABLE_D AND NOT ENABLE_ACL AND NOT ENABLE_MS_TESTCASES)
 ############ libge_runner.so ############
-add_library(ge_runner SHARED ${TRAIN_SRC_LIST} ${PROTO_SRCS} ${PROTO_CLIENT_SRCS} $<TARGET_OBJECTS:msprofiler_fwk>)
+add_library(ge_runner SHARED ${TRAIN_SRC_LIST} ${PROTO_SRCS} ${PROTO_CLIENT_SRCS})

 target_compile_definitions(ge_runner PRIVATE
    PROTOBUF_INLINE_NOT_IN_HEADERS=0
@ -648,11 +648,14 @@ target_include_directories(ge_runner PRIVATE
    ${GE_CODE_DIR}/third_party/fwkacllib/inc/toolchain
 )

-target_link_libraries(ge_runner
+target_link_libraries(ge_runner PRIVATE
    $<BUILD_INTERFACE:intf_pub>
    ge_memory
    adump_server
    static_mmpa
+    -Wl,--whole-archive
+    msprofiler_fwk
+    -Wl,--no-whole-archive
    -Wl,--no-as-needed
    graph
    ge_common
@ -712,7 +715,7 @@ target_include_directories(ge_compiler PRIVATE
    ${GE_CODE_DIR}/third_party/fwkacllib/inc/toolchain
 )

-target_link_libraries(ge_compiler
+target_link_libraries(ge_compiler PRIVATE
    $<BUILD_INTERFACE:intf_pub>
    ge_memory
    static_mmpa
@ -766,7 +769,14 @@ target_link_options(opensrc_ascendcl PRIVATE
    -Wl,--allow-multiple-definition
    -Wl,-z,muldefs
    -Wl,-Bsymbolic
-    -Wl,--exclude-libs,ALL
+    -Wl,--exclude-libs,libascend_protobuf.a
+    -Wl,--exclude-libs,libge_executor.a
+    -Wl,--exclude-libs,libge_common.a
+    -Wl,--exclude-libs,libgraph.a
+    -Wl,--exclude-libs,libmmpa.a
+    -Wl,--exclude-libs,libregister.a
+    -Wl,--exclude-libs,liberror_manager.a
+    -Wl,--exclude-libs,libadump_server.a
 )
 target_link_libraries(opensrc_ascendcl PRIVATE
                     -Wl,--whole-archive
--- a/ge/common/dump/dump_op.cc
+++ b/ge/common/dump/dump_op.cc
@ -94,6 +94,9 @@ Status DumpOp::DumpOutput(aicpu::dump::Task &task) {
    for (auto dim : output_descs.at(i).GetShape().GetDims()) {
      output.mutable_shape()->add_dim(dim);
    }
+    for (auto dim : output_descs.at(i).GetOriginShape().GetDims()) {
+      output.mutable_origin_shape()->add_dim(dim);
+    }
    int64_t output_size = 0;
    if (TensorUtils::GetTensorSizeInBytes(output_descs.at(i), output_size) != SUCCESS) {
      GELOGE(PARAM_INVALID, "Get output size filed");
@ -118,6 +121,9 @@ Status DumpOp::DumpInput(aicpu::dump::Task &task) {
    for (auto dim : input_descs.at(i).GetShape().GetDims()) {
      input.mutable_shape()->add_dim(dim);
    }
+    for (auto dim : input_descs.at(i).GetOriginShape().GetDims()) {
+      input.mutable_origin_shape()->add_dim(dim);
+    }
    int64_t input_size = 0;
    if (TensorUtils::GetTensorSizeInBytes(input_descs.at(i), input_size) != SUCCESS) {
      GELOGE(PARAM_INVALID, "Get output size filed");
@ -214,8 +220,15 @@ Status DumpOp::LaunchDumpOp() {
  SetOpMappingLoopAddr(global_step_, loop_per_iter_, loop_cond_, op_mapping_info);
  GELOGI("Dump step is %s ,dump path is %s ,in Launch dump op", dump_properties_.GetDumpStep().c_str(),
         dump_path.c_str());
-
+  uint32_t task_id = 0;
+  uint32_t stream_id = 0;
+  rt_ret = rtGetTaskIdAndStreamID(&task_id, &stream_id);
+  if (rt_ret != RT_ERROR_NONE) {
+    GELOGW("call rtGetTaskIdAndStreamID failed, ret = 0x%X", rt_ret);
+  }
  aicpu::dump::Task task;
+  task.set_task_id(task_id);
+  task.set_stream_id(stream_id);
  task.mutable_op()->set_op_name(op_desc_->GetName());
  task.mutable_op()->set_op_type(op_desc_->GetType());
  if (dump_properties_.GetDumpMode() == kDumpOutput) {
--- a/ge/common/ge/tbe_plugin_manager.cc
+++ b/ge/common/ge/tbe_plugin_manager.cc
@ -181,12 +181,19 @@ void TBEPluginManager::GetCustomOpPath(std::string &customop_path) {
 void TBEPluginManager::LoadCustomOpLib() {
  LoadPluginSo(options_);

+  std::string fmk_type = std::to_string(domi::TENSORFLOW);
+  auto it = options_.find(ge::FRAMEWORK_TYPE);
+  if (it != options_.end()) {
+   fmk_type = it->second;
+  }
  std::vector<OpRegistrationData> registration_datas = domi::OpRegistry::Instance()->registrationDatas;
  GELOGI("The size of registration_datas is: %zu", registration_datas.size());
  for (OpRegistrationData reg_data : registration_datas) {
-    GELOGD("Begin to register optype: %s, imply_type: %s", reg_data.GetOmOptype().c_str(),
-           TypeUtils::ImplyTypeToSerialString(reg_data.GetImplyType()).c_str());
-    domi::OpRegistry::Instance()->Register(reg_data);
+    if (std::to_string(reg_data.GetFrameworkType()) == fmk_type) {
+      GELOGD("Begin to register optype: %s, imply_type: %s", reg_data.GetOmOptype().c_str(),
+             TypeUtils::ImplyTypeToSerialString(reg_data.GetImplyType()).c_str());
+      (void)domi::OpRegistry::Instance()->Register(reg_data);
+    }
  }
 }

--- a/ge/common/profiling/ge_profiling.cc
+++ b/ge/common/profiling/ge_profiling.cc
@ -112,7 +112,6 @@ ge::Status RegProfCtrlCallback(MsprofCtrlCallback func) {
  if (ge::ProfilingManager::Instance().GetMsprofCallback().msprofCtrlCallback != nullptr) {
    GELOGW("Msprof ctrl callback is exist, just ignore it.");
  } else {
-    GELOGI("GE register Msprof ctrl callback.");
    ge::ProfilingManager::Instance().SetMsprofCtrlCallback(func);
  }
  return ge::SUCCESS;
@ -124,7 +123,6 @@ ge::Status RegProfSetDeviceCallback(MsprofSetDeviceCallback func) {
    return ge::PARAM_INVALID;
  }
  // Pass MsprofSetDeviceCallback to runtime
-  GELOGI("GE pass setdevice callback to runtime.");
  ge::Status rt_ret = rtRegDeviceStateCallback(kRtSetDeviceRegName.c_str(), static_cast<rtDeviceStateCallback>(func));
  if (rt_ret != ge::SUCCESS) {
    GELOGE(rt_ret, "Pass MsprofSetDeviceCallback to runtime failed!");
@ -158,7 +156,7 @@ ge::Status ProfCommandHandle(ProfCommandHandleType type, void *data, uint32_t le
  if (type != kProfCommandhandleFinalize) {
    GE_CHECK_NOTNULL(data);
  }
-  ProfCommandHandleData *prof_config_param = (ProfCommandHandleData *)data;
+  ProfCommandHandleData *prof_config_param = reinterpret_cast<ProfCommandHandleData *>(data);
  auto iter = kProfCommandTypeMap.find(type);
  if (iter == kProfCommandTypeMap.end()) {
    GELOGW("The prof comand type is invalid.");
@ -183,7 +181,8 @@ ge::Status ProfCommandHandle(ProfCommandHandleType type, void *data, uint32_t le
  if (type != kProfCommandhandleFinalize) {
    command.module_index = prof_config_param->profSwitch;
  }
-  GELOGI("GE commandhandle execute, Command Type: %d, data type config: 0x%llx", type, command.module_index);
+  GELOGI("GE commandhandle execute, Command Type: %s, data type config: 0x%llx", iter->second.c_str(),
+         command.module_index);
  if (type == kProfCommandhandleStart || type == kProfCommandhandleStop) {
    GELOGI("Profiling device nums:%s , deviceID:[%s]", prof_params[0].c_str(), prof_params[kDeviceListIndex].c_str());
  }
--- a/ge/common/profiling/profiling_manager.cc
+++ b/ge/common/profiling/profiling_manager.cc
@ -38,10 +38,8 @@ const std::string kProfModelUnsubscribe = "prof_model_cancel_subscribe";
 }  // namespace

 namespace ge {
-ProfilingManager::ProfilingManager() : is_load_profiling_(false),
-                                       is_execute_profiling_(false),
-                                       is_training_trace_(false),
-                                       subscribe_count_(0) {
+ProfilingManager::ProfilingManager()
+    : is_load_profiling_(false), is_execute_profiling_(false), is_training_trace_(false), subscribe_count_(0) {
  prof_cb_.msprofCtrlCallback = nullptr;
  prof_cb_.msprofReporterCallback = nullptr;
 }
@ -102,8 +100,8 @@ ge::Status ProfilingManager::InitFromOptions(const Options &options, MsprofGeOpt
      return INTERNAL_ERROR;
    }
    is_execute_profiling_ = true;
-    GELOGI("The profiling in options is %s, %s. origin option: %s", options.profiling_mode.c_str(),
-          prof_conf.options, options.profiling_options.c_str());
+    GELOGI("The profiling in options is %s, %s. origin option: %s", options.profiling_mode.c_str(), prof_conf.options,
+           options.profiling_options.c_str());
  } else {
    (void)mmGetEnv("PROFILING_MODE", env_profiling_mode, MMPA_MAX_PATH);
    (void)mmGetEnv("PROFILING_OPTIONS", prof_conf.options, MSPROF_OPTIONS_DEF_LEN_MAX);
@ -143,6 +141,9 @@ ge::Status ProfilingManager::ParseOptions(const std::string &options) {
  }
  try {
    Json prof_options = Json::parse(options);
+    if (options.find(kTrainingTrace) == std::string::npos) {
+      return ge::SUCCESS;
+    }
    const std::string training_trace = prof_options[kTrainingTrace];
    if (training_trace.empty()) {
      GELOGI("Training trace will not take effect.");
@ -802,32 +803,46 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::GetFpBpP
  if (!fp_point_.empty() && !bp_point_.empty()) {
    fp_point = fp_point_;
    bp_point = bp_point_;
-    GELOGI("Bp Fp have been initialized in env or options. bp_point: %s, fp_point: %s", bp_point.c_str(), fp_point.c_str());
+    GELOGI("Bp Fp have been initialized in env or options. bp_point: %s, fp_point: %s", bp_point.c_str(),
+           fp_point.c_str());
    return;
  }
  // ProfApi mode and training trace is set
-  try {
-    char env_profiling_options[MSPROF_OPTIONS_DEF_LEN_MAX] = { 0x00 };
+  // Parse options first
+  char env_profiling_options[MSPROF_OPTIONS_DEF_LEN_MAX] = { 0x00 };
+  bool is_profiling_valid = false;
+  std::string profiling_options;
+  if (ge::GetContext().GetOption(OPTION_EXEC_PROFILING_OPTIONS, profiling_options) == SUCCESS &&
+      !profiling_options.empty()) {
+    is_profiling_valid = true;
+  } else {
    INT32 ret = mmGetEnv("PROFILING_OPTIONS", env_profiling_options, MSPROF_OPTIONS_DEF_LEN_MAX);
    if (ret != EN_OK) {
      GELOGI("PROFILING_OPTIONS env is not exist.");
      return;
    }
    GELOGI("Parse env PROFILING_OPTIONS:%s.", env_profiling_options);
-    Json prof_options = Json::parse(env_profiling_options);
+    profiling_options = env_profiling_options;
+    is_profiling_valid = true;
+  }
+  if (is_profiling_valid) {
+    try {
+      Json prof_options = Json::parse(profiling_options);

-    fp_point_ = prof_options[kFpPoint];
-    bp_point_ = prof_options[kBpPoint];
+      fp_point_ = prof_options[kFpPoint];
+      bp_point_ = prof_options[kBpPoint];

-    fp_point = fp_point_;
-    bp_point = bp_point_;
-    if (!fp_point_.empty() && !bp_point_.empty()) {
-      GELOGI("Training trace bp fp is set, bp_point:%s, fp_point:%s.", bp_point_.c_str(), fp_point_.c_str());
+      fp_point = fp_point_;
+      bp_point = bp_point_;
+      if (!fp_point_.empty() && !bp_point_.empty()) {
+        GELOGI("Training trace bp fp is set, bp_point:%s, fp_point:%s.", bp_point_.c_str(), fp_point_.c_str());
+      }
+    } catch (...) {
+      GELOGW("Json prof options is invalid.");
+      return;
    }
-  } catch (...) {
-    GELOGE(FAILED, "Json prof options is invalid.");
-    return;
  }
+  
  return;
 }

--- a/ge/common/profiling/profiling_manager.h
+++ b/ge/common/profiling/profiling_manager.h
@ -36,21 +36,21 @@ using Json = nlohmann::json;
 namespace {
  const std::string GE_PROFILING_MODULE = "Framework";
  // DataTypeConfig MASK
-  #define PROF_ACL_API_MASK                0x0001
-  #define PROF_TASK_TIME_MASK              0x0002
-  #define PROF_AICORE_METRICS_MASK         0x0004
-  #define PROF_AICPU_TRACE_MASK            0x0008
-  #define PROF_MODEL_EXECUTE_MASK          0x0010
-  #define PROF_RUNTIME_API_MASK            0x0020
-  #define PROF_RUNTIME_TRACE_MASK          0x0040
-  #define PROF_SCHEDULE_TIMELINE_MASK      0x0080
-  #define PROF_SCHEDULE_TRACE_MASK         0x0100
-  #define PROF_AIVECTORCORE_METRICS_MASK   0x0200
-  #define PROF_SUBTASK_TIME_MASK           0x0400
-  #define PROF_TRAINING_TRACE_MASK         0x0800
-  #define PROF_HCCL_TRACE_MASK             0x1000
-  #define PROF_DATA_PROCESS_MASK           0x2000
-  #define PROF_MODEL_LOAD_MASK             0x8000000000000000
+  const uint64_t PROF_ACL_API_MASK = 0x0001;
+  const uint64_t PROF_TASK_TIME_MASK = 0x0002;
+  const uint64_t PROF_AICORE_METRICS_MASK = 0x0004;
+  const uint64_t PROF_AICPU_TRACE_MASK = 0x0008;
+  const uint64_t PROF_MODEL_EXECUTE_MASK = 0x0010;
+  const uint64_t PROF_RUNTIME_API_MASK = 0x0020;
+  const uint64_t PROF_RUNTIME_TRACE_MASK = 0x0040;
+  const uint64_t PROF_SCHEDULE_TIMELINE_MASK = 0x0080;
+  const uint64_t PROF_SCHEDULE_TRACE_MASK = 0x0100;
+  const uint64_t PROF_AIVECTORCORE_METRICS_MASK = 0x0200;
+  const uint64_t PROF_SUBTASK_TIME_MASK = 0x0400;
+  const uint64_t PROF_TRAINING_TRACE_MASK = 0x0800;
+  const uint64_t PROF_HCCL_TRACE_MASK = 0x1000;
+  const uint64_t PROF_DATA_PROCESS_MASK = 0x2000;
+  const uint64_t PROF_MODEL_LOAD_MASK = 0x8000000000000000;

 }  // namespace
 namespace ge {
@ -80,7 +80,8 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ProfilingManager {
  bool ProfilingTrainingTraceOn() const { return is_training_trace_; }
  bool ProfilingModelLoadOn() const { return is_load_profiling_; }
  bool ProfilingModelExecuteOn() const;
-  bool ProfilingOn() const { return is_load_profiling_ && is_execute_profiling_; } // is_execute_profiling_ only used by ge option and env
+  // is_execute_profiling_ only used by ge option and env
+  bool ProfilingOn() const { return is_load_profiling_ && is_execute_profiling_; }
  void ReportProfilingData(uint32_t model_id, const std::vector<TaskDescInfo> &task_desc_info,
                           const std::vector<ComputeGraphDescInfo> &compute_graph_desc_info);
  void ProfilingTaskDescInfo(uint32_t model_id, const std::vector<TaskDescInfo> &task_desc_info,
--- a/ge/common/proto/op_mapping_info.proto
+++ b/ge/common/proto/op_mapping_info.proto
@ -15,6 +15,7 @@ message Output {
    int32 original_output_data_type = 7;
    int32 original_output_format = 8;
    uint64 size = 9;
+    Shape origin_shape = 10;
 }

 message Input {
@ -23,6 +24,7 @@ message Input {
    Shape shape = 3;
    uint64 address = 4;
    uint64 size = 5;
+    Shape origin_shape = 6;
 }

 enum BufferType {
--- a/ge/executor/ge_executor.cc
+++ b/ge/executor/ge_executor.cc
@ -209,19 +209,6 @@ bool IsDynmaicDimsSizeMatchModel(const vector<uint64_t> cur_dynamic_dims,

 namespace ge {
 bool GeExecutor::isInit_ = false;
-class ModelListenerAdapter : public ModelListener {
- public:
-  domi::Status OnComputeDone(uint32_t model_id, uint32_t dataIndex, uint32_t resultCode,
-                             std::vector<ge::OutputTensorInfo> &outputs) {
-    if (listener == nullptr) {
-      GELOGE(ge::FAILED, "listener is null.");
-      return FAILED;
-    }
-    return listener->OnComputeDone(model_id, dataIndex, resultCode, outputs);
-  }
-
-  std::shared_ptr<ge::ModelListener> listener;
-};

 static void InitOpsProtoManger() {
  string opsproto_path;
@ -573,60 +560,6 @@ Status GeExecutor::SetDynamicAippData(uint32_t model_id, void *dynamic_input_add
  return SUCCESS;
 }

-// Load model
-Status GeExecutor::LoadModelOffline(uint32_t &model_id, const std::string &path, const std::string &key,
-                                    int32_t priority, std::shared_ptr<ge::ModelListener> listener) {
-  GELOGI("load model offline begin.");
-  if (!isInit_) {
-    GELOGE(ACL_ERROR_GE_EXEC_NOT_INIT, "GeExecutor has not been initialized!");
-    return ACL_ERROR_GE_EXEC_NOT_INIT;
-  }
-
-  string filePath = RealPath(path.c_str());
-  if (filePath.empty()) {
-    GELOGE(ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID,
-           "File path is invalid. please check your text file '%s'.", path.c_str());
-    return ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID;
-  }
-
-  std::shared_ptr<ModelListenerAdapter> listener_adapter = MakeShared<ModelListenerAdapter>();
-  if (listener_adapter == nullptr) {
-    GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "ModelListenerAdapter make shared failed!");
-    return ACL_ERROR_GE_MEMORY_ALLOCATION;
-  }
-  listener_adapter->listener = listener;
-
-  Status ret = GraphLoader::LoadModelFromFile(path, key, priority, listener_adapter, model_id);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "[GeExecutor] LoadModelFromFile failed");
-    return ACL_ERROR_GE_LOAD_MODEL;
-  }
-  return SUCCESS;
-}
-
-Status GeExecutor::LoadModel(uint32_t &model_id, const ModelData &model_data,
-                             std::shared_ptr<ge::ModelListener> listener) {
-  GELOGI("Load model begin.");
-  if (!isInit_) {
-    GELOGE(ACL_ERROR_GE_EXEC_NOT_INIT, "GeExecutor has not been initialized!");
-    return ACL_ERROR_GE_EXEC_NOT_INIT;
-  }
-
-  std::shared_ptr<ModelListenerAdapter> listener_adapter = MakeShared<ModelListenerAdapter>();
-  if (listener_adapter == nullptr) {
-    GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "ModelListenerAdapter make shared failed!");
-    return ACL_ERROR_GE_MEMORY_ALLOCATION;
-  }
-  listener_adapter->listener = listener;
-
-  Status ret = GraphLoader::LoadModel(model_data, listener_adapter, model_id);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "[GeExecutor] LoadModel failed.");
-    return ACL_ERROR_GE_LOAD_MODEL;
-  }
-  return ret;
-}
-
 Status GeExecutor::UnloadModel(uint32_t model_id) {
  GELOGD("unload model %u begin.", model_id);
  if (!isInit_) {
@ -659,21 +592,6 @@ Status GeExecutor::UnloadModel(uint32_t model_id) {
  return SUCCESS;
 }

-Status GeExecutor::RunModel(const ge::RunModelData &input_data, ge::RunModelData &output_data) {
-  GELOGI("run model begin.");
-  if (!isInit_) {
-    GELOGE(ACL_ERROR_GE_EXEC_NOT_INIT, "GeExecutor has not been initialized!");
-    return ACL_ERROR_GE_EXEC_NOT_INIT;
-  }
-
-  InputData inputs;
-  GetDomiInputData(input_data, inputs);
-  OutputData outputs;
-  GetDomiOutputData(output_data, outputs);
-
-  return GraphExecutor::DataInput(inputs, outputs);
-}
-
 // Get input and output descriptor
 Status GeExecutor::GetModelDescInfo(uint32_t model_id, std::vector<ge::TensorDesc> &input_desc,
                                    std::vector<ge::TensorDesc> &output_desc, bool new_model_desc) {
--- a/ge/executor/proto/op_mapping_info.proto
+++ b/ge/executor/proto/op_mapping_info.proto
@ -15,6 +15,7 @@ message Output {
    int32 original_output_data_type = 7;
    int32 original_output_format = 8;
    uint64 size = 9;
+    Shape origin_shape = 10;
 }

 message Input {
@ -23,6 +24,7 @@ message Input {
    Shape shape = 3;
    uint64 address = 4;
    uint64 size = 5;
+    Shape origin_shape = 6;
 }

 enum BufferType {
--- a/ge/ge_local_engine/engine/host_cpu_engine.cc
+++ b/ge/ge_local_engine/engine/host_cpu_engine.cc
@ -39,7 +39,7 @@ namespace {
      }                                                                                                                \
      ge_tensor = MakeShared<GeTensor>(out_desc);                                                                      \
      GE_CHECK_NOTNULL(ge_tensor);                                                                                     \
-      GELOGI("node:%s allocate output %zu success, size=%lld", op_desc->GetName().c_str(), i, data_num * sizeof(TYPE));\
+      GELOGD("node:%s allocate output %zu success, size=%lld", op_desc->GetName().c_str(), i, data_num * sizeof(TYPE));\
      if (ge_tensor->SetData(reinterpret_cast<uint8_t *>(buf.get()), data_num * sizeof(TYPE)) != GRAPH_SUCCESS) {      \
        GELOGE(MEMALLOC_FAILED, "Set data for output %zu of node %s failed.", i, op_desc->GetName().c_str());          \
        return MEMALLOC_FAILED;                                                                                        \
@ -50,8 +50,7 @@ namespace {
    } else {                                                                                                           \
      ge_tensor = outputs[i];                                                                                          \
      GE_CHECK_NOTNULL(ge_tensor);                                                                                     \
-      GELOGI("node:%s existed output %zu, addr=%p, size=%lld", op_desc->GetName().c_str(), i,                          \
-             reinterpret_cast<const uint8_t *>(ge_tensor->GetData().data()), ge_tensor->GetData().size());             \
+      GELOGD("node:%s existed output %zu", op_desc->GetName().c_str(), i);                                             \
    }                                                                                                                  \
    auto tensor = TensorAdapter::AsTensor(*ge_tensor);                                                                 \
    auto tensor_name = op_desc->GetOutputNameByIndex(i);                                                               \
--- a/ge/generator/ge_generator.cc
+++ b/ge/generator/ge_generator.cc
@ -563,6 +563,19 @@ Status GeGenerator::GenerateModel(const Graph &graph, const string &file_name_pr

  GE_CHECK_NOTNULL(ge_root_model);
  GE_CHECK_NOTNULL(ge_root_model->GetRootGraph());
+  ModelHelper model_helper;
+  string model_name = "";
+  Status name_ret = model_helper.GetModelNameFromMergedGraphName(ge_root_model->GetRootGraph()->GetName(),
+                                                                 model_name);
+  if (name_ret != SUCCESS) {
+    ErrorManager::GetInstance().ATCReportErrMessage("E10000", {"parameter"}, {"output"});
+    GELOGE(FAILED, "Get model_name failed. Param --output is invalid.");
+    return PARAM_INVALID;
+  }
+  map<string, GeModelPtr> name_to_ge_model = ge_root_model->GetSubgraphInstanceNameToModel();
+  GeModelPtr &ge_model = name_to_ge_model[ge_root_model->GetRootGraph()->GetName()];
+  GE_RETURN_WITH_LOG_IF_FALSE(ge_model != nullptr, "ge_model cannot be null");
+  ge_model->SetName(model_name);
  ret = impl_->SaveRootModel(file_name_prefix, ge_root_model, model);
  if (ret != SUCCESS) {
    GELOGE(ret, "Save model failed");
--- a/ge/graph/build/memory/graph_mem_assigner.cc
+++ b/ge/graph/build/memory/graph_mem_assigner.cc
@ -99,7 +99,7 @@ Status GraphMemoryAssigner::AssignMemory() {
  MemoryOffset memory_offset(RT_MEMORY_HBM, mem_assigner->GetMemOffset());
  memory_offset_.emplace(RT_MEMORY_HBM, memory_offset);

-  if (mem_assigner->GetP2PMemOffset() > 0) {
+  if (mem_assigner->GetP2PMemOffset() >= 0) {
    MemoryOffset p2p_memory_offset(RT_MEMORY_P2P_DDR, mem_assigner->GetP2PMemOffset());
    memory_offset_.emplace(RT_MEMORY_P2P_DDR, p2p_memory_offset);
  }
--- a/ge/graph/build/stream_graph_optimizer.cc
+++ b/ge/graph/build/stream_graph_optimizer.cc
@ -48,26 +48,41 @@ void StreamGraphOptimizer::RefreshNodeId(const ComputeGraphPtr &comp_graph, Grap
  }
 }

-bool StreamGraphOptimizer::IsSameStreamId(const ComputeGraphPtr &comp_graph) {
+bool StreamGraphOptimizer::IsSameStreamIdOrBatchLabel(const ComputeGraphPtr &comp_graph) {
  if (comp_graph == nullptr) {
    return false;
  }
  std::set<int64_t> stream_set;
+  std::set<std::string> label_set;
  for (const ge::NodePtr &cur_node : comp_graph->GetDirectNode()) {
    GE_IF_BOOL_EXEC(cur_node->GetOpDesc() == nullptr, continue);
    int64_t stream_id = cur_node->GetOpDesc()->GetStreamId();
    if (stream_id == kInvalidStream) {
      continue;
    }
-    GELOGD("Node %s in subgraph %s stream id is: %ld, node num: %zu", cur_node->GetName().c_str(),
-           comp_graph->GetName().c_str(), stream_id, comp_graph->GetDirectNodesSize());
    stream_set.insert(stream_id);
+
+    std::string batch_label;
+    if (AttrUtils::GetStr(cur_node->GetOpDesc(), ATTR_NAME_BATCH_LABEL, batch_label)) {
+      label_set.insert(batch_label);
+    } else {
+      GELOGD("Node %s[%s] has no batch label, subgraph %s, stream id: %ld", cur_node->GetName().c_str(),
+             cur_node->GetType().c_str(), comp_graph->GetName().c_str(), stream_id);
+      continue;
+    }
+
+    GELOGD("Node %s in subgraph %s stream id: %ld, node num: %zu", cur_node->GetName().c_str(),
+           comp_graph->GetName().c_str(), stream_id, comp_graph->GetDirectNodesSize());
  }
-  if (stream_set.size() > 1) {
-    GELOGI("Nodes of graph: %s have different stream id, node num: %zu, different stream num: %zu.",
+  if (stream_set.size() > 1 || label_set.size() > 1) {
+    GELOGI("Nodes of graph: %s have different stream id or batch_label, node num: %zu, different stream num: %zu.",
           comp_graph->GetName().c_str(), comp_graph->GetDirectNodesSize(), stream_set.size());
    return false;
  }
+
+  if (!label_set.empty()) {
+    (void)AttrUtils::SetStr(comp_graph, ATTR_NAME_BATCH_LABEL, *label_set.begin());
+  }
  return true;
 }

@ -99,8 +114,8 @@ Status StreamGraphOptimizer::OptimizeStreamedSubGraph(const ComputeGraphPtr &com
          continue;
        }

-        if (!IsSameStreamId(subgraph)) {
-          GELOGI("There are more than one stream in subgraph %s", subgraph->GetName().c_str());
+        if (!IsSameStreamIdOrBatchLabel(subgraph)) {
+          GELOGI("There are more than one stream or batch_label in subgraph %s", subgraph->GetName().c_str());
          continue;
        }
        OpDescPtr op_desc = nodes.at(0)->GetOpDesc();
@ -112,9 +127,11 @@ Status StreamGraphOptimizer::OptimizeStreamedSubGraph(const ComputeGraphPtr &com
          return FAILED;
        }
        run_context.stream = run_context.graphStreamList[stream_id];
-        GELOGD("Subgraph has same stream id, subgraph: %s, engine_name: %s, stream_id: %ld, rtstream: %lu.",
-               subgraph->GetName().c_str(), engine_name.c_str(), stream_id,
-               static_cast<uint64_t>(reinterpret_cast<uintptr_t>(run_context.stream)));
+        std::string batch_label;
+        (void)AttrUtils::GetStr(subgraph, ATTR_NAME_BATCH_LABEL, batch_label);
+        GELOGD("Subgraph has same stream id, subgraph: %s, engine_name: %s, stream_id: %ld, rtstream: %lu, "
+          "batch_label: %s", subgraph->GetName().c_str(), engine_name.c_str(), stream_id,
+               static_cast<uint64_t>(reinterpret_cast<uintptr_t>(run_context.stream)), batch_label.c_str());
        for (auto iter = graph_optimizers.begin(); iter != graph_optimizers.end(); ++iter) {
          GE_CHECK_NOTNULL(*iter);
          Status ret = (*iter)->OptimizeStreamGraph(*subgraph, run_context);
--- a/ge/graph/build/stream_graph_optimizer.h
+++ b/ge/graph/build/stream_graph_optimizer.h
@ -41,7 +41,7 @@ class StreamGraphOptimizer {
 private:
  void RefreshNodeId(const ComputeGraphPtr &comp_graph, Graph2SubGraphInfoList &subgraph_map);

-  bool IsSameStreamId(const ComputeGraphPtr &comp_graph);
+  bool IsSameStreamIdOrBatchLabel(const ComputeGraphPtr &comp_graph);
 };
 }  // namespace ge
 #endif  // GE_GRAPH_BUILD_OPTIMIZE_STREAM_GRAPH_H_
--- a/ge/graph/build/task_generator.cc
+++ b/ge/graph/build/task_generator.cc
@ -567,7 +567,7 @@ Status TaskGenerator::MarkFirstAndLastOps(const vector<OpDescPtr> &ops, bool is_
      continue;
    }
    string op_type = op_desc->GetType();
-    if (!is_single_stream && (!op_desc->GetSubgraphInstanceNames().empty() || separator_types.count(op_type) != 0)) {
+    if (!op_desc->GetSubgraphInstanceNames().empty() || separator_types.count(op_type) != 0) {
      continuous_op_lists.emplace_back(vector<OpDescPtr>());
    } else {
      continuous_op_lists.back().emplace_back(op_desc);
--- a/ge/graph/load/graph_loader.cc
+++ b/ge/graph/load/graph_loader.cc
@ -122,14 +122,14 @@ Status GraphLoader::LoadDataFromFile(const std::string &path, const std::string
                                     ModelData &model_data) {
  Status ret;
  if (!CheckInputPathValid(path)) {
-    GELOGE(GE_EXEC_MODEL_PATH_INVALID, "model path is invalid: %s", path.c_str());
-    return GE_EXEC_MODEL_PATH_INVALID;
+    GELOGE(ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID, "model path is invalid: %s", path.c_str());
+    return ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID;
  }

  GELOGI("Load model begin, model path is: %s", path.c_str());
  if (!key_path.empty() && !CheckInputPathValid(key_path)) {
-    GELOGE(GE_EXEC_MODEL_KEY_PATH_INVALID, "decrypt_key path is invalid: %s", key_path.c_str());
-    return GE_EXEC_MODEL_KEY_PATH_INVALID;
+    GELOGE(ACL_ERROR_GE_PARAM_INVALID, "decrypt_key path is invalid: %s", key_path.c_str());
+    return ACL_ERROR_GE_PARAM_INVALID;
  }

  ret = DavinciModelParser::LoadFromFile(path.c_str(), key_path.c_str(), priority, model_data);
@ -144,63 +144,6 @@ Status GraphLoader::LoadDataFromFile(const std::string &path, const std::string
    return SUCCESS;
 }

-Status GraphLoader::LoadModelFromFile(const std::string &path, const std::string &key_path, int32_t priority,
-                                      const std::shared_ptr<ModelListener> &listener, uint32_t &model_id) {
-  Status ret;
-  ModelData model_data;
-  ret = LoadDataFromFile(path, key_path, priority, model_data);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "LoadModelFromFile: Load failed. ret = %u", ret);
-    if (model_data.model_data != nullptr) {
-      delete[] static_cast<char *>(model_data.model_data);
-      model_data.model_data = nullptr;
-    }
-    return ret;
-  }
-
-  ret = LoadModel(model_data, listener, model_id);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "LoadModel: Load failed. ret = %u", ret);
-    if (model_data.model_data != nullptr) {
-      delete[] static_cast<char *>(model_data.model_data);
-      model_data.model_data = nullptr;
-    }
-  }
-
-  if (model_data.model_data != nullptr) {
-    delete[] static_cast<char *>(model_data.model_data);
-    model_data.model_data = nullptr;
-  }
-
-  return ret;
-}
-
-Status GraphLoader::LoadModel(const ModelData &model_data, const std::shared_ptr<ModelListener> &listener,
-                              uint32_t &model_id) {
-  GELOGI("Load model begin, model_id:%u.", model_id);
-
-  // For GeOp, Open Device 0 here.
-  GE_CHK_RT_RET(rtSetDevice(0));
-  auto model_manager = ModelManager::GetInstance();
-  GE_CHECK_NOTNULL(model_manager);
-  Status ret = model_manager->LoadModelOffline(model_id, model_data, listener);
-  if (ret != SUCCESS) {
-    GE_CHK_RT(rtDeviceReset(0));
-    GELOGE(ret, "LoadModel: Load failed.");
-    return ret;
-  }
-  ret = model_manager->Start(model_id);
-  if (ret != SUCCESS) {
-    if (model_manager->Unload(model_id) != SUCCESS) {
-      GELOGE(FAILED, "LoadModel: Unload failed while trying to unload after a failed start.");
-    }
-    GELOGE(ret, "LoadModel: Start failed.");
-    return ret;
-  }
-  GELOGI("LoadModel: Start model success, model_id:%u.", model_id);
-  return SUCCESS;
-}
-
 Status GraphLoader::CommandHandle(const Command &command) {
  try {
    auto model_manager = ModelManager::GetInstance();
@ -225,16 +168,16 @@ Status GraphLoader::CommandHandle(const Command &command) {
 }

 Status GraphLoader::LoadModelFromData(uint32_t &model_id, const ModelData &model_data, void *dev_ptr,
-                                      size_t memsize, void *weight_ptr, size_t weightsize) {
+                                      size_t mem_size, void *weight_ptr, size_t weight_size) {
  GELOGI("Load model begin, model_id:%u.", model_id);
  // For ACL, Open Device from App.
  auto model_manager = ModelManager::GetInstance();
  GE_CHECK_NOTNULL(model_manager);
  Status ret = model_manager->LoadModelOffline(
-      model_id, model_data, nullptr, dev_ptr, memsize, weight_ptr, weightsize);
+      model_id, model_data, nullptr, dev_ptr, mem_size, weight_ptr, weight_size);
  if (ret != SUCCESS) {
-    GELOGE(ret, "Load model failed, model_id:%u.", model_id);
-    return ret;
+    GELOGE(ACL_ERROR_GE_LOAD_MODEL, "Load model failed, model_id:%u.", model_id);
+    return ACL_ERROR_GE_LOAD_MODEL;
  }
  GELOGI("Load model success, model_id:%u.", model_id);
  return SUCCESS;
@ -259,8 +202,8 @@ Status GraphLoader::LoadModelWithQ(uint32_t &model_id, const ModelData &model_da
  GE_CHECK_NOTNULL(model_manager);
  Status ret = model_manager->LoadModelWithQ(model_id, model_data, input_queue_ids, output_queue_ids);
  if (ret != SUCCESS) {
-    GELOGE(ret, "Load model with queue failed, model_id:%u.", model_id);
-    return ret;
+    GELOGE(ACL_ERROR_GE_LOAD_MODEL, "Load model with queue failed, model_id:%u.", model_id);
+    return ACL_ERROR_GE_LOAD_MODEL;
  }

  GELOGI("Load model with queue success, model_id:%u.", model_id);
--- a/ge/graph/load/graph_loader.h
+++ b/ge/graph/load/graph_loader.h
@ -44,12 +44,6 @@ class GraphLoader {

  static Status GetMaxUsedMemory(uint32_t model_id, uint64_t &max_size);

-  static Status LoadModel(const ModelData &model_data, const std::shared_ptr<ModelListener> &listener,
-                          uint32_t &model_id);
-
-  static Status LoadModelFromFile(const std::string &path, const std::string &key_path, int32_t priority,
-                                  const std::shared_ptr<ModelListener> &listener, uint32_t &model_id);
-
  static Status CommandHandle(const Command &command);

  static Status GetMemoryInfo(int64_t &free);
--- a/ge/graph/load/new_model_manager/data_dumper.cc
+++ b/ge/graph/load/new_model_manager/data_dumper.cc
@ -319,6 +319,9 @@ Status DataDumper::GenerateOutput(aicpu::dump::Output &output, const OpDesc::Vis
  for (auto dim : tensor_descs.at(index).GetShape().GetDims()) {
    output.mutable_shape()->add_dim(dim);
  }
+  for (auto dim : tensor_descs.at(index).GetOriginShape().GetDims()) {
+    output.mutable_origin_shape()->add_dim(dim);
+  }
  int64_t output_size = 0;
  if (TensorUtils::GetTensorSizeInBytes(tensor_descs.at(index), output_size) != SUCCESS) {
    GELOGE(PARAM_INVALID, "Get output size filed");
@ -476,6 +479,9 @@ Status DataDumper::GenerateInput(aicpu::dump::Input &input, const OpDesc::Vistor
  for (auto dim : tensor_descs.at(index).GetShape().GetDims()) {
    input.mutable_shape()->add_dim(dim);
  }
+  for (auto dim : tensor_descs.at(index).GetOriginShape().GetDims()) {
+    input.mutable_origin_shape()->add_dim(dim);
+  }
  int64_t input_size = 0;
  if (AttrUtils::GetInt(tensor_descs.at(index), ATTR_NAME_INPUT_ORIGIN_SIZE, input_size)) {
    GELOGI("Get aipp input size according to attr is %ld", input_size);
--- a/ge/graph/load/new_model_manager/davinci_model.cc
+++ b/ge/graph/load/new_model_manager/davinci_model.cc
--- a/ge/graph/load/new_model_manager/davinci_model.h
+++ b/ge/graph/load/new_model_manager/davinci_model.h
@ -76,6 +76,20 @@ struct timeInfo {
  int64_t dumpEndTime;
 };

+struct TaskMemInfo {
+  int64_t input_size{0};
+  int64_t output_size{0};
+  int64_t weight_size{0};
+  int64_t workspace_size{0};
+  int64_t total_size{0};
+};
+
+struct ProfileInfo {
+  FusionOpInfo fusion_info;
+  TaskMemInfo memory_info;
+  uint32_t task_count{0};
+};
+
 enum ExecuteMode {
  INITIALIZATION,
  SYNCHRONIZATION,
@ -226,8 +240,6 @@ class DavinciModel {
  const vector<OpDescPtr> &GetDataList() const { return data_op_list_; }

  // get Op
-  const map<uint32_t, OpDescPtr> &GetOpList() const { return op_list_; }
-
  OpDescPtr GetOpByIndex(uint32_t index) const {
    if (op_list_.find(index) == op_list_.end()) {
      return nullptr;
@ -436,10 +448,6 @@ class DavinciModel {

  int64_t GetLoadEndTime() { return load_end_time_; }

-  Status SinkModelProfile();
-
-  Status SinkTimeProfile(const InputData &current_data);
-
  Status ReportProfilingData();

  void SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, uint32_t task_id, uint32_t stream_id) {
@ -476,6 +484,14 @@ class DavinciModel {
  void SetTotalIOAddrs(vector<void *> &io_addrs) {
    total_io_addrs_.insert(total_io_addrs_.end(), io_addrs.begin(), io_addrs.end());
  }
+  void SetHybridArgsSize(uint32_t args_size) { total_hybrid_args_size_ += args_size; }
+  uint32_t GetHybridArgsSize() {
+    return total_hybrid_args_size_;
+  }
+  void *GetCurrentHybridArgsAddr(uint32_t offset) {
+    void *cur_args = static_cast<char *>(hybrid_addrs_) + offset;
+    return cur_args;
+  }
  void SetTotalFixedAddrsSize(string tensor_name, int64_t fix_addr_size);
  int64_t GetFixedAddrsSize(string tensor_name);
  void *GetCurrentFixedAddr(int64_t offset) const {
@ -494,7 +510,7 @@ class DavinciModel {
  Status MallocKnownArgs();
  Status UpdateKnownNodeArgs(const vector<void *> &inputs, const vector<void *> &outputs);
  Status CreateKnownZeroCopyMap(const vector<void *> &inputs, const vector<void *> &outputs);
-  Status UpdateKnownZeroCopyAddr();
+  Status UpdateKnownZeroCopyAddr(vector<void *> &total_io_addrs);
  void SetKnownNodeAddrNotChanged(bool base_addr_not_changed) { base_addr_not_changed_ = base_addr_not_changed; }

  Status GetOrigInputInfo(uint32_t index, OriginInputInfo &orig_input_info);
@ -529,15 +545,6 @@ class DavinciModel {
  struct timeInfo time_info_;
  int32_t dataInputTid;

-  ///
-  /// @ingroup ge
-  /// @brief Save Batch label Info.
-  /// @param [in] const OpDescPtr &op_desc
-  /// @param [in] uintptr_t addr: address value in args block.
-  /// @return None.
-  ///
-  void SetBatchLabelAddr(const OpDescPtr &op_desc, uintptr_t addr);
-
  ///
  /// @ingroup ge
  /// @brief Copy Check input size and model op size.
@ -649,14 +656,6 @@ class DavinciModel {
  ///
  void AdjustDataOpList(const map<uint32_t, OpDescPtr> &data_by_index);

-  ///
-  /// @ingroup ge
-  /// @brief input zero copy node Initialize.
-  /// @param [in] NodePtr: Data Op.
-  /// @return Status
-  ///
-  Status InitInputZeroCopyNodes(const NodePtr &node);
-
  ///
  /// @ingroup ge
  /// @brief NetOutput Op Initialize.
@ -665,30 +664,6 @@ class DavinciModel {
  ///
  Status InitNetOutput(const NodePtr &node);

-  ///
-  /// @ingroup ge
-  /// @brief output zero copy node Initialize.
-  /// @param [in] NodePtr: Data Op.
-  /// @return Status
-  ///
-  Status InitOutputZeroCopyNodes(const NodePtr &node);
-
-  ///
-  /// @ingroup ge
-  /// @brief input zero copy node Initialize for Case.
-  /// @param [in] NodePtr: Data Op.
-  /// @return Status
-  ///
-  Status InitInputBatchLabel(const NodePtr &node);
-
-  ///
-  /// @ingroup ge
-  /// @brief output zero copy node Initialize for Case.
-  /// @param [in] NodePtr: netoutput Op.
-  /// @return Status
-  ///
-  Status InitOutputBatchLabel(const NodePtr &node);
-
  ///
  /// @ingroup ge
  /// @brief Constant Op Init.
@ -837,6 +812,11 @@ class DavinciModel {

  void SetDataDumperArgs(const ComputeGraphPtr &compute_graph);

+  Status InitModelProfile();
+  Status SinkModelProfile();
+
+  Status SinkTimeProfile(const InputData &current_data);
+
  Status GenOutputTensorInfo(const OpDescPtr &op_desc, uint32_t data_index, OutputData *output_data,
                             std::vector<ge::OutputTensorInfo> &outputs);

@ -914,11 +894,6 @@ class DavinciModel {
  std::vector<ZeroCopyTask> zero_copy_tasks_;  // Task used Data or NetOutput addr.
  std::set<const void *> copy_only_addrs_;     // Address need copy to original place.

-  // {op_id, batch_label}
-  std::map<int64_t, std::string> zero_copy_op_id_batch_label_;
-  // {batch_label, addrs}
-  std::map<std::string, std::set<uintptr_t>> zero_copy_batch_label_addrs_;
-
  std::vector<TaskInfoPtr> task_list_;
  // rt_moodel_handle
  rtModel_t rt_model_handle_;
@ -977,6 +952,8 @@ class DavinciModel {
  void *args_ = nullptr;
  void *args_host_ = nullptr;
  void *fixed_addrs_ = nullptr;
+  void *hybrid_addrs_ = nullptr;
+  uint32_t total_hybrid_args_size_ = 0;
  int64_t total_fixed_addr_size_ = 0;
  std::map<const void *, void *> knonw_input_data_info_;
  std::map<const void *, void *> knonw_output_data_info_;
@ -1016,6 +993,9 @@ class DavinciModel {
  // key: input_index: input is merge node; value: each gear info and each output shape
  std::map<size_t, std::map<vector<int64_t>, vector<int64_t>>> merge_nodes_gear_and_real_out_shape_info_;
  std::vector<std::vector<int64_t>> all_gears_info_;
+
+  std::multimap<uint32_t, uint32_t> op_id_map_;
+  std::vector<ProfileInfo> profile_list_;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_DAVINCI_MODEL_H_
--- a/ge/graph/load/new_model_manager/model_manager.cc
+++ b/ge/graph/load/new_model_manager/model_manager.cc
@ -89,6 +89,7 @@ Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, u
  if (op_type == aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_KERNEL_DESTROY) {
    std::vector<uint64_t> v_aicpu_kernel;
    std::string model_key = std::to_string(session_id) + "_" + std::to_string(model_id);
+    std::lock_guard<std::recursive_mutex> lock(map_mutex_);
    auto iter = model_aicpu_kernel_.find(model_key);
    if (iter != model_aicpu_kernel_.end()) {
      GELOGD("kernel destroy session_id %lu, model_id %u.", session_id, model_id);
@ -176,7 +177,7 @@ Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, u
 }

 void ModelManager::DestroyAicpuSession(uint64_t session_id) {
-  std::lock_guard<std::mutex> lock(sess_ids_mutex_);
+  std::lock_guard<std::recursive_mutex> lock(map_mutex_);
  auto it = sess_ids_.find(session_id);
  if (it == sess_ids_.end()) {
    GELOGI("The session: %lu not created.", session_id);
@ -205,7 +206,7 @@ void ModelManager::DestroyAicpuSession(uint64_t session_id) {
 }

 ge::Status ModelManager::DestroyAicpuSessionForInfer(uint32_t model_id) {
-  std::lock_guard<std::mutex> lock(map_mutex_);
+  std::lock_guard<std::recursive_mutex> lock(map_mutex_);
  auto hybrid_davinci_model = hybrid_model_map_.find(model_id);
  if (hybrid_davinci_model != hybrid_model_map_.end()) {
    uint64_t session_id = hybrid_davinci_model->second->GetSessionId();
@ -215,8 +216,8 @@ ge::Status ModelManager::DestroyAicpuSessionForInfer(uint32_t model_id) {

  auto it = model_map_.find(model_id);
  if (it == model_map_.end()) {
-    GELOGE(GE_EXEC_MODEL_ID_INVALID, "model id %u does not exists.", model_id);
-    return GE_EXEC_MODEL_ID_INVALID;
+    GELOGE(ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, "model id %u does not exists.", model_id);
+    return ACL_ERROR_GE_EXEC_MODEL_ID_INVALID;
  }
  uint64_t session_id = it->second->GetSessionId();
  DestroyAicpuSession(session_id);
@ -225,7 +226,7 @@ ge::Status ModelManager::DestroyAicpuSessionForInfer(uint32_t model_id) {

 ge::Status ModelManager::DestroyAicpuKernel(uint64_t session_id, uint32_t model_id) {
  GELOGD("destroy aicpu kernel in session_id %lu, model_id %u.", session_id, model_id);
-  std::lock_guard<std::mutex> lock(map_mutex_);
+  std::lock_guard<std::recursive_mutex> lock(map_mutex_);
  std::string model_key = std::to_string(session_id) + "_" + std::to_string(model_id);
  if (model_aicpu_kernel_.find(model_key) != model_aicpu_kernel_.end()) {
    Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_KERNEL_DESTROY, session_id, model_id);
@ -238,7 +239,7 @@ ge::Status ModelManager::DestroyAicpuKernel(uint64_t session_id, uint32_t model_
 }

 ge::Status ModelManager::CreateAicpuKernel(uint64_t session_id, uint32_t model_id, uint64_t kernel_id) {
-  std::lock_guard<std::mutex> lock(map_mutex_);
+  std::lock_guard<std::recursive_mutex> lock(map_mutex_);
  std::vector<uint64_t> v_aicpu_kernel;
  std::string model_key = std::to_string(session_id) + "_" + std::to_string(model_id);
  if (model_aicpu_kernel_.find(model_key) != model_aicpu_kernel_.end()) {
@ -250,7 +251,7 @@ ge::Status ModelManager::CreateAicpuKernel(uint64_t session_id, uint32_t model_i
 }

 ModelManager::~ModelManager() {
-  std::lock_guard<std::mutex> lock(map_mutex_);
+  std::lock_guard<std::recursive_mutex> lock(map_mutex_);
  model_map_.clear();
  model_aicpu_kernel_.clear();
  cust_aicpu_so_.clear();
@ -358,18 +359,18 @@ Status ModelManager::LoadModelOnline(uint32_t &model_id, const shared_ptr<ge::Ge

 void ModelManager::InsertModel(uint32_t id, std::shared_ptr<DavinciModel> &davinci_model) {
  GE_CHK_BOOL_EXEC(davinci_model != nullptr, return, "davinci_model ptr is null, id: %u", id);
-  std::lock_guard<std::mutex> lock(map_mutex_);
+  std::lock_guard<std::recursive_mutex> lock(map_mutex_);
  model_map_[id] = davinci_model;
 }

 void ModelManager::InsertModel(uint32_t id, shared_ptr<hybrid::HybridDavinciModel> &hybrid_model) {
  GE_CHK_BOOL_EXEC(hybrid_model != nullptr, return, "hybrid_model ptr is null, id: %u", id);
-  std::lock_guard<std::mutex> lock(map_mutex_);
+  std::lock_guard<std::recursive_mutex> lock(map_mutex_);
  hybrid_model_map_[id] = hybrid_model;
 }

 Status ModelManager::DeleteModel(uint32_t id) {
-  std::lock_guard<std::mutex> lock(map_mutex_);
+  std::lock_guard<std::recursive_mutex> lock(map_mutex_);

  auto it = model_map_.find(id);
  auto hybrid_model_it = hybrid_model_map_.find(id);
@ -384,22 +385,22 @@ Status ModelManager::DeleteModel(uint32_t id) {
  } else if (hybrid_model_it != hybrid_model_map_.end()) {
    (void)hybrid_model_map_.erase(hybrid_model_it);
  } else {
-    GELOGE(GE_EXEC_MODEL_ID_INVALID, "model id %u does not exists.", id);
-    return GE_EXEC_MODEL_ID_INVALID;
+    GELOGE(ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, "model id %u does not exists.", id);
+    return ACL_ERROR_GE_EXEC_MODEL_ID_INVALID;
  }

  return SUCCESS;
 }

 std::shared_ptr<DavinciModel> ModelManager::GetModel(uint32_t id) {
-  std::lock_guard<std::mutex> lock(map_mutex_);
+  std::lock_guard<std::recursive_mutex> lock(map_mutex_);

  auto it = model_map_.find(id);
  return (it == model_map_.end()) ? nullptr : it->second;
 }

 std::shared_ptr<hybrid::HybridDavinciModel> ModelManager::GetHybridModel(uint32_t id) {
-  std::lock_guard<std::mutex> lock(map_mutex_);
+  std::lock_guard<std::recursive_mutex> lock(map_mutex_);

  auto it = hybrid_model_map_.find(id);
  return (it == hybrid_model_map_.end()) ? nullptr : it->second;
@ -902,7 +903,7 @@ Status ModelManager::GetInputOutputDescInfo(const uint32_t model_id, vector<Inpu
  }

  std::shared_ptr<DavinciModel> davinci_model = GetModel(model_id);
-  GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, GE_EXEC_MODEL_ID_INVALID,
+  GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID,
                         "GetInputOutputDescInfo Failed, Invalid model id %u!", model_id);

  davinci_model->SetModelDescVersion(new_model_desc);
@ -970,8 +971,9 @@ Status ModelManager::GetUserDesignateShapeOrder(const uint32_t model_id,
 }

 Status ModelManager::GetCurShape(const uint32_t model_id, std::vector<int64_t> &batch_info, int32_t &dynamic_type) {
-  std::shared_ptr<DavinciModel> davinci_model = GetModel(model_id);
-  GE_CHECK_NOTNULL(davinci_model);
+  auto davinci_model = GetModel(model_id);
+  GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID,
+                         "GetCurShape Failed, Invalid Model ID %u!", model_id);
  davinci_model->GetCurShape(batch_info, dynamic_type);
  return SUCCESS;
 }
@ -984,7 +986,8 @@ Status ModelManager::GetModelAttr(uint32_t model_id, std::vector<string> &dynami
  }

  std::shared_ptr<DavinciModel> davinci_model = GetModel(model_id);
-  GE_CHECK_NOTNULL(davinci_model);
+  GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID,
+                         "GetModelAttr Failed, Invalid Model ID %u!", model_id);
  davinci_model->GetModelAttr(dynamic_output_shape_info);
  return SUCCESS;
 }
@ -994,9 +997,8 @@ Status ModelManager::GetInputOutputDescInfoForZeroCopy(const uint32_t model_id,
                                                       std::vector<uint32_t> &inputFormats,
                                                       std::vector<uint32_t> &outputFormats) {
  std::shared_ptr<DavinciModel> davinci_model = GetModel(model_id);
-  GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, PARAM_INVALID, "GetInputOutputDescInfo Failed, Invalid model id %u!",
-                         model_id);
-
+  GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID,
+      "GetInputOutputDescInfo Failed, Invalid model id %u!", model_id);
  return davinci_model->GetInputOutputDescInfoForZeroCopy(input_desc, output_desc, inputFormats, outputFormats);
 }

@ -1011,18 +1013,14 @@ Status ModelManager::GetInputOutputDescInfoForZeroCopy(const uint32_t model_id,
 Status ModelManager::GetAIPPInfo(const uint32_t model_id, uint32_t index, AippConfigInfo &aipp_info) {
  std::shared_ptr<DavinciModel> davinci_model = GetModel(model_id);
  GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID,
-                         "GetAIPPInfo failed, invalid model_id is %u.",
-                         model_id);
-
+      "GetAIPPInfo failed, invalid model_id is %u.", model_id);
  return davinci_model->GetAIPPInfo(index, aipp_info);
 }

 Status ModelManager::GetAippType(uint32_t model_id, uint32_t index, InputAippType &type, size_t &aipp_index) {
  std::shared_ptr<DavinciModel> davinci_model = GetModel(model_id);
  GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID,
-                         "GetAIPPInfo failed, invalid model_id is %u.",
-                         model_id);
-
+      "GetAIPPInfo failed, invalid model_id is %u.", model_id);
  return davinci_model->GetAippType(index, type, aipp_index);
 }

@ -1055,7 +1053,15 @@ Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model
  mmTimespec timespec = mmGetTickCount();

  ModelHelper model_helper;
-  Status ret = model_helper.LoadModel(model);
+  Status ret = model_helper.LoadRootModel(model);
+  if (model_helper.GetModelType()) {
+    bool is_shape_unknown = false;
+    GE_CHK_STATUS_RET(model_helper.GetGeRootModel()->CheckIsUnknownShape(is_shape_unknown),
+                      "CheckIsUnknownShape failed, model id:%u", model_id);
+    if (is_shape_unknown || GetContext().GetHostExecFlag()) {
+      return DoLoadHybridModelOnline(model_id, model_helper.GetGeRootModel(), listener);
+    }
+  }
  if (ret != SUCCESS) {
    GELOGE(ret, "load model failed.");
    return ret;
@ -1069,8 +1075,8 @@ Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model
      GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Make shared failed");
      return ACL_ERROR_GE_MEMORY_ALLOCATION;
    } catch (...) {
-      GELOGE(INTERNAL_ERROR, "Make shared failed since other exception raise");
-      return INTERNAL_ERROR;
+      GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Make shared failed since other exception raise");
+      return ACL_ERROR_GE_MEMORY_ALLOCATION;
    }
    ret = davinci_model->Assign(ge_model);
    if (ret != SUCCESS) {
@ -1082,7 +1088,7 @@ Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model
    int32_t device_id = 0;
    rtError_t rt_ret = rtGetDevice(&device_id);
    if (rt_ret != RT_ERROR_NONE || device_id < 0) {
-      GELOGE(RT_FAILED, "Call rtGetDevice failed, ret = 0x%X, device_id = %d.", rt_ret, device_id);
+      GELOGE(rt_ret, "Call rtGetDevice failed, ret = 0x%X, device_id = %d.", rt_ret, device_id);
      return RT_ERROR_TO_GE_STATUS(rt_ret);
    }
    davinci_model->SetDeviceId(device_id);
@ -1214,7 +1220,7 @@ Status ModelManager::ExecuteModel(uint32_t model_id, rtStream_t stream, bool asy

  std::shared_ptr<DavinciModel> davinci_model = GetModel(model_id);
  GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID,
-                         "Invalid model id %u, check weather model has been loaded or not.", model_id);
+                         "Invalid model id %u, check whether model has been loaded or not.", model_id);

  if (davinci_model->NeedDestroyAicpuKernel()) {
    GELOGI("Start to destroy specified aicpu kernel.");
@ -1237,7 +1243,7 @@ Status ModelManager::ExecuteModel(uint32_t model_id, rtStream_t stream, bool asy
 }

 Status ModelManager::CreateAicpuSession(uint64_t session_id) {
-  std::lock_guard<std::mutex> lock(sess_ids_mutex_);
+  std::lock_guard<std::recursive_mutex> lock(map_mutex_);
  auto it = sess_ids_.find(session_id);
  // never been created by any model
  if (it == sess_ids_.end()) {
@ -1456,8 +1462,7 @@ void ModelManager::GenModelId(uint32_t *id) {
  if (id == nullptr) {
    return;
  }
-
-  std::lock_guard<std::mutex> lock(map_mutex_);
+  std::lock_guard<std::recursive_mutex> lock(map_mutex_);
  *id = ++max_model_id_;
 }

--- a/ge/graph/load/new_model_manager/model_manager.h
+++ b/ge/graph/load/new_model_manager/model_manager.h
@ -353,8 +353,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
  std::map<uint32_t, std::shared_ptr<hybrid::HybridDavinciModel>> hybrid_model_map_;
  std::map<std::string, std::vector<uint64_t>> model_aicpu_kernel_;
  uint32_t max_model_id_;
-  std::mutex map_mutex_;
-  std::mutex sess_ids_mutex_;
+  std::recursive_mutex map_mutex_;
  std::mutex session_id_create_mutex_;
  static::std::mutex exeception_infos_mutex_;
  uint64_t session_id_bias_;
--- a/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc
+++ b/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc
@ -90,20 +90,18 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci
                  fusion_op_info_.op_index = context.op_index(); fusion_op_info_.original_op_names = original_op_names;
                  fusion_op_info_.op_name = op_desc_->GetName());

-  string session_graph_model_id;
-  davinci_model_->GetUniqueId(op_desc_, session_graph_model_id);
-  // get bin_file_key
-  const char *bin_file_key = davinci_model_->GetRegisterStub(op_desc_->GetName(), session_graph_model_id);
  // new aicpu kernel(rtCpuKernelLaunch) no need to check function
  if (kernel_type_ == ccKernelType::CCE_AI_CORE) {
-    rtError_t rt_ret;
-    rt_ret = rtGetFunctionByName(const_cast<char *>(kernel_def.stub_func().c_str()), &stub_func_);
+    rtError_t rt_ret = rtGetFunctionByName(const_cast<char *>(kernel_def.stub_func().c_str()), &stub_func_);
    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "execute rtGetFunctionByName failed. stub_func: %s",
                                                    kernel_def.stub_func().c_str());
                    return RT_ERROR_TO_GE_STATUS(rt_ret););
  } else if (kernel_type_ == ccKernelType::TE) {
-    rtError_t rt_ret;
-    rt_ret = rtGetFunctionByName(bin_file_key, &stub_func_);
+    // get bin_file_key
+    string session_graph_model_id;
+    davinci_model_->GetUniqueId(op_desc_, session_graph_model_id);
+    const char *bin_file_key = davinci_model_->GetRegisterStub(op_desc_->GetName(), session_graph_model_id);
+    rtError_t rt_ret = rtGetFunctionByName(bin_file_key, &stub_func_);
    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE,
                    GELOGE(RT_FAILED, "execute rtGetFunctionByName failed. bin_file_key: %s", bin_file_key);
                    return RT_ERROR_TO_GE_STATUS(rt_ret););
@ -372,7 +370,11 @@ Status KernelTaskInfo::SuperKernelDistribute() {
 Status KernelTaskInfo::Distribute() {
  GELOGD("KernelTaskInfo Distribute Start.");
  if (davinci_model_->IsKnownNode()) {
-    args_ = davinci_model_->GetCurrentArgsAddr(args_offset_);
+    if (kernel_type_ == ccKernelType::TE) {
+      args_ = davinci_model_->GetCurrentArgsAddr(args_offset_);
+    } else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) {
+      args_ = davinci_model_->GetCurrentHybridArgsAddr(hybrid_args_offset_);
+    }
    GELOGI("Known node %s args addr %p, offset %u.", op_desc_->GetName().c_str(), args_, args_offset_);
  }
  rtError_t rt_ret = RT_ERROR_NONE;
@ -428,36 +430,31 @@ Status KernelTaskInfo::UpdateArgs() {
  const RuntimeParam &rts_param = davinci_model_->GetRuntimeParam();
  vector<void *> input_data_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc_);
  vector<void *> output_data_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc_);
-  vector<void *> workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(rts_param, op_desc_);

  vector<void *> io_addrs;
-  if (!op_desc_->HasAttr(ATTR_DYNAMIC_SHAPE_FIXED_ADDR)) {
-    io_addrs.insert(io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end());
-    io_addrs.insert(io_addrs.end(), output_data_addrs.begin(), output_data_addrs.end());
+  io_addrs.insert(io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end());
+  io_addrs.insert(io_addrs.end(), output_data_addrs.begin(), output_data_addrs.end());
+  if (kernel_type_ == ccKernelType::TE) {
+    vector<void *> workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(rts_param, op_desc_);
    io_addrs.insert(io_addrs.end(), workspace_data_addrs.begin(), workspace_data_addrs.end());
-  } else {
-    string peer_input_name;
-    if (AttrUtils::GetStr(op_desc_, ATTR_DYNAMIC_SHAPE_FIXED_ADDR, peer_input_name)) {
-      uint32_t output_index = davinci_model_->GetFixedAddrOutputIndex(peer_input_name);
-      if (output_index > output_data_addrs.size()) {
-        GELOGE(FAILED, "The output data addr size[%zu] and output index[%u] are inconsistent.",
-               output_data_addrs.size(), output_index);
-        return FAILED;
-      }
-      io_addrs.insert(io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end());
-      for (size_t i = 0; i < output_data_addrs.size(); ++i) {
-        if (i == output_index) {
-          void *fixed_addr = davinci_model_->GetCurrentFixedAddr(fixed_addr_offset_);
-          io_addrs.emplace_back(fixed_addr);
-          continue;
-        }
-        io_addrs.emplace_back(output_data_addrs[i]);
-      }
-      io_addrs.insert(io_addrs.end(), workspace_data_addrs.begin(), workspace_data_addrs.end());
+    davinci_model_->SetTotalIOAddrs(io_addrs);
+  } else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) {
+    davinci_model_->UpdateKnownZeroCopyAddr(io_addrs);
+    uintptr_t io_addr = reinterpret_cast<uintptr_t>(args_addr.get()) + sizeof(aicpu::AicpuParamHead);
+    auto addrs_size = sizeof(uint64_t) * io_addrs.size();
+    errno_t sec_ret = memcpy_s(reinterpret_cast<void *>(io_addr), addrs_size, io_addrs.data(), addrs_size);
+    if (sec_ret != EOK) {
+      GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret);
+      return FAILED;
+    }
+    // copy args to device
+    rtError_t rt_ret = rtMemcpy(args_, args_size_, args_addr.get(), args_size_, RT_MEMCPY_HOST_TO_DEVICE);
+    if (rt_ret != RT_ERROR_NONE) {
+      GELOGE(RT_FAILED, "Call rt api(rtMemcpy) failed, ret: 0x%X", rt_ret);
+      return RT_ERROR_TO_GE_STATUS(rt_ret);
    }
  }

-  davinci_model_->SetTotalIOAddrs(io_addrs);
  GELOGI("KernelTaskInfo::UpdateArgs success.");
  return SUCCESS;
 }
@ -533,33 +530,18 @@ Status KernelTaskInfo::UpdateL2Data(const domi::KernelDef &kernel_def) {
 }

 Status KernelTaskInfo::CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) {
-  domi::KernelDef kernel_def = task_def.kernel();
-  uint32_t args_size = kernel_def.args_size();
-  args_offset_ = davinci_model->GetTotalArgsSize();
-  davinci_model->SetTotalArgsSize(args_size);
-  GELOGI("kernel task name , args_size %u, args_offset %u", args_size, args_offset_);
-
-  // get opcontext stored in model
+  const domi::KernelDef &kernel_def = task_def.kernel();
  const domi::KernelContext &context = kernel_def.context();
-  // get opdesc
-  op_desc_ = davinci_model->GetOpByIndex(context.op_index());
-  GE_CHECK_NOTNULL(op_desc_);
-  // alloc fixed addr
-  string peer_input_name;
-  if (AttrUtils::GetStr(op_desc_, ATTR_DYNAMIC_SHAPE_FIXED_ADDR, peer_input_name) && !peer_input_name.empty()) {
-    uint32_t output_index = davinci_model->GetFixedAddrOutputIndex(peer_input_name);
-    if (output_index > op_desc_->GetOutputsSize()) {
-      GELOGE(FAILED, "The output size[%zu] and output index[%u] are inconsistent.", op_desc_->GetOutputsSize(),
-             output_index);
-      return FAILED;
-    }
-    fixed_addr_offset_ = davinci_model->GetFixedAddrsSize(peer_input_name);
-    auto tensor_desc = op_desc_->GetOutputDesc(output_index);
-    int64_t tensor_size = 0;
-    GE_CHK_STATUS(TensorUtils::GetSize(tensor_desc, tensor_size));
-    davinci_model->SetTotalFixedAddrsSize(peer_input_name, tensor_size);
-    GELOGI("Calculate stream switch task args , tensor size is %ld, fixed addr offset %ld", tensor_size,
-           fixed_addr_offset_);
+  kernel_type_ = static_cast<ccKernelType>(context.kernel_type());
+  if (kernel_type_ == ccKernelType::TE) {
+    uint32_t args_size = kernel_def.args_size();
+    args_offset_ = davinci_model->GetTotalArgsSize();
+    davinci_model->SetTotalArgsSize(args_size);
+    GELOGI("kernel task name , args_size %u, args_offset %u", args_size, args_offset_);
+  } else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) {
+    hybrid_args_offset_ = davinci_model->GetHybridArgsSize();
+    davinci_model->SetHybridArgsSize(kernel_def.args_size());
+    GELOGI("aicpu kernel task name , args_size %u, args_offset %u", kernel_def.args_size(), hybrid_args_offset_);
  }
  return SUCCESS;
 }
@ -888,7 +870,7 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k
  }

  // copy args to new host memory
-  std::unique_ptr<uint8_t[]> args_addr(new (std::nothrow) uint8_t[args_size_]);
+  args_addr = std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[args_size_]);
  GE_PRINT_DYNAMIC_MEMORY(new, "cce task physical memory.", sizeof(uint8_t) * args_size_)
  errno_t sec_ret = memcpy_s(args_addr.get(), args_size_, kernel_def.args().data(), args_size_);
  if (sec_ret != EOK) {
@ -896,8 +878,23 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k
    return FAILED;
  }

-  const RuntimeParam &rts_param = davinci_model_->GetRuntimeParam();
+  auto aicpu_param_head = reinterpret_cast<aicpu::AicpuParamHead *>(args_addr.get());
+  const auto &ext_info = kernel_def.kernel_ext_info();
+  auto init_ret = InitAicpuTaskExtInfo(ext_info);
+  if (init_ret != SUCCESS) {
+    GELOGE(init_ret, "Init aicpu task ext info failed, ext_info size=%zu", ext_info.size());
+    return init_ret;
+  }
+  GELOGI("Node[%s] type[%s] kernel_ext_info size=%zu, aicpu_ext_info_addr_=%p", op_desc_->GetName().c_str(),
+         op_desc_->GetType().c_str(), ext_info.size(), aicpu_ext_info_addr_);

+  aicpu_param_head->extInfoAddr = reinterpret_cast<uintptr_t>(aicpu_ext_info_addr_);
+  aicpu_param_head->extInfoLength = static_cast<uintptr_t>(ext_info.size());
+
+  if (davinci_model_->IsKnownNode()) {
+    return SUCCESS;
+  }
+  const RuntimeParam &rts_param = davinci_model_->GetRuntimeParam();
  vector<void *> input_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc);
  vector<void *> output_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc);
  vector<void *> io_addrs;
@ -914,19 +911,6 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k
    }
  }

-  auto aicpu_param_head = reinterpret_cast<aicpu::AicpuParamHead *>(args_addr.get());
-  const auto &ext_info = kernel_def.kernel_ext_info();
-  auto init_ret = InitAicpuTaskExtInfo(ext_info);
-  if (init_ret != SUCCESS) {
-    GELOGE(init_ret, "Init aicpu task ext info failed, ext_info size=%zu", ext_info.size());
-    return init_ret;
-  }
-  GELOGI("Node[%s] type[%s] kernel_ext_info size=%zu, aicpu_ext_info_addr_=%p", op_desc_->GetName().c_str(),
-         op_desc_->GetType().c_str(), ext_info.size(), aicpu_ext_info_addr_);
-
-  aicpu_param_head->extInfoAddr = reinterpret_cast<uintptr_t>(aicpu_ext_info_addr_);
-  aicpu_param_head->extInfoLength = static_cast<uintptr_t>(ext_info.size());
-
  // malloc device memory for args
  rtError_t rt_ret = rtMalloc(static_cast<void **>(&args_), args_size_, RT_MEMORY_HBM);
  if (rt_ret != RT_ERROR_NONE) {
--- a/ge/graph/load/new_model_manager/task_info/kernel_task_info.h
+++ b/ge/graph/load/new_model_manager/task_info/kernel_task_info.h
@ -159,7 +159,9 @@ class KernelTaskInfo : public TaskInfo {
  OpDescPtr op_desc_;
  DavinciModel *davinci_model_;
  uint32_t args_offset_ = 0;
+  uint32_t hybrid_args_offset_ = 0;
  int64_t fixed_addr_offset_ = 0;
+  std::unique_ptr<uint8_t[]> args_addr = nullptr;
  bool call_save_dump_ = false;

  // aicpu ext_info device mem
--- a/ge/graph/load/new_model_manager/zero_copy_offset.cc
+++ b/ge/graph/load/new_model_manager/zero_copy_offset.cc
@ -183,22 +183,18 @@ void ZeroCopyOffset::SetOutputOutsideAddrs(const int64_t &input_offset, const bo
  addr_count_ = out_count;
 }

-bool ZeroCopyOffset::SetOutsideAddrsValue(ZeroCopyTask &zero_copy_task, void *outside_addr, void *args, size_t offset) {
+void ZeroCopyOffset::SetOutsideAddrsValue(ZeroCopyTask &zero_copy_task, void *outside_addr, void *args, size_t offset) {
  const auto addr_val = reinterpret_cast<uintptr_t>(outside_addr);
-  bool set_batch_label_flag = false;
  for (uint32_t out_count = 0; out_count < GetAddrCount(); ++out_count) {
-    auto &addrs_mapping_list = GetOutsideAddrs();
-    auto args_addrs = addrs_mapping_list[out_count].find(outside_addr);
-    if (args_addrs != addrs_mapping_list[out_count].end()) {
+    auto args_addrs = outside_addrs_[out_count].find(outside_addr);
+    if (args_addrs != outside_addrs_[out_count].end()) {
      GE_CHK_STATUS(zero_copy_task.SetTaskArgsOffset(addr_val, offset), "Input args invalid.");
      void *args_val = static_cast<uint8_t *>(args) + offset;
      args_addrs->second.push_back(args_val);
      GELOGD("[ZCPY] set copy input: virtual_addr: 0x%lx, task_addr: %p, args: %p, offset: %zu.", addr_val, args_val,
             args, offset);
-      set_batch_label_flag = true;
    }
  }
-  return set_batch_label_flag;
 }

 }  // namespace ge
--- a/Show More
+++ b/Show More