Update GraphEngine to synchronize with latest Ascend driver software suite 13 Apr 2020

5 years ago · 3f3c41fd04
parent 40e9f6f834
commit 3f3c41fd04
61 changed files with 1075 additions and 629 deletions
--- a/inc/common/opskernel/ops_kernel_info_types.h
+++ b/inc/common/opskernel/ops_kernel_info_types.h
@ -26,7 +26,6 @@
 using std::string;

 namespace ge {
-/*lint -e148*/
 struct RunContext {
  rtModel_t model;
  rtStream_t stream;
@ -40,8 +39,6 @@ struct RunContext {
  std::vector<rtEvent_t> graphEventList;    // all events of graph, order by ge event id(0,1,...)
 };

-/*lint +e148*/
-
 struct Task {
  uint32_t id;
  uint16_t type;
@ -50,8 +47,7 @@ struct Task {
 };

 struct OpInfo {
-  string engine;  // which engin
-  /*lint -e148*/
+  string engine;       // which engin
  string opKernelLib;  // which opsKernelStore
  int computeCost;     // compute cost
  bool flagPartial;    // whether to support is related to shape
--- a/inc/external/ge/ge_api_types.h
+++ b/inc/external/ge/ge_api_types.h
@ -98,7 +98,7 @@ const std::string OUTPUT_NODE_NAME = "ge.outputNodeName";
 // its value should be "0" or "1", default value is "0"
 const std::string COMPRESS_FLAG = "ge.compressFlag";

-const std::string ATUO_PRECISION_FLAG = "ge.exec.auto_mix_precision";
+const std::string PRECISION_MODE = "ge.exec.precision_mode";

 // Configure single op flag for FE
 // its value should be "0" or "1", default value is "0"
--- a/inc/framework/common/debug/ge_log.h
+++ b/inc/framework/common/debug/ge_log.h
@ -44,8 +44,6 @@ inline bool IsLogEnable(int module_name, int log_level) noexcept {
  return false;
 }

-/*lint --emacro((773),GE_TIMESTAMP_START)*/
-/*lint -esym(773,GE_TIMESTAMP_START)*/
 #define GE_TIMESTAMP_START(stage) uint64_t startUsec_##stage = ge::GetCurrentTimestap()

 #define GE_TIMESTAMP_END(stage, stage_name)                                           \
--- a/inc/framework/common/ge_inner_error_codes.h
+++ b/inc/framework/common/ge_inner_error_codes.h
@ -14,7 +14,6 @@
 * limitations under the License.
 */

-/*lint -e* */
 #ifndef INC_FRAMEWORK_COMMON_GE_INNER_ERROR_CODES_H_
 #define INC_FRAMEWORK_COMMON_GE_INNER_ERROR_CODES_H_

--- a/inc/framework/common/helper/om_file_helper.h
+++ b/inc/framework/common/helper/om_file_helper.h
@ -88,5 +88,4 @@ class OmFileSaveHelper {
  OmFileContext context_;
 };
 }  // namespace ge
-/*lint +e148*/
 #endif  // INC_FRAMEWORK_COMMON_HELPER_OM_FILE_HELPER_H_
--- a/inc/graph/debug/ge_attr_define.h
+++ b/inc/graph/debug/ge_attr_define.h
@ -774,4 +774,3 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DYNAMIC_
 }  // namespace ge

 #endif  // INC_GRAPH_DEBUG_GE_ATTR_DEFINE_H_
-/*lint +e618*/
--- a/inc/graph/model.h
+++ b/inc/graph/model.h
@ -31,8 +31,6 @@ using std::map;
 using std::string;
 using std::vector;

-/*lint -e148*/
-
 class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Model : public AttrHolder {
 public:
  Model();
@ -91,7 +89,6 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Model : public AttrHolder {
  std::string platform_version_{""};
  Graph graph_;
 };
-/*lint +e148*/
 }  // namespace ge
 using ModelPtr = std::shared_ptr<ge::Model>;

--- a/src/common/graph/ge_attr_define.cc
+++ b/src/common/graph/ge_attr_define.cc
@ -124,7 +124,7 @@ const std::string ATTR_NAME_BROACAST_REAL_DIM_CNT = "broacast_real_dim_cnt";
 const std::string ATTR_NAME_DIM_ALIGN = "dim_align";
 const std::string ATTR_NAME_FRAMEWORK_ORIGINAL_TYPE = "original_type";

-const std::string ATTR_NAME_SESSION_GRAPH_ID = "session_graph_id";
+const std::string ATTR_NAME_SESSION_GRAPH_ID = "_session_graph_id";

 const std::string ATTR_NAME_AUTOMIC_ADD_START = "automic_add_addr_start";
 const std::string ATTR_NAME_AUTOMIC_ADD_MEM_SIZE = "automic_add_mem_size";
--- a/src/common/graph/ge_attr_value.cc
+++ b/src/common/graph/ge_attr_value.cc
@ -34,7 +34,7 @@ namespace ge {
 GeAttrValue::NamedAttrs::NamedAttrs() { named_attrs_.InitDefault(); }

 GeAttrValue::NamedAttrs::NamedAttrs(const ProtoMsgOwner &owner, proto::NamedAttrs *proto_msg)
-    : named_attrs_(owner, proto_msg) {}  // lint !e1744
+    : named_attrs_(owner, proto_msg) {}

 void GeAttrValue::NamedAttrs::SetName(const std::string &name) {
  auto proto_msg = named_attrs_.GetProtoMsg();
@ -239,7 +239,7 @@ ATTR_VALUE_SET_GET_IMP(GeAttrValue::STR)
 ATTR_VALUE_SET_GET_IMP(vector<GeAttrValue::STR>)
 ATTR_VALUE_SET_GET_IMP(GeAttrValue::INT)
 ATTR_VALUE_SET_GET_IMP(vector<GeAttrValue::INT>)
-ATTR_VALUE_SET_GET_IMP(GeAttrValue::FLOAT)  // lint !e524
+ATTR_VALUE_SET_GET_IMP(GeAttrValue::FLOAT)
 ATTR_VALUE_SET_GET_IMP(vector<GeAttrValue::FLOAT>)
 ATTR_VALUE_SET_GET_IMP(GeAttrValue::BOOL)
 ATTR_VALUE_SET_GET_IMP(vector<GeAttrValue::BOOL>)
@ -253,11 +253,9 @@ ATTR_VALUE_SET_GET_IMP(GeAttrValue::BYTES)
 ATTR_VALUE_SET_GET_IMP(vector<GeAttrValue::BYTES>)
 ATTR_VALUE_SET_GET_IMP(GeAttrValue::NAMED_ATTRS)
 ATTR_VALUE_SET_GET_IMP(vector<GeAttrValue::NAMED_ATTRS>)
-/*lint -e665*/
 ATTR_VALUE_SET_GET_IMP(vector<vector<int64_t>>)
-/*lint +e665*/
-ATTR_VALUE_SET_GET_IMP(vector<DataType>)        // lint !e665
-ATTR_VALUE_SET_GET_IMP(GeAttrValue::DATA_TYPE)  // lint !e665
+ATTR_VALUE_SET_GET_IMP(vector<DataType>)
+ATTR_VALUE_SET_GET_IMP(GeAttrValue::DATA_TYPE)

 #undef ATTR_VALUE_SET_GET_IMP

--- a/src/common/graph/model_serialize.cc
+++ b/src/common/graph/model_serialize.cc
@ -265,13 +265,13 @@ bool ModelSerializeImp::HandleNodeNameRef() {
               item.dst_node_name.c_str(), item.dst_in_index);
        return false;
      }
-      GE_CHK_BOOL_ONLY_LOG((src_anchor->LinkTo(dst_anchor) == GRAPH_SUCCESS), " linkTo failed.");  // lint !e737
+      GE_CHK_BOOL_ONLY_LOG((src_anchor->LinkTo(dst_anchor) == GRAPH_SUCCESS), " linkTo failed.");
    } else {
      // Control edge
      auto src_anchor = src_node_it->second->GetOutControlAnchor();
      auto dst_anchor = item.dst_node->GetInControlAnchor();
      if (src_anchor != nullptr && dst_anchor != nullptr) {
-        GE_CHK_BOOL_ONLY_LOG((src_anchor->LinkTo(dst_anchor) == GRAPH_SUCCESS), " linkTo failed.");  // lint !e737
+        GE_CHK_BOOL_ONLY_LOG((src_anchor->LinkTo(dst_anchor) == GRAPH_SUCCESS), " linkTo failed.");
      }
    }
  }
--- a/src/common/graph/op_desc.cc
+++ b/src/common/graph/op_desc.cc
@ -32,7 +32,6 @@ using std::shared_ptr;
 using std::string;
 using std::vector;

-/*lint -save -e521 -e681 -e732 -e737*/
 namespace ge {
 const std::string ATTR_NAME_ID = "id";

--- a/src/common/graph/operator.cc
+++ b/src/common/graph/operator.cc
@ -421,7 +421,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Operator OpDescUtils::CreateOpera
    return Operator("default");
  }
  OperatorKeeper::GetInstance().CheckInOperator(operator_impl_ptr);
-  return operator_impl_ptr->ToOperator();  // lint !e514
+  return operator_impl_ptr->ToOperator();
 }

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY OpDescPtr OpDescUtils::GetOpDescFromOperator(const Operator &oprt) {
--- a/src/common/graph/opsproto/opsproto_manager.cc
+++ b/src/common/graph/opsproto/opsproto_manager.cc
@ -33,9 +33,7 @@ OpsProtoManager *OpsProtoManager::Instance() {
 }

 bool OpsProtoManager::Initialize(const std::map<std::string, std::string> &options) {
-  /*lint -e1561*/
  auto proto_iter = options.find("ge.opsProtoLibPath");
-  /*lint +e1561*/
  if (proto_iter == options.end()) {
    GELOGW("ge.opsProtoLibPath option not set, return.");
    return false;
--- a/src/common/graph/utils/op_desc_utils.cc
+++ b/src/common/graph/utils/op_desc_utils.cc
@ -30,7 +30,6 @@

 using std::vector;

-/*lint -e512 -e737 -e752*/
 namespace ge {
 const char OP_DESC_QUANT_PARAMS[] = "quantize_factor";
 static const int CONST_OP_NORMAL_WEIGHT_SIZE = 1;
@ -135,11 +134,11 @@ graphStatus OpDescUtils::GetQuantizeFactorParams(const OpDesc &op_desc, Quantize
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus
 OpDescUtils::SetQuantizeFactorParams(const OpDescPtr &op_desc, const QuantizeFactorParams &quant) {
  GE_CHK_BOOL_EXEC_INFO(op_desc != nullptr, return GRAPH_FAILED, "op_desc is nullptr");
-  return op_desc->SetAttr(OP_DESC_QUANT_PARAMS, GeAttrValue::CreateFrom<QuantizeFactorParams>(quant));  // lint !e732
+  return op_desc->SetAttr(OP_DESC_QUANT_PARAMS, GeAttrValue::CreateFrom<QuantizeFactorParams>(quant));
 }

 graphStatus OpDescUtils::SetQuantizeFactorParams(OpDesc &op_desc, const QuantizeFactorParams &quant) {
-  return op_desc.SetAttr(OP_DESC_QUANT_PARAMS, GeAttrValue::CreateFrom<QuantizeFactorParams>(quant));  // lint !e732
+  return op_desc.SetAttr(OP_DESC_QUANT_PARAMS, GeAttrValue::CreateFrom<QuantizeFactorParams>(quant));
 }

 GeTensorPtr OpDescUtils::MutableWeights(OpDesc &op_desc) {
@ -164,7 +163,7 @@ graphStatus OpDescUtils::SetWeights(OpDesc &op_desc, const GeTensorPtr weight) {
    GELOGE(GRAPH_FAILED, "weight is null");
    return GRAPH_FAILED;
  }
-  return AttrUtils::SetTensor(&op_desc, ATTR_NAME_WEIGHTS, weight) ? GRAPH_SUCCESS : GRAPH_FAILED;  // lint !e737
+  return AttrUtils::SetTensor(&op_desc, ATTR_NAME_WEIGHTS, weight) ? GRAPH_SUCCESS : GRAPH_FAILED;
 }

 graphStatus OpDescUtils::SetWeights(OpDescPtr op_desc, const GeTensorPtr weight) {
@ -230,7 +229,7 @@ size_t OpDescUtils::GetNonConstInputsSize(const ge::Node &node) {
        continue;
      }
    }
-    return input_num;  // lint !e712
+    return input_num;
  } else {
    GE_IF_BOOL_EXEC(
      node.GetInDataNodes().size() < GetConstInputs(node).size(),
@ -335,7 +334,7 @@ bool OpDescUtils::IsNonConstInput(const ge::Node &node, const size_t index) {
  bool ret = false;
  if (index < node.GetAllInDataAnchors().size()) {
    if (NodeUtils::IsAnchorStatusSet(node)) {
-      ret = (ge::AnchorUtils::GetStatus(node.GetInDataAnchor(static_cast<int>(index))) == ANCHOR_DATA);  // lint !e712
+      ret = (ge::AnchorUtils::GetStatus(node.GetInDataAnchor(static_cast<int>(index))) == ANCHOR_DATA);
    } else {
      for (const auto &anchor : node.GetAllInDataAnchors()) {
        if (anchor->GetIdx() != static_cast<int>(index)) {
@ -574,4 +573,3 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus OpDescUtils::ClearWei
  return GRAPH_SUCCESS;
 }
 }  // namespace ge
-/*lint +e512 +e737 +e752*/
--- a/src/common/graph/utils/tensor_utils.cc
+++ b/src/common/graph/utils/tensor_utils.cc
@ -286,10 +286,10 @@ static graphStatus CalcTensorElementCnt(const std::vector<int64_t> &dims, Format

  const string type_str = TypeUtils::DataTypeToSerialString(data_type);
  if (graph_status == GRAPH_SUCCESS) {
-    GELOGI(
-        "CalcTensorElementCnt end, format=%d(%s),"
-        " data_type=%d(%s), element_cnt=%ld.",
-        format, format_str.c_str(), data_type, type_str.c_str(), element_cnt);
+    GELOGD(
+      "CalcTensorElementCnt end, format=%d(%s),"
+      " data_type=%d(%s), element_cnt=%ld.",
+      format, format_str.c_str(), data_type, type_str.c_str(), element_cnt);
  } else {
    GELOGE(GRAPH_FAILED, "CalcTensorElementCnt failed, format=%d(%s), data_type=%d(%s).", format, format_str.c_str(),
           data_type, type_str.c_str());
@ -329,10 +329,10 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus TensorUtils::CalcTens
  // Support unknown shape
  if (element_cnt < 0) {
    mem_size = kMemSizeUnknownShape;
-    GELOGI(
-        "element_cnt is unknown. "
-        "format=%d(%s), data_type=%d(%s), mem_size=%ld",
-        format, format_str.c_str(), data_type, type_str.c_str(), mem_size);
+    GELOGD(
+      "element_cnt is unknown. "
+      "format=%d(%s), data_type=%d(%s), mem_size=%ld",
+      format, format_str.c_str(), data_type, type_str.c_str(), mem_size);
    return GRAPH_SUCCESS;
  }
  auto type_size_int64 = static_cast<int64_t>(type_size);
@ -343,10 +343,10 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus TensorUtils::CalcTens
  }
  mem_size = element_cnt * type_size_int64;

-  GELOGI(
-      "CalcTensorMemSize end, "
-      "format=%d(%s), data_type=%d(%s), mem_size=%ld",
-      format, format_str.c_str(), data_type, type_str.c_str(), mem_size);
+  GELOGD(
+    "CalcTensorMemSize end, "
+    "format=%d(%s), data_type=%d(%s), mem_size=%ld",
+    format, format_str.c_str(), data_type, type_str.c_str(), mem_size);
  return GRAPH_SUCCESS;
 }

--- a/src/ge/common/op/attr_define.cc
+++ b/src/ge/common/op/attr_define.cc
@ -108,7 +108,7 @@ const std::string ATTR_NAME_NAN_OPT = "nan_opt";
 const std::string ATTR_NAME_AIPP = "aipp";
 const std::string NEW_AIPP_CONV_OP = "new_conv_op_for_aipp";

-const std::string ATTR_NAME_SESSION_GRAPH_ID = "session_graph_id";
+const std::string ATTR_NAME_SESSION_GRAPH_ID = "_session_graph_id";

 const std::string ATTR_NAME_MULTISHAPE_BATCHLIST = "multi_shape_batchlist";
 const std::string ATTR_NAME_MULTISHAPE_BATCHLIST_SIZE = "multi_shape_batchlist_size";
--- a/src/ge/graph/build/memory/block_mem_assigner.cc
+++ b/src/ge/graph/build/memory/block_mem_assigner.cc
@ -402,6 +402,31 @@ bool IsOutputBlock(const ge::InDataAnchorPtr &in_data_anchor) {
  return false;
 }

+// current node's output uses previous node's output memory
+bool IsReferencePreviousNodeOutputMemory(const ge::NodePtr &node, uint32_t output_index) {
+  // Get the reference type of the node, default is false
+  bool is_ref = false;
+  // If GetBool fail, is_ref is false.
+  auto op_desc = node->GetOpDesc();
+  if (op_desc == nullptr) {
+    return false;
+  }
+  (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_REFERENCE, is_ref);
+  if (!is_ref) {
+    return false;
+  }
+  const string &output_name = op_desc->GetOutputNameByIndex(output_index);
+  for (const auto &input_name : op_desc->GetAllInputNames()) {
+    if (!input_name.empty() && output_name == input_name) {
+      int input_index = op_desc->GetInputIndexByName(input_name);
+      GELOGI("Reference memory:name[%s] output[%s][%u] ref to input[%s][%d] ", op_desc->GetName().c_str(),
+             output_name.c_str(), output_index, input_name.c_str(), input_index);
+      return true;
+    }
+  }
+  return false;
+}
+
 void BlockMemAssigner::ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock *> &reusable_memory) {
  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(to_release == nullptr, return, "Input parameter to_release is null.");
  GE_CHK_TRUE_EXEC_INFO(to_release->ref_count_ <= 0, return, "Release memory");
@ -489,7 +514,7 @@ void BlockMemAssigner::AssignMemoryWithReuse(vector<int64_t> &ranges) {
      if (output_op_desc != nullptr) {
        GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(*output_op_desc, size) != SUCCESS, GELOGI("Get size failed"));
      }
-      if ((size == 0) || CheckIsZeroMemNodeType(n->GetType())) {
+      if ((size == 0) || CheckIsZeroMemNodeType(n->GetType()) || IsReferencePreviousNodeOutputMemory(n, i)) {
        zero_memory_list_.emplace_back(n, kOutput, i);
        continue;
      }
@ -607,11 +632,11 @@ void BlockMemAssigner::MergeDynamicBatchBlocks() {
    std::sort(it->second.begin(), it->second.end(), CompareBlockMaxSize);
  }
  if (it_max != dynamic_batch_blocks.end()) {
-    GELOGI("MergeDynamicBatch %s block counts %zu", it_max->first.c_str(), it_max->second.size());
+    GELOGD("MergeDynamicBatch %s block counts %zu", it_max->first.c_str(), it_max->second.size());
  }
  for (it = dynamic_batch_blocks.begin(); it != dynamic_batch_blocks.end(); ++it) {
    if (it != it_max) {
-      GELOGI("MergeDynamicBatch from %s to %s", it->first.c_str(), it_max->first.c_str());
+      GELOGD("MergeDynamicBatch from %s to %s", it->first.c_str(), it_max->first.c_str());
      MergeBlocks(it_max->second, it->second);
    }
  }
--- a/src/ge/graph/build/memory/graph_mem_assigner.cc
+++ b/src/ge/graph/build/memory/graph_mem_assigner.cc
@ -296,7 +296,7 @@ Status GraphMemoryAssigner::ReAssignVirtualConcatMemory() {
      }
      output_list.at(0) = memory_offset_[0].mem_offset_;
      n->GetOpDesc()->SetOutputOffset(output_list);
-      GELOGI("Set Concat %s output offset to %zu.", n->GetOpDesc()->GetName().c_str(), memory_offset_[0].mem_offset_);
+      GELOGD("Set Concat %s output offset to %zu.", n->GetOpDesc()->GetName().c_str(), memory_offset_[0].mem_offset_);

      size_t extra_memory_size = 0;
      for (const auto &in_data_anchor : n->GetAllInDataAnchors()) {
@ -401,7 +401,7 @@ Status GraphMemoryAssigner::ReAssignMergeMemory() {
        data_output_offset = output_list[index];
        max_output_size = tmp_output_size;
      }
-      GELOGI("merge=%s, input=%s, size=%ld, offset=%ld, max_size=%ld", n->GetName().c_str(),
+      GELOGD("merge=%s, input=%s, size=%ld, offset=%ld, max_size=%ld", n->GetName().c_str(),
             src_node->GetName().c_str(), tmp_output_size, data_output_offset, max_output_size);
    }

@ -541,7 +541,7 @@ Status GraphMemoryAssigner::AssignReferenceMemory(const ge::NodePtr &node) {
      GE_CHECK_NOTNULL(peer_out_op_desc);
      output_list[out_data_anchor->GetIdx()] = peer_out_op_desc->GetOutputOffset()[peer_out_anchor_index];
    } else {
-      GELOGI("Reference output : origin %s name[%s] output[%d] offset is [%ld] stream_id[%ld]",
+      GELOGD("Reference output : origin %s name[%s] output[%d] offset is [%ld] stream_id[%ld]",
             node->GetOwnerComputeGraph()->GetName().c_str(), out_op_desc->GetName().c_str(), out_data_anchor->GetIdx(),
             output_list[out_data_anchor->GetIdx()], out_op_desc->GetStreamId());
    }
@ -576,7 +576,7 @@ bool GraphMemoryAssigner::CheckInputIsSupportAtomic(const ge::NodePtr &node) {
 Status GraphMemoryAssigner::AssignAtomicOutputMemory(const ge::NodePtr &node) {
  auto op_desc = node->GetOpDesc();
  GE_IF_BOOL_EXEC(op_desc == nullptr, GELOGE(ge::FAILED, "op_desc is null."); return ge::FAILED);
-  GELOGI("Begin to assign atomic output memory, node = %s.", op_desc->GetName().c_str());
+  GELOGD("Begin to assign atomic output memory, node = %s.", op_desc->GetName().c_str());

  vector<int64_t> atomic_output_index;
  // If GetListInt fail, atomic_output_index is empty.
@ -620,7 +620,7 @@ Status GraphMemoryAssigner::AssignAtomicOutputMemory(const ge::NodePtr &node) {

    // If you have already assigned an atomic address, skip it, and you don't need to reassign it.
    if (is_assigned_mem) {
-      GELOGI(
+      GELOGD(
        "[IMAS]Atomic output : we have assigned atomic memory as the input of next node in "
        "ReAssignContinuousMemory function.");
      continue;
@ -822,7 +822,7 @@ Status GraphMemoryAssigner::SetLoopGraphAtomicAttr(const ge::NodePtr &node, int6
        continue;
      }

-      GELOGI("SetLoopGraphAtomicAttr,  node is %s, op type is %s.", peer_out_node_desc->GetName().c_str(),
+      GELOGD("SetLoopGraphAtomicAttr,  node is %s, op type is %s.", peer_out_node_desc->GetName().c_str(),
             peer_out_node_desc->GetType().c_str());

      if (peer_out_node_desc->GetType() == ATOMICADDRCLEAN) {
--- a/src/ge/graph/build/task_generator.cc
+++ b/src/ge/graph/build/task_generator.cc
@ -398,23 +398,26 @@ Status TaskGenerator::FindProfilingTaskIndex(const ComputeGraphPtr &graph, Profi
    if (op_kernel_lib_name.empty()) {
      continue;
    }
-    if (op_desc->GetName() == bp_point_str) {
-      last_bp = current_idx;
-      GELOGI("Last bp name %s, idx %u", op_desc->GetName().c_str(), last_bp);
-    }
+
    if (op_desc->GetType() == NETOUTPUT) {
      iter_end = current_idx;
      GELOGI("Iter end name %s, idx %u", op_desc->GetName().c_str(), iter_end);
    }
-    if (op_desc->GetName() == fp_point_str) {
-      first_fp = current_idx;
-      GELOGI("First fp name %s, idx %u", op_desc->GetName().c_str(), first_fp);
-    }

    if (op_desc->GetType() == HCOMALLREDUCE) {
      ar_ppoint.emplace_back(current_idx);
      GELOGI("Allreduce name %s, idx %u", op_desc->GetName().c_str(), current_idx);
    }
+
+    if (first_fp == 0 && IsProfPoint(op_desc, fp_point_str)) {
+      first_fp = current_idx;
+      GELOGI("First fp name %s, idx %u", op_desc->GetName().c_str(), first_fp);
+    }
+
+    if (IsProfPoint(op_desc, bp_point_str)) {
+      last_bp = current_idx;
+      GELOGI("Last bp name %s, idx %u", op_desc->GetName().c_str(), last_bp);
+    }
  }
  ppoint.fp_index = first_fp;
  ppoint.bp_index = last_bp;
@ -526,4 +529,29 @@ Status TaskGenerator::InsertProfilingTaskAfter(const OpDescPtr &op_desc, const P
  }
  return SUCCESS;
 }
+
+bool TaskGenerator::IsProfPoint(const OpDescPtr &op, const std::string &name) {
+  if (op == nullptr) {
+    return false;
+  }
+
+  if (op->GetName() == name) {
+    return true;
+  }
+
+  std::vector<std::string> original_op_names;
+  bool ret = AttrUtils::GetListStr(op, ge::ATTR_NAME_DATA_DUMP_ORIGIN_OP_NAMES, original_op_names);
+  if (!ret) {
+    return false;
+  }
+
+  for (auto &origin_name : original_op_names) {
+    if (origin_name == name) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
 }  // namespace ge
--- a/src/ge/graph/build/task_generator.h
+++ b/src/ge/graph/build/task_generator.h
@ -99,6 +99,8 @@ class TaskGenerator {
                                  std::vector<uint32_t> &ar_ppoint, uint32_t node_index,
                                  std::vector<domi::TaskDef> &task_def_list);

+  static bool IsProfPoint(const OpDescPtr &op, const std::string &name);
+
  uint8_t *var_mem_base_ = nullptr;
  uint64_t var_mem_size_ = 0;
 };
--- a/src/ge/graph/load/graph_loader.cc
+++ b/src/ge/graph/load/graph_loader.cc
@ -336,7 +336,7 @@ Status GraphLoader::LoadModelFromData(uint32_t &model_id, const ModelData &model
    auto model_manager = ModelManager::GetInstance();
    GE_CHECK_NOTNULL(model_manager);
    Status ret =
-        model_manager->LoadModelOffline(model_id, model_data, nullptr, dev_ptr, memsize, weight_ptr, weightsize);
+      model_manager->LoadModelOffline(model_id, model_data, nullptr, dev_ptr, memsize, weight_ptr, weightsize);
    if (ret != SUCCESS) {
      GELOGE(ret, "Load model failed, model_id:%u.", model_id);
      return ret;
@ -428,4 +428,15 @@ Status GraphLoader::GetMemoryInfo(int64_t &free) {
  GELOGI("GetMemoryInfo free[%zu], total[%zu], return free[%ld]", free_mem, total_mem, free);
  return SUCCESS;
 }
+
+Status GraphLoader::DestroyAicpuKernel(uint64_t session_id, uint32_t model_id) {
+  auto model_manager = ModelManager::GetInstance();
+  GE_CHECK_NOTNULL(model_manager);
+  Status ret = model_manager->DestroyAicpuKernel(session_id, model_id);
+  if (ret != SUCCESS) {
+    GELOGE(ret, "Destroy aicpu kernel failed.");
+    return ret;
+  }
+  return SUCCESS;
+}
 }  // namespace ge
--- a/src/ge/graph/load/graph_loader.h
+++ b/src/ge/graph/load/graph_loader.h
@ -73,6 +73,8 @@ class GraphLoader {
  static Status ExecuteModel(uint32_t model_id, rtStream_t stream, bool async_mode, const InputData &input_data,
                             OutputData &output_data);

+  static Status DestroyAicpuKernel(uint64_t session_id, uint32_t model_id);
+
 private:
  static Status LoadModelOnline(uint32_t &model_id, std::shared_ptr<ge::Model> &model,
                                const std::shared_ptr<ModelListener> &listener);
--- a/src/ge/graph/load/new_model_manager/model_manager.cc
+++ b/src/ge/graph/load/new_model_manager/model_manager.cc
@ -18,7 +18,6 @@

 #include <string>

-#include "cce/aicpu_engine_struct.h"
 #include "common/l2_cache_optimize.h"
 #include "common/profiling/profiling_manager.h"
 #include "common/properties_manager.h"
@ -41,17 +40,43 @@ std::shared_ptr<ModelManager> ModelManager::GetInstance() {

 ModelManager::ModelManager() { max_model_id_ = 0; }

-static Status KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType opType, uint64_t session_id) {
+Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, uint64_t session_id, uint32_t model_id) {
  STR_FWK_OP_KERNEL param_base = {};
  void *devicebase = nullptr;
+  void *aicpu_kernel_addr = nullptr;
  const uint32_t kKernelType = 0;
  param_base.fwkKernelType = kKernelType;
-  param_base.fwkKernelBase.fwk_kernel.opType = opType;
+  param_base.fwkKernelBase.fwk_kernel.opType = op_type;
  param_base.fwkKernelBase.fwk_kernel.sessionID = session_id;
+  if (op_type == aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_KERNEL_DESTROY) {
+    std::vector<uint64_t> v_aicpu_kernel;
+    std::string model_key = std::to_string(session_id) + "_" + std::to_string(model_id);
+    auto iter = model_aicpu_kernel_.find(model_key);
+    if (iter != model_aicpu_kernel_.end()) {
+      GELOGD("kernel destroy session_id %lu, model_id %u.", session_id, model_id);
+      v_aicpu_kernel = model_aicpu_kernel_.at(model_key);
+      // Insert size of aicpu kernel vector in the first element
+      v_aicpu_kernel.insert(v_aicpu_kernel.begin(), v_aicpu_kernel.size());
+
+      auto kernel_size = sizeof(uint64_t) * (v_aicpu_kernel.size());
+      rtError_t rt_ret = rtMalloc(&aicpu_kernel_addr, kernel_size, RT_MEMORY_HBM);
+      GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMalloc error, ret: 0x%X", rt_ret);
+                      return RT_FAILED;)
+
+      rt_ret = rtMemcpy(aicpu_kernel_addr, kernel_size, v_aicpu_kernel.data(), kernel_size, RT_MEMCPY_HOST_TO_DEVICE);
+      GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy to input_output_addr_ error: 0x%X", rt_ret);
+                      GE_CHK_RT(rtFree(aicpu_kernel_addr)); return FAILED;)
+      uint64_t kernel_id_addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(aicpu_kernel_addr));
+      param_base.fwkKernelBase.fwk_kernel.kernelID = kernel_id_addr;
+      // Remove model key from map
+      model_aicpu_kernel_.erase(iter);
+    }
+  }

  rtError_t rt_ret = rtMalloc(&(devicebase), sizeof(STR_FWK_OP_KERNEL), RT_MEMORY_HBM);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(rt_ret, "malloc device memory failed.");
+    GE_IF_BOOL_EXEC(aicpu_kernel_addr != nullptr, GE_CHK_RT(rtFree(aicpu_kernel_addr)));
    return FAILED;
  }

@ -59,6 +84,7 @@ static Status KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType opType, uint64_t
    rtMemcpy(devicebase, sizeof(STR_FWK_OP_KERNEL), &param_base, sizeof(STR_FWK_OP_KERNEL), RT_MEMCPY_HOST_TO_DEVICE);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(rt_ret, "memory copy to device failed.");
+    GE_IF_BOOL_EXEC(aicpu_kernel_addr != nullptr, GE_CHK_RT(rtFree(aicpu_kernel_addr)));
    GE_CHK_RT(rtFree(devicebase));
    return FAILED;
  }
@ -67,6 +93,7 @@ static Status KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType opType, uint64_t
  rt_ret = rtStreamCreate(&stream, 0);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(rt_ret, "create stream failed.");
+    GE_IF_BOOL_EXEC(aicpu_kernel_addr != nullptr, GE_CHK_RT(rtFree(aicpu_kernel_addr)));
    GE_CHK_RT(rtFree(devicebase));
    return FAILED;
  }
@ -74,6 +101,7 @@ static Status KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType opType, uint64_t
  rt_ret = rtKernelLaunchEx(devicebase, sizeof(STR_FWK_OP_KERNEL), 0, stream);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(rt_ret, "rtKernelLaunchEx failed.");
+    GE_IF_BOOL_EXEC(aicpu_kernel_addr != nullptr, GE_CHK_RT(rtFree(aicpu_kernel_addr)));
    GE_CHK_RT(rtFree(devicebase));
    GE_CHK_RT(rtStreamDestroy(stream));
    return FAILED;
@ -81,11 +109,20 @@ static Status KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType opType, uint64_t
  rt_ret = rtStreamSynchronize(stream);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(rt_ret, "rtStreamSynchronize failed.");
+    GE_IF_BOOL_EXEC(aicpu_kernel_addr != nullptr, GE_CHK_RT(rtFree(aicpu_kernel_addr)));
    GE_CHK_RT(rtFree(devicebase));
    GE_CHK_RT(rtStreamDestroy(stream));
    return FAILED;
  }
-
+  if (aicpu_kernel_addr != nullptr) {
+    rt_ret = rtFree(aicpu_kernel_addr);
+    if (rt_ret != RT_ERROR_NONE) {
+      GELOGE(rt_ret, "free memory failed.");
+      GE_CHK_RT(rtFree(devicebase));
+      GE_CHK_RT(rtStreamDestroy(stream));
+      return FAILED;
+    }
+  }
  rt_ret = rtFree(devicebase);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(rt_ret, "free memory failed.");
@ -107,7 +144,7 @@ void ModelManager::DestroyAicpuSession(uint64_t session_id) {
    GELOGI("The session: %lu not created.", session_id);
    return;
  } else {
-    Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_SESSION_DESTROY, session_id);
+    Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_SESSION_DESTROY, session_id, 0);
    if (ret != SUCCESS) {
      GELOGW("The session: %lu destroy failed.", session_id);
    } else {
@ -117,9 +154,36 @@ void ModelManager::DestroyAicpuSession(uint64_t session_id) {
  }
 }

+ge::Status ModelManager::DestroyAicpuKernel(uint64_t session_id, uint32_t model_id) {
+  GELOGD("destroy aicpu kernel in session_id %lu, model_id %u.", session_id, model_id);
+  std::lock_guard<std::mutex> lock(sess_ids_mutex_);
+  std::string model_key = std::to_string(session_id) + "_" + std::to_string(model_id);
+  if (model_aicpu_kernel_.find(model_key) != model_aicpu_kernel_.end()) {
+    Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_KERNEL_DESTROY, session_id, model_id);
+    if (ret != SUCCESS) {
+      GELOGE(FAILED, "Destroy aicpu kernel failed.");
+      return FAILED;
+    }
+  }
+  return SUCCESS;
+}
+
+ge::Status ModelManager::CreateAicpuKernel(uint64_t session_id, uint32_t model_id, uint64_t kernel_id) {
+  std::vector<uint64_t> v_aicpu_kernel;
+  std::lock_guard<std::mutex> lock(sess_ids_mutex_);
+  std::string model_key = std::to_string(session_id) + "_" + std::to_string(model_id);
+  if (model_aicpu_kernel_.find(model_key) != model_aicpu_kernel_.end()) {
+    v_aicpu_kernel = model_aicpu_kernel_.at(model_key);
+  }
+  v_aicpu_kernel.push_back(kernel_id);
+  model_aicpu_kernel_[model_key] = v_aicpu_kernel;
+  return SUCCESS;
+}
+
 ModelManager::~ModelManager() {
  std::lock_guard<std::mutex> lock(map_mutex_);
  model_map_.clear();
+  model_aicpu_kernel_.clear();

  GE_IF_BOOL_EXEC(device_count > 0, GE_CHK_RT(rtDeviceReset(0)));
 }
@ -687,7 +751,7 @@ Status ModelManager::CreateAicpuSession(uint64_t session_id) {
  auto it = sess_ids_.find(session_id);
  // never been created by any model
  if (it == sess_ids_.end()) {
-    Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_SESSION_CREATE, session_id);
+    Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_SESSION_CREATE, session_id, 0);
    if (ret == SUCCESS) {
      (void)sess_ids_.insert(session_id);
      GELOGI("The session: %lu create success.", session_id);
--- a/src/ge/graph/load/new_model_manager/model_manager.h
+++ b/src/ge/graph/load/new_model_manager/model_manager.h
@ -24,6 +24,7 @@
 #include <memory>
 #include <set>
 #include <vector>
+#include "cce/aicpu_engine_struct.h"
 #include "common/types.h"
 #include "common/ge_types.h"
 #include "common/ge_inner_error_codes.h"
@ -199,12 +200,18 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
  ///
  std::shared_ptr<DavinciModel> GetModel(uint32_t id);

+  ge::Status KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, uint64_t session_id, uint32_t model_id);
+
  ge::Status CreateAicpuSession(uint64_t session_id);

  static ge::Status GetModelMemAndWeightSize(const ModelData &model, size_t &mem_size, size_t &weight_size);

  void DestroyAicpuSession(uint64_t session_id);

+  ge::Status DestroyAicpuKernel(uint64_t session_id, uint32_t model_id);
+
+  ge::Status CreateAicpuKernel(uint64_t session_id, uint32_t model_id, uint64_t kernel_id);
+
 private:
  ///
  /// @ingroup domi_ome
@ -233,6 +240,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
  void GenModelId(uint32_t *id);

  std::map<uint32_t, std::shared_ptr<DavinciModel>> model_map_;
+  std::map<std::string, std::vector<uint64_t>> model_aicpu_kernel_;
  std::vector<uint32_t> free_model_id_;
  uint32_t max_model_id_;
  std::mutex map_mutex_;
--- a/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc
@ -120,9 +120,13 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin
  GELOGI("session_id: %lu", session_id);
  GE_CHECK_NOTNULL(ModelManager::GetInstance());
  GE_IF_BOOL_EXEC(ModelManager::GetInstance()->CreateAicpuSession(session_id) != SUCCESS,
-                  GELOGE(ret, "CreateAicpuSession error.");
-                  return ret;)
-
+                  GELOGE(FAILED, "CreateAicpuSession error.");
+                  return FAILED;)
+  // 4.1 Collect aicpu kernel
+  uint64_t kernel_id = fwk_op_kernel.fwkKernelBase.fwk_kernel.kernelID;
+  GE_IF_BOOL_EXEC(ModelManager::GetInstance()->CreateAicpuKernel(session_id, davinci_model->Id(), kernel_id) != SUCCESS,
+                  GELOGE(FAILED, "CreateAicpuKernel error.");
+                  return FAILED;)
  // 5. Return result
  rtError_t rt_ret = rtMalloc(&kernel_buf_, sizeof(STR_FWK_OP_KERNEL), RT_MEMORY_HBM);
  GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMalloc error: 0x%X", rt_ret); return FAILED;)
--- a/Show More
+++ b/Show More