Merge branch 'development' of gitee.com:dong-duo/graphengine into development

5 years ago · c42b6da60b
parent e1bd856bc1 6e913a5391
commit c42b6da60b
32 changed files with 357 additions and 207 deletions
--- a/ge/graph/build/label_allocator.cc
+++ b/ge/graph/build/label_allocator.cc
@ -26,12 +26,17 @@
 namespace ge {
 LabelAllocator::LabelAllocator(const ComputeGraphPtr &graph) : compute_graph_(graph) {}

-Status LabelAllocator::AssignFunctionalLabels(uint32_t &label_index) {
+Status LabelAllocator::AssignFunctionalLabels() {
  if (compute_graph_ == nullptr) {
    GELOGE(INTERNAL_ERROR, "ComputeGraph not set, Assign labels failed.");
    return INTERNAL_ERROR;
  }

+  if (compute_graph_->GetGraphUnknownFlag()) {
+    GELOGD("Graph[%s] is unknown graph, skip label allocator.", compute_graph_->GetName().c_str());
+    return SUCCESS;
+  }
+
  // Add label task for sub graph.
  GELOGI("AssignFunctionalLabels start: %s.", compute_graph_->GetName().c_str());
  std::set<NodePtr> functional_nodes;
@ -42,7 +47,7 @@ Status LabelAllocator::AssignFunctionalLabels(uint32_t &label_index) {
  }

  // Add label for functional op.
-  label_index = 0;
+  uint32_t label_index = 0;
  for (auto node : functional_nodes) {
    LabelMakerPtr maker = LabelMakerFactory::Instance().Create(node->GetType(), compute_graph_, node);
    if (maker == nullptr) {
@ -56,6 +61,7 @@ Status LabelAllocator::AssignFunctionalLabels(uint32_t &label_index) {
    }
  }

+  (void)AttrUtils::SetInt(*compute_graph_, ATTR_MODEL_LABEL_NUM, label_index);
  GELOGI("AssignFunctionalLabels success.");
  return SUCCESS;
 }
--- a/ge/graph/build/label_allocator.h
+++ b/ge/graph/build/label_allocator.h
@ -28,7 +28,7 @@ class LabelAllocator {
  explicit LabelAllocator(const ComputeGraphPtr &graph);
  ~LabelAllocator() = default;

-  Status AssignFunctionalLabels(uint32_t &label_index);
+  Status AssignFunctionalLabels();

 private:
  bool CollectFunctionalNode(ComputeGraphPtr &graph, std::set<NodePtr> &functional_nodes);
--- a/ge/graph/build/logical_stream_allocator.cc
+++ b/ge/graph/build/logical_stream_allocator.cc
@ -348,7 +348,11 @@ Status NodeStreamUpdatePass::Run(ComputeGraphPtr graph, const vector<SubgraphPtr
    auto compute_graph = subgraph->subgraph_info.GetSubGraph();
    for (NodePtr &node : compute_graph->GetDirectNode()) {
      GE_CHECK_NOTNULL(node->GetOpDesc());
-      if (IsEngineSkip(*subgraph) && node->GetInNodes().empty()) {
+      if (node->GetOpDesc()->HasAttr(ATTR_NAME_RTS_LABEL_NODE)) {
+        node->GetOpDesc()->SetStreamId(context.default_stream);
+        GELOGD("Node %s of type %s in subgraph %s is assigned parent stream %ld (engine: %s).", node->GetName().c_str(),
+               node->GetType().c_str(), subgraph->name.c_str(), context.default_stream, engine_name.c_str());
+      } else if (IsEngineSkip(*subgraph) && node->GetInNodes().empty()) {
        GELOGD("Node %s of type %s in subgraph %s doesn't need to assign a stream (engine: %s).",
               node->GetName().c_str(), node->GetType().c_str(), subgraph->name.c_str(), engine_name.c_str());
      } else {
--- a/ge/graph/build/model_builder.cc
+++ b/ge/graph/build/model_builder.cc
@ -23,7 +23,6 @@
 #include "graph/anchor.h"
 #include "graph/attr_value.h"
 #include "graph/buffer.h"
-#include "graph/build/label_allocator.h"
 #include "graph/build/stream_allocator.h"
 #include "graph/common/omg_util.h"
 #include "graph/common/ge_call_wrapper.h"
@ -42,7 +41,6 @@
 #include "graph/utils/op_desc_utils.h"
 #include "graph/utils/tensor_utils.h"
 #include "graph/utils/type_utils.h"
-#include "graph/passes/memcpy_addr_async_pass.h"
 #include "init/gelib.h"
 #include "memory/memory_assigner.h"
 #include "omg/version.h"
@ -692,25 +690,8 @@ Status ModelBuilder::BuildModelForGetTask(ge::Model &model) {
  GE_TIMESTAMP_END(AssignLogicalStreams, "GraphBuilder::AssignLogicalStreams");

  // Assign functional op labels.
-  GE_TIMESTAMP_START(AssignFunctionalLabels);
-  LabelAllocator label_allocator(compute_graph_);
-  GE_CHK_STATUS_RET(label_allocator.AssignFunctionalLabels(label_num_), "Assign label failed.");
-  GE_TIMESTAMP_END(AssignFunctionalLabels, "ModelBuilder::AssignFunctionalLabels");
-
-  // Add memcpy_addr_async node.
-  rtFeatureType_t feature_type = FEATURE_TYPE_MEMCPY;
-  int32_t feature_info = MEMCPY_INFO_SUPPORT_ZEROCOPY;
-  int64_t value = 0;
-  rtError_t rt_ret = rtGetRtCapability(feature_type, feature_info, &value);
-  if (rt_ret != RT_ERROR_NONE) {
-    GELOGE(RT_FAILED, "rtGetRtCapability failed.");
-    return RT_FAILED;
-  } else {
-    GE_TIMESTAMP_START(AddMemcpyAddrAsyncNode);
-    MemcpyAddrAsyncPass memcpy_addr;
-    GE_CHK_STATUS_RET(memcpy_addr.Run(compute_graph_), "Add memcpy_addr_async node failed.");
-    GE_TIMESTAMP_END(AddMemcpyAddrAsyncNode, "MemcpyAddrAsyncPass::Run.");
-  }
+  label_num_ = 0;
+  (void)AttrUtils::GetInt(*compute_graph_, ATTR_MODEL_LABEL_NUM, label_num_);

  GE_TIMESTAMP_START(AssignMemory);
  MemoryAssigner mem_assigner(compute_graph_);
--- a/ge/graph/label/label_maker.cc
+++ b/ge/graph/label/label_maker.cc
--- a/ge/graph/label/label_maker.h
+++ b/ge/graph/label/label_maker.h
@ -60,9 +60,8 @@ class LabelMaker {
  ComputeGraphPtr parent_graph_;

 private:
-  void SetStreamIdEnter(const ComputeGraphPtr &graph, const OpDescPtr &op_desc);
-  void SetStreamIdLeave(const ComputeGraphPtr &graph, const OpDescPtr &op_desc);
-  void SetStreamIdOwner(const ComputeGraphPtr &graph, const OpDescPtr &op_desc);
+  void LinkToGraphHead(const ComputeGraphPtr &graph, const NodePtr &node);
+  void LinkToGraphTail(const ComputeGraphPtr &graph, const NodePtr &node);
 };
 }  // namespace ge
 #endif  // GE_GRAPH_PASSES_LABEL_MAKER_H_
--- a/ge/graph/manager/graph_manager.cc
+++ b/ge/graph/manager/graph_manager.cc
@ -100,6 +100,8 @@
 #include "graph/passes/subgraph_const_migration_pass.h"
 #include "graph/passes/unused_args_clean_pass.h"
 #include "graph/passes/global_step_insert_pass.h"
+#include "graph/passes/memcpy_addr_async_pass.h"
+#include "graph/build/label_allocator.h"
 #include "graph/utils/tensor_adapter.h"
 #include "graph/utils/type_utils.h"
 #include "graph/graph_util.h"
@ -634,6 +636,13 @@ Status GraphManager::PreRunAfterOptimizeSubGraph(const GraphNodePtr &graph_node,
  GM_RUN_AND_DUMP_PERF("OptimizeGraphBeforeBuildForRts",
                       GetCompilerStages(graph_node->GetGraphId()).optimizer.OptimizeGraphBeforeBuildForRts,
                       compute_graph);
+
+  Status ret = compute_graph->TopologicalSorting();
+  if (ret != SUCCESS) {
+    GELOGE(ret, "Graph topological sort failed, ret:%d.", ret);
+    return ret;
+  }
+
  GM_RUN_AND_DUMP_PERF("Build", Build, graph_node, compute_graph, ge_root_model, session_id);
  GELOGI("PreRun:PreRunAfterOptimizeSubGraph success.");
  return SUCCESS;
@ -2180,6 +2189,18 @@ Status GraphManager::OptimizeStage2(ge::ComputeGraphPtr &compute_graph) {
    return ret;
  }

+  // Assign functional op labels.
+  GE_TIMESTAMP_START(AssignFunctionalLabels);
+  LabelAllocator label_allocator(compute_graph);
+  GE_CHK_STATUS_RET(label_allocator.AssignFunctionalLabels(), "Assign label failed.");
+  GE_TIMESTAMP_END(AssignFunctionalLabels, "ModelBuilder::AssignFunctionalLabels");
+
+  // Add memcpy addr asynchronous node.
+  GE_TIMESTAMP_START(AddMemcpyAddrAsyncNode);
+  MemcpyAddrAsyncPass memcpy_addr;
+  GE_CHK_STATUS_RET(memcpy_addr.Run(compute_graph), "Add memcpy_addr_async node failed.");
+  GE_TIMESTAMP_END(AddMemcpyAddrAsyncNode, "MemcpyAddrAsyncPass::Run.");
+
  // After while sub graph handle, mark all node rw type
  auto result = GetCompilerStages(compute_graph->GetGraphID()).optimizer.HandleMemoryRWConflict(compute_graph);
  if (result != SUCCESS) {
@ -2190,11 +2211,6 @@ Status GraphManager::OptimizeStage2(ge::ComputeGraphPtr &compute_graph) {

  ChangeConstTypeWhenTraining(compute_graph);

-  ret = compute_graph->TopologicalSorting();
-  if (ret != SUCCESS) {
-    GELOGE(ret, "Graph topological sort failed, ret:%d.", ret);
-    return ret;
-  }
  GELOGI("End optimize after merge sub graph.");
  return SUCCESS;
 }
--- a/ge/graph/manager/rdma_pool_allocator.cc
+++ b/ge/graph/manager/rdma_pool_allocator.cc
@ -202,7 +202,7 @@ Status RdmaPoolAllocator::GetBaseAddr(uint64_t &base_addr, uint64_t &mem_size) {
    GELOGE(INTERNAL_ERROR, "Rdma base addr is nullptr.");
    return INTERNAL_ERROR;
  }
-  base_addr = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(rdma_base_addr_));
+  base_addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(rdma_base_addr_));
  mem_size = rdma_mem_size_;
  return SUCCESS;
 }
--- a/ge/graph/passes/mark_agnostic_pass.cc
+++ b/ge/graph/passes/mark_agnostic_pass.cc
@ -21,7 +21,7 @@ namespace ge {
 Status MarkAgnosticPass::Run(ComputeGraphPtr graph) {
  for (const auto &node : graph->GetDirectNode()) {
    auto node_type = NodeUtils::GetNodeType(*node);
-    if (node_type == SWITCH || node_type == REFSWITCH || node_type == SWITCHN) {
+    if (node_type == SWITCH || node_type == SWITCHN) {
      GELOGD("Mark format agnostic and continuous for switch node %s", node->GetName().c_str());
      const OpDescPtr op_desc = node->GetOpDesc();
      const GeTensorDescPtr op_tensor = op_desc->MutableInputDesc(0);
@ -37,10 +37,15 @@ Status MarkAgnosticPass::Run(ComputeGraphPtr graph) {
    if (node_type == IDENTITY) {
      GELOGD("Mark format agnostic for identity node %s", node->GetName().c_str());
      AttrUtils::SetInt(node->GetOpDesc(), "_format_agnostic", 1);
+      continue;
+    }
+    if (node_type == REFMERGE || node_type == REFSWITCH) {
+      GELOGD("Mark format agnostic for regmerge and refswitch node %s", node->GetName().c_str());
+      AttrUtils::SetInt(node->GetOpDesc(), "_format_agnostic", 1);
      AttrUtils::SetListInt(node->GetOpDesc(), "_format_agnostic_except_input", std::vector<int64_t>({1}));
      continue;
    }
-    if (node_type == MERGE || node_type == REFMERGE) {
+    if (node_type == MERGE) {
      GELOGD("Mark format agnostic and continuous for merge node %s", node->GetName().c_str());
      const OpDescPtr op_desc = node->GetOpDesc();
      const GeTensorDescPtr op_tensor = op_desc->MutableOutputDesc(0);
--- a/ge/graph/passes/memcpy_addr_async_pass.cc
+++ b/ge/graph/passes/memcpy_addr_async_pass.cc
@ -25,6 +25,14 @@
 namespace ge {
 Status MemcpyAddrAsyncPass::Run(ComputeGraphPtr graph) {
  GE_CHECK_NOTNULL(graph);
+
+  int64_t value = 0;
+  rtError_t rt_ret = rtGetRtCapability(FEATURE_TYPE_MEMCPY, MEMCPY_INFO_SUPPORT_ZEROCOPY, &value);
+  if (rt_ret != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "rtGetRtCapability failed, error=0x%x.", rt_ret);
+    return RT_FAILED;
+  }
+
  for (auto &node : graph->GetAllNodes()) {
    auto op_desc = node->GetOpDesc();
    GE_IF_BOOL_EXEC(op_desc == nullptr, continue);
@ -210,9 +218,18 @@ NodePtr MemcpyAddrAsyncPass::CreateMemcpyAddrAsyncNode(const ComputeGraphPtr &gr
    return nullptr;
  }

-  int64_t stream_id = out_of_user_data->GetOpDesc()->GetStreamId();
-  op_desc->SetStreamId(stream_id);
-  GELOGI("SetStreamId: Node %s assign stream is %ld.", op_desc->GetName().c_str(), stream_id);
+  string stream_label;
+  if (AttrUtils::GetStr(out_of_user_data->GetOpDesc(), ATTR_NAME_STREAM_LABEL, stream_label)) {
+    (void)AttrUtils::SetStr(op_desc, ATTR_NAME_STREAM_LABEL, stream_label);
+    GELOGD("Node %s set stream label: %s", op_desc->GetName().c_str(), stream_label.c_str());
+  }
+
+  bool rts_label_node = false;
+  if (AttrUtils::GetBool(out_of_user_data->GetOpDesc(), ATTR_NAME_RTS_LABEL_NODE, rts_label_node)) {
+    (void)AttrUtils::SetBool(op_desc, ATTR_NAME_RTS_LABEL_NODE, rts_label_node);
+    GELOGD("Node %s set rts label node attribute", op_desc->GetName().c_str());
+  }
+
  bool labeled_input = false;
  (void)ge::AttrUtils::GetBool(out_of_user_data->GetOpDesc(), ATTR_NAME_NODE_CONNECT_INPUT, labeled_input);
  if (labeled_input) {
--- a/ge/graph/passes/merge_pass.cc
+++ b/ge/graph/passes/merge_pass.cc
@ -79,6 +79,13 @@ Status MergePass::Run(NodePtr &node) {
          return FAILED;
        }
      }
+      auto in_node = in_data_nodes.at(0);
+      if (IsMergeInputNeedOptimized(in_node)) {
+        if (IsolateAndDeleteNode(in_node, {0}) != SUCCESS) {
+          GELOGE(FAILED, "Isolate and delete node %s failed.", in_node->GetName().c_str());
+          return FAILED;
+        }
+      }
      return IsolateAndDeleteNode(node, merge_io_map);
    }
    default: {
@ -172,4 +179,27 @@ Status MergePass::CreateConstByValue(NodePtr &node, int value_index, OpDescPtr &
  GE_CHK_STATUS_RET(op_desc->AddOutputDesc(original_out_tensor_desc), "add out put desc failed");
  return SUCCESS;
 }
+
+bool MergePass::IsMergeInputNeedOptimized(NodePtr &node) const {
+  if (node == nullptr) {
+    return false;
+  }
+  // node is not inserted by MergeInputMemcpyPass
+  if ((node->GetType() != MEMCPYASYNC) && (node->GetType() != MEMCPYADDRASYNC)) {
+    return false;
+  }
+  if (node->GetInDataNodes().size() != 1) {
+    return false;
+  }
+
+  auto in_node = node->GetInDataNodes().at(0);
+  if (in_node == nullptr) {
+    return false;
+  }
+  // in_node may be global_step var
+  if ((in_node->GetType() == VARIABLE) || (in_node->GetType() == VARIABLEV2)) {
+    return false;
+  }
+  return true;
+}
 }  // namespace ge
--- a/ge/graph/passes/merge_pass.h
+++ b/ge/graph/passes/merge_pass.h
@ -28,6 +28,7 @@ class MergePass : public BaseNodePass {
  bool IsNeedChangeIndexToConstant(NodePtr &node) const;
  Status ChangeIndexToConstant(NodePtr &node, int &value_index);
  Status CreateConstByValue(NodePtr &node, int value_index, OpDescPtr &op_desc);
+  bool IsMergeInputNeedOptimized(NodePtr &node) const;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_PASSES_MERGE_PASS_H_
--- a/ge/graph/passes/next_iteration_pass.cc
+++ b/ge/graph/passes/next_iteration_pass.cc
@ -173,14 +173,17 @@ Status NextIterationPass::FindWhileGroups() {

          NodePtr next_node = nullptr;
          if (FindTargetNode(out_node, NEXTITERATION, true, batch_label, next_node) != SUCCESS) {
-            GELOGE(INTERNAL_ERROR, "Get NextIteration node failed.");
+            GELOGE(INTERNAL_ERROR,
+                   "Get NextIteration node failed: inputs of Merge should be Enter/NextIteration, current_Merge=%s",
+                   out_node->GetName().c_str());
            return INTERNAL_ERROR;
          }
          batch_iter.second->merge_next_pairs.emplace_back(std::make_pair(out_node, next_node));

          NodePtr switch_node = nullptr;
          if (FindTargetNode(out_node, SWITCH, false, batch_label, switch_node) != SUCCESS) {
-            GELOGE(INTERNAL_ERROR, "Get Switch node failed.");
+            GELOGE(INTERNAL_ERROR, "Get Switch node failed: output of Merge should be Switch, current_Merge=%s",
+                   out_node->GetName().c_str());
            return INTERNAL_ERROR;
          }
          if (switch_node == nullptr) {
@ -189,7 +192,9 @@ Status NextIterationPass::FindWhileGroups() {

          NodePtr loop_cond = nullptr;
          if (FindTargetNode(switch_node, LOOPCOND, true, batch_label, loop_cond) != SUCCESS) {
-            GELOGE(INTERNAL_ERROR, "Get LoopCond node failed.");
+            GELOGE(INTERNAL_ERROR,
+                   "Get LoopCond node failed: pred input of Switch should be LoopCond, current_Switch=%s",
+                   switch_node->GetName().c_str());
            return INTERNAL_ERROR;
          }
          if (batch_iter.second->loop_cond == nullptr) {
--- a/ge/graph/preprocess/graph_preprocess.cc
+++ b/ge/graph/preprocess/graph_preprocess.cc
@ -117,6 +117,7 @@
 #include "graph/passes/variable_op_pass.h"
 #include "graph/passes/variable_prepare_op_pass.h"
 #include "graph/passes/variable_ref_delete_op_pass.h"
+#include "graph/passes/mark_agnostic_pass.h"


 namespace ge {
@ -1626,6 +1627,7 @@ Status GraphPrepare::PrepareOptimize() {
  try {
    (void)original_graph_passes.AddPass("PrepareOptimize::ShapeOperateOpRemovePass", new ShapeOperateOpRemovePass);
    (void)original_graph_passes.AddPass("PrepareOptimize::ReplaceTransShapePass", new ReplaceTransShapePass);
+    (void)original_graph_passes.AddPass("PrepareOptimize::MarkAgnosticPass", new MarkAgnosticPass);
  } catch (std::bad_alloc &e) {
    GELOGE(INTERNAL_ERROR, "Add pass failed, bad memory allocation occurs.");
    return INTERNAL_ERROR;
--- a/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc
+++ b/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc
@ -40,8 +40,6 @@ using domi::AippOpParams;
 namespace ge {
 namespace {
 const char *const kMbatchSwitchnName = "mbatch-switch-name";
-const int64_t kFormatAgnosticSwitch = 1;
-const int64_t kFormatDependInputIndex = 1;
 }  // namespace
 static void ConvertShape2Nhwc(Format &format, vector<int64_t> &shape_vec) {
  if ((format == FORMAT_NHWC) || (shape_vec.size() != static_cast<size_t>(NORMAL_TENSOR_SIZE))) {
@ -269,23 +267,6 @@ Status InsertNewOpUtil::GetAippParams(const std::unique_ptr<domi::AippOpParams>
  return SUCCESS;
 }

-Status InsertNewOpUtil::AddFormatAgnosticAttrToSwitchn(const NodePtr &aipp_node) {
-  GE_CHECK_NOTNULL(aipp_node);
-  auto next_nodes = aipp_node->GetOutDataNodes();
-  for (const auto next_node : next_nodes) {
-    GE_CHECK_NOTNULL(next_node);
-    auto op_desc = next_node->GetOpDesc();
-    GE_CHECK_NOTNULL(op_desc);
-    if (op_desc->GetType() == SWITCHN) {
-      GELOGI("Find switchn node [%s] after aipp [%s]", op_desc->GetName().c_str(), aipp_node->GetName().c_str());
-      (void)AttrUtils::SetInt(op_desc, "_format_agnostic", kFormatAgnosticSwitch);
-      (void)AttrUtils::SetListInt(op_desc, "_format_agnostic_except_input",
-                                  std::vector<int64_t>({kFormatDependInputIndex}));
-    }
-  }
-  return SUCCESS;
-}
-
 Status InsertNewOpUtil::UpdateDataNodeByAipp(const ComputeGraphPtr &graph) {
  std::map<std::string, NodePtr> switchn_names_to_data;
  std::set<NodePtr> updated_switchn;
@ -300,9 +281,6 @@ Status InsertNewOpUtil::UpdateDataNodeByAipp(const ComputeGraphPtr &graph) {
    }
    if (node->GetType() == AIPP) {
      GE_RETURN_IF_ERROR(UpdatePrevNodeByAipp(node, updated_switchn));
-      // In dynamic batch/HW and dynamic aipp scend, switchn should be set format agnostic, otherwise transdata maybe
-      // inserted between aipp and switchn which introduce performance and memory increase problem.
-      GE_RETURN_IF_ERROR(AddFormatAgnosticAttrToSwitchn(node));
    }
    if (node->GetType() == CASE && node->GetOpDesc()->HasAttr(ATTR_NAME_BATCH_NUM)) {
      multbatch_case = node;
--- a/ge/graph/preprocess/insert_op/util_insert_aipp_op.h
+++ b/ge/graph/preprocess/insert_op/util_insert_aipp_op.h
@ -68,7 +68,6 @@ class InsertNewOpUtil {
  void UpdateMultiBatchInputDims(const OpDescPtr &data_opdesc, Format &old_format);
  Status UpdatePrevNodeByAipp(NodePtr &node, std::set<NodePtr> &switchns);
  Status UpdateDataBySwitchN(const NodePtr &switchn, const NodePtr &data);
-  Status AddFormatAgnosticAttrToSwitchn(const NodePtr &aipp_node);
  Status GetDataRelatedNode(NodePtr &node, std::map<NodePtr, std::set<NodePtr>> &data_next_node_map);
  Status GetAllAipps(const NodePtr &data_node, const NodePtr &node, std::vector<NodePtr> &aipps);
  Status GetInputOutputInfo(NodePtr &data_node, NodePtr &aipp_node, std::string &input, std::string &output);
--- a/ge/hybrid/common/npu_memory_allocator.cc
+++ b/ge/hybrid/common/npu_memory_allocator.cc
@ -45,16 +45,9 @@ NpuMemoryAllocator *NpuMemoryAllocator::GetAllocator() {
 NpuMemoryAllocator::NpuMemoryAllocator(uint32_t device_id) : device_id_(device_id) {}

 void *NpuMemoryAllocator::Allocate(std::size_t size, AllocationAttr *attr) {
-  void *try_reuse_addr = nullptr;
  size_t allocate_size = size;
  MemStorageType mem_type = HBM;
  if (attr != nullptr) {
-    try_reuse_addr = attr->try_reuse_addr_;
-    if (attr->padding_ != 0) {
-      // padding up to multiple of attr->padding, and add extra attr->padding_
-      allocate_size = (size + 2 * attr->padding_ - 1) / attr->padding_ * attr->padding_;
-      GELOGD("Padding size %ld by %d. final size = %zu.", size, attr->padding_, allocate_size);
-    }
    mem_type = attr->mem_type_;
  }

@ -69,6 +62,17 @@ void *NpuMemoryAllocator::Allocate(std::size_t size, AllocationAttr *attr) {
  } else if (mem_type == HOST_DDR) {
    buffer = malloc(allocate_size);
  } else {
+    void *try_reuse_addr = nullptr;
+    int padding = kDefaultPadding;
+    if (attr != nullptr) {
+      try_reuse_addr = attr->try_reuse_addr_;
+      if (attr->padding_ > 0) {
+        padding = attr->padding_;
+      }
+    }
+    // padding up to multiple of padding, and add extra padding
+    allocate_size = (size + 2 * padding - 1) / padding * padding;
+    GELOGD("Padding size %ld by %d. final size = %zu.", size, padding, allocate_size);
    buffer = MemManager::Instance()
                 .CachingInstance(RT_MEMORY_HBM)
                 .Malloc(allocate_size, reinterpret_cast<uint8_t *>(try_reuse_addr), device_id_);
--- a/ge/hybrid/executor/worker/execution_engine.cc
+++ b/ge/hybrid/executor/worker/execution_engine.cc
@ -120,11 +120,13 @@ Status NodeDoneCallback::PrepareConstInputs(const NodeItem &node_item) {
           node_item.NodeName().c_str(),
           output_idx,
           output_tensor->GetSize());
+    if (tensor_size > 0) {
      GE_CHK_RT_RET(rtMemcpy(host_buffer.data(),
                             tensor_size,
                             output_tensor->GetData(),
                             tensor_size,
                             RT_MEMCPY_DEVICE_TO_HOST));
+    }
    tensor.SetData(std::move(host_buffer));
    string session_id = std::to_string(context_->GetSessionId());
    RuntimeInferenceContext *runtime_infer_ctx = nullptr;
--- a/ge/hybrid/model/hybrid_model_builder.cc
+++ b/ge/hybrid/model/hybrid_model_builder.cc
@ -257,7 +257,7 @@ Status HybridModelBuilder::ParseDependentInputNodes(NodeItem &node_item, const s
  }

  // cond or branch need to be prepared before the execution of IF or CASE
-  if (node_item.node_type == IF || node_item.node_type == CASE) {
+  if (node_item.node_type == IF || node_item.node_type == STATELESSIF || node_item.node_type == CASE) {
    const auto &in_anchor = ge_node->GetInDataAnchor(0);
    GE_CHECK_NOTNULL(in_anchor);
    const auto &peer_anchor = in_anchor->GetPeerOutAnchor();
@ -701,6 +701,9 @@ Status HybridModelBuilder::LoadGraph() {
      GE_CHK_STATUS_RET(IdentifyVariableOutputs(*parent_node_item),
                        "[%s] Failed to identify ref outputs.",
                        parent_node_item->NodeName().c_str());
+      GE_CHK_STATUS_RET(IdentifySameInputs(*parent_node_item),
+                        "[%s] Failed to identify same outputs.",
+                        parent_node_item->NodeName().c_str());

      // if parent is function control op. need add a virtual partitioned call
      if (parent_node_item->IsControlOp()) {
@ -917,7 +920,7 @@ Status HybridModelBuilder::LoadGeModel(ComputeGraph &sub_graph, const GeModelPtr
  auto parent_node = sub_graph.GetParentNode();
  GE_CHECK_NOTNULL(parent_node);
  auto op_type = parent_node->GetType();
-  if (op_type == IF || op_type == CASE || op_type == WHILE) {
+  if (IsControlOp(op_type)) {
    GELOGD("Set ge_model for control op subgraph: [%s], task_size = %d",
           sub_graph.GetName().c_str(),
           ge_model->GetModelTaskDefPtr()->task_size());
@ -1162,6 +1165,46 @@ Status HybridModelBuilder::InitRuntimeParams() {
  return SUCCESS;
 }

+Status HybridModelBuilder::IdentifySameInputs(NodeItem &node_item) {
+  GELOGD("Start to parse same inputs on net output: %s", node_item.NodeName().c_str());
+  auto subgraph = NodeUtils::GetSubgraph(*node_item.node, kSubgraphIndex);
+  GE_CHECK_NOTNULL(subgraph);
+  auto net_output_node = subgraph->FindFirstNodeMatchType(NETOUTPUT);
+  if (net_output_node == nullptr) {
+    GELOGD("Subgraph [%s] does not have net output", subgraph->GetName().c_str());
+    return SUCCESS;
+  }
+
+  auto net_output_desc = net_output_node->GetOpDesc();
+  GE_CHECK_NOTNULL(net_output_desc);
+
+  std::map<std::string, int> connected_inputs;
+  for (const auto &in_data_anchor : net_output_node->GetAllInDataAnchors()) {
+    auto out_data_anchor = in_data_anchor->GetPeerOutAnchor();
+    if (out_data_anchor == nullptr) {
+      continue;
+    }
+    auto src_node = out_data_anchor->GetOwnerNode();
+    GE_CHECK_NOTNULL(src_node);
+    auto op_desc = src_node->GetOpDesc();
+    GE_CHECK_NOTNULL(op_desc);
+
+    std::string input_key = std::to_string(op_desc->GetId()) + "_" + std::to_string(out_data_anchor->GetIdx());
+    auto it = connected_inputs.find(input_key);
+    if (it == connected_inputs.end()) {
+      connected_inputs.emplace(input_key, in_data_anchor->GetIdx());
+    } else {
+      GELOGD("[%s] output [%d] reuse output [%d] input node = %s, idx = %d.", node_item.NodeName().c_str(),
+             in_data_anchor->GetIdx(),
+             it->second,
+             src_node->GetName().c_str(),
+             out_data_anchor->GetIdx());      
+      node_item.reuse_outputs.emplace(in_data_anchor->GetIdx(), it->second);
+    }
+  }
+  return SUCCESS;
+}
+
 Status HybridModelBuilder::IdentifyVariableOutputs(NodeItem &node_item) {
  GELOGD("Start to parse outputs of node: %s", node_item.NodeName().c_str());
  auto subgraph = NodeUtils::GetSubgraph(*node_item.node, kSubgraphIndex);
--- a/ge/hybrid/model/hybrid_model_builder.h
+++ b/ge/hybrid/model/hybrid_model_builder.h
@ -59,6 +59,7 @@ class HybridModelBuilder {
  Status LoadGeModel(ComputeGraph &graph, const GeModelPtr &ge_model);
  Status LoadTasks();
  Status IdentifyVariableOutputs(NodeItem &node_item);
+  Status IdentifySameInputs(NodeItem &node_item);
  Status BuildNodeItem(const NodePtr &node, NodeItem &node_item);
  Status GetOrCreateNodeItem(const NodePtr &node, NodeItem **node_item);
  Status ParseDependentInputNodes(NodeItem &node_item, const std::vector<string> &dependencies);
--- a/ge/hybrid/model/node_item.cc
+++ b/ge/hybrid/model/node_item.cc
@ -28,6 +28,9 @@ namespace hybrid {
 namespace {
 const char * const kAttrNameOriginalFusionGraph = "_original_fusion_graph";
 const char * const kNodeTypeRetVal = "_RetVal";
+std::set<std::string> kControlOpTypes {
+    IF, STATELESSIF, CASE, WHILE, STATELESSWHILE
+};

 Status ParseInputMapping(Node &node, OpDesc &op_desc, FusedSubgraph &fused_subgraph) {
  uint32_t parent_index = 0;
@ -102,6 +105,11 @@ Status ParseFusedSubgraph(NodeItem &node_item) {
  return SUCCESS;
 }
 }  // namespace
+
+bool IsControlOp(const std::string &op_type) {
+  return kControlOpTypes.count(op_type) > 0;
+}
+
 NodeItem::NodeItem(NodePtr node): node(std::move(node)) {
  this->op_desc = this->node->GetOpDesc().get();
  this->node_id = this->op_desc->GetId();
@ -153,8 +161,7 @@ Status NodeItem::Init() {
 }

 bool NodeItem::IsControlOp() const {
-  auto op_type = op_desc->GetType();
-  return op_type == IF || op_type == CASE || op_type == WHILE || op_type == FOR;
+  return ge::hybrid::IsControlOp(op_desc->GetType());
 }

 std::string NodeItem::DebugString() const {
--- a/ge/hybrid/model/node_item.h
+++ b/ge/hybrid/model/node_item.h
@ -36,6 +36,8 @@ struct FusedSubgraph {
  ComputeGraphPtr graph;
 };

+bool IsControlOp(const std::string &op_type);
+
 // for caching static information across execution
 struct NodeItem {
  explicit NodeItem(NodePtr node);
@ -83,6 +85,7 @@ struct NodeItem {
  const NodeExecutor *node_executor = nullptr;
  std::map<int, ge::NodePtr> ref_outputs;
  std::map<int, int> reuse_inputs;
+  std::map<int, int> reuse_outputs;

  std::vector<bool> is_input_shape_static;
  bool is_output_shape_static = true;
--- a/ge/hybrid/node_executor/aicore/aicore_node_executor.cc
+++ b/ge/hybrid/node_executor/aicore/aicore_node_executor.cc
@ -156,6 +156,13 @@ Status AiCoreNodeExecutor::CompileTask(const HybridModel &model,

 Status AiCoreNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) {
  RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeTaskExecuteAsync] Start");
+  if (IsNoOp(context)) {
+    GELOGD("[%s] Skipping execution for op with empty outputs", context.GetNodeName());
+    auto ret = context.TryExecuteCallback(done_callback);
+    RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeTaskExecuteAsync] End");
+    return ret;
+  }
+
  auto op_desc = context.GetNodeItem().op_desc;
  GE_CHECK_NOTNULL(op_desc);
  GELOGI("[%s] ExecuteAsync Start.", op_desc->GetName().c_str());
@ -219,5 +226,18 @@ bool AiCoreNodeTask::IsSupportDynamicShape() {

  return true;
 }
+
+bool AiCoreNodeTask::IsNoOp(TaskContext &task_context) {
+  for (int i = 0; i < task_context.NumOutputs(); ++i) {
+    const auto &tensor_desc = task_context.MutableOutputDesc(i);
+    GE_CHECK_NOTNULL(tensor_desc);
+    const auto &shape = tensor_desc->MutableShape();
+    if (shape.IsScalar() || shape.GetShapeSize() > 0) {
+      return false;
+    }
+  }
+
+  return true;
+}
 }  // namespace hybrid
 }  // namespace ge
--- a/ge/hybrid/node_executor/aicore/aicore_node_executor.h
+++ b/ge/hybrid/node_executor/aicore/aicore_node_executor.h
@ -52,6 +52,7 @@ class AiCoreNodeTask : public NodeTask {
  Status UpdateArgs(TaskContext &context) override;
  Status ExecuteAsync(TaskContext &context, std::function<void()> done_callback) override;
 private:
+  static bool IsNoOp(TaskContext &task_context);
  std::vector<std::unique_ptr<AiCoreOpTask>> tasks_;
 };

--- a/ge/hybrid/node_executor/controlop/control_op_executor.cc
+++ b/ge/hybrid/node_executor/controlop/control_op_executor.cc
@ -404,11 +404,11 @@ Status ControlOpNodeExecutor::LoadTask(const HybridModel &model,

  unique_ptr<ControlOpNodeTask> node_task;
  auto node_type = node->GetType();
-  if (node_type == IF) {
+  if (node_type == IF || node_type == STATELESSIF) {
    node_task.reset(new(std::nothrow) IfOpNodeTask());
  } else if (node_type == CASE) {
    node_task.reset(new(std::nothrow) CaseOpNodeTask());
-  } else if (node_type == WHILE) {
+  } else if (node_type == WHILE || node_type == STATELESSWHILE) {
    node_task.reset(new(std::nothrow) WhileOpNodeTask());
  } else {
    GELOGE(PARAM_INVALID, "[%s] Unsupported type: %s", node->GetName().c_str(), node_type.c_str());
--- a/Show More
+++ b/Show More