!1188 dynamic shape over flow

From: @zhou_chao1993 Reviewed-by: @xchu42,@ji_chen Signed-off-by: @ji_chen
4 years ago · 1b845b9ac2
parent 7d4f374692 10662d550f
commit 1b845b9ac2
23 changed files with 393 additions and 126 deletions
--- a/ge/CMakeLists.txt
+++ b/ge/CMakeLists.txt
@ -103,6 +103,7 @@ set(TRAIN_SRC_LIST
    "common/profiling/profiling_manager.cc"
    "common/dump/dump_manager.cc"
    "common/dump/dump_properties.cc"
+    "common/dump/opdebug_register.cc"
    "common/dump/dump_op.cc"
    "common/profiling/ge_profiling.cc"
    "common/profiling/ge_runner_profiling.cc"
@ -427,6 +428,7 @@ set(INFER_SRC_LIST
    "common/dump/dump_properties.cc"
    "common/dump/dump_manager.cc"
    "common/dump/dump_op.cc"
+    "common/dump/opdebug_register.cc"
    "common/dump/dump_server.cc"
    "common/helper/model_cache_helper.cc"
    "ge_local_engine/engine/host_cpu_engine.cc"
--- a/ge/common/dump/dump_manager.cc
+++ b/ge/common/dump/dump_manager.cc
@ -104,8 +104,12 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status DumpManager::SetDumpConf
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const DumpProperties &DumpManager::GetDumpProperties(
  uint64_t session_id) {
  std::lock_guard<std::mutex> lock(mutex_);
-  // If session_id is not found in dump_properties_map_, operator[] will insert one.
-  return dump_properties_map_[session_id];
+  auto iter = dump_properties_map_.find(session_id);
+  if (iter != dump_properties_map_.end()) {
+    return iter->second;
+  }
+  static DumpProperties default_properties;
+  return default_properties;
 }

 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void DumpManager::AddDumpProperties(
--- a/ge/common/dump/dump_op.cc
+++ b/ge/common/dump/dump_op.cc
@ -219,9 +219,9 @@ Status DumpOp::LaunchDumpOp() {
  op_mapping_info.set_dump_path(dump_path);
  op_mapping_info.set_flag(kAicpuLoadFlag);
  op_mapping_info.set_dump_step(dump_properties_.GetDumpStep());
-  if (!dynamic_model_name_.empty()) {
-    op_mapping_info.set_model_name(dynamic_model_name_);
  op_mapping_info.set_model_id(dynamic_model_id_);
+  if (!dynamic_model_name_.empty() && dump_properties_.IsDumpOpen()) {
+    op_mapping_info.set_model_name(dynamic_model_name_);
  }
  SetOpMappingLoopAddr(global_step_, loop_per_iter_, loop_cond_, op_mapping_info);
  GELOGI("Dump step is %s ,dump path is %s ,in Launch dump op", dump_properties_.GetDumpStep().c_str(),
@ -253,7 +253,7 @@ Status DumpOp::LaunchDumpOp() {
    }
    op_mapping_info.mutable_task()->Add(std::move(task));
  }
-  if (dump_properties_.GetDumpMode() == kDumpAll) {
+  if (dump_properties_.GetDumpMode() == kDumpAll || dump_properties_.IsOpDebugOpen()) {
    auto ret = DumpOutput(task);
    if (ret != SUCCESS) {
      GELOGE(ret, "Dump output failed when in dumping all");
--- a/ge/common/dump/dump_properties.h
+++ b/ge/common/dump/dump_properties.h
@ -81,11 +81,11 @@ class DumpProperties {

  const std::string &GetEnableDumpDebug() const {return enable_dump_debug_;}

+
 private:
  void CopyFrom(const DumpProperties &other);

  void SetDumpDebugOptions();
-
  std::string enable_dump_;
  std::string enable_dump_debug_;

--- a/ge/common/dump/opdebug_register.cc
+++ b/ge/common/dump/opdebug_register.cc
@ -0,0 +1,148 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "opdebug_register.h"
+
+namespace {
+const size_t kOpDebugMemorySize = 2048UL;
+const size_t kDebugP2pSize = 8UL;
+}  // namespace
+namespace ge {
+OpdebugRegister::~OpdebugRegister() {}
+
+Status OpdebugRegister::RegisterDebugForModel(rtModel_t model_handle, uint32_t op_debug_mode, DataDumper &data_dumper) {
+  GELOGD("Start to register debug for model in overflow");
+  auto ret = MallocMemForOpdebug();
+  if (ret != SUCCESS) {
+    GELOGE(ret, "Malloc memory for opdebug in model overflow failed ,ret:0x%X", ret);
+    return ret;
+  }
+  uint32_t debug_stream_id = 0;
+  uint32_t debug_task_id = 0;
+  auto rt_ret = rtDebugRegister(model_handle, op_debug_mode, op_debug_addr_, &debug_stream_id, &debug_task_id);
+  if (rt_ret != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "rtDebugRegister error, ret: 0x%X", rt_ret);
+    return RT_ERROR_TO_GE_STATUS(rt_ret);
+  }
+  GELOGD("debug_task_id:%u, debug_stream_id:%u in model overflow", debug_task_id, debug_stream_id);
+  data_dumper.SaveOpDebugId(debug_task_id, debug_stream_id, p2p_debug_addr_, true);
+  return SUCCESS;
+}
+
+void OpdebugRegister::UnregisterDebugForModel(rtModel_t model_handle) {
+  rtError_t rt_ret = RT_ERROR_NONE;
+  if (model_handle != nullptr) {
+    GELOGD("start to call rtDebugUnRegister in model overflow.");
+    rt_ret = rtDebugUnRegister(model_handle);
+    if (rt_ret != RT_ERROR_NONE) {
+      GELOGW("rtDebugUnRegister failed, ret: 0x%X", rt_ret);
+    }
+  }
+
+  if (op_debug_addr_ != nullptr) {
+    rt_ret = rtFree(op_debug_addr_);
+    if (rt_ret != RT_ERROR_NONE) {
+      GELOGW("rtFree failed, ret: 0x%X", rt_ret);
+    }
+    op_debug_addr_ = nullptr;
+  }
+
+  if (p2p_debug_addr_ != nullptr) {
+    rt_ret = rtFree(p2p_debug_addr_);
+    if (rt_ret != RT_ERROR_NONE) {
+      GELOGW("rtFree failed, ret: 0x%X", rt_ret);
+    }
+    p2p_debug_addr_ = nullptr;
+  }
+  return;
+}
+
+Status OpdebugRegister::RegisterDebugForStream(rtStream_t stream, uint32_t op_debug_mode, DataDumper &data_dumper) {
+  GELOGD("Start to register debug for stream in stream overflow");
+  auto ret = MallocMemForOpdebug();
+  if (ret != SUCCESS) {
+    GELOGE(ret, "Malloc memory for opdebug in stream overflow ,ret:0x%X", ret);
+    return ret;
+  }
+
+  uint32_t debug_stream_id = 0;
+  uint32_t debug_task_id = 0;
+#ifdef ONLY_COMPILE_OPEN_SRC
+  auto rt_ret = rtDebugRegisterForStream(stream, op_debug_mode, op_debug_addr_, &debug_stream_id, &debug_task_id);
+  if (rt_ret != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "rtDebugRegisterForStream error, ret: 0x%X", rt_ret);
+    return RT_ERROR_TO_GE_STATUS(rt_ret);
+  }
+#endif
+  GELOGD("debug_task_id:%u, debug_stream_id:%u in stream overflow.", debug_task_id, debug_stream_id);
+  data_dumper.SaveOpDebugId(debug_task_id, debug_stream_id, p2p_debug_addr_, true);
+  return SUCCESS;
+}
+
+void OpdebugRegister::UnregisterDebugForStream(rtStream_t stream) {
+  rtError_t rt_ret = RT_ERROR_NONE;
+#ifdef ONLY_COMPILE_OPEN_SRC
+  if (stream != nullptr) {
+    GELOGD("start call rtDebugUnRegisterForStream in unknown shape over flow.");
+    rt_ret = rtDebugUnRegisterForStream(stream);
+    if (rt_ret != RT_ERROR_NONE) {
+      GELOGW("rtDebugUnRegisterForStream failed, ret: 0x%X", rt_ret);
+    }
+  }
+#endif
+
+  if (op_debug_addr_ != nullptr) {
+    rt_ret = rtFree(op_debug_addr_);
+    if (rt_ret != RT_ERROR_NONE) {
+      GELOGW("rtFree failed, ret: 0x%X", rt_ret);
+    }
+    op_debug_addr_ = nullptr;
+  }
+
+  if (p2p_debug_addr_ != nullptr) {
+    rt_ret = rtFree(p2p_debug_addr_);
+    if (rt_ret != RT_ERROR_NONE) {
+      GELOGW("rtFree failed, ret: 0x%X", rt_ret);
+    }
+    p2p_debug_addr_ = nullptr;
+  }
+  return;
+}
+
+Status OpdebugRegister::MallocMemForOpdebug() {
+  rtError_t rt_ret = rtMalloc(&op_debug_addr_, kOpDebugMemorySize, RT_MEMORY_DDR);
+  if (rt_ret != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "rtMalloc error, ret: 0x%X", rt_ret);
+    return RT_ERROR_TO_GE_STATUS(rt_ret);
+  }
+
+  uint64_t debug_addrs_tmp = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(op_debug_addr_));
+  // For data dump, aicpu needs the pointer to pointer that save the real debug address.
+  rt_ret = rtMalloc(&p2p_debug_addr_, kDebugP2pSize, RT_MEMORY_HBM);
+  if (rt_ret != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "rtMalloc error, ret: 0x%X", rt_ret);
+    return RT_ERROR_TO_GE_STATUS(rt_ret);
+  }
+  rt_ret = rtMemcpy(p2p_debug_addr_, sizeof(uint64_t), &debug_addrs_tmp, sizeof(uint64_t), RT_MEMCPY_HOST_TO_DEVICE);
+  if (rt_ret != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "rtMemcpy to p2p_addr error: 0x%X", rt_ret);
+    return RT_ERROR_TO_GE_STATUS(rt_ret);
+  }
+
+  return SUCCESS;
+}
+
+}  // namespace ge
--- a/ge/common/dump/opdebug_register.h
+++ b/ge/common/dump/opdebug_register.h
@ -0,0 +1,44 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GE_COMMON_DUMP_OPDEBUG_REGISTER_H_
+#define GE_COMMON_DUMP_OPDEBUG_REGISTER_H_
+
+#include <map>
+#include "common/debug/ge_log.h"
+#include "common/debug/log.h"
+#include "graph/load/model_manager/data_dumper.h"
+
+namespace ge {
+class OpdebugRegister {
+ public:
+  OpdebugRegister() = default;
+  ~OpdebugRegister();
+
+  Status RegisterDebugForModel(rtModel_t model_handle, uint32_t op_debug_mode, DataDumper &data_dumper);
+  void UnregisterDebugForModel(rtModel_t model_handle);
+
+  Status RegisterDebugForStream(rtStream_t stream, uint32_t op_debug_mode, DataDumper &data_dumper);
+  void UnregisterDebugForStream(rtStream_t stream);
+
+ private:
+  Status MallocMemForOpdebug();
+
+  void *op_debug_addr_ = nullptr;
+  void *p2p_debug_addr_ = nullptr;
+};
+}  // namespace ge
+#endif  // GE_COMMON_DUMP_OPDEBUG_REGISTER_H_
--- a/ge/executor/CMakeLists.txt
+++ b/ge/executor/CMakeLists.txt
@ -17,6 +17,7 @@ set(SRC_LIST
    "../common/dump/dump_properties.cc"
    "../common/dump/dump_manager.cc"
    "../common/dump/dump_op.cc"
+    "../common/dump/opdebug_register.cc"
    "../common/profiling/ge_profiling.cc"
    "../graph/load/graph_loader.cc"
    "../graph/execute/graph_execute.cc"
--- a/ge/graph/load/model_manager/data_dumper.h
+++ b/ge/graph/load/model_manager/data_dumper.h
@ -36,21 +36,9 @@
 namespace ge {
 class DataDumper {
 public:
-  explicit DataDumper(const RuntimeParam &rsh)
-      : model_name_(),
-        model_id_(0),
-        runtime_param_(rsh),
-        dev_mem_load_(nullptr),
-        dev_mem_unload_(nullptr),
-        op_list_(),
-        input_map_(),
-        load_flag_(false),
-        device_id_(0),
-        global_step_(0),
-        loop_per_iter_(0),
-        loop_cond_(0),
-        compute_graph_(nullptr),
-        ref_info_() {}
+  DataDumper() : runtime_param_{} {}
+
+  explicit DataDumper(const RuntimeParam &rsh) : runtime_param_(rsh) {}

  ~DataDumper();

@ -105,10 +93,10 @@ class DataDumper {
  // for inference data dump
  std::string om_name_;

-  uint32_t model_id_;
+  uint32_t model_id_ = 0;
  const RuntimeParam &runtime_param_;
-  void *dev_mem_load_;
-  void *dev_mem_unload_;
+  void *dev_mem_load_ = nullptr;
+  void *dev_mem_unload_ = nullptr;

  struct InnerDumpInfo;
  struct InnerInputMapping;
@ -119,16 +107,15 @@ class DataDumper {
  uint32_t end_graph_stream_id_ = 0;
  bool is_end_graph_ = false;
  std::multimap<std::string, InnerInputMapping> input_map_;  // release after DavinciModel::Init
-  bool load_flag_;
-  uint32_t device_id_;
-  uintptr_t global_step_;
-  uintptr_t loop_per_iter_;
-  uintptr_t loop_cond_;
-  ComputeGraphPtr compute_graph_;  // release after DavinciModel::Init
+  bool load_flag_ = false;
+  uint32_t device_id_ = 0;
+  uintptr_t global_step_ = 0;
+  uintptr_t loop_per_iter_ = 0;
+  uintptr_t loop_cond_ = 0;
+  ComputeGraphPtr compute_graph_ = nullptr;  // release after DavinciModel::Init
  std::map<OpDescPtr, void *> ref_info_;     // release after DavinciModel::Init
  void *l1_fusion_addr_ = nullptr;

-
  uint32_t op_debug_task_id_ = 0;
  uint32_t op_debug_stream_id_ = 0;
  void *op_debug_addr_ = nullptr;
@ -150,14 +137,10 @@ class DataDumper {
  void SetOpDebugIdToAicpu(uint32_t task_id, uint32_t stream_id, void *op_debug_addr,
                           aicpu::dump::OpMappingInfo &op_mapping_info);
  Status ExecuteUnLoadDumpInfo(aicpu::dump::OpMappingInfo &op_mapping_info);
-  Status GenerateInput(aicpu::dump::Input &input,
-                       const OpDesc::Vistor<GeTensorDesc> &tensor_descs,
-                       const uintptr_t &addr,
-                       size_t index);
-  Status GenerateOutput(aicpu::dump::Output &output,
-                        const OpDesc::Vistor<GeTensorDesc> &tensor_descs,
-                        const uintptr_t &addr,
-                        size_t index);
+  Status GenerateInput(aicpu::dump::Input &input, const OpDesc::Vistor<GeTensorDesc> &tensor_descs,
+                       const uintptr_t &addr, size_t index);
+  Status GenerateOutput(aicpu::dump::Output &output, const OpDesc::Vistor<GeTensorDesc> &tensor_descs,
+                        const uintptr_t &addr, size_t index);
  void GenerateOpBuffer(const int64_t &size, aicpu::dump::Task &task);
 };
 struct DataDumper::InnerDumpInfo {
--- a/ge/graph/load/model_manager/davinci_model.cc
+++ b/ge/graph/load/model_manager/davinci_model.cc
@ -232,6 +232,8 @@ DavinciModel::~DavinciModel() {

      FreeP2PMem();

+      OpDebugUnRegister();
+
      if (l1_fusion_addr_ != nullptr) {
        GE_CHK_RT(rtFree(l1_fusion_addr_));
      }
@ -242,8 +244,6 @@ DavinciModel::~DavinciModel() {
      }
    }

-    OpDebugUnRegister();
-
    ReleaseTask();
    CleanTbeHandle();

@ -568,77 +568,21 @@ Status DavinciModel::SetTSDevice() {
 }

 Status DavinciModel::OpDebugRegister() {
-  bool is_op_debug = false;
-  (void)ge::AttrUtils::GetBool(ge_model_, ATTR_OP_DEBUG_FLAG, is_op_debug);
-  GELOGD("The value of op debug in ge_model is %d.", is_op_debug);
-  if (is_op_debug) {
-    debug_reg_mutex_.lock();
-    rtError_t rt_ret = rtMalloc(&op_debug_addr_, kOpDebugMemorySize, RT_MEMORY_DDR);
-    if (rt_ret != RT_ERROR_NONE) {
-      GELOGE(RT_FAILED, "rtMalloc error, ret: 0x%X", rt_ret);
-      return RT_ERROR_TO_GE_STATUS(rt_ret);
-    }
-
-    uint64_t debug_addrs_tmp = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(op_debug_addr_));
-
-    // For data dump, aicpu needs the pointer to pointer that save the real debug address.
-    rt_ret = rtMalloc(&p2p_debug_addr_, kDebugP2pSize, RT_MEMORY_HBM);
-    if (rt_ret != RT_ERROR_NONE) {
-      GELOGE(RT_FAILED, "rtMalloc error, ret: 0x%X", rt_ret);
-      return RT_ERROR_TO_GE_STATUS(rt_ret);
-    }
-    rt_ret = rtMemcpy(p2p_debug_addr_, sizeof(uint64_t), &debug_addrs_tmp, sizeof(uint64_t), RT_MEMCPY_HOST_TO_DEVICE);
-    if (rt_ret != RT_ERROR_NONE) {
-      GELOGE(RT_FAILED, "rtMemcpy to p2p_addr error: 0x%X", rt_ret);
-      return RT_ERROR_TO_GE_STATUS(rt_ret);
-    }
-
-    uint32_t op_debug_mode = 0;
-    (void)ge::AttrUtils::GetInt(ge_model_, ATTR_OP_DEBUG_MODE, op_debug_mode);
-    GELOGD("The value of op_debug_mode in ge_model_ is %u.", op_debug_mode);
-    uint32_t debug_task_id = 0;
-    uint32_t debug_stream_id = 0;
-    rt_ret = rtDebugRegister(rt_model_handle_, op_debug_mode, op_debug_addr_, &debug_stream_id, &debug_task_id);
-    if (rt_ret != RT_ERROR_NONE) {
-      GELOGE(RT_FAILED, "rtDebugRegister error, ret: 0x%X", rt_ret);
-      return RT_ERROR_TO_GE_STATUS(rt_ret);
+  if (GetDumpProperties().IsOpDebugOpen()) {
+     uint32_t op_debug_mode = GetDumpProperties().GetOpDebugMode();
+    auto ret = opdebug_register_.RegisterDebugForModel(rt_model_handle_, op_debug_mode, data_dumper_);
+    if (ret != SUCCESS) {
+      GELOGE(ret,"Register known shape op debug failed, ret: 0x%X",ret);
+      return ret;
    }
-    GELOGI("debug_task_id:%d, debug_stream_id:%u", debug_task_id, debug_stream_id);
    is_op_debug_reg_ = true;
-
-    data_dumper_.SaveOpDebugId(debug_task_id, debug_stream_id, p2p_debug_addr_, is_op_debug);
  }
-
  return SUCCESS;
 }

 void DavinciModel::OpDebugUnRegister() {
  if (is_op_debug_reg_) {
-    debug_reg_mutex_.unlock();
-    rtError_t rt_ret = RT_ERROR_NONE;
-    if (rt_model_handle_ != nullptr) {
-      GELOGD("start call debug_unregister.");
-      rt_ret = rtDebugUnRegister(rt_model_handle_);
-      if (rt_ret != RT_ERROR_NONE) {
-        GELOGW("rtDebugUnRegister failed, ret: 0x%X", rt_ret);
-      }
-    }
-
-    if (op_debug_addr_ != nullptr) {
-      rt_ret = rtFree(op_debug_addr_);
-      if (rt_ret != RT_ERROR_NONE) {
-        GELOGW("rtFree failed, ret: 0x%X", rt_ret);
-      }
-      op_debug_addr_ = nullptr;
-    }
-
-    if (p2p_debug_addr_ != nullptr) {
-      rt_ret = rtFree(p2p_debug_addr_);
-      if (rt_ret != RT_ERROR_NONE) {
-        GELOGW("rtFree failed, ret: 0x%X", rt_ret);
-      }
-      p2p_debug_addr_ = nullptr;
-    }
+    opdebug_register_.UnregisterDebugForModel(rt_model_handle_);
    is_op_debug_reg_ = false;
  }
  return;
--- a/ge/graph/load/model_manager/davinci_model.h
+++ b/ge/graph/load/model_manager/davinci_model.h
@ -29,6 +29,7 @@
 #include "common/helper/om_file_helper.h"
 #include "common/opskernel/ge_task_info.h"
 #include "common/properties_manager.h"
+#include "common/dump/opdebug_register.h"
 #include "common/types.h"
 #include "framework/common/util.h"
 #include "graph/debug/ge_attr_define.h"
@ -984,6 +985,7 @@ class DavinciModel {
  int64_t maxDumpOpNum_;
  // for data dump
  DataDumper data_dumper_;
+  OpdebugRegister opdebug_register_;
  uint64_t iterator_count_;
  bool is_l1_fusion_enable_;
  map<OpDescPtr, void *> saved_task_addrs_;  // release after DavinciModel::Init
@ -1021,8 +1023,6 @@ class DavinciModel {
  // for op debug
  mutex debug_reg_mutex_;
  bool is_op_debug_reg_ = false;
-  void *op_debug_addr_ = nullptr;
-  void *p2p_debug_addr_ = nullptr;
  bool is_online_infer_dynamic_ = false;
  bool is_getnext_sink_dynamic_ = false;
  vector<int32_t> cur_dynamic_dims_;
--- a/ge/hybrid/executor/hybrid_model_async_executor.cc
+++ b/ge/hybrid/executor/hybrid_model_async_executor.cc
@ -85,6 +85,10 @@ Status HybridModelAsyncExecutor::Stop() {
    ret = future_.get();
  }

+  if (is_op_debug_reg_) {
+    op_debug_register_.UnregisterDebugForStream(stream_);
+  }
+
  if (stream_ != nullptr) {
    GE_CHK_RT(rtStreamDestroy(stream_));
    stream_ = nullptr;
@ -101,6 +105,7 @@ Status HybridModelAsyncExecutor::Init() {
  executor_ = std::unique_ptr<HybridModelExecutor>(new(std::nothrow) HybridModelExecutor(model_, device_id_, stream_));
  GE_CHECK_NOTNULL(executor_);
  GE_CHK_STATUS_RET(executor_->Init(), "Failed to init hybrid engine");
+  GE_CHK_STATUS_RET(DumpOpDebug(),"Dump op debug failed in hybrid engine");

  GELOGI("HybridModel stage nums:%zu", model_->GetRootGraphItem()->NumGroups());
  if (model_->GetRootGraphItem()->NumGroups() >= kMinimumPiplineStages) {
@ -508,5 +513,40 @@ Status HybridModelAsyncExecutor::Execute(const vector<GeTensor> &inputs, vector<

  return SUCCESS;
 }
+Status HybridModelAsyncExecutor::DumpOpDebug() {
+  const DumpProperties &dump_properties = executor_->GetContext()->dump_properties;
+  if (dump_properties.IsOpDebugOpen()) {
+    GELOGD("Opdebug is open in hybrid engine");
+    uint32_t op_debug_mode = dump_properties.GetOpDebugMode();
+    GE_CHK_RT_RET(op_debug_register_.RegisterDebugForStream(stream_, op_debug_mode, data_dumper_));
+    is_op_debug_reg_ = true;
+    data_dumper_.SetDumpProperties(dump_properties);
+    data_dumper_.SetModelName(model_->GetModelName());
+    data_dumper_.SetModelId(model_->GetModelId());
+    data_dumper_.SetDeviceId(model_->GetDeviceId());
+    void *global_step = nullptr;
+    TensorValue *varible_global_step = model_->GetVariable(NODE_NAME_GLOBAL_STEP);
+    if (varible_global_step != nullptr) {
+      global_step = const_cast<void *>(varible_global_step->GetData());
+    }
+
+    void *loop_per_iter = nullptr;
+    TensorValue *varible_loop_per_iter = model_->GetVariable(NODE_NAME_FLOWCTRL_LOOP_PER_ITER);
+    if (varible_loop_per_iter != nullptr) {
+      loop_per_iter = const_cast<void *>(varible_loop_per_iter->GetData());
+    }
+
+    void *loop_cond = nullptr;
+    TensorValue *varible_loop_cond = model_->GetVariable(NODE_NAME_FLOWCTRL_LOOP_COND);
+    if (varible_loop_cond != nullptr) {
+      loop_cond = const_cast<void *>(varible_loop_cond->GetData());
+    }
+    data_dumper_.SetLoopAddr(global_step, loop_per_iter, loop_cond);
+    GE_CHK_STATUS_RET(data_dumper_.LoadDumpInfo(), "LoadDumpInfo failed in hybrid engine");
+    GELOGD("Dump op debug SUCCESS in hybrid engine");
+  }
+  return SUCCESS;
+}
+
 }  // namespace hybrid
 }  // namespace ge
--- a/ge/hybrid/executor/hybrid_model_async_executor.h
+++ b/ge/hybrid/executor/hybrid_model_async_executor.h
@ -21,7 +21,9 @@
 #include <future>
 #include "external/ge/ge_api_error_codes.h"
 #include "external/ge/ge_api_types.h"
+#include "common/dump/opdebug_register.h"
 #include "graph/load/model_manager/data_inputer.h"
+#include "graph/load/model_manager/data_dumper.h"
 #include "hybrid/executor/hybrid_model_executor.h"
 #include "hybrid/executor/hybrid_model_pipeline_executor.h"
 #include "runtime/stream.h"
@ -77,6 +79,8 @@ class HybridModelAsyncExecutor {

  Status PrepareInputs(const InputData &current_data, HybridModelExecutor::ExecuteArgs &args);

+  Status DumpOpDebug();
+
  std::mutex mu_;
  HybridModel *model_;
  uint32_t device_id_ = 0U;
@ -94,6 +98,9 @@ class HybridModelAsyncExecutor {
  std::vector<bool> is_input_dynamic_;
  std::shared_ptr<ModelListener> listener_;
  string om_name_;
+  DataDumper data_dumper_;
+  bool is_op_debug_reg_ = false;
+  OpdebugRegister op_debug_register_;
 };
 }  // namespace hybrid
 }  // namespace ge
--- a/ge/hybrid/executor/worker/execution_engine.cc
+++ b/ge/hybrid/executor/worker/execution_engine.cc
@ -266,9 +266,9 @@ Status NodeDoneCallback::OnNodeDone() {
  RECORD_CALLBACK_EVENT(graph_context_, context_->GetNodeName(), "[Compute] End");
  RECORD_CALLBACK_EVENT(graph_context_, context_->GetNodeName(), "[Callback] Start");

-  auto dump_path = context_->GetDumpProperties().GetDumpPath();
-  if (!dump_path.empty()) {
-    GELOGI("Start to dump dynamic shape,dump_path is %s", dump_path.c_str());
+  const DumpProperties &dump_properties = context_->GetDumpProperties();
+  if (dump_properties.IsDumpOpen() || context_->IsOverFlow()) {
+    GELOGI("Start to dump dynamic shape op");
    GE_CHK_STATUS_RET(DumpDynamicNode(), "Failed to dump dynamic node");
  }

--- a/ge/hybrid/model/hybrid_model.h
+++ b/ge/hybrid/model/hybrid_model.h
@ -61,6 +61,10 @@ class HybridModel {
    device_id_ = device_id;
  }

+  uint32_t GetDeviceId() {
+    return device_id_;
+  }
+
  void SetModelId(uint32_t model_id) {
    model_id_ = model_id;
  }
--- a/ge/hybrid/node_executor/aicore/aicore_node_executor.cc
+++ b/ge/hybrid/node_executor/aicore/aicore_node_executor.cc
@ -17,6 +17,7 @@
 #include "aicore_node_executor.h"
 #include "framework/common/taskdown_common.h"
 #include "hybrid/executor/hybrid_execution_context.h"
+#include "external/runtime/rt_error_codes.h"

 namespace ge {
 namespace hybrid {
@ -189,6 +190,7 @@ Status AiCoreNodeTask::ExecuteAsync(TaskContext &context, std::function<void()>
    }
    RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeLaunchKernel] Start");
    GE_CHK_STATUS_RET_NOLOG((*it)->LaunchKernel(context.GetStream()));
+    GE_CHK_STATUS_RET_NOLOG(CheckOverflow(context));
    // save profiling data
    uint32_t task_id = 0;
    uint32_t stream_id = 0;
@ -259,6 +261,25 @@ void AiCoreNodeTask::SetWorkspaceSizes(const vector<int64_t> &workspace_sizes) {
  workspace_sizes_ = workspace_sizes;
 }

+Status AiCoreNodeTask::CheckOverflow(TaskContext &context) {
+  const DumpProperties &dump_properties = context.GetDumpProperties();
+  if (dump_properties.IsOpDebugOpen()) {
+    GELOGD("Op %s is doing overflow check in hybrid engine", context.GetNodeName());
+    auto rt_ret = rtStreamSynchronize(context.GetStream());
+    if (rt_ret == ACL_ERROR_RT_AICORE_OVER_FLOW) {
+      context.SetOverFlow(true);
+      GELOGW("Dynamic shape op %s is over flow", context.GetNodeName());
+      return SUCCESS;
+    } else if (rt_ret != RT_ERROR_NONE) {
+      GELOGE(rt_ret, "rtstreamsynchronize failed");
+      return RT_ERROR_TO_GE_STATUS(rt_ret);
+    }
+    return SUCCESS;
+  }
+  GELOGD("Opdebug is not open in hybrid engine");
+  return SUCCESS;
+}
+
 TaskCompilerFactory &TaskCompilerFactory::GetInstance() {
  static TaskCompilerFactory instance;
  return instance;
--- a/ge/hybrid/node_executor/aicore/aicore_node_executor.h
+++ b/ge/hybrid/node_executor/aicore/aicore_node_executor.h
@ -62,6 +62,7 @@ class AiCoreNodeTask : public NodeTask {
  const vector<int64_t> &GetWorkspaceSizes() const;
  void SetWorkspaceSizes(const vector<int64_t> &workspace_sizes);
 private:
+  Status CheckOverflow(TaskContext &context);
  std::vector<std::unique_ptr<AiCoreOpTask>> tasks_;
  std::vector<int64_t> workspace_sizes_;
 };
--- a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc
+++ b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc
@ -124,7 +124,7 @@ Status KnownNodeTask::Init(TaskContext &context) {
  }
  if (!load_flag_) {
    auto dump_properties = context.GetDumpProperties();
-    if (dump_properties.IsDumpOpen()) {
+    if (dump_properties.IsDumpOpen() || dump_properties.IsOpDebugOpen()) {
      davinci_model_->SetDumpProperties(dump_properties);
      void *global_step = nullptr;
      TensorValue *varible_global_step = context.GetVariable(NODE_NAME_GLOBAL_STEP);
--- a/ge/hybrid/node_executor/task_context.cc
+++ b/ge/hybrid/node_executor/task_context.cc
@ -350,6 +350,14 @@ void TaskContext::SetStreamId(uint32_t stream_id) {
  stream_id_ = stream_id;
 }

+void TaskContext::SetOverFlow(bool is_over_flow) {
+  is_over_flow_ = is_over_flow;
+}
+
+bool TaskContext::IsOverFlow() {
+  return is_over_flow_;
+}
+
 Status TaskContext::AllocateWorkspace(size_t size, void **buffer, void *ori_addr) {
  GE_CHECK_NOTNULL(buffer);
  if (ori_addr == nullptr) {
--- a/ge/hybrid/node_executor/task_context.h
+++ b/ge/hybrid/node_executor/task_context.h
@ -65,6 +65,7 @@ class TaskContext {
  int64_t GetSessionId() const;
  uint64_t GetIterationNumber() const;

+
  void NodeDone();
  void OnError(Status error);

@ -106,6 +107,9 @@ class TaskContext {
  uint32_t GetStreamId() const;
  void SetStreamId(uint32_t stream_id);

+  void SetOverFlow(bool is_over_flow);
+  bool IsOverFlow();
+
  Status Synchronize();

  bool IsForceInferShape() const;
@ -138,6 +142,7 @@ class TaskContext {
  uint32_t task_id_ = 0;
  uint32_t stream_id_ = 0;
  std::vector<TaskDescInfo> task_desc_info;
+  bool is_over_flow_ = false;
 };
 }  // namespace hybrid
 }  // namespace ge
--- a/ge/single_op/task/op_task.cc
+++ b/ge/single_op/task/op_task.cc
@ -491,21 +491,18 @@ Status AiCpuBaseTask::UpdateOutputShape(vector<GeTensorDesc> &output_desc) {
  }
  GELOGD("Start to update DEPEND_SHAPE_RANGE AiCpuBaseTask outputshape.");

-  GE_CHK_RT_RET(rtMemcpy(aicpu_ext_handle_->GetExtInfo(),
-                         aicpu_ext_handle_->GetExtInfoLen(),
-                         ext_info_addr_dev_,
-                         aicpu_ext_handle_->GetExtInfoLen(),
-                         RT_MEMCPY_DEVICE_TO_HOST));
+  GE_CHK_RT_RET(rtMemcpy(aicpu_ext_handle_->GetExtInfo(), aicpu_ext_handle_->GetExtInfoLen(), ext_info_addr_dev_,
+                         aicpu_ext_handle_->GetExtInfoLen(), RT_MEMCPY_DEVICE_TO_HOST));

  for (size_t i = 0; i < num_outputs_; ++i) {
    GeShape shape;
    DataType data_type;
    aicpu_ext_handle_->GetOutputShapeAndType(i, shape, data_type);
-    GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(shape, output_desc[i]),
-                      "AiCpuCCTask Update [%zu]th output shape failed.", i);
+    GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(shape, output_desc[i]), "AiCpuCCTask Update [%zu]th output shape failed.",
+                      i);
    if (DumpManager::GetInstance().GetDumpProperties(kInferSessionId).IsSingleOpNeedDump()) {
-      GE_CHK_STATUS_RET(op_desc_->UpdateOutputDesc(i, output_desc[i]),
-                        "AiCpuCCTask Update [%zu]th output desc failed.", i);
+      GE_CHK_STATUS_RET(op_desc_->UpdateOutputDesc(i, output_desc[i]), "AiCpuCCTask Update [%zu]th output desc failed.",
+                        i);
    }
  }
  GELOGD("Update DEPEND_SHAPE_RANGE AiCpuBaseTask outputshape finished.");
@ -697,10 +694,10 @@ Status AiCpuTask::UpdateShapeByHbmBuffer(vector<GeTensorDesc> &output_desc) {
      const auto &shape_hbm = out_shape_hbm_[i];

      uint32_t dim_num = result_summary.shape_data_size / sizeof(int64_t);
-      std::unique_ptr<int64_t[]> shape_addr(new(std::nothrow) int64_t[dim_num]());
+      std::unique_ptr<int64_t[]> shape_addr(new (std::nothrow) int64_t[dim_num]());
      GE_CHECK_NOTNULL(shape_addr);
-      GE_CHK_RT_RET(rtMemcpy(shape_addr.get(), result_summary.shape_data_size,
-                             shape_hbm, result_summary.shape_data_size, RT_MEMCPY_DEVICE_TO_HOST));
+      GE_CHK_RT_RET(rtMemcpy(shape_addr.get(), result_summary.shape_data_size, shape_hbm,
+                             result_summary.shape_data_size, RT_MEMCPY_DEVICE_TO_HOST));

      for (uint32_t dim_idx = 0; dim_idx < dim_num; ++dim_idx) {
        shape_dims.emplace_back(shape_addr[dim_idx]);
@ -711,13 +708,14 @@ Status AiCpuTask::UpdateShapeByHbmBuffer(vector<GeTensorDesc> &output_desc) {
    GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(GeShape(shape_dims), output_desc[i]),
                      "AiCpuTask update [%zu]th output shape failed.", i);
    if (DumpManager::GetInstance().GetDumpProperties(kInferSessionId).IsSingleOpNeedDump()) {
-      GE_CHK_STATUS_RET(op_desc_->UpdateOutputDesc(i, output_desc[i]),
-                        "AiCpuTask update [%zu]th output desc failed.", i);
+      GE_CHK_STATUS_RET(op_desc_->UpdateOutputDesc(i, output_desc[i]), "AiCpuTask update [%zu]th output desc failed.",
+                        i);
    }
  }
  return SUCCESS;
 }

+
 Status AiCpuTask::UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output_desc,
                                                    vector<DataBuffer> &outputs,
                                                    rtStream_t stream) {
--- a/tests/depends/runtime/src/runtime_stub.cc
+++ b/tests/depends/runtime/src/runtime_stub.cc
@ -431,3 +431,7 @@ rtError_t rtGetTaskIdAndStreamID(uint32_t *taskId, uint32_t *streamId)
 {
 return RT_ERROR_NONE;
 }
+
+rtError_t rtDebugRegisterForStream(rtStream_t stream, uint32_t flag, const void *addr, uint32_t *streamId, uint32_t *taskId) {
+  return RT_ERROR_NONE;
+}
--- a/tests/ut/ge/CMakeLists.txt
+++ b/tests/ut/ge/CMakeLists.txt
@ -162,6 +162,7 @@ set(COMMON_SRC_FILES
    "${GE_CODE_DIR}/ge/common/dump/dump_properties.cc"
    "${GE_CODE_DIR}/ge/common/helper/model_helper.cc"
    "${GE_CODE_DIR}/ge/common/dump/dump_manager.cc"
+    "${GE_CODE_DIR}/ge/common/dump/opdebug_register.cc"
    "${GE_CODE_DIR}/ge/common/helper/om_file_helper.cc"
    "${GE_CODE_DIR}/ge/model/ge_root_model.cc"
    "${GE_CODE_DIR}/ge/common/model_parser/model_parser.cc"
@ -734,6 +735,7 @@ set(MULTI_PARTS_TEST_FILES
    "graph/transop_util_unittest.cc"
    "common/datatype_transfer_unittest.cc"
    "common/dump_manager_unittest.cc"
+    "common/opdebug_register_unittest.cc"
    "common/format_transfer_unittest.cc"
    "common/format_transfer_transpose_unittest.cc"
    "common/format_transfer_nchw_5d_unittest.cc"
--- a/tests/ut/ge/common/opdebug_register_unittest.cc
+++ b/tests/ut/ge/common/opdebug_register_unittest.cc
@ -0,0 +1,51 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "common/dump/opdebug_register.h"
+#include "common/debug/log.h"
+#include "common/ge_inner_error_codes.h"
+
+namespace ge {
+class UTEST_opdebug_register : public testing::Test {
+ protected:
+  void SetUp() {}
+  void TearDown() {}
+};
+ 
+TEST_F(UTEST_opdebug_register, register_debug_for_model_success) {
+  OpdebugRegister opdebug_register;
+  rtModel_t model_handle = (void*)0x111;
+  uint32_t op_debug_mode = 1;
+  DataDumper data_dumper;
+  auto ret = opdebug_register.RegisterDebugForModel(model_handle, op_debug_mode, data_dumper);
+  opdebug_register.UnregisterDebugForModel(model_handle);
+  EXPECT_EQ(ret, ge::SUCCESS);
+}
+
+TEST_F(UTEST_opdebug_register, register_debug_for_stream_success) {
+  OpdebugRegister opdebug_register;
+  rtStream_t stream = (void*)0x111;
+  uint32_t op_debug_mode = 1;
+  DataDumper data_dumper;
+  auto ret = opdebug_register.RegisterDebugForStream(stream, op_debug_mode, data_dumper);
+  opdebug_register.UnregisterDebugForStream(stream);
+  EXPECT_EQ(ret, ge::SUCCESS);
+}
+
+
+}  // namespace ge