!561 Dynamic multi batch memory optimization

From: @tangqunzhang Reviewed-by: @xchu42,@wqtshg Signed-off-by: @ji_chen
5 years ago · 5043ea87bb
parent c97137e93f 56b950a09d
commit 5043ea87bb
6 changed files with 363 additions and 204 deletions
--- a/ge/graph/build/memory/binary_block_mem_assigner.cc
+++ b/ge/graph/build/memory/binary_block_mem_assigner.cc
@ -22,7 +22,7 @@ namespace {
 const uint32_t kRangeCeilInterval = 2;
 const uint32_t kLogBase = 2;
 const int64_t kLargeBlockSize = 8 * 1024 * 1024;
-const int64_t kLargeBlockRangeSize = 10;
+const int64_t kLargeBlockRangeSize = 2;
 }  // namespace

 namespace ge {
@ -73,15 +73,17 @@ Status BinaryBlockMemAssigner::GetMemoryRanges(vector<int64_t> &range_ceils) {
    GELOGE(FAILED, "dividend is 0!");
    return FAILED;
  }
+  // Memory size is 512 aligned, so it is not necessary to take less than 512
+  int64_t min_memory_size = (all_memory_size.back() > MEM_ALIGN_SIZE) ? MEM_ALIGN_SIZE : all_memory_size.front();
  auto range_number = static_cast<size_t>(
-    ceil(log(all_memory_size.back() / static_cast<double>(all_memory_size.front())) / log(kLogBase)));
+    ceil(log(all_memory_size.back() / static_cast<double>(min_memory_size)) / log(kLogBase)));
  range_number = (range_number == 0) ? 1 : range_number;
  GELOGD("Range number: %zu", range_number);

  vector<vector<int64_t>> ranges(range_number);
  GE_CHK_BOOL_EXEC((range_number != 0), return PARAM_INVALID, "range_number can't be 0.");
  size_t range_number_limit = all_memory_size.size() / range_number;
-  int64_t range_ceil = all_memory_size[0];
+  int64_t range_ceil = min_memory_size;
  for (size_t i = 1; i <= range_number; i++) {
    GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(static_cast<uint64_t>(range_ceil), kRangeCeilInterval),
                    GELOGE(FAILED, "Multiply result is out of range.");
@ -114,7 +116,7 @@ Status BinaryBlockMemAssigner::GetMemoryRanges(vector<int64_t> &range_ceils) {
      range_ceils.push_back(range.back());
    }
  }
-  GELOGD("Range ceils: %s", ToString(range_ceils).c_str());
+  GELOGI("Range ceils: %s", ToString(range_ceils).c_str());

  return SUCCESS;
 }
--- a/ge/graph/build/memory/block_mem_assigner.cc
+++ b/ge/graph/build/memory/block_mem_assigner.cc
--- a/ge/graph/build/memory/block_mem_assigner.h
+++ b/ge/graph/build/memory/block_mem_assigner.h
@ -65,6 +65,7 @@ class MemoryBlock {
        stream_id_(stream_id),
        deleted_block_(false),
        reuse_mem_(reuse_mem),
+        same_stream_(true),
        input_index_(0),
        continuous_block_(false),
        first_continuous_block_(false),
@ -85,10 +86,14 @@ class MemoryBlock {
    symbol_list_.clear();
  }

-  void Init(size_t real_size, OpMemoryType type, const ge::NodePtr &node, uint32_t out_index, size_t no_align_size) {
+  void Init(size_t real_size, OpMemoryType type, const ge::NodePtr &node, uint32_t out_index, size_t no_align_size,
+            int64_t stream_id) {
    real_size_list_.emplace_back(real_size);
    no_align_size_list_.emplace_back(no_align_size);
    node_type_index_list_.emplace_back(node, type, out_index, false);
+    if (stream_id != stream_id_) {
+        same_stream_ = false;
+    }
  }
  size_t Size() const { return block_size_; }

@ -106,6 +111,12 @@ class MemoryBlock {
    node_type_index_list_.emplace_back(node_type_index);
    real_size_list_.emplace_back(real_size);
    no_align_size_list_.emplace_back(no_align_size);
+    if ((node_type_index.node != nullptr) && (node_type_index.node->GetOpDesc() != nullptr)) {
+      auto stream_id = node_type_index.node->GetOpDesc()->GetStreamId();
+      if (stream_id != stream_id_) {
+        same_stream_ = false;
+      }
+    }
  }

  void AddSymbol(const std::string &symbol) {
@ -122,7 +133,7 @@ class MemoryBlock {

  std::string String();

-  bool IsSameLabel(std::string &first_batch_label);
+  bool IsSameBatchLabel();

  void AddContinuousLifeReuseBlock(MemoryBlock *block, DependStreamLife &total_node_depend_stream_life);

@ -142,6 +153,7 @@ class MemoryBlock {
  int64_t stream_id_;
  bool deleted_block_;
  bool reuse_mem_;
+  bool same_stream_;
  uint32_t input_index_;
  bool continuous_block_;
  bool first_continuous_block_;
@ -149,6 +161,7 @@ class MemoryBlock {
  bool is_zero_copy_;
  std::map<int64_t, size_t> depend_stream_life_;
  int64_t memory_type_;
+  std::string batch_label_;
 private:
  size_t block_size_;
  std::vector<size_t> real_size_list_;
@ -209,7 +222,7 @@ class BlockMemAssigner : public MemAssigner {

  void GetOutAndWorkSpaceMem(std::vector<int64_t> &all_memory_size);

-  void GetNodeWorkSpaceSize(const ge::NodePtr &node, std::vector<int64_t> &workspace_memory);
+  void GetNodeWorkSpaceSize(const ge::NodePtr &node, std::vector<int64_t> &workspace_memory, int64_t &total_size);

  ///
  /// @ingroup GE
@ -353,7 +366,7 @@ class BlockMemAssigner : public MemAssigner {
  /// @return void
  /// @author
  ///
-  void ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock *> &reusable_memory);
+  void ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock *> &reusable_memory, bool same_stream = true);

  ///
  /// @ingroup GE
@ -379,11 +392,11 @@ class BlockMemAssigner : public MemAssigner {

  ///
  /// @ingroup GE
-  /// @brief Merge memory blocks between different batchs
+  /// @brief Resize memory blocks for each batchs
  /// @return merge or not
  /// @author
  ///
-  bool MergeDynamicBatchBlocks();
+  void ResizeDynamicBatchBlocks();

  void AssignContinuousBlocks();

@ -436,6 +449,17 @@ class BlockMemAssigner : public MemAssigner {

  int64_t atomic_addr_clean_id_ = 0;

+  size_t theory_min_memory_size_ = 0;
+
+  size_t theory_memory_size_ = 0;
+
+  std::string max_batch_label_;
+
+  ///
+  /// @          [stream1][nodeid]
+  /// @[nodeid]  [stream2][nodeid]
+  /// @          [stream2][nodeid]
+  ///
  DependStreamLife total_node_depend_stream_life_;
 };
 }  // namespace ge
--- a/ge/graph/build/memory/graph_mem_assigner.cc
+++ b/ge/graph/build/memory/graph_mem_assigner.cc
@ -1646,9 +1646,9 @@ ge::Status GraphMemoryAssigner::SetAtomicCleanAttr(const NodePtr &node, const ve
    }
    string atomic_mem_size_str = ss.str();

-    GELOGI("[IMAS]SetAtomicCleanAttr : Set graph[%s] atomic_node[%s] output offset [%s] size[%s] streamid[%ld]",
+    GELOGI("[IMAS]SetAtomicCleanAttr : Set %s atomic_node name[%s] output[0] offset to [%s] streamid[%ld] size[%s]",
           node->GetOwnerComputeGraph()->GetName().c_str(), node_op_desc->GetName().c_str(),
-           atomic_mem_start_str.c_str(), atomic_mem_size_str.c_str(), node->GetOpDesc()->GetStreamId());
+           atomic_mem_start_str.c_str(), node->GetOpDesc()->GetStreamId(), atomic_mem_size_str.c_str());
  }
  return SUCCESS;
 }
--- a/ge/graph/load/new_model_manager/davinci_model.cc
+++ b/ge/graph/load/new_model_manager/davinci_model.cc
@ -2202,7 +2202,7 @@ Status DavinciModel::CopyInputData(const InputData &input_data, bool device_data
    void *mem_addr = data.second.GetBasicAddr();
    void *data_buf_addr = reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(data_buf.data));
    uint64_t data_buf_length = data_buf.length;
-    GELOGI("[IMAS]CopyPlainData memcpy graph_%u type[F] input[%u] dst[%p] src[%p] mem_size[%lu] datasize[%lu]",
+    GELOGI("CopyPlainData memcpy graph_%u type[F] input[%u] dst[%p] src[%p] mem_size[%lu] datasize[%lu]",
           runtime_param_.graph_id, data.first, mem_addr, data_buf_addr, data_size, data_buf_length);
    GE_CHK_RT_RET(rtMemcpy(mem_addr, data_size, data_buf_addr, data_buf_length, kind));
  }
--- a/ge/graph/load/new_model_manager/model_utils.cc
+++ b/ge/graph/load/new_model_manager/model_utils.cc
@ -61,7 +61,7 @@ vector<int64_t> ModelUtils::GetInputSize(ConstOpDescPtr op_desc) {
      GELOGI("Get size from TensorDesc failed, op : %s, input index : %zu", op_desc->GetName().c_str(), i);
      continue);

-    GELOGI("[IMAS]GetInputSize op: %s, index: %zu, size:%ld", op_desc->GetName().c_str(), i, tensor_size);
+    GELOGI("GetInputSize op: %s, index: %zu, size:%ld", op_desc->GetName().c_str(), i, tensor_size);
    v_input_size.push_back(tensor_size);
  }

@ -96,7 +96,7 @@ vector<int64_t> ModelUtils::GetOutputSize(ConstOpDescPtr op_desc) {
      GELOGI("Get size from TensorDesc failed, op : %s, output index : %zu", op_desc->GetName().c_str(), i);
      continue);

-    GELOGI("[IMAS]GetOutputSize op: %s, index: %zu, size:%ld", op_desc->GetName().c_str(), i, tensor_size);
+    GELOGI("GetOutputSize op: %s, index: %zu, size:%ld", op_desc->GetName().c_str(), i, tensor_size);
    v_output_size.push_back(tensor_size);
  }