profiler memory

5 years ago · 8d147deb07
parent 7de625fbca
commit 8d147deb07
25 changed files with 560 additions and 147 deletions
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
--- a/mindspore/ccsrc/backend/optimizer/somas/somas.cc
+++ b/mindspore/ccsrc/backend/optimizer/somas/somas.cc
@ -35,6 +35,11 @@
 #include "utils/ms_context.h"
 #include "debug/common.h"
 #include "common/thread_pool.h"
+#include "profiler/device/common/memory_profiling.h"
+
+using mindspore::profiler::MemoryProfiling;
+using mindspore::profiler::NodeMemory;
+using mindspore::profiler::TensorMemory;

 namespace mindspore {
 namespace somas {
@ -49,6 +54,11 @@ std::map<TensorType, std::string> tensor_type_name_map = {{kCommon, "Common"},
                                                          {kRefNodeOutput, "RefNodeOutput"},
                                                          {kUnknown, "Unknown"}};

+std::map<LifeLongType, std::string> life_long_name_map = {{kLifeLongNone, "LifeLongNone"},
+                                                          {kLifeLongGraphAll, "LifeLongGraphAll"},
+                                                          {kLifeLongGraphStart, "LifeLongGraphStart"},
+                                                          {kLifeLongGraphEnd, "LifeLongGraphEnd"}};
+
 bool Somas::Allocate(const session::KernelGraph *graph) {
  auto ret = InitSomasTensors(graph);
  if (!ret) {
@ -1413,5 +1423,43 @@ uint8_t *Somas::GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const
  }
  return ptr;
 }
+
+void Somas::ConvertToProfilingNode(uint32_t graph_id) {
+#ifdef ENABLE_D
+  auto graph_node = MemoryProfiling::GetInstance().GetGraphMemoryNode(graph_id);
+  if (graph_node == nullptr) {
+    graph_node = MemoryProfiling::GetInstance().AddGraphMemoryNode(graph_id);
+    MS_LOG(INFO) << "Add graph memory node for dynamic memory profiling, graph id is " << graph_id;
+  }
+
+  for (const auto &tensor : tensors_list_) {
+    TensorMemory tensor_memory;
+    tensor_memory.SetTensorId(tensor->GetId());
+    tensor_memory.SetAlignedSize(tensor->GetAlignedSize());
+    tensor_memory.SetType(tensor_type_name_map[tensor->type_]);
+    tensor_memory.SetLifeStart(tensor->lifetime_.start_);
+    tensor_memory.SetLifeEnd(tensor->lifetime_.end_);
+    tensor_memory.SetLifeLong(life_long_name_map[tensor->lifelong_value_]);
+    graph_node->AddTensorMemory(tensor_memory);
+  }
+
+  for (const auto &node : nodes_list_) {
+    NodeMemory node_memory;
+    std::string name = GetSplitName(node->scope_full_name_);
+    node_memory.SetNodeName(name);
+    node_memory.SetNodeId(node->GetId());
+    for (const auto &tensor : node->input_tensors_) {
+      node_memory.AddInputTensorId(tensor->GetId());
+    }
+    for (const auto &tensor : node->output_tensors_) {
+      node_memory.AddOutputTensorId(tensor->GetId());
+    }
+    for (const auto &tensor : node->workspace_tensors_) {
+      node_memory.AddWorkSpaceTensorId(tensor->GetId());
+    }
+    graph_node->AddNodeMemory(node_memory);
+  }
+#endif
+}
 }  // namespace somas
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/somas/somas.h
+++ b/mindspore/ccsrc/backend/optimizer/somas/somas.h
@ -54,6 +54,8 @@ class Somas {

  static bool NodeSort(SomasNodePtr, SomasNodePtr);
  std::vector<DynamicBitSet> reuse_matrix_;
+  std::vector<DynamicBitSet> tensor_relation;
+  void ConvertToProfilingNode(uint32_t graph_id);

 private:
  // Maps
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@ -30,6 +30,7 @@
 #include "runtime/device/ascend/kernel_select_ascend.h"
 #include "runtime/device/ascend/kernel_build_ascend.h"
 #include "runtime/device/ascend/ascend_kernel_runtime.h"
+#include "runtime/device/ascend/profiling/profiling_manager.h"
 #include "backend/optimizer/ascend/ascend_backend_optimization.h"
 #include "backend/optimizer/common/common_backend_optimization.h"
 #include "backend/optimizer/ascend/mindir/dropout_unify_mindir.h"
@ -65,6 +66,11 @@
 #include "ps/util.h"
 #include "ps/ps_cache/ps_cache_manager.h"
 #endif
+#include "profiler/device/common/memory_profiling.h"
+
+using mindspore::device::ascend::ProfilingManager;
+using mindspore::profiler::MemoryProfiling;
+
 static constexpr uint32_t kLabelSwitchLabelId = 2;
 namespace mindspore {
 namespace session {
@ -649,6 +655,15 @@ GraphId AscendSession::CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) {
  root_graph->SetInputNodes();
  root_graph->SetOptimizerFlag();
  DumpAllGraphs(all_graphs);
+  // Save memory profiling data to proto file
+  if (ProfilingManager::GetInstance().IsProfiling()) {
+    auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
+    MS_EXCEPTION_IF_NULL(runtime_instance);
+    uint64_t mem_size = runtime_instance->GetAvailableMemMaxSize();
+    auto instance = MemoryProfiling::GetInstance();
+    instance.SetDeviceMemSize(mem_size);
+    instance.SaveMemoryProfiling();
+  }
  // return the root_graph id to backend
  auto graph_id = root_graph->graph_id();
  return graph_id;
--- a/mindspore/ccsrc/profiler/CMakeLists.txt
+++ b/mindspore/ccsrc/profiler/CMakeLists.txt
@ -1,11 +1,14 @@
-if (ENABLE_GPU)
+if(ENABLE_GPU)
    file(GLOB_RECURSE PROFILER_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "device/gpu/*.cc")
-    set_property(SOURCE ${PROFILER_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_PROFILER)
+    set_property(SOURCE ${PROFILER_SRC_LIST} PROPERTY COMPILE_DEFINITIONS
+      SUBMODULE_ID=mindspore::SubModuleId::SM_PROFILER)
    add_library(_mindspore_profiler_obj OBJECT ${PROFILER_SRC_LIST})
-endif ()
+endif()

-if (ENABLE_D)
-    file(GLOB_RECURSE PROFILER_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "device/ascend/*.cc")
-    set_property(SOURCE ${PROFILER_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_PROFILER)
+if(ENABLE_D)
+    file(GLOB_RECURSE PROFILER_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "device/ascend/*.cc" "device/common/*.cc")
+    set_property(SOURCE ${PROFILER_SRC_LIST} PROPERTY COMPILE_DEFINITIONS
+      SUBMODULE_ID=mindspore::SubModuleId::SM_PROFILER)
    add_library(_mindspore_profiler_obj OBJECT ${PROFILER_SRC_LIST})
-endif ()
+    add_dependencies(_mindspore_profiler_obj mindspore::protobuf)
+endif()
--- a/mindspore/ccsrc/profiler/device/common/memory_profiling.cc
+++ b/mindspore/ccsrc/profiler/device/common/memory_profiling.cc
@ -0,0 +1,97 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "profiler/device/common/memory_profiling.h"
+#include <fstream>
+#include <memory>
+#include "utils/log_adapter.h"
+#include "utils/ms_context.h"
+
+namespace mindspore {
+namespace profiler {
+
+std::shared_ptr<GraphMemory> MemoryProfiling::AddGraphMemoryNode(uint32_t graph_id) {
+  std::shared_ptr<GraphMemory> node = std::make_shared<GraphMemory>(graph_id);
+  graph_memory_[graph_id] = node;
+  return node;
+}
+
+std::shared_ptr<GraphMemory> MemoryProfiling::GetGraphMemoryNode(uint32_t graph_id) {
+  auto node = graph_memory_.find(graph_id);
+  if (node != graph_memory_.end()) {
+    return node->second;
+  }
+
+  return nullptr;
+}
+
+void MemoryProfiling::MemoryToPB() {
+  memory_proto_.set_total_mem(device_mem_size_);
+  for (const auto &graph : graph_memory_) {
+    GraphMemProto *graph_proto = memory_proto_.add_graph_mem();
+    graph_proto->set_graph_id(graph.second->GetGraphId());
+    graph_proto->set_static_mem(graph.second->GetStaticMemSize());
+    // node memory to PB
+    for (const auto &node : graph.second->GetNodeMemory()) {
+      NodeMemProto *node_mem = graph_proto->add_node_mems();
+      node_mem->set_node_name(node.GetNodeName());
+      node_mem->set_node_id(node.GetNodeId());
+      for (const auto &id : node.GetInputTensorId()) {
+        node_mem->add_input_tensor_id(id);
+      }
+      for (const auto &id : node.GetOutputTensorId()) {
+        node_mem->add_output_tensor_id(id);
+      }
+      for (const auto &id : node.GetOutputTensorId()) {
+        node_mem->add_workspace_tensor_id(id);
+      }
+    }
+    // tensor memory to PB
+    for (const auto &node : graph.second->GetTensorMemory()) {
+      TensorMemProto *tensor_mem = graph_proto->add_tensor_mems();
+      tensor_mem->set_tensor_id(node.GetTensorId());
+      tensor_mem->set_size(node.GetAlignedSize());
+      std::string type = node.GetType();
+      tensor_mem->set_type(type);
+      tensor_mem->set_life_start(node.GetLifeStart());
+      tensor_mem->set_life_end(node.GetLifeEnd());
+      std::string life_long = node.GetLifeLong();
+      tensor_mem->set_life_long(life_long);
+    }
+  }
+  MS_LOG(INFO) << "Memory profiling data to PB end";
+  return;
+}
+
+void MemoryProfiling::SaveMemoryProfiling() {
+  auto context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context);
+  std::string dir_path = context->get_param<std::string>(MS_CTX_PROFILING_DIR_PATH);
+  auto device_id = context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
+  std::string file = dir_path + std::string("/memory_usage_") + std::to_string(device_id) + std::string(".pb");
+
+  MemoryToPB();
+
+  std::fstream handle(file, std::ios::out | std::ios::trunc | std::ios::binary);
+  if (!memory_proto_.SerializeToOstream(&handle)) {
+    MS_LOG(ERROR) << "Save memory profiling data to file failed";
+  }
+  handle.close();
+  MS_LOG(INFO) << "Start save memory profiling data to " << file << " end";
+  return;
+}
+}  // namespace profiler
+}  // namespace mindspore
--- a/mindspore/ccsrc/profiler/device/common/memory_profiling.h
+++ b/mindspore/ccsrc/profiler/device/common/memory_profiling.h
@ -0,0 +1,124 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_PROFILER_DEVICE_COMMON_PROFILING_MEMORY_H
+#define MINDSPORE_PROFILER_DEVICE_COMMON_PROFILING_MEMORY_H
+
+#include "proto/memory_profiling.pb.h"
+#include <string>
+#include <map>
+#include <vector>
+#include <memory>
+#include "utils/ms_context.h"
+
+namespace mindspore {
+namespace profiler {
+
+class NodeMemory {
+ public:
+  NodeMemory() : node_name_(""), node_id_(0) {}
+  ~NodeMemory() = default;
+
+  void SetNodeName(const std::string &name) { node_name_ = name; }
+  void SetNodeId(uint64_t node_id) { node_id_ = node_id; }
+  void AddInputTensorId(uint64_t node_id) { input_tensor_id_.emplace_back(node_id); }
+  void AddOutputTensorId(uint64_t node_id) { output_tensor_id_.emplace_back(node_id); }
+  void AddWorkSpaceTensorId(uint64_t node_id) { workspace_tensor_id_.emplace_back(node_id); }
+  std::string GetNodeName() const { return node_name_; }
+  uint64_t GetNodeId() const { return node_id_; }
+  std::vector<uint64_t> GetInputTensorId() const { return input_tensor_id_; }
+  std::vector<uint64_t> GetOutputTensorId() const { return output_tensor_id_; }
+  std::vector<uint64_t> GetWorkspaceTensorId() const { return workspace_tensor_id_; }
+
+ private:
+  std::string node_name_;
+  uint64_t node_id_;
+  std::vector<uint64_t> input_tensor_id_;
+  std::vector<uint64_t> output_tensor_id_;
+  std::vector<uint64_t> workspace_tensor_id_;
+};
+
+class TensorMemory {
+ public:
+  TensorMemory() : tensor_id_(0), size_(0), type_(""), life_start_(0), life_end_(0), life_long_("") {}
+  ~TensorMemory() = default;
+
+  void SetTensorId(uint64_t tensor_id) { tensor_id_ = tensor_id; }
+  void SetAlignedSize(uint64_t size) { size_ = size; }
+  void SetType(const std::string &type) { type_ = type; }
+  void SetLifeStart(uint64_t start) { life_start_ = start; }
+  void SetLifeEnd(uint64_t end) { life_end_ = end; }
+  void SetLifeLong(const std::string &life_long) { life_long_ = life_long; }
+  uint64_t GetTensorId() const { return tensor_id_; }
+  uint64_t GetAlignedSize() const { return size_; }
+  std::string GetType() const { return type_; }
+  uint64_t GetLifeStart() const { return life_start_; }
+  uint64_t GetLifeEnd() const { return life_end_; }
+  std::string GetLifeLong() const { return life_long_; }
+
+ private:
+  uint64_t tensor_id_;
+  uint64_t size_;          // aligned tensor size
+  std::string type_;       // see TensorType in somas_tensor.h
+  uint64_t life_start_;    // the exe node id at which tensor memory allocated
+  uint64_t life_end_;      // the exe node id at which tensor memory deallocated
+  std::string life_long_;  // see LifeLongType in somas_tensor.h
+};
+
+class GraphMemory {
+ public:
+  explicit GraphMemory(uint32_t graph_id) : graph_id_(graph_id), static_mem_size_(0) {}
+  ~GraphMemory() = default;
+  void AddStaticMemorySize(uint32_t size) { static_mem_size_ += size; }
+  void AddNodeMemory(const NodeMemory &node) { node_memory_.emplace_back(node); }
+  void AddTensorMemory(const TensorMemory &node) { tensor_memory_.emplace_back(node); }
+  uint32_t GetGraphId() const { return graph_id_; }
+  uint32_t GetStaticMemSize() const { return static_mem_size_; }
+  std::vector<NodeMemory> GetNodeMemory() const { return node_memory_; }
+  std::vector<TensorMemory> GetTensorMemory() const { return tensor_memory_; }
+
+ private:
+  uint32_t graph_id_;
+  uint32_t static_mem_size_;
+  std::vector<NodeMemory> node_memory_;
+  std::vector<TensorMemory> tensor_memory_;
+};
+
+class MemoryProfiling {
+ public:
+  MemoryProfiling() = default;
+  ~MemoryProfiling() = default;
+
+  static MemoryProfiling &GetInstance() {
+    static MemoryProfiling instance;
+    return instance;
+  }
+
+  MemoryProto &GetMemProto() { return memory_proto_; }
+  std::shared_ptr<GraphMemory> AddGraphMemoryNode(uint32_t graph_id);
+  std::shared_ptr<GraphMemory> GetGraphMemoryNode(uint32_t graph_id);
+  void SetDeviceMemSize(uint64_t size) { device_mem_size_ = size; }
+  void MemoryToPB();
+  void SaveMemoryProfiling();
+
+ private:
+  MemoryProto memory_proto_;
+  std::map<uint32_t, std::shared_ptr<GraphMemory>> graph_memory_;
+  uint64_t device_mem_size_;
+};
+}  // namespace profiler
+}  // namespace mindspore
+#endif
--- a/mindspore/ccsrc/profiler/device/common/memory_profiling.proto
+++ b/mindspore/ccsrc/profiler/device/common/memory_profiling.proto
@ -0,0 +1,50 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = "proto3";
+
+package mindspore.profiler;
+
+message MemoryProto {
+  repeated GraphMemProto graph_mem = 1; // memory usage of multiple graphs
+  int64 total_mem = 2; // total allocated device memory
+}
+
+message GraphMemProto {
+  int64 graph_id = 1;  // graph id
+  int64 static_mem = 2; // size of allocated static memory for current graph
+  repeated NodeMemProto node_mems = 3;  // execution nodes
+  repeated TensorMemProto tensor_mems = 4;  // all tensors
+  string fp_start = 5; // node name of fp start
+  string bp_end = 6; // node name of bp end
+}
+
+message NodeMemProto {
+  string node_name = 1;  // node name
+  int64 node_id = 2;  // node id with respect to the execution order
+  repeated int64 input_tensor_id = 3;  // input tensor id
+  repeated int64 output_tensor_id = 4;  // output tensor id
+  repeated int64 workspace_tensor_id = 5;  // workspace tensor id
+}
+
+message TensorMemProto {
+  int64 tensor_id = 1;  // tensor id
+  int64 size = 2;  // aligned tensor size
+  string type = 3;  // tensor type, e.g. Common, OutputOnly
+  int64 life_start = 4;  // the exe node id at which tensor memory allocated
+  int64 life_end = 5;  // the exe node id at which tensor memory deallocated
+  string life_long = 6; // see LifeLongType enum
+}
--- a/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc
+++ b/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc
@ -94,8 +94,8 @@ REGISTER_PYBIND_DEFINE(MsContextPy, ([](const py::module *m) {
                           .value("save_graphs_path", MsCtxParam::MS_CTX_SAVE_GRAPHS_PATH)
                           .value("variable_memory_max_size", MsCtxParam::MS_CTX_VARIABLE_MEMORY_MAX_SIZE)
                           .value("device_id", MsCtxParam::MS_CTX_DEVICE_ID)
-                           .value("max_call_depth", MsCtxParam::MS_CTX_MAX_CALL_DEPTH);
-
+                           .value("max_call_depth", MsCtxParam::MS_CTX_MAX_CALL_DEPTH)
+                           .value("profiling_dir_path", MsCtxParam::MS_CTX_PROFILING_DIR_PATH);
                         (void)py::class_<mindspore::MsContext, std::shared_ptr<mindspore::MsContext>>(*m, "MSContext")
                           .def_static("get_instance", &mindspore::MsContext::GetInstance, "Get ms context instance.")
                           .def("get_param", &mindspore::MsCtxGetParameter, "Get value of specified paramter.")
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@ -895,4 +895,9 @@ void AscendKernelRuntime::KernelLaunchProfiling(const std::string &kernel_name)
    MS_LOG(EXCEPTION) << "Too many profiling data";
  }
 }
+
+uint64_t AscendKernelRuntime::GetAvailableMemMaxSize() const {
+  auto ascend_mem_manager = dynamic_pointer_cast<AscendMemoryManager>(mem_manager_);
+  return ascend_mem_manager->GetDeviceMemSize();
+}
 }  // namespace mindspore::device::ascend
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
@ -55,6 +55,7 @@ class AscendKernelRuntime : public KernelRuntime {
  void CreateContext() override;
  void *context() const override { return rt_context_; }
  void PreInit() override;
+  uint64_t GetAvailableMemMaxSize() const;

 protected:
  DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.cc
@ -18,6 +18,12 @@
 #include "runtime/device/ascend/ascend_memory_pool.h"
 #include "utils/ms_context.h"
 #include "runtime/mem.h"
+#include "runtime/device/ascend/profiling/profiling_manager.h"
+#include "profiler/device/common/memory_profiling.h"
+
+using mindspore::device::ascend::ProfilingManager;
+using mindspore::profiler::MemoryProfiling;
+
 namespace mindspore {
 namespace device {
 namespace ascend {
@ -44,6 +50,11 @@ void AscendMemoryManager::MallocDeviceMemory() {
  AscendMemoryPool::GetInstance().Init(device_mem_base_, device_mem_size_, dynamic_mem_offset_);
 }

+uint64_t AscendMemoryManager::GetDeviceMemSize() {
+  auto mem_size = GetDeviceMemSizeFromContext();
+  return mem_size == 0 ? kAscendDeviceMemSize : mem_size;
+}
+
 uint64_t AscendMemoryManager::GetDeviceMemSizeFromContext() {
  auto context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context);
@ -88,7 +99,7 @@ void *AscendMemoryManager::MallocMemFromMemPool(size_t size) {
  return AscendMemoryPool::GetInstance().AllocTensorMem(align_size);
 }

-uint8_t *AscendMemoryManager::MallocStaticMem(size_t size, bool communication_mem) {
+uint8_t *AscendMemoryManager::MallocStaticMem(size_t size, bool communication_mem, uint32_t graph_id) {
  size_t align_size = 0;
  if (communication_mem) {
    align_size = GetCommunicationAlignSize(size);
@ -96,6 +107,16 @@ uint8_t *AscendMemoryManager::MallocStaticMem(size_t size, bool communication_me
    align_size = GetCommonAlignSize(size);
  }

+  if (ProfilingManager::GetInstance().IsProfiling() && graph_id != kInvalidGraphId) {
+    auto node = MemoryProfiling::GetInstance().GetGraphMemoryNode(graph_id);
+    if (node == nullptr) {
+      node = MemoryProfiling::GetInstance().AddGraphMemoryNode(graph_id);
+      MS_LOG(INFO) << "Add graph memory node for static memory profiling, graph id is " << graph_id;
+    }
+
+    node->AddStaticMemorySize(align_size);
+  }
+
  auto device_mem_pool_offset = AscendMemoryPool::GetInstance().device_mem_pool_offset();
  MS_LOG(INFO) << "Malloc Memory: Static, total[" << device_mem_size_ << "] (dynamic[" << total_dynamic_size_
               << "] memory pool[" << device_mem_size_ - device_mem_pool_offset << "])"
@ -139,6 +160,13 @@ uint8_t *AscendMemoryManager::MallocDynamicMem(size_t size, bool communication_m
    return device_mem_base_ + offset;
  }
 }
+
+void AscendMemoryManager::MallocSomasDynamicMem(const session::KernelGraph *graph) {
+  MemoryManager::MallocSomasDynamicMem(graph);
+  if (ProfilingManager::GetInstance().IsProfiling()) {
+    somas_reuse_util_ptr_->ConvertToProfilingNode(graph->graph_id());
+  }
+}
 }  // namespace ascend
 }  // namespace device
 }  // namespace mindspore
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.h
@ -31,9 +31,11 @@ class AscendMemoryManager : public MemoryManager {
  void ResetDynamicMemory() override;
  void ClearGlobalIdleMem() override;
  void *MallocMemFromMemPool(size_t size) override;
+  uint64_t GetDeviceMemSize();
+  void MallocSomasDynamicMem(const session::KernelGraph *graph);

 protected:
-  uint8_t *MallocStaticMem(size_t size, bool communication_mem) override;
+  uint8_t *MallocStaticMem(size_t size, bool communication_mem, uint32_t graph_id = kInvalidGraphId) override;
  uint8_t *MallocDynamicMem(size_t size, bool communication_mem) override;

 private:
--- a/mindspore/ccsrc/runtime/device/cpu/cpu_memory_manager.cc
+++ b/mindspore/ccsrc/runtime/device/cpu/cpu_memory_manager.cc
@ -22,7 +22,7 @@ namespace mindspore {
 namespace device {
 namespace cpu {

-uint8_t *CPUMemoryManager::MallocStaticMem(size_t size, bool) {
+uint8_t *CPUMemoryManager::MallocStaticMem(size_t size, bool, uint32_t) {
  void *ptr = malloc(size);
  if (ptr != nullptr) {
    memset_s(ptr, size, 0, size);
--- a/mindspore/ccsrc/runtime/device/cpu/cpu_memory_manager.h
+++ b/mindspore/ccsrc/runtime/device/cpu/cpu_memory_manager.h
@ -44,7 +44,7 @@ class CPUMemoryManager : public MemoryManager {
  void DecreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs);

 protected:
-  uint8_t *MallocStaticMem(size_t size, bool communication_mem) override;
+  uint8_t *MallocStaticMem(size_t size, bool communication_mem, uint32_t graph_id = kInvalidGraphId) override;
  uint8_t *MallocDynamicMem(size_t size, bool communication_mem) override;

 private:
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.cc
@ -101,7 +101,7 @@ void GPUMemoryManager::FreeDeviceMemory() {
  GPUMemoryAllocator::GetInstance().ReleaseDeviceRes();
 }

-uint8_t *GPUMemoryManager::MallocStaticMem(size_t size, bool) {
+uint8_t *GPUMemoryManager::MallocStaticMem(size_t size, bool, uint32_t) {
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
  if (context_ptr->get_param<bool>(MS_CTX_ENABLE_DYNAMIC_MEM_POOL)) {
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.h
@ -36,7 +36,7 @@ class GPUMemoryManager : public MemoryManager {
                                      std::vector<size_t> size_list) override;

 protected:
-  uint8_t *MallocStaticMem(size_t size, bool communication_mem) override;
+  uint8_t *MallocStaticMem(size_t size, bool communication_mem, uint32_t graph_id = kInvalidGraphId) override;
 };
 }  // namespace gpu
 }  // namespace device
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc
@ -360,7 +360,7 @@ void KernelRuntime::AssignStaticMemoryInput(const session::KernelGraph *graph) {
      auto tensor_size = CountNodeDeviceMemorySize(item, index);
      device_address = CreateDeviceAddress(nullptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id);
      MS_LOG(DEBUG) << "Malloc static memory for " << item->fullname_with_scope();
-      if (mem_manager_->MallocMem(kStaticMem, tensor_size, device_address) == nullptr) {
+      if (mem_manager_->MallocMem(kStaticMem, tensor_size, device_address, graph->graph_id()) == nullptr) {
        MS_LOG(EXCEPTION) << "Cannot alloc address when flag is: " << kStaticMem << ", tensor size is: " << tensor_size;
      }
      MS_LOG(INFO) << "Malloc Input for graph " << graph->graph_id() << ", node: " << item->fullname_with_scope()
@ -629,6 +629,10 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const
  MS_EXCEPTION_IF_NULL(ms_context);
  std::vector<tensor::TensorPtr> tensors;
  TensorValueToTensor(node_value, &tensors);
+  // Graph id should be passed to record static memory if profiling is enabled.
+  auto kernel_info = static_cast<device::KernelInfo *>(value_node->kernel_info());
+  MS_EXCEPTION_IF_NULL(kernel_info);
+  uint32_t graph_id = kernel_info->graph_id();
  for (const auto &tensor : tensors) {
    if (tensor == nullptr) {
      MS_LOG(WARNING) << "Tensor is null";
@ -651,7 +655,7 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const
    if (ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER) &&
        !mem_manager_->MallocMemFromMemPool(address, node_size)) {
      MS_LOG(EXCEPTION) << "Cannot alloc address from memory pool when tensor size is: " << node_size;
-    } else if (mem_manager_->MallocMem(kStaticMem, node_size, address) == nullptr) {
+    } else if (mem_manager_->MallocMem(kStaticMem, node_size, address, graph_id) == nullptr) {
      MS_LOG(EXCEPTION) << "Cannot alloc address when flag is: " << kStaticMem << ", tensor size is: " << node_size;
    }
    AnfAlgo::SetOutputAddr(address, output_idx, value_node.get());
@ -662,6 +666,8 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const
                                   << "node dtype is " << AnfAlgo::GetOutputInferDataType(value_node, output_idx);
    }
  }
+
+  return;
 }

 void KernelRuntime::AssignStaticMemoryValueNode(session::KernelGraph *graph) {
@ -690,7 +696,7 @@ void KernelRuntime::AssignStaticMemoryValueNode(session::KernelGraph *graph) {
      if (ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER) &&
          !mem_manager_->MallocMemFromMemPool(address, tensor_size)) {
        MS_LOG(EXCEPTION) << "Cannot alloc address from memory pool when tensor size is: " << tensor_size;
-      } else if (mem_manager_->MallocMem(kStaticMem, tensor_size, address) == nullptr) {
+      } else if (mem_manager_->MallocMem(kStaticMem, tensor_size, address, graph->graph_id()) == nullptr) {
        MS_LOG(EXCEPTION) << "Cannot alloc address when flag is: " << kStaticMem << ", tensor size is: " << tensor_size;
      }
      AnfAlgo::SetOutputAddr(address, 0, value_node.get());
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h
@ -100,6 +100,7 @@ class KernelRuntime {
  }

  virtual void PreInit() {}
+  virtual uint64_t GetAvailableMemMaxSize() const { return 0; }

 protected:
  virtual DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
--- a/mindspore/ccsrc/runtime/device/memory_manager.cc
+++ b/mindspore/ccsrc/runtime/device/memory_manager.cc
@ -18,8 +18,10 @@
 #include <string>
 #include "backend/session/anf_runtime_algorithm.h"
 #include "utils/ms_context.h"
+
 using mindspore::memreuse::BestFitMemReuse;
 using mindspore::memreuse::MemReuseUtilPtr;
+
 namespace mindspore {
 namespace device {
 size_t MemoryManager::GetCommonAlignSize(size_t input_size) const {
@ -139,11 +141,11 @@ uint8_t *MemoryManager::MallocWorkSpaceMem(const AnfNodePtr &node, size_t index,
  return MallocDynamicMem(size, false);
 }

-uint8_t *MemoryManager::MallocMem(MemType type, size_t size, const DeviceAddressPtr &address) {
+uint8_t *MemoryManager::MallocMem(MemType type, size_t size, const DeviceAddressPtr &address, uint32_t graph_id) {
  MS_EXCEPTION_IF_NULL(address);
  uint8_t *ptr = nullptr;
  if (type == kStaticMem) {
-    ptr = MallocStaticMem(size, false);
+    ptr = MallocStaticMem(size, false, graph_id);
    address->from_mem_pool_ = true;
  } else if (type == kDynamicMem) {
    ptr = MallocDynamicMem(size, false);
@ -152,7 +154,7 @@ uint8_t *MemoryManager::MallocMem(MemType type, size_t size, const DeviceAddress
  return ptr;
 }

-uint8_t *MemoryManager::MallocStaticMem(size_t size, bool communication_mem) {
+uint8_t *MemoryManager::MallocStaticMem(size_t size, bool communication_mem, uint32_t graph_id) {
  size_t align_size = 0;
  if (communication_mem) {
    align_size = GetCommunicationAlignSize(size);
--- a/mindspore/ccsrc/runtime/device/memory_manager.h
+++ b/mindspore/ccsrc/runtime/device/memory_manager.h
@ -44,11 +44,12 @@ class MemoryManager {
  virtual void ClearGlobalIdleMem() {}

  void MallocReusedDynamicMem(const session::KernelGraph *graph);
-  void MallocSomasDynamicMem(const session::KernelGraph *graph);
+  virtual void MallocSomasDynamicMem(const session::KernelGraph *graph);
  uint8_t *MallocOutputMem(const AnfNodePtr &node, size_t index, MemType type, size_t size,
                           const DeviceAddressPtr &address, bool comm_mem);
  uint8_t *MallocWorkSpaceMem(const AnfNodePtr &node, size_t index, MemType type, size_t size);
-  virtual uint8_t *MallocMem(MemType type, size_t size, const DeviceAddressPtr &address);
+  virtual uint8_t *MallocMem(MemType type, size_t size, const DeviceAddressPtr &address,
+                             uint32_t graph_id = kInvalidGraphId);

  virtual bool MallocMemFromMemPool(const DeviceAddressPtr address, size_t size);
  virtual void *MallocMemFromMemPool(size_t size);
@ -62,7 +63,7 @@ class MemoryManager {
  size_t GetCommunicationAlignSize(size_t input_size) const;

 protected:
-  virtual uint8_t *MallocStaticMem(size_t size, bool communication_mem);
+  virtual uint8_t *MallocStaticMem(size_t size, bool communication_mem, uint32_t graph_id = kInvalidGraphId);
  virtual uint8_t *MallocDynamicMem(size_t size, bool communication_mem);
  uint8_t *device_mem_base_{nullptr};
  uint64_t device_mem_size_{0};
--- a/mindspore/core/utils/ms_context.cc
+++ b/mindspore/core/utils/ms_context.cc
@ -73,6 +73,7 @@ MsContext::MsContext(const std::string &policy, const std::string &target) {
  set_param<bool>(MS_CTX_ENABLE_GRAPH_KERNEL, false);
  set_param<bool>(MS_CTX_ENABLE_SPARSE, false);
  set_param<bool>(MS_CTX_ENABLE_PARALLEL_SPLIT, false);
+  set_param<std::string>(MS_CTX_PROFILING_DIR_PATH, "");

  backend_policy_ = policy_map_[policy];
 }
--- a/mindspore/core/utils/ms_context.h
+++ b/mindspore/core/utils/ms_context.h
@ -104,6 +104,7 @@ enum MsCtxParam : unsigned {
  MS_CTX_SAVE_GRAPHS_PATH,
  MS_CTX_VARIABLE_MEMORY_MAX_SIZE,
  MS_CTX_PYTHON_EXE_PATH,
+  MS_CTX_PROFILING_DIR_PATH,
  MS_CTX_TYPE_STRING_END,

  // parameter numbers of each type
--- a/mindspore/profiler/profiling.py
+++ b/mindspore/profiler/profiling.py
@ -140,7 +140,8 @@ class Profiler:
                logger.error(msg)
                raise ValueError(msg)
            # use context interface to open profiling, for the new mindspore version(after 2020.5.21)
-            context.set_context(enable_profiling=True, profiling_options=profiling_options)
+            context.set_context(enable_profiling=True, profiling_options=profiling_options,
+                                profiling_dir_path=self._output_path)
            base_profiling_container_path = os.path.join(self._output_path, "container")
            container_path = os.path.join(base_profiling_container_path, self._dev_id)
            data_path = os.path.join(container_path, "data")
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@ -4,12 +4,12 @@ message("build ut testcases...")
 project(ut)

 set(PROJECT_DIR "${PROJECT_SOURCE_DIR}/../../..")
-if (ENABLE_DUMP_IR)
+if(ENABLE_DUMP_IR)
    add_compile_definitions(ENABLE_DUMP_IR)
-endif (ENABLE_DUMP_IR)
-if (ENABLE_D)
+endif()
+if(ENABLE_D)
    add_compile_definitions(ENABLE_D)
-endif ()
+endif()

 #add python lib and include for all ut executables;
 message("PYTHON_INCLUDE_DIRS = ${PYTHON_INCLUDE_DIRS}")
@ -25,13 +25,13 @@ MESSAGE("check  ut_test ${CMAKE_BINARY_DIR}")

 link_directories(${MS_CCSRC_BUILD_PATH})

-if (ENABLE_MINDDATA)
+if(ENABLE_MINDDATA)
    add_definitions(-D ENABLE_MINDDATA)
    link_directories(${MS_CCSRC_BUILD_PATH}/minddata/dataset)
    link_directories(${MS_CCSRC_BUILD_PATH}/minddata/mindrecord)
-endif ()
+endif()
 # fetch ut test files
-if (ENABLE_MINDDATA)
+if(ENABLE_MINDDATA)
    include_directories(${CMAKE_SOURCE_DIR}/mindspore/ccsrc/minddata/dataset/kernels/image)
    file(GLOB_RECURSE UT_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
            ./stub/*.cc
@ -61,7 +61,7 @@ if (ENABLE_MINDDATA)
            ./cxx_api/*.cc
            )

-    if (NOT ENABLE_PYTHON)
+    if(NOT ENABLE_PYTHON)
        set(PYTHON_RELATED_SRCS
                dataset/filter_op_test.cc
                dataset/voc_op_test.cc
@ -69,15 +69,15 @@ if (ENABLE_MINDDATA)
                dataset/sentence_piece_vocab_op_test.cc
                )
        list(REMOVE_ITEM UT_SRCS ${PYTHON_RELATED_SRCS})
-    endif ()
-else ()
+    endif()
+else()
    file(GLOB_RECURSE TEMP_UT_SRCS ./*.cc)
-    foreach (OBJ ${TEMP_UT_SRCS})
-        if (NOT ${OBJ} MATCHES "./dataset/" AND NOT ${OBJ} MATCHES "./mindrecord/")
+    foreach(OBJ ${TEMP_UT_SRCS})
+        if(NOT ${OBJ} MATCHES "./dataset/" AND NOT ${OBJ} MATCHES "./mindrecord/")
            list(APPEND UT_SRCS ${OBJ})
-        endif ()
-    endforeach ()
-endif ()
+        endif()
+    endforeach()
+endif()

 file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        "../../../mindspore/ccsrc/pybind_api/*.cc"
@ -133,9 +133,11 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        "../../../mindspore/ccsrc/transform/graph_ir/*.cc"
        "../../../mindspore/ccsrc/transform/graph_ir/op_declare/*.cc"
        "../../../mindspore/ccsrc/ps/*.cc"
+        "../../../mindspore/ccsrc/profiler/device/common/*.cc"
        )

-list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/frontend/parallel/strategy_checkpoint/parallel_strategy_checkpoint.cc")
+list(REMOVE_ITEM MINDSPORE_SRC_LIST
+  "../../../mindspore/ccsrc/frontend/parallel/strategy_checkpoint/parallel_strategy_checkpoint.cc")
 list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/ps/util.cc")
 list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/ps/scheduler.cc")
 list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/ps/optimizer_info.cc")
@ -154,31 +156,32 @@ add_dependencies(_ut_ut_obj engine-cache-server)
 add_executable(ut_tests $<TARGET_OBJECTS:_ut_ut_obj>
        $<TARGET_OBJECTS:_ut_mindspore_obj>)

-if (ENABLE_GE)
-    if (ENABLE_TRAIN)
+if(ENABLE_GE)
+    if(ENABLE_TRAIN)
        target_link_libraries(ut_tests PRIVATE graph ge_runner)
-    else ()
+    else()
        target_link_libraries(ut_tests PRIVATE graph ge_client)
-    endif ()
+    endif()

    target_link_libraries(mindspore PRIVATE tsdclient)
-endif ()
+endif()

-if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    target_link_libraries(ut_tests PRIVATE mindspore::gtest mindspore::event mindspore::event_pthreads mindspore_gvar ${PYTHON_LIBRARIES} pthread util dl)
-    if (ENABLE_MINDDATA)
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+    target_link_libraries(ut_tests PRIVATE mindspore::gtest mindspore::event mindspore::event_pthreads
+                          mindspore_gvar ${PYTHON_LIBRARIES} pthread util dl)
+    if(ENABLE_MINDDATA)

        # AUX_SOURCE_DIRECTORY(LITE_CV_FILES)
        # message(STATUS "xxxxxxxxxxxxxxxxx"${LITE_CV_FILES} )
        # add_library(_live_cv OBJECT ${LITE_CV_FILES})

        target_link_libraries(ut_tests PRIVATE _c_dataengine _c_mindrecord)
-    endif ()
-else ()
+    endif()
+else()
    target_link_libraries(ut_tests PRIVATE mindspore::gtest mindspore_gvar ${PYTHON_LIBRARIES})
-endif ()
-if (USE_GLOG)
+endif()
+if(USE_GLOG)
    target_link_libraries(ut_tests PRIVATE mindspore::glog)
-endif ()
+endif()

 target_link_libraries(ut_tests PRIVATE mindspore mindspore_shared_lib securec graph)