diff --git a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc index 787d334a1a..d1a50a0dfe 100644 --- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc +++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc @@ -13,13 +13,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #include "backend/optimizer/mem_reuse/mem_reuse_allocator.h" #include "backend/optimizer/mem_reuse/mem_reuse.h" #include "backend/optimizer/mem_reuse/mem_reuse_checker.h" #ifdef ENABLE_D #include "runtime/device/ascend/ascend_stream_assign.h" #endif +#ifdef ENABLE_DEBUGGER +#include "debug/debugger/debugger.h" +#include "debug/debug_services.h" +#endif namespace mindspore { namespace memreuse { @@ -75,6 +78,15 @@ bool BestFitMemReuse::IsUsable(const KernelDefPtr &kernel_curr, const MembufPtr MS_EXCEPTION_IF_NULL(mem_buf); auto kernel_prev = mem_buf->used_kernel_; MS_EXCEPTION_IF_NULL(kernel_prev); +#ifdef ENABLE_DEBUGGER + auto debugger_ = mindspore::Debugger::GetInstance(); + DebugServices *debug_services = debugger_->debug_services(); + auto watchpoint_table = debug_services->GetWatchpointTable(); + std::string current_kernel_name = kernel_curr->scope_full_name(); + if (debug_services->IsWatchPoint(current_kernel_name, watchpoint_table)) { + return false; + } +#endif auto curr_stream_id = kernel_curr->stream_id(); auto prev_stream_id = kernel_prev->stream_id(); if (curr_stream_id == prev_stream_id) { diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc index 9995518c00..3987b9f183 100644 --- a/mindspore/ccsrc/backend/session/ascend_session.cc +++ b/mindspore/ccsrc/backend/session/ascend_session.cc @@ -331,6 +331,11 @@ GraphId AscendSession::CompileGraph(NotNull func_graph) { device::KernelAdjust::GetInstance().Profiling(NOT_NULL(root_graph.get())); // build kernel BuildKernel(root_graph); +#ifdef ENABLE_DEBUGGER + if (debugger_) { + debugger_->PreExecute(root_graph); + } +#endif // alloc mem MemoryAlloc(root_graph.get()); // task generate @@ -407,6 +412,11 @@ void AscendSession::BuildGraph(GraphId graph_id) { BuildKernel(graph); auto ms_context = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(ms_context); +#ifdef ENABLE_DEBUGGER + if (debugger_) { + debugger_->PreExecute(graph); + } +#endif if (ms_context->precompile_only()) { MS_LOG(INFO) << "Precompile only, stop in build kernel step"; } else { @@ -475,12 +485,6 @@ void AscendSession::RunGraph(const GraphId &graph_id, const std::vectorPreExecute(kernel_graph); - } -#endif { py::gil_scoped_release release; // run task on device @@ -791,7 +795,8 @@ void AscendSession::LoadTensor(const std::shared_ptr &kernel_graph) auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); DebugServices *debug_services = debugger_->debug_services(); - TensorLoader *tensor_loader = debug_services->get_tensor_loader(); + TensorLoader *tensor_loader = debug_services->tensor_loader(); + // TensorData will be freed up here tensor_loader->EmptyTensor(); uint32_t iter_num = tensor_loader->GetIterNum(); tensor_loader->set_iter_num(++iter_num); diff --git a/mindspore/ccsrc/debug/debug_services.cc b/mindspore/ccsrc/debug/debug_services.cc index cb883eef51..cc6c5c53ad 100644 --- a/mindspore/ccsrc/debug/debug_services.cc +++ b/mindspore/ccsrc/debug/debug_services.cc @@ -37,8 +37,8 @@ DebugServices &DebugServices::operator=(const DebugServices &other) { DebugServices::~DebugServices() { delete tensor_loader_; } -void DebugServices::add_watchpoint(unsigned int id, unsigned int watch_condition, - const std::vector> &check_node_list) { +void DebugServices::AddWatchpoint(unsigned int id, unsigned int watch_condition, + const std::vector> &check_node_list) { std::lock_guard lg(lock_); watchpoint_t watchpoint_item; @@ -57,14 +57,14 @@ void DebugServices::add_watchpoint(unsigned int id, unsigned int watch_condition watchpoint_table[id] = watchpoint_item; } -void DebugServices::remove_watchpoint(unsigned int id) { +void DebugServices::RemoveWatchpoint(unsigned int id) { std::lock_guard lg(lock_); watchpoint_table.erase(id); } -void DebugServices::check_watchpoints(std::vector *name, std::vector *slot, - std::vector *data_ptr, std::vector *data_size, - std::vector *condition, std::vector *wacthpoint_id) { +void DebugServices::CheckWatchpoints(std::vector *name, std::vector *slot, + std::vector *data_ptr, std::vector *data_size, + std::vector *condition, std::vector *wacthpoint_id) { std::lock_guard lg(lock_); std::vector> tensor_list = tensor_loader_->GetTensor(); @@ -171,9 +171,9 @@ void DebugServices::check_watchpoints(std::vector *name, std::vecto } } -void DebugServices::read_nodes_tensors(std::vector name, std::vector *ret_name, - std::vector *data_ptr, std::vector *data_size, - std::vector *dtype, std::vector> *shape) { +void DebugServices::ReadNodesTensors(std::vector name, std::vector *ret_name, + std::vector *data_ptr, std::vector *data_size, + std::vector *dtype, std::vector> *shape) { std::vector>> result_list; tensor_loader_->SearchTensors(name, &result_list); @@ -189,6 +189,28 @@ void DebugServices::read_nodes_tensors(std::vector name, std::vecto } } -TensorLoader *DebugServices::get_tensor_loader() const { return tensor_loader_; } +bool DebugServices::IsWatchPoint(std::string kernel_name, + std::unordered_map watchpoint_table) { + bool ret = false; + for (auto w_table_item : watchpoint_table) { + auto check_node_list = std::get<1>(w_table_item).check_node_list; + for (auto check_node : check_node_list) { + std::string w_name = std::get<0>(check_node); + bool w_type = std::get<1>(check_node); + if ((w_type == true && + ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) || + (w_type == false && kernel_name == w_name)) { + ret = true; + return ret; + } + } + } + return ret; +} + +TensorLoader *DebugServices::tensor_loader() const { return tensor_loader_; } +std::unordered_map DebugServices::GetWatchpointTable() { + return watchpoint_table; +} } // namespace mindspore diff --git a/mindspore/ccsrc/debug/debug_services.h b/mindspore/ccsrc/debug/debug_services.h index b2fd41cd68..41400af1d5 100644 --- a/mindspore/ccsrc/debug/debug_services.h +++ b/mindspore/ccsrc/debug/debug_services.h @@ -37,22 +37,6 @@ class DebugServices { ~DebugServices(); - void add_watchpoint(unsigned int id, unsigned int watch_condition, - const std::vector> &check_node_list); - - void remove_watchpoint(unsigned int id); - - void check_watchpoints(std::vector *name, std::vector *slot, std::vector *data_ptr, - std::vector *data_size, std::vector *condition, - std::vector *wacthpoint_id); - - void read_nodes_tensors(std::vector name, std::vector *ret_name, - std::vector *data_ptr, std::vector *data_size, - std::vector *dtype, std::vector> *shape); - - TensorLoader *get_tensor_loader() const; - - private: typedef struct condition_no_param { bool enabled = false; } condition_no_param_t; @@ -84,6 +68,26 @@ class DebugServices { std::vector> check_node_list; } watchpoint_t; + void AddWatchpoint(unsigned int id, unsigned int watch_condition, + const std::vector> &check_node_list); + + void RemoveWatchpoint(unsigned int id); + + void CheckWatchpoints(std::vector *name, std::vector *slot, std::vector *data_ptr, + std::vector *data_size, std::vector *condition, + std::vector *wacthpoint_id); + + void ReadNodesTensors(std::vector name, std::vector *ret_name, + std::vector *data_ptr, std::vector *data_size, + std::vector *dtype, std::vector> *shape); + + bool IsWatchPoint(std::string kernel_name, std::unordered_map watchpoint_table); + + TensorLoader *tensor_loader() const; + + std::unordered_map GetWatchpointTable(); + + private: std::mutex lock_; std::unordered_map watchpoint_table; diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc index 369f33d79c..dd89e17e2d 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.cc +++ b/mindspore/ccsrc/debug/debugger/debugger.cc @@ -43,7 +43,8 @@ Debugger::Debugger() device_id_(0), num_step_(0), debugger_enabled_(false), - is_dataset_graph_(false) {} + is_dataset_graph_(false), + partial_memory_(false) {} void Debugger::Init(const uint32_t device_id) { // access lock for public method @@ -57,6 +58,7 @@ void Debugger::EnableDebugger() { // reset some of the class members num_step_ = 0; debugger_enabled_ = false; + partial_memory_ = false; grpc_client_ = nullptr; debug_services_ = nullptr; @@ -72,7 +74,8 @@ void Debugger::EnableDebugger() { MS_LOG(WARNING) << "Not enabling debugger. Set environment variable ENABLE_MS_DEBUGGER=1 to enable debugger."; return; } - // configure host + + // configure grpc host const char *env_host_str = std::getenv("MS_DEBUGGER_HOST"); std::string host; if (env_host_str != nullptr) { @@ -82,7 +85,7 @@ void Debugger::EnableDebugger() { MS_LOG(WARNING) << "Environment variable MS_DEBUGGER_HOST doesn't exist. Using default debugger host: localhost"; host = "localhost"; } - // configure port + // configure grpc port const char *env_port_str = std::getenv("MS_DEBUGGER_PORT"); std::string port; if (env_port_str != nullptr) { @@ -93,6 +96,27 @@ void Debugger::EnableDebugger() { port = "50051"; } + // configure partial memory reuse + const char *env_partial_mem_str = std::getenv("MS_DEBUGGER_PARTIAL_MEM"); + if (env_partial_mem_str != nullptr) { + MS_LOG(INFO) << "Getenv MS_DEBUGGER_PARTIAL_MEM: " << env_partial_mem_str; + if (std::strcmp(env_partial_mem_str, "1") == 0) { + partial_memory_ = true; + } + } + // switch memory reuse on or off + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + context_ptr->set_enable_mem_reuse(partial_memory_); + // print some message about memory reuse to user + if (partial_memory_) { + MS_LOG(WARNING) << "Partial Memory Reuse is enabled. Note: 1. Please only set watchpoints before running the first " + "step. 2. Tensor values are only available for nodes that are watched by any watchpoint."; + } else { + MS_LOG(WARNING) << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory " + "usage for large models."; + } + // initialize grpc client grpc_client_ = std::make_unique(host, port); debug_services_ = std::make_unique(); @@ -106,6 +130,7 @@ void Debugger::Reset() { num_step_ = 0; debugger_enabled_ = false; is_dataset_graph_ = false; + partial_memory_ = false; graph_ptr_ = nullptr; grpc_client_ = nullptr; debug_services_ = nullptr; @@ -317,11 +342,10 @@ void Debugger::SetWatchpoint(const ProtoVector &nodes, const WatchCon [](WatchNode node) -> std::tuple { return make_tuple(node.node_name(), node.node_type() == "scope"); }); - - debug_services_->add_watchpoint(id, condition.condition(), check_node_list); + debug_services_->AddWatchpoint(id, condition.condition(), check_node_list); } -void Debugger::RemoveWatchpoint(const int32_t id) { debug_services_->remove_watchpoint(id); } +void Debugger::RemoveWatchpoint(const int32_t id) { debug_services_->RemoveWatchpoint(id); } std::list Debugger::LoadTensors(const ProtoVector &tensors) const { std::vector name; @@ -335,7 +359,7 @@ std::list Debugger::LoadTensors(const ProtoVector &ten // ret_name will contain tensor names that are found in TensorLoader // items in ret_name will be in the same order with tensors if found - debug_services_->read_nodes_tensors(name, &ret_name, &data_ptr, &data_size, &dtype, &shape); + debug_services_->ReadNodesTensors(name, &ret_name, &data_ptr, &data_size, &dtype, &shape); std::list tensor_list; unsigned int result_index = 0; @@ -384,8 +408,7 @@ std::list Debugger::CheckWatchpoints() const { std::vector condition; std::vector watchpoint_id; - debug_services_->check_watchpoints(&name, &slot, &data_ptr, &data_size, &condition, &watchpoint_id); - + debug_services_->CheckWatchpoints(&name, &slot, &data_ptr, &data_size, &condition, &watchpoint_id); std::list hits; for (unsigned int i = 0; i < name.size(); i++) { WatchpointHit hit; @@ -494,4 +517,6 @@ std::string GetTensorFullName(const TensorProto &tensor) { return node_name + ":" + tensor.slot() + (tensor.iter() == "" ? "" : ":" + tensor.iter()); } +bool Debugger::partial_memory() { return partial_memory_; } + } // namespace mindspore diff --git a/mindspore/ccsrc/debug/debugger/debugger.h b/mindspore/ccsrc/debug/debugger/debugger.h index da1f325291..5a3965d7cc 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.h +++ b/mindspore/ccsrc/debug/debugger/debugger.h @@ -76,6 +76,8 @@ class Debugger : public std::enable_shared_from_this { bool debugger_enabled() const; + bool partial_memory(); + private: // private constructor for singleton Debugger(); @@ -129,6 +131,7 @@ class Debugger : public std::enable_shared_from_this { int32_t num_step_; bool debugger_enabled_; bool is_dataset_graph_; + bool partial_memory_; std::mutex access_lock_; // singleton diff --git a/mindspore/ccsrc/debug/tensor_data.h b/mindspore/ccsrc/debug/tensor_data.h index 9704d69089..00af203208 100644 --- a/mindspore/ccsrc/debug/tensor_data.h +++ b/mindspore/ccsrc/debug/tensor_data.h @@ -51,25 +51,13 @@ class TensorData { int GetExecutionOrder() { return this->execution_order; } - int SetExecutionOrder(int execution_order) { - this->execution_order = execution_order; - return true; - } + void SetExecutionOrder(int execution_order) { this->execution_order = execution_order; } - int SetName(const std::string &name) { - this->name = name; - return true; - } + void SetName(const std::string &name) { this->name = name; } - bool SetTensor(mindspore::tensor::TensorPtr out_tensor) { - this->tensor_ptr = out_tensor; - return true; - } + void SetTensor(mindspore::tensor::TensorPtr out_tensor) { this->tensor_ptr = out_tensor; } - bool SetSlot(size_t slot) { - this->slot = slot; - return true; - } + void SetSlot(size_t slot) { this->slot = slot; } }; } // namespace mindspore #endif // MINDSPORE_CCSRC_DEBUG_TENSOR_DATA_H_ diff --git a/mindspore/ccsrc/debug/tensor_load.h b/mindspore/ccsrc/debug/tensor_load.h index e3ae5c94eb..ae0e89aae2 100644 --- a/mindspore/ccsrc/debug/tensor_load.h +++ b/mindspore/ccsrc/debug/tensor_load.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -28,9 +29,10 @@ class TensorLoader { public: TensorLoader() : iter_num(-1) {} - ~TensorLoader() {} + ~TensorLoader() { EmptyTensor(); } bool LoadNewTensor(std::shared_ptr tensor, bool keep_prev) { + std::lock_guard lg(lock_); if (keep_prev) { // add prev step tensor into current step map with ":prev" suffix auto handle = prev_tensor_list_map.extract(tensor->GetName()); @@ -61,11 +63,11 @@ class TensorLoader { } } - bool EmptyTensor() { + void EmptyTensor() { + std::lock_guard lg(lock_); prev_tensor_list_map.clear(); tensor_list_map.swap(prev_tensor_list_map); tensor_list.clear(); - return true; } void EmptyPrevTensor() { prev_tensor_list_map.clear(); } @@ -77,6 +79,7 @@ class TensorLoader { std::map> tensor_list_map; std::map> prev_tensor_list_map; uint32_t iter_num; + std::mutex lock_; }; } // namespace mindspore #endif // MINDSPORE_CCSRC_DEBUG_TENSOR_LOAD_H_ diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc index 32238a0603..1a87f3e6af 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc @@ -372,10 +372,13 @@ bool AscendDeviceAddress::LoadMemToHost(bool trans_flag, const std::string &tens const std::string &host_fmt, const std::vector &host_shape, TypeId host_type, size_t slot, Debugger *debugger, bool keep_prev) const { bool ret = false; - DebugServices *debug_services = debugger->debug_services(); - TensorLoader *tensor_loader = debug_services->get_tensor_loader(); - + TensorLoader *tensor_loader = debug_services->tensor_loader(); + // TensorData is freed up in AscendSession class + auto tensor_data = std::make_shared(); + tensor_data->SetName(tensor_name); + tensor_data->SetExecutionOrder(execution_order); + tensor_data->SetSlot(slot); if (trans_flag) { MS_LOG(INFO) << "E2E tensor name is " << tensor_name; mindspore::tensor::TensorPtr out_tensor = std::make_shared(host_type, host_shape); @@ -385,28 +388,18 @@ bool AscendDeviceAddress::LoadMemToHost(bool trans_flag, const std::string &tens MS_LOG(ERROR) << "Copy device mem to host failed"; return ret; } - auto tensor_data = std::make_shared(); - tensor_data->SetName(tensor_name); - tensor_data->SetExecutionOrder(execution_order); tensor_data->SetTensor(out_tensor); - tensor_data->SetSlot(slot); - ret = tensor_loader->LoadNewTensor(tensor_data, keep_prev); } else { mindspore::tensor::TensorPtr out_tensor = std::make_shared(type_id_, host_shape); size_t host_size = out_tensor->data().nbytes(); auto ret_rt_memcpy = rtMemcpy(out_tensor->data_c(), host_size, ptr_, host_size, RT_MEMCPY_DEVICE_TO_HOST); - - auto tensor_data = std::make_shared(); - tensor_data->SetName(tensor_name); - tensor_data->SetExecutionOrder(execution_order); - tensor_data->SetTensor(out_tensor); - tensor_data->SetSlot(slot); - ret = tensor_loader->LoadNewTensor(tensor_data, keep_prev); if (ret_rt_memcpy != RT_ERROR_NONE) { MS_LOG(ERROR) << "SyncDeviceToHost: rtMemcpy mem size[" << size_ << "] fail, ret[" << ret_rt_memcpy << "]"; } MS_LOG(INFO) << "E2E tensor name is " << tensor_name; + tensor_data->SetTensor(out_tensor); } + ret = tensor_loader->LoadNewTensor(tensor_data, keep_prev); return ret; } #endif diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc index 07669a9b3c..3ab3a52d42 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc @@ -311,15 +311,24 @@ bool AscendKernelRuntime::DumpData(mindspore::session::KernelGraph *graph) { namespace { void LoadOutput(mindspore::session::KernelGraph *graph, Debugger *debugger) { MS_EXCEPTION_IF_NULL(graph); + // trans_flag: "true" means tensor values will be transfered to host format, otherwise not. bool trans_flag = false; const auto &apply_kernels = graph->execution_order(); // for kernels, execution order starts from 1 int exec_order = 1; + auto debugger_ = mindspore::Debugger::GetInstance(); + DebugServices *debug_services = debugger_->debug_services(); + auto watchpoint_table = debug_services->GetWatchpointTable(); for (const auto &node : apply_kernels) { MS_EXCEPTION_IF_NULL(node); auto node_name = AnfAlgo::GetCNodeName(node); std::string kernel_name = node->fullname_with_scope(); auto output_size = AnfAlgo::GetOutputTensorNum(node); + if (debugger_->partial_memory()) { + if (!debug_services->IsWatchPoint(kernel_name, watchpoint_table)) { + continue; + } + } for (size_t j = 0; j < output_size; ++j) { auto addr = AnfAlgo::GetOutputAddr(node, j); auto type = AnfAlgo::GetOutputInferDataType(node, j); @@ -347,6 +356,7 @@ void LoadOutput(mindspore::session::KernelGraph *graph, Debugger *debugger) { void LoadParameters(mindspore::session::KernelGraph *graph, Debugger *debugger) { MS_EXCEPTION_IF_NULL(graph); + // trans_flag: "true" means tensor values will be transfered to host format, otherwise not. bool trans_flag = false; const auto ¶meters = graph->inputs(); // for parameters, set its execution order to be 0;