From cb4c74c7c034761e8c1772f416fdaeedfa248e4f Mon Sep 17 00:00:00 2001 From: Shida He Date: Thu, 25 Jun 2020 12:44:23 -0400 Subject: [PATCH] Keep parameters of the previous step in TensorLoader Add name truncating to support mindinsight loading parameter Refactoring and address review comments --- .../ccsrc/debug/debugger/debug_graph.proto | 6 + mindspore/ccsrc/debug/debugger/debugger.cc | 205 +++++++++--------- mindspore/ccsrc/debug/debugger/debugger.h | 33 +-- mindspore/ccsrc/debug/tensor_load.h | 17 +- .../device/ascend/ascend_device_address.cc | 8 +- .../device/ascend/ascend_device_address.h | 3 +- .../device/ascend/ascend_kernel_runtime.cc | 6 +- mindspore/ccsrc/session/ascend_session.cc | 3 +- 8 files changed, 158 insertions(+), 123 deletions(-) diff --git a/mindspore/ccsrc/debug/debugger/debug_graph.proto b/mindspore/ccsrc/debug/debugger/debug_graph.proto index 042360fac3..0930791ac0 100644 --- a/mindspore/ccsrc/debug/debugger/debug_graph.proto +++ b/mindspore/ccsrc/debug/debugger/debug_graph.proto @@ -313,4 +313,10 @@ message TensorProto { // If the tensor content transferring is finished. optional bool finished = 6; + + // The iteration of the tensor. Supported: "prev" or leave empty. + optional string iter = 7; + + // If the tensor name should be truncated. + optional bool truncate = 8; } \ No newline at end of file diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc index ea147a929f..c061fba6e7 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.cc +++ b/mindspore/ccsrc/debug/debugger/debugger.cc @@ -178,7 +178,7 @@ void Debugger::CheckDatasetGraph() { is_dataset_graph_ = false; } -GraphProto Debugger::GetGraphProto() { +GraphProto Debugger::GetGraphProto() const { // convert kernel graph to debugger modelproto ModelProto model = GetDebuggerFuncGraphProto(graph_ptr_); return model.graph(); @@ -261,12 +261,9 @@ void Debugger::CommandLoop() { MS_LOG(INFO) << "node name: " << node.node_name(); MS_LOG(INFO) << "node type: " << node.node_type(); } - WatchCondition recieved_condition = GetWatchcondition(reply); - MS_LOG(INFO) << "condition: " << recieved_condition.condition(); - int32_t id = GetWatchpointID(reply); - MS_LOG(INFO) << "id: " << id; - bool delete_ = GetWatchpointDelete(reply); - MS_LOG(INFO) << "delete: " << delete_; + MS_LOG(INFO) << "condition: " << GetWatchcondition(reply).condition(); + MS_LOG(INFO) << "id: " << GetWatchpointID(reply); + MS_LOG(INFO) << "delete: " << GetWatchpointDelete(reply); } MS_LOG(INFO) << "Setting watchpoint"; if (GetWatchpointDelete(reply)) { @@ -284,15 +281,20 @@ void Debugger::CommandLoop() { MS_LOG(INFO) << "tensor node name: " << tensor.node_name(); MS_LOG(INFO) << "tensor slot: " << tensor.slot(); MS_LOG(INFO) << "tensor finished: " << std::boolalpha << tensor.finished() << std::noboolalpha; + MS_LOG(INFO) << "tensor iter: " << tensor.iter(); + MS_LOG(INFO) << "tensor truncate: " << std::boolalpha << tensor.truncate() << std::noboolalpha; } } MS_LOG(INFO) << "Sending tensors"; std::list tensors = LoadTensors(GetTensors(reply)); { + // print view cmd reply for (auto tensor : tensors) { MS_LOG(INFO) << "tensor node name: " << tensor.node_name(); MS_LOG(INFO) << "tensor slot: " << tensor.slot(); MS_LOG(INFO) << "tensor finished: " << std::boolalpha << tensor.finished() << std::noboolalpha; + MS_LOG(INFO) << "tensor iter: " << tensor.iter(); + MS_LOG(INFO) << "tensor truncate: " << std::boolalpha << tensor.truncate() << std::noboolalpha; MS_LOG(INFO) << "tensor dims: "; for (auto dim : tensor.dims()) { MS_LOG(INFO) << dim << ","; @@ -309,68 +311,6 @@ void Debugger::CommandLoop() { } } -DebuggerCommand Debugger::GetCommand(const EventReply &reply) { - DebuggerCommand cmd = DebuggerCommand::kUnknownCMD; - switch (reply.cmd_case()) { - case debugger::EventReply::CmdCase::kExit: - cmd = DebuggerCommand::kExitCMD; - break; - case debugger::EventReply::CmdCase::kRunCmd: - cmd = DebuggerCommand::kRunCMD; - break; - case debugger::EventReply::CmdCase::kSetCmd: - cmd = DebuggerCommand::kSetCMD; - break; - case debugger::EventReply::CmdCase::kViewCmd: - cmd = DebuggerCommand::kViewCMD; - break; - default: - MS_LOG(ERROR) << "Error: UnknownCMD"; - break; - } - return cmd; -} - -ProtoVector Debugger::GetWatchnodes(const EventReply &reply) { - if (!reply.has_set_cmd()) { - MS_LOG(ERROR) << "Error: Not SetCMD, can not get WatchNodes. Returning default value: ProtoVector()."; - return ProtoVector(); - } - return reply.set_cmd().watch_nodes(); -} - -WatchCondition Debugger::GetWatchcondition(const EventReply &reply) { - if (!reply.has_set_cmd() || !reply.set_cmd().has_watch_condition()) { - MS_LOG(ERROR) << "Error: Can not get WatchCondition from command. Returning default value: WatchCondition()."; - return WatchCondition(); - } - return reply.set_cmd().watch_condition(); -} - -int32_t Debugger::GetWatchpointID(const EventReply &reply) { - if (!reply.has_set_cmd()) { - MS_LOG(ERROR) << "Error: Not SetCMD, can not get Watchpoint ID. Returning default value: 0."; - return 0; - } - return reply.set_cmd().id(); -} - -bool Debugger::GetWatchpointDelete(const EventReply &reply) { - if (!reply.has_set_cmd()) { - MS_LOG(ERROR) << "Error: Not SetCMD, can not get Watchpoint delete flag. Returning default value: false."; - return false; - } - return reply.set_cmd().delete_(); -} - -ProtoVector Debugger::GetTensors(const EventReply &reply) { - if (!reply.has_view_cmd()) { - MS_LOG(ERROR) << "Error: Not ViewCMD, can not get Tensors. Returning default value: ProtoVector()."; - return ProtoVector(); - } - return reply.view_cmd().tensors(); -} - void Debugger::SetWatchpoint(const ProtoVector &nodes, const WatchCondition &condition, const int32_t id) { std::vector> check_node_list; std::transform(nodes.begin(), nodes.end(), std::back_inserter(check_node_list), @@ -383,7 +323,7 @@ void Debugger::SetWatchpoint(const ProtoVector &nodes, const WatchCon void Debugger::RemoveWatchpoint(const int32_t id) { debug_services_->remove_watchpoint(id); } -std::list Debugger::LoadTensors(const ProtoVector &tensors) { +std::list Debugger::LoadTensors(const ProtoVector &tensors) const { std::vector name; std::vector ret_name; std::vector data_ptr; @@ -391,38 +331,42 @@ std::list Debugger::LoadTensors(const ProtoVector &ten std::vector dtype; std::vector> shape; - std::transform(tensors.begin(), tensors.end(), std::back_inserter(name), - [](TensorProto tensor) -> std::string { return tensor.node_name() + ":" + tensor.slot(); }); + std::transform(tensors.begin(), tensors.end(), std::back_inserter(name), GetTensorFullName); + // ret_name will contain tensor names that are found in TensorLoader + // items in ret_name will be in the same order with tensors if found debug_services_->read_nodes_tensors(name, &ret_name, &data_ptr, &data_size, &dtype, &shape); std::list tensor_list; unsigned int result_index = 0; - TensorProto tensor_item; - for (auto tensor : tensors) { + TensorProto tensor_item; tensor_item.set_node_name(tensor.node_name()); tensor_item.set_slot(tensor.slot()); + tensor_item.set_iter(tensor.iter()); + tensor_item.set_truncate(tensor.truncate()); + tensor_item.clear_tensor_content(); + tensor_item.clear_data_type(); + tensor_item.clear_dims(); + // always set finished to true before big tensor splitting is supported tensor_item.set_finished(true); // return empty tensor if didn't find the requested tensor - if (result_index >= ret_name.size() || ret_name[result_index] != tensor.node_name() + ":" + tensor.slot()) { + if (result_index >= ret_name.size() || ret_name[result_index] != GetTensorFullName(tensor)) { tensor_list.push_back(tensor_item); continue; } tensor_item.set_tensor_content(data_ptr[result_index], data_size[result_index]); tensor_item.set_data_type(GetDebuggerNumberDataType(dtype[result_index])); - tensor_item.clear_dims(); for (auto &elem : shape[result_index]) { tensor_item.add_dims(elem); } + // add tensor to result list and increment result_index to check next item in ret_name tensor_list.push_back(tensor_item); - result_index++; } - return tensor_list; } @@ -432,7 +376,7 @@ void Debugger::Exit() { std::exit(EXIT_FAILURE); } -std::list Debugger::CheckWatchpoints() { +std::list Debugger::CheckWatchpoints() const { std::vector name; std::vector slot; std::vector data_ptr; @@ -442,31 +386,23 @@ std::list Debugger::CheckWatchpoints() { debug_services_->check_watchpoints(&name, &slot, &data_ptr, &data_size, &condition, &watchpoint_id); - std::list points; - + std::list hits; for (unsigned int i = 0; i < name.size(); i++) { - TensorProto *tensor_item; - tensor_item = new TensorProto(); + WatchpointHit hit; + hit.set_id(watchpoint_id[i]); + + // here TensorProto act as a tensor indicator, not sending tensor content + TensorProto *tensor_item = hit.mutable_tensor(); tensor_item->set_node_name(name[i]); tensor_item->set_slot(slot[i]); - tensor_item->set_tensor_content(data_ptr[i], data_size[i]); - - // finished in TensorProto will always be true before we implement big tensor splitting tensor_item->set_finished(true); - WatchCondition *condition_item; - condition_item = new WatchCondition(); + WatchCondition *condition_item = hit.mutable_watch_condition(); condition_item->set_condition(debugger::WatchCondition_Condition(condition[i])); - WatchpointHit point; - point.set_allocated_tensor(tensor_item); - point.set_allocated_watch_condition(condition_item); - point.set_id(watchpoint_id[i]); - - points.push_back(point); + hits.push_back(hit); } - - return points; + return hits; } void Debugger::SendWatchpointsAndSuspend(const std::list &points) { @@ -481,8 +417,81 @@ void Debugger::SendWatchpointsAndSuspend(const std::list &points) CommandLoop(); } -DebugServices *Debugger::get_debug_services() { return debug_services_.get(); } +DebugServices *Debugger::debug_services() const { return debug_services_.get(); } + +bool Debugger::debugger_enabled() const { return debugger_enabled_; } + +DebuggerCommand GetCommand(const EventReply &reply) { + DebuggerCommand cmd = DebuggerCommand::kUnknownCMD; + switch (reply.cmd_case()) { + case debugger::EventReply::CmdCase::kExit: + cmd = DebuggerCommand::kExitCMD; + break; + case debugger::EventReply::CmdCase::kRunCmd: + cmd = DebuggerCommand::kRunCMD; + break; + case debugger::EventReply::CmdCase::kSetCmd: + cmd = DebuggerCommand::kSetCMD; + break; + case debugger::EventReply::CmdCase::kViewCmd: + cmd = DebuggerCommand::kViewCMD; + break; + default: + MS_LOG(ERROR) << "Error: UnknownCMD"; + break; + } + return cmd; +} + +ProtoVector GetWatchnodes(const EventReply &reply) { + if (!reply.has_set_cmd()) { + MS_LOG(ERROR) << "Error: Not SetCMD, can not get WatchNodes. Returning default value: ProtoVector()."; + return ProtoVector(); + } + return reply.set_cmd().watch_nodes(); +} + +WatchCondition GetWatchcondition(const EventReply &reply) { + if (!reply.has_set_cmd() || !reply.set_cmd().has_watch_condition()) { + MS_LOG(ERROR) << "Error: Can not get WatchCondition from command. Returning default value: WatchCondition()."; + return WatchCondition(); + } + return reply.set_cmd().watch_condition(); +} + +int32_t GetWatchpointID(const EventReply &reply) { + if (!reply.has_set_cmd()) { + MS_LOG(ERROR) << "Error: Not SetCMD, can not get Watchpoint ID. Returning default value: 0."; + return 0; + } + return reply.set_cmd().id(); +} -bool Debugger::debugger_enabled() { return debugger_enabled_; } +bool GetWatchpointDelete(const EventReply &reply) { + if (!reply.has_set_cmd()) { + MS_LOG(ERROR) << "Error: Not SetCMD, can not get Watchpoint delete flag. Returning default value: false."; + return false; + } + return reply.set_cmd().delete_(); +} + +ProtoVector GetTensors(const EventReply &reply) { + if (!reply.has_view_cmd()) { + MS_LOG(ERROR) << "Error: Not ViewCMD, can not get Tensors. Returning default value: ProtoVector()."; + return ProtoVector(); + } + return reply.view_cmd().tensors(); +} + +std::string GetTensorFullName(const TensorProto &tensor) { + string node_name = tensor.node_name(); + if (tensor.truncate()) { + // scopes in node name are seperated by '/' + // use the name without scope if truncate is true + std::size_t found = node_name.find_last_of("/"); + node_name = node_name.substr(found + 1); + } + return node_name + ":" + tensor.slot() + (tensor.iter() == "" ? "" : ":" + tensor.iter()); +} } // namespace mindspore diff --git a/mindspore/ccsrc/debug/debugger/debugger.h b/mindspore/ccsrc/debug/debugger/debugger.h index 6ce7d03625..9b03d6b0b7 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.h +++ b/mindspore/ccsrc/debug/debugger/debugger.h @@ -72,9 +72,9 @@ class Debugger : public std::enable_shared_from_this { // suspend the execution after a debug_op void PostDebugOp(); - DebugServices *get_debug_services(); + DebugServices *debug_services() const; - bool debugger_enabled(); + bool debugger_enabled() const; private: // private constructor for singleton @@ -92,7 +92,7 @@ class Debugger : public std::enable_shared_from_this { void CheckDatasetGraph(); // serialize graph and get proto - GraphProto GetGraphProto(); + GraphProto GetGraphProto() const; // send graph and enter command wait loop void SendGraphAndSuspend(const GraphProto &graph_proto); @@ -102,16 +102,6 @@ class Debugger : public std::enable_shared_from_this { // break if RunCMD void CommandLoop(); - // process reply and command type - DebuggerCommand GetCommand(const EventReply &reply); - - // parse other data out of EventReply - ProtoVector GetWatchnodes(const EventReply &reply); - WatchCondition GetWatchcondition(const EventReply &reply); - int32_t GetWatchpointID(const EventReply &reply); - bool GetWatchpointDelete(const EventReply &reply); - ProtoVector GetTensors(const EventReply &reply); - // set what nodes and conditions to watch void SetWatchpoint(const ProtoVector &nodes, const WatchCondition &condition, const int32_t id); @@ -119,14 +109,14 @@ class Debugger : public std::enable_shared_from_this { void RemoveWatchpoint(const int32_t id); // load tensor for view command - std::list LoadTensors(const ProtoVector &tensors); + std::list LoadTensors(const ProtoVector &tensors) const; // terminate training process void Exit(); // analyze tensors and check watchpoint conditions // return names of tensors and what condition they hit - std::list CheckWatchpoints(); + std::list CheckWatchpoints() const; // send watchpoints that hit and enter command wait loop void SendWatchpointsAndSuspend(const std::list &points); @@ -155,5 +145,18 @@ ModelProto GetDebuggerFuncGraphProto(const FuncGraphPtr &func_graph); // for getting proto DataType from Type of Tensor DataType GetDebuggerNumberDataType(const TypePtr &type); +// process reply and command type +DebuggerCommand GetCommand(const EventReply &reply); + +// parse other data out of EventReply +ProtoVector GetWatchnodes(const EventReply &reply); +WatchCondition GetWatchcondition(const EventReply &reply); +int32_t GetWatchpointID(const EventReply &reply); +bool GetWatchpointDelete(const EventReply &reply); +ProtoVector GetTensors(const EventReply &reply); + +// get the full name of a tensor, which is the name used in TensorLoader +std::string GetTensorFullName(const TensorProto &tensor); + } // namespace mindspore #endif // MINDSPORE_CCSRC_DEBUG_DEBUGGER_DEBUGGER_H_ diff --git a/mindspore/ccsrc/debug/tensor_load.h b/mindspore/ccsrc/debug/tensor_load.h index 6c3ea67a78..e3ae5c94eb 100644 --- a/mindspore/ccsrc/debug/tensor_load.h +++ b/mindspore/ccsrc/debug/tensor_load.h @@ -21,6 +21,7 @@ #include #include #include +#include #include "debug/tensor_data.h" namespace mindspore { class TensorLoader { @@ -29,7 +30,15 @@ class TensorLoader { ~TensorLoader() {} - bool LoadNewTensor(std::shared_ptr tensor) { + bool LoadNewTensor(std::shared_ptr tensor, bool keep_prev) { + if (keep_prev) { + // add prev step tensor into current step map with ":prev" suffix + auto handle = prev_tensor_list_map.extract(tensor->GetName()); + if (!handle.empty()) { + handle.key() = tensor->GetName() + ":prev"; + tensor_list_map.insert(std::move(handle)); + } + } tensor_list.push_back(tensor); tensor_list_map.insert({tensor->GetName(), tensor}); return true; @@ -53,16 +62,20 @@ class TensorLoader { } bool EmptyTensor() { - tensor_list_map.clear(); + prev_tensor_list_map.clear(); + tensor_list_map.swap(prev_tensor_list_map); tensor_list.clear(); return true; } + void EmptyPrevTensor() { prev_tensor_list_map.clear(); } + void set_iter_num(uint32_t iter_num) { this->iter_num = iter_num; } private: std::vector> tensor_list; std::map> tensor_list_map; + std::map> prev_tensor_list_map; uint32_t iter_num; }; } // namespace mindspore diff --git a/mindspore/ccsrc/device/ascend/ascend_device_address.cc b/mindspore/ccsrc/device/ascend/ascend_device_address.cc index 89f2263abb..1b5645ab30 100644 --- a/mindspore/ccsrc/device/ascend/ascend_device_address.cc +++ b/mindspore/ccsrc/device/ascend/ascend_device_address.cc @@ -370,10 +370,10 @@ bool AscendDeviceAddress::DumpMemToFile(bool trans_flag, const std::string &file #ifdef ENABLE_DEBUGGER bool AscendDeviceAddress::LoadMemToHost(bool trans_flag, const std::string &tensor_name, int execution_order, const std::string &host_fmt, const std::vector &host_shape, - TypeId host_type, size_t slot, Debugger *debugger) const { + TypeId host_type, size_t slot, Debugger *debugger, bool keep_prev) const { bool ret = false; - DebugServices *debug_services = debugger->get_debug_services(); + DebugServices *debug_services = debugger->debug_services(); TensorLoader *tensor_loader = debug_services->get_tensor_loader(); if (trans_flag) { @@ -390,7 +390,7 @@ bool AscendDeviceAddress::LoadMemToHost(bool trans_flag, const std::string &tens tensor_data->SetExecutionOrder(execution_order); tensor_data->SetTensor(out_tensor); tensor_data->SetSlot(slot); - ret = tensor_loader->LoadNewTensor(tensor_data); + ret = tensor_loader->LoadNewTensor(tensor_data, keep_prev); } else { mindspore::tensor::TensorPtr out_tensor = std::make_shared(type_id_, host_shape); size_t host_size = out_tensor->data().nbytes(); @@ -401,7 +401,7 @@ bool AscendDeviceAddress::LoadMemToHost(bool trans_flag, const std::string &tens tensor_data->SetExecutionOrder(execution_order); tensor_data->SetTensor(out_tensor); tensor_data->SetSlot(slot); - ret = tensor_loader->LoadNewTensor(tensor_data); + ret = tensor_loader->LoadNewTensor(tensor_data, keep_prev); if (ret_rt_memcpy != RT_ERROR_NONE) { MS_LOG(ERROR) << "SyncDeviceToHost: rtMemcpy mem size[" << size_ << "] fail, ret[" << ret_rt_memcpy << "]"; } diff --git a/mindspore/ccsrc/device/ascend/ascend_device_address.h b/mindspore/ccsrc/device/ascend/ascend_device_address.h index 4e560e30f4..27bcea814c 100644 --- a/mindspore/ccsrc/device/ascend/ascend_device_address.h +++ b/mindspore/ccsrc/device/ascend/ascend_device_address.h @@ -46,7 +46,8 @@ class AscendDeviceAddress : public DeviceAddress { #endif #ifdef ENABLE_DEBUGGER bool LoadMemToHost(bool dump_mode, const std::string &tensor_name, int execution_order, const std::string &host_fmt, - const std::vector &host_shape, TypeId host_type, size_t slot, Debugger *debugger) const; + const std::vector &host_shape, TypeId host_type, size_t slot, Debugger *debugger, + bool keep_prev) const; #endif private: diff --git a/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc index efdcb98755..8b176af5fc 100644 --- a/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc +++ b/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc @@ -322,7 +322,8 @@ void LoadOutput(mindspore::session::KernelGraph *graph, Debugger *debugger) { (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), [](size_t inner_item) { return SizeToInt(inner_item); }); } - auto ret = ascend_addr->LoadMemToHost(trans_flag, tensor_name, exec_order, format, int_shapes, type, j, debugger); + auto ret = + ascend_addr->LoadMemToHost(trans_flag, tensor_name, exec_order, format, int_shapes, type, j, debugger, false); if (!ret) { MS_LOG(ERROR) << "LoadMemToHost: flag:" << trans_flag << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!"; @@ -356,7 +357,8 @@ void LoadParameters(mindspore::session::KernelGraph *graph, Debugger *debugger) (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes), [](size_t inner_item) { return SizeToInt(inner_item); }); } - auto ret = ascend_addr->LoadMemToHost(trans_flag, tensor_name, exec_order, format, int_shapes, type, 0, debugger); + auto ret = + ascend_addr->LoadMemToHost(trans_flag, tensor_name, exec_order, format, int_shapes, type, 0, debugger, true); if (!ret) { MS_LOG(ERROR) << "LoadMemToHost Failed: flag:" << trans_flag << ", path:" << tensor_name << ", host_format:" << format << ".!"; diff --git a/mindspore/ccsrc/session/ascend_session.cc b/mindspore/ccsrc/session/ascend_session.cc index 397ed8f94a..c703127f74 100644 --- a/mindspore/ccsrc/session/ascend_session.cc +++ b/mindspore/ccsrc/session/ascend_session.cc @@ -799,12 +799,13 @@ void AscendSession::LoadTensor(const std::shared_ptr &kernel_graph) #ifdef ENABLE_DEBUGGER auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); - DebugServices *debug_services = debugger_->get_debug_services(); + DebugServices *debug_services = debugger_->debug_services(); TensorLoader *tensor_loader = debug_services->get_tensor_loader(); tensor_loader->EmptyTensor(); uint32_t iter_num = tensor_loader->GetIterNum(); tensor_loader->set_iter_num(++iter_num); (void)runtime_instance->LoadData(kernel_graph.get(), debugger_.get()); + tensor_loader->EmptyPrevTensor(); #endif MS_LOG(INFO) << "Finish!"; }