!3768 GPU debugger grpc implementation and smart kernel read

Merge pull request !3768 from lichen_101010/master_ms1_grpc
5 years ago · 7280d3170a
parent 2b6ac46a03 7499c72d54
commit 7280d3170a
8 changed files with 283 additions and 9 deletions
--- a/mindspore/ccsrc/debug/debug_services.cc
+++ b/mindspore/ccsrc/debug/debug_services.cc
@ -171,6 +171,61 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
  }
 }

+void DebugServices::CheckSingleWatchpoint(std::shared_ptr<TensorData> watchtensor, std::string *name, std::string *slot,
+                                          char **data_ptr, unsigned int *data_size, int *condition,
+                                          unsigned int *wacthpoint_id) {
+  std::lock_guard<std::mutex> lg(lock_);
+
+  std::string current_watchtensor_name;
+  current_watchtensor_name = watchtensor->GetName();
+  mindspore::tensor::TensorPtr tensor_ptr = watchtensor->GetTensor();
+  int tensor_data_type = tensor_ptr->data_type_c();
+  watchpoint_t watchpoint_to_check;
+
+  for (auto w_table_item : watchpoint_table) {
+    auto check_node_list = std::get<1>(w_table_item).check_node_list;
+    for (auto check_node : check_node_list) {
+      std::string w_name = std::get<0>(check_node);
+      bool w_type = std::get<1>(check_node);
+      // get current the full info including condition, id..., for current watchtensor
+      std::string current_node_name = current_watchtensor_name.substr(0, current_watchtensor_name.find_first_of(":"));
+      if ((w_type == true && (current_watchtensor_name.find(w_name) != string::npos || w_name == "*")) ||
+          (w_type == false && current_node_name == w_name)) {
+        watchpoint_to_check = w_table_item.second;
+        // need to add support for float16 and float64, and other types when we support conditions beyond inf and nan
+        if (tensor_data_type != kNumberTypeFloat && tensor_data_type != kNumberTypeFloat32) {
+          return;
+        }
+        break;
+      }
+    }
+  }
+
+  float *start_addr = reinterpret_cast<float *>(tensor_ptr->data_c());
+  unsigned int num_elements = (tensor_ptr->data().nbytes()) / sizeof(float);
+
+  for (unsigned int index = 0; index < num_elements; index++) {
+    float x = start_addr[index];
+    if (((watchpoint_to_check.conditions.inf.enabled || watchpoint_to_check.conditions.neg_inf.enabled) && isinf(x)) ||
+        (watchpoint_to_check.conditions.nan.enabled && isnan(x))) {
+      std::string name_no_slot = current_watchtensor_name.substr(0, current_watchtensor_name.find_first_of(":"));
+      *name = name_no_slot;
+      *slot = std::to_string(watchtensor->GetSlot());
+      *data_ptr = reinterpret_cast<char *>(tensor_ptr->data_c());
+      *data_size = tensor_ptr->data().nbytes();
+      int condition_item = -1;
+      if (watchpoint_to_check.conditions.nan.enabled) {
+        condition_item = 0;
+      } else if (watchpoint_to_check.conditions.inf.enabled || watchpoint_to_check.conditions.neg_inf.enabled) {
+        condition_item = 1;
+      }
+      *condition = condition_item;
+
+      *wacthpoint_id = watchpoint_to_check.id;
+    }
+  }
+}
+
 void DebugServices::ReadNodesTensors(std::vector<std::string> name, std::vector<std::string> *ret_name,
                                     std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size,
                                     std::vector<TypePtr> *dtype, std::vector<std::vector<int>> *shape) {
--- a/mindspore/ccsrc/debug/debug_services.h
+++ b/mindspore/ccsrc/debug/debug_services.h
@ -78,6 +78,9 @@ class DebugServices {
                        std::vector<unsigned int> *data_size, std::vector<int> *condition,
                        std::vector<unsigned int> *wacthpoint_id);

+  void CheckSingleWatchpoint(std::shared_ptr<TensorData> watchnode, std::string *name, std::string *slot,
+                             char **data_ptr, unsigned int *data_size, int *condition, unsigned int *wacthpoint_id);
+
  void ReadNodesTensors(std::vector<std::string> name, std::vector<std::string> *ret_name,
                        std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size,
                        std::vector<TypePtr> *dtype, std::vector<std::vector<int>> *shape);
--- a/mindspore/ccsrc/debug/debugger/debug_grpc.proto
+++ b/mindspore/ccsrc/debug/debugger/debug_grpc.proto
@ -31,6 +31,10 @@ service EventListener {
 message Metadata {
  string device_name = 1;
  int32 cur_step = 2;
+  // define the backend is 'GPU' or "Ascend"
+  string backend = 3;
+  // the full name of current node
+  string cur_node = 4;
 }

 message EventReply {
@ -44,12 +48,22 @@ message EventReply {

  oneof cmd {
    bool exit = 2;
-    int32 run_cmd = 3;
+    RunCMD run_cmd = 3;
    SetCMD set_cmd = 4;
    ViewCMD view_cmd = 5;
  }
 }

+message RunCMD {
+    // step level or node level.  "step" or "node"
+    string run_level = 1;
+    oneof cmd {
+        int32 run_steps = 2;
+        // the next node full name
+        string node_name = 3;
+    }
+}
+
 message SetCMD {
  repeated WatchNode watch_nodes = 1;
  WatchCondition watch_condition = 2;
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@ -45,6 +45,9 @@ Debugger::Debugger()
      device_target_(""),
      num_step_(0),
      debugger_enabled_(false),
+      run_level_(""),
+      node_name_(""),
+      cur_name_(""),
      is_dataset_graph_(false),
      partial_memory_(false) {}

@ -164,10 +167,46 @@ void Debugger::PostExecute() {
  // access lock for public method
  std::lock_guard<std::mutex> a_lock(access_lock_);
  // analyze tensor data and send the watchpoints been hit
+  if (run_level_ == "node") {
+    MS_LOG(INFO) << "Debugger is in node level mode ";
+    return;
+  }
  if (debugger_enabled_ && !is_dataset_graph_) {
-    num_step_++;
    MS_LOG(INFO) << "Debugger suspend at end of step; number of steps executed: " << num_step_;
-    SendWatchpointsAndSuspend(CheckWatchpoints());
+    CommandLoop();
+  }
+}
+
+bool Debugger::ReadNodeDataRequired() {
+  if (debugger_enabled_ && !is_dataset_graph_) {
+    auto watchpoint_table = debug_services_->GetWatchpointTable();
+    auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, watchpoint_table);
+    // if node has a watchpoint on it, is next_to node, or continue_to node then read the kernel tensor data
+    if (is_watchpoint || (run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_))) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void Debugger::PostExecuteNode() {
+  // access lock for public method
+  std::lock_guard<std::mutex> a_lock(access_lock_);
+  if (debugger_enabled_ && !is_dataset_graph_) {
+    auto watchpoint_table = debug_services_->GetWatchpointTable();
+    auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, watchpoint_table);
+    // if kernel is watchpoint,and get hit. suspend.
+    if (is_watchpoint) {
+      auto hits = CheckSingleWatchpoint(cur_name_);
+      if (!hits.empty()) {
+        SendWatchpointsAndSuspend(hits);
+      }
+    }
+    // if kernel is not watchpoint and is next_to or continue_to node, suspend.
+    if (run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_)) {
+      CommandLoop();
+    }
+    return;
  }
 }

@ -232,6 +271,8 @@ void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) {
  Metadata metadata;
  metadata.set_device_name(device_name);
  metadata.set_cur_step(num_step_);
+  metadata.set_backend(device_target_);
+  metadata.set_cur_node(cur_name_);
  EventReply reply_metadata = grpc_client_->SendMetadata(metadata);
  if (reply_metadata.status() != reply_metadata.OK) {
    MS_LOG(ERROR) << "Error: SendMetadata failed";
@ -249,8 +290,11 @@ void Debugger::CommandLoop() {
  // prepare metadata
  std::string device_name = std::to_string(device_id_) + ":" + std::to_string(graph_ptr_->graph_id());
  Metadata metadata;
+
  metadata.set_device_name(device_name);
  metadata.set_cur_step(num_step_);
+  metadata.set_backend(device_target_);
+  metadata.set_cur_node(cur_name_);

  // loop exit flag
  bool run = false;
@ -291,6 +335,16 @@ void Debugger::CommandLoop() {
        break;
      case DebuggerCommand::kRunCMD:
        MS_LOG(INFO) << "RunCMD";
+        {
+          // print run cmd content
+          // get run_level and node_name
+          run_level_ = GetRunLevel(reply);
+          node_name_ = GetNodeName(reply);
+
+          MS_LOG(INFO) << "run_level: " << run_level_;
+          MS_LOG(INFO) << "node_name_: " << node_name_;
+        }
+
        // exit loop
        run = true;
        break;
@ -445,6 +499,35 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints() const {
  return hits;
 }

+std::list<WatchpointHit> Debugger::CheckSingleWatchpoint(std::string watchnode) const {
+  auto tensor_loader = debug_services_->tensor_loader();
+  auto tensors = tensor_loader->GetNodeTensorMap(watchnode);
+  std::list<WatchpointHit> hits;
+  for (std::vector<std::shared_ptr<TensorData>>::iterator it = tensors.begin(); it != tensors.end(); ++it) {
+    auto cur_tensor = *it;
+    std::string name = "";
+    std::string slot = "";
+    char *data_ptr = nullptr;
+    unsigned int data_size = 0;
+    int condition = -1;
+    unsigned int watchpoint_id = -1;
+    WatchpointHit hit;
+    debug_services_->CheckSingleWatchpoint(cur_tensor, &name, &slot, &data_ptr, &data_size, &condition, &watchpoint_id);
+    if (name != "") {
+      hit.set_id(watchpoint_id);
+      // here TensorProto act as a tensor indicator, not sending tensor content
+      TensorProto *tensor_item = hit.mutable_tensor();
+      tensor_item->set_node_name(name);
+      tensor_item->set_slot(slot);
+      tensor_item->set_finished(true);
+      WatchCondition *condition_item = hit.mutable_watch_condition();
+      condition_item->set_condition(debugger::WatchCondition_Condition(condition));
+      hits.push_back(hit);
+    }
+  }
+  return hits;
+}
+
 void Debugger::SendWatchpointsAndSuspend(const std::list<WatchpointHit> &points) {
  // send info about watchpoint
  if (!points.empty()) {
@ -491,6 +574,24 @@ ProtoVector<WatchNode> GetWatchnodes(const EventReply &reply) {
  return reply.set_cmd().watch_nodes();
 }

+std::string GetRunLevel(const EventReply &reply) {
+  if (!reply.has_run_cmd()) {
+    MS_LOG(ERROR) << "Error: Not RunCMD, can not get RunLevel. Returning default value: "
+                     "";
+    return "";
+  }
+  return reply.run_cmd().run_level();
+}
+
+std::string GetNodeName(const EventReply &reply) {
+  if (!reply.has_run_cmd()) {
+    MS_LOG(ERROR) << "Error: Not RunCMD, can not get NodeName. Returning default value: "
+                     "";
+    return "";
+  }
+  return reply.run_cmd().node_name();
+}
+
 WatchCondition GetWatchcondition(const EventReply &reply) {
  if (!reply.has_set_cmd() || !reply.set_cmd().has_watch_condition()) {
    MS_LOG(ERROR) << "Error: Can not get WatchCondition from command. Returning default value: WatchCondition().";
@ -536,4 +637,20 @@ std::string GetTensorFullName(const TensorProto &tensor) {

 bool Debugger::partial_memory() { return partial_memory_; }

+void Debugger::SetCurNode(std::string cur_name) {
+  // access lock for public method
+  std::lock_guard<std::mutex> a_lock(access_lock_);
+  cur_name_ = cur_name;
+}
+
+std::string Debugger::run_level() const { return run_level_; }
+
+void Debugger::SetStepNum(int32_t cur_num_step) {
+  // access lock for public method
+  std::lock_guard<std::mutex> a_lock(access_lock_);
+  num_step_ = cur_num_step;
+}
+
+int32_t Debugger::step_num() const { return num_step_; }
+
 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@ -69,6 +69,10 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  // don't need a graph_ptr because it is saved during pre_execute
  void PostExecute();

+  bool ReadNodeDataRequired();
+
+  void PostExecuteNode();
+
  // suspend the execution after a debug_op
  void PostDebugOp();

@ -78,6 +82,14 @@ class Debugger : public std::enable_shared_from_this<Debugger> {

  bool partial_memory();

+  void SetCurNode(std::string cur_name);
+
+  std::string run_level() const;
+
+  void SetStepNum(int32_t cur_num_step);
+
+  int32_t step_num() const;
+
 private:
  // private constructor for singleton
  Debugger();
@ -119,6 +131,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  // analyze tensors and check watchpoint conditions
  // return names of tensors and what condition they hit
  std::list<WatchpointHit> CheckWatchpoints() const;
+  std::list<WatchpointHit> CheckSingleWatchpoint(std::string watchnode) const;

  // send watchpoints that hit and enter command wait loop
  void SendWatchpointsAndSuspend(const std::list<WatchpointHit> &points);
@ -131,6 +144,9 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  std::string device_target_;
  int32_t num_step_;
  bool debugger_enabled_;
+  std::string run_level_;
+  std::string node_name_;
+  std::string cur_name_;
  bool is_dataset_graph_;
  bool partial_memory_;
  std::mutex access_lock_;
@ -154,6 +170,8 @@ DebuggerCommand GetCommand(const EventReply &reply);

 // parse other data out of EventReply
 ProtoVector<WatchNode> GetWatchnodes(const EventReply &reply);
+std::string GetNodeName(const EventReply &reply);
+std::string GetRunLevel(const EventReply &reply);
 WatchCondition GetWatchcondition(const EventReply &reply);
 int32_t GetWatchpointID(const EventReply &reply);
 bool GetWatchpointDelete(const EventReply &reply);
--- a/mindspore/ccsrc/debug/tensor_load.h
+++ b/mindspore/ccsrc/debug/tensor_load.h
@ -47,6 +47,9 @@ class TensorLoader {
    }
    tensor_list.push_back(tensor);
    tensor_list_map.insert({tensor->GetName(), tensor});
+    auto node_name = tensor->GetName();
+    node_name = node_name.substr(0, node_name.find_first_of(":"));
+    node_tensor_map.insert({node_name, tensor});
    return true;
  }
  std::vector<std::shared_ptr<TensorData>> GetTensor() { return tensor_list; }
@ -54,6 +57,17 @@ class TensorLoader {
  uint32_t GetIterNum() { return iter_num; }

  std::map<std::string, std::shared_ptr<TensorData>> GetTensorMap() { return tensor_list_map; }
+
+  std::vector<std::shared_ptr<TensorData>> GetNodeTensorMap(std::string node_name) {
+    std::vector<std::shared_ptr<TensorData>> tensors;
+    for (auto itr = node_tensor_map.begin(); itr != node_tensor_map.end(); itr++) {
+      if (itr->first == node_name) {
+        tensors.push_back(itr->second);
+      }
+    }
+    return tensors;
+  }
+
  void SearchTensors(const std::vector<std::string> &search_list,
                     std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list) {
    for (auto i : search_list) {
@ -70,6 +84,7 @@ class TensorLoader {
  void EmptyTensor() {
    std::lock_guard<std::mutex> lg(lock_);
    prev_tensor_list_map.clear();
+    node_tensor_map.clear();
    tensor_list_map.swap(prev_tensor_list_map);
    tensor_list.clear();
  }
@ -127,6 +142,7 @@ class TensorLoader {
 private:
  std::vector<std::shared_ptr<TensorData>> tensor_list;
  std::map<std::string, std::shared_ptr<TensorData>> tensor_list_map;
+  std::multimap<std::string, std::shared_ptr<TensorData>> node_tensor_map;
  std::map<std::string, std::shared_ptr<TensorData>> prev_tensor_list_map;
  uint32_t iter_num;
  std::mutex lock_;
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
@ -90,9 +90,7 @@ bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int executi
  tensor_data->SetTensor(out_tensor);
  tensor_data->SetSlot(slot);
  ret = tensor_loader->LoadNewTensor(tensor_data, keep_prev);
-
  MS_LOG(INFO) << "E2E tensor name is " << tensor_name;
-
  return ret;
 }
 #endif
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@ -31,6 +31,9 @@
 #include "runtime/device/gpu/gpu_memory_copy_manager.h"
 #include "common/trans.h"
 #include "ir/dtype.h"
+#ifdef ENABLE_DEBUGGER
+#include "debug/debug_services.h"
+#endif

 namespace mindspore {
 namespace device {
@ -221,10 +224,46 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
                    const std::vector<mindspore::kernel::AddressPtr> &kernel_workspaces,
                    const std::vector<mindspore::kernel::AddressPtr> &kernel_outputs, int exec_order, void *stream_ptr,
                    bool dump_enabled) {
-  if (!(debugger && (debugger->debugger_enabled() || dump_enabled))) {
+  // check if we should read the kernel data
+  bool read_data = false;
+  std::string kernel_name = kernel->fullname_with_scope();
+  if (debugger) {
+    debugger->SetCurNode(kernel_name);
+    if (dump_enabled) {
+      read_data = true;
+    } else if (debugger->debugger_enabled()) {
+      read_data = debugger->ReadNodeDataRequired();
+    }
+  }
+
+  if (!read_data) {
    return;
  }
-  std::string kernel_name = kernel->fullname_with_scope();
+
+  // get inputs
+  if (!dump_enabled) {
+    auto input_size = AnfAlgo::GetInputTensorNum(kernel);
+    for (size_t j = 0; j < input_size; ++j) {
+      auto input_kernel = kernel->input(j + 1);
+      std::string input_kernel_name = input_kernel->fullname_with_scope();
+      auto addr = kernel_inputs[j];
+      auto type = AnfAlgo::GetOutputInferDataType(input_kernel, PARAMETER_OUTPUT_INDEX);
+      auto format = kOpFormat_DEFAULT;
+      auto gpu_addr = std::make_unique<GPUDeviceAddress>(addr->addr, addr->size, format, type);
+      string input_tensor_name = input_kernel_name + ':' + "0";
+      std::vector<int> int_shapes;
+      auto shape = AnfAlgo::GetOutputDeviceShape(input_kernel, PARAMETER_OUTPUT_INDEX);
+      (void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
+                           [](size_t inner_item) { return SizeToInt(inner_item); });
+      auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, debugger, false);
+      if (!ret) {
+        MS_LOG(ERROR) << "LoadMemToHost:"
+                      << ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!";
+      }
+    }
+  }
+
+  // get outputs
  auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
  for (size_t j = 0; j < output_size; ++j) {
    auto addr = kernel_outputs[j];
@ -242,11 +281,21 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
                    << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
    }
  }
+
+  debugger->PostExecuteNode();
+}
+
+void UpdateStepNum(Debugger *debugger, bool dump_enabled) {
+  if (debugger && (debugger->debugger_enabled() || dump_enabled)) {
+    auto cur_step_num = debugger->step_num();
+    cur_step_num = cur_step_num + 1;
+    debugger->SetStepNum(cur_step_num);
+  }
 }

 void LoadParameters(const session::KernelGraph *graph, Debugger *debugger, bool dump_enabled) {
  MS_EXCEPTION_IF_NULL(graph);
-  if (!(debugger && (debugger->debugger_enabled() || dump_enabled))) {
+  if (!(debugger && dump_enabled)) {
    return;
  }
  const auto &parameters = graph->inputs();
@ -616,9 +665,13 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De

 #ifdef ENABLE_DEBUGGER
  bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration();
+  if (!mock) {
+    UpdateStepNum(debugger, dump_enabled);
+  }
 #endif
  auto &kernels = graph->execution_order();
  int exec_order = 1;
+
  for (const auto &kernel : kernels) {
    auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
    MS_EXCEPTION_IF_NULL(kernel_mod);
@ -662,7 +715,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, De
  }
  if (!mock) {
 #ifdef ENABLE_DEBUGGER
-    // collect weights and bias
+    // collect weights and bias for dump mode
    LoadParameters(graph, debugger, dump_enabled);
 #endif
    CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed.");