Op Overflow Watchpoint support for D-chip debugger

Other Authors: Harshvardhan Gupta, Li Chen
5 years ago · 4834a3378b
parent 2953720169
commit 4834a3378b
8 changed files with 239 additions and 40 deletions
--- a/mindspore/ccsrc/debug/data_dump_parser.cc
+++ b/mindspore/ccsrc/debug/data_dump_parser.cc
@ -209,4 +209,24 @@ void DataDumpParser::CheckOpDebugMode(uint32_t op_debug_mode) const {
    MS_LOG(EXCEPTION) << "[DataDump] op_debug_mode in config json file should be [0-3]";
  }
 }
+
+std::string DataDumpParser::GetOpOverflowBinPath(uint32_t graph_id, uint32_t device_id) const {
+  std::string bin_path = "/var/log/npu/ide_daemon/dump";
+
+  const char *dump_data_path = std::getenv("DATA_DUMP_PATH");
+  bin_path.append(dump_data_path);
+  bin_path.append("_");
+  bin_path.append(std::to_string(device_id));
+  bin_path.append("/");
+  bin_path.append(net_name_);
+  bin_path.append("_");
+  bin_path.append(std::to_string(graph_id));
+  bin_path.append("/");
+  bin_path.append(std::to_string(dump_mode_));
+  bin_path.append("/");
+  bin_path.append(std::to_string(dump_step_));
+  bin_path.append("/");
+
+  return bin_path;
+}
 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/data_dump_parser.h
+++ b/mindspore/ccsrc/debug/data_dump_parser.h
@ -42,6 +42,7 @@ class DataDumpParser {
  uint32_t dump_step() const { return dump_step_; }
  void MatchKernel(const std::string &kernel_name);
  void PrintUnusedKernel();
+  std::string GetOpOverflowBinPath(uint32_t graph_id, uint32_t device_id) const;

 private:
  DataDumpParser() = default;
--- a/mindspore/ccsrc/debug/debug_services.cc
+++ b/mindspore/ccsrc/debug/debug_services.cc
@ -50,6 +50,8 @@ void DebugServices::AddWatchpoint(unsigned int id, unsigned int watch_condition,
  } else if (watch_condition == 1) {
    watchpoint_item.conditions.inf.enabled = true;
    watchpoint_item.conditions.neg_inf.enabled = true;
+  } else if (watch_condition == 2) {
+    watchpoint_item.conditions.overflow.enabled = true;
  }

  watchpoint_item.check_node_list = check_node_list;
@ -63,8 +65,8 @@ void DebugServices::RemoveWatchpoint(unsigned int id) {
 }

 void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot,
-                                     std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size,
-                                     std::vector<int> *condition, std::vector<unsigned int> *wacthpoint_id) {
+                                     std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id,
+                                     const std::vector<std::string> &op_overflows) {
  std::lock_guard<std::mutex> lg(lock_);

  std::vector<std::shared_ptr<TensorData>> tensor_list = tensor_loader_->GetTensor();
@ -74,6 +76,7 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector

  for (std::size_t i = 0; i < tensor_list.size(); i++) {
    current_tensor_name = tensor_list[i]->GetName();
+    std::string tensor_slot = std::to_string(tensor_list[i]->GetSlot());
    mindspore::tensor::TensorPtr tensor_ptr = tensor_list[i]->GetTensor();
    int tensor_data_type = tensor_ptr->data_type_c();

@ -106,10 +109,23 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
        }
      }
    }
+    std::vector<unsigned int> hit_encountered;

-    // check if no watchpoints are valid for the current tensor
-    if (watchpoints_to_check_table.empty()) {
-      continue;
+    // handle watchpoint conditions that do not require per element checks
+    for (auto it_w_table_check = watchpoints_to_check_table.begin();
+         it_w_table_check != watchpoints_to_check_table.end(); ++it_w_table_check) {
+      if (it_w_table_check->second.conditions.overflow.enabled) {
+        std::string name_no_slot = current_tensor_name.substr(0, current_tensor_name.find_first_of(":"));
+        if (std::find(op_overflows.begin(), op_overflows.end(), name_no_slot) != op_overflows.end()) {
+          hit_encountered.push_back(it_w_table_check->second.id);
+        }
+      }
+    }
+
+    if (hit_encountered.size()) {
+      HandleWatchpointHits(hit_encountered, name, slot, condition, watchpoint_id, current_tensor_name,
+                           &watchpoints_to_check_table, tensor_slot);
+      hit_encountered.clear();
    }

    // need to add support for float16 and float64, and other types when we support conditions beyond inf and nan
@ -117,11 +133,14 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
      continue;
    }

+    // check if no watchpoints are remaining
+    if (watchpoints_to_check_table.empty()) {
+      continue;
+    }
+
    float *start_addr = reinterpret_cast<float *>(tensor_ptr->data_c());
    unsigned int num_elements = (tensor_ptr->data().nbytes()) / sizeof(float);
-
    std::unordered_map<unsigned int, watchpoint_t>::iterator it_w_table_check;
-    std::vector<unsigned int> hit_encountered;

    for (unsigned int index = 0; index < num_elements; index++) {
      float x = start_addr[index];
@ -134,33 +153,12 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
        } else if (it_w_table_check->second.conditions.nan.enabled && isnan(x)) {
          hit_encountered.push_back(it_w_table_check->second.id);
        }
-
        ++it_w_table_check;
      }

      if (hit_encountered.size()) {
-        for (auto it_hit_id = hit_encountered.begin(); it_hit_id != hit_encountered.end(); ++it_hit_id) {
-          std::string name_no_slot = current_tensor_name.substr(0, current_tensor_name.find_first_of(":"));
-          name->push_back(name_no_slot);
-
-          slot->push_back(std::to_string(tensor_list[i]->GetSlot()));
-          data_ptr->push_back(reinterpret_cast<char *>(tensor_ptr->data_c()));
-          data_size->push_back(tensor_ptr->data().nbytes());
-
-          int condition_item = -1;
-          if (watchpoint_table[*it_hit_id].conditions.nan.enabled) {
-            condition_item = 0;
-          } else if (watchpoint_table[*it_hit_id].conditions.inf.enabled ||
-                     watchpoint_table[*it_hit_id].conditions.neg_inf.enabled) {
-            condition_item = 1;
-          }
-          condition->push_back(condition_item);
-
-          wacthpoint_id->push_back(*it_hit_id);
-
-          watchpoints_to_check_table.erase(*it_hit_id);
-        }
-
+        HandleWatchpointHits(hit_encountered, name, slot, condition, watchpoint_id, current_tensor_name,
+                             &watchpoints_to_check_table, tensor_slot);
        hit_encountered.clear();
      }

@ -171,6 +169,34 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
  }
 }

+void DebugServices::HandleWatchpointHits(const std::vector<unsigned int> &hit_encountered,
+                                         std::vector<std::string> *name, std::vector<std::string> *slot,
+                                         std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id,
+                                         std::string current_tensor_name,
+                                         std::unordered_map<unsigned int, watchpoint_t> *watchpoints_to_check_table,
+                                         std::string tensor_slot) {
+  for (auto it_hit_id = hit_encountered.begin(); it_hit_id != hit_encountered.end(); ++it_hit_id) {
+    if (watchpoint_table.find(*it_hit_id) != watchpoint_table.end()) {
+      std::string name_no_slot = current_tensor_name.substr(0, current_tensor_name.find_first_of(":"));
+      name->push_back(name_no_slot);
+      slot->push_back(tensor_slot);
+
+      int condition_item = -1;
+      if (watchpoint_table[*it_hit_id].conditions.nan.enabled) {
+        condition_item = 0;
+      } else if (watchpoint_table[*it_hit_id].conditions.inf.enabled ||
+                 watchpoint_table[*it_hit_id].conditions.neg_inf.enabled) {
+        condition_item = 1;
+      } else if (watchpoint_table[*it_hit_id].conditions.overflow.enabled) {
+        condition_item = 2;
+      }
+      condition->push_back(condition_item);
+      watchpoint_id->push_back(*it_hit_id);
+    }
+    watchpoints_to_check_table->erase(*it_hit_id);
+  }
+}
+
 void DebugServices::CheckSingleWatchpoint(std::shared_ptr<TensorData> watchtensor, std::string *name, std::string *slot,
                                          char **data_ptr, unsigned int *data_size, int *condition,
                                          unsigned int *wacthpoint_id) {
--- a/mindspore/ccsrc/debug/debug_services.h
+++ b/mindspore/ccsrc/debug/debug_services.h
@ -51,6 +51,7 @@ class DebugServices {
    condition_no_param_t inf;
    condition_no_param_t neg_inf;
    condition_no_param_t nan;
+    condition_no_param_t overflow;
    condition_with_param_t max_below;
    condition_with_param_t max_above;
    condition_with_param_t min_below;
@ -74,9 +75,8 @@ class DebugServices {

  void RemoveWatchpoint(unsigned int id);

-  void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<char *> *data_ptr,
-                        std::vector<unsigned int> *data_size, std::vector<int> *condition,
-                        std::vector<unsigned int> *wacthpoint_id);
+  void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition,
+                        std::vector<unsigned int> *watchpoint_id, const std::vector<std::string> &op_overflows);

  void CheckSingleWatchpoint(std::shared_ptr<TensorData> watchnode, std::string *name, std::string *slot,
                             char **data_ptr, unsigned int *data_size, int *condition, unsigned int *wacthpoint_id);
@ -97,6 +97,12 @@ class DebugServices {
  std::unordered_map<unsigned int, watchpoint_t> watchpoint_table;

  TensorLoader *tensor_loader_;
+
+  void HandleWatchpointHits(const std::vector<unsigned int> &hit_encountered, std::vector<std::string> *name,
+                            std::vector<std::string> *slot, std::vector<int> *condition,
+                            std::vector<unsigned int> *watchpoint_id, std::string current_tensor_name,
+                            std::unordered_map<unsigned int, watchpoint_t> *watchpoints_to_check_table,
+                            std::string tensor_slot);
 };
 }  // namespace mindspore

--- a/mindspore/ccsrc/debug/debugger/debug_grpc.proto
+++ b/mindspore/ccsrc/debug/debugger/debug_grpc.proto
@ -79,8 +79,16 @@ message WatchCondition {
  enum Condition {
    nan = 0;
    inf = 1;
+    overflow = 2;
+    ge = 3;  // greater than and equal to
+    gt = 4;  // greater than
+    le = 5;  // less than and equal to
+    lt = 6;  // less than
+    between = 7;  // between
  }
  Condition condition = 1;
+  repeated float value = 2;  // for between condition, there will be two values
+  repeated bool include = 3;  // for between condition, define the value is included or not
 }

 message WatchNode {
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@ -14,11 +14,18 @@
 * limitations under the License.
 */

+#include <dirent.h>
+#include <stdio.h>
 #include <fstream>
 #include <tuple>
 #include <vector>
 #include <algorithm>
+#include <iostream>
+#include <cstring>
+#include <utility>
+#include <map>
 #include "debug/debugger/debugger.h"
+#include "debug/data_dump_parser.h"
 #include "pipeline/jit/pipeline.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "runtime/device/kernel_runtime_manager.h"
@ -49,7 +56,9 @@ Debugger::Debugger()
      node_name_(""),
      cur_name_(""),
      is_dataset_graph_(false),
-      partial_memory_(false) {}
+      partial_memory_(false),
+      last_overflow_bin_(0),
+      overflow_bin_path_("") {}

 void Debugger::Init(const uint32_t device_id, const std::string device_target) {
  // access lock for public method
@ -133,6 +142,35 @@ void Debugger::EnableDebugger() {
                       "usage for large models.";
  }

+  if (device_target_ == kAscendDevice) {
+    // set operation overflow info
+    overflow_bin_path_ = DataDumpParser::GetInstance().GetOpOverflowBinPath(graph_ptr_->graph_id(), device_id_);
+    // new overflow dump files will have a timestamp greater than last_overflow_bin_
+    last_overflow_bin_ = 0;
+    DIR *d;
+    d = opendir(overflow_bin_path_.c_str());
+    if (d) {
+      struct dirent *dir;
+      while ((dir = readdir(d)) != NULL) {
+        if (dir->d_type == DT_REG) {
+          std::string file_path = overflow_bin_path_;
+          file_path.append(dir->d_name);
+          std::size_t found = file_path.find_last_of(".");
+          if (found == std::string::npos) {
+            continue;
+          }
+          std::string overflow_time = file_path.substr(found + 1);
+          if (stod(overflow_time) <= last_overflow_bin_) {
+            MS_LOG(INFO) << "Old op overflow bin folder" << file_path;
+            continue;
+          }
+          last_overflow_bin_ = stod(overflow_time);
+        }
+      }
+      MS_LOG(INFO) << "last op overflow bin folder" << last_overflow_bin_;
+    }
+  }
+
  // initialize grpc client
  if (debugger_enabled_) {
    grpc_client_ = std::make_unique<GrpcClient>(host, port);
@ -154,6 +192,9 @@ void Debugger::Reset() {
  graph_ptr_ = nullptr;
  grpc_client_ = nullptr;
  debug_services_ = nullptr;
+  last_overflow_bin_ = 0;
+  overflow_bin_path_ = "";
+  stream_task_to_opname_.clear();
 }

 void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
@ -200,6 +241,7 @@ void Debugger::PostExecuteNode() {
  if (debugger_enabled_ && !is_dataset_graph_) {
    auto watchpoint_table = debug_services_->GetWatchpointTable();
    auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, watchpoint_table);
+
    // if kernel is watchpoint,and get hit. suspend.
    if (is_watchpoint) {
      auto hits = CheckSingleWatchpoint(cur_name_);
@ -225,6 +267,10 @@ void Debugger::PostDebugOp() {
  }
 }

+std::map<std::pair<uint32_t, uint32_t>, std::string> &Debugger::GetStreamTaskToOpnameMap() {
+  return stream_task_to_opname_;
+}
+
 void Debugger::CheckGraphPtr(const KernelGraphPtr &graph_ptr) {
  if (graph_ptr_ != graph_ptr) {
    MS_LOG(INFO) << "Debugger got new graph: " << graph_ptr->graph_id();
@ -476,15 +522,15 @@ void Debugger::Exit() {
  std::exit(EXIT_FAILURE);
 }

-std::list<WatchpointHit> Debugger::CheckWatchpoints() const {
+std::list<WatchpointHit> Debugger::CheckWatchpoints() {
  std::vector<std::string> name;
  std::vector<std::string> slot;
-  std::vector<char *> data_ptr;
-  std::vector<unsigned int> data_size;
  std::vector<int> condition;
  std::vector<unsigned int> watchpoint_id;
+  std::vector<std::string> overflow_ops;

-  debug_services_->CheckWatchpoints(&name, &slot, &data_ptr, &data_size, &condition, &watchpoint_id);
+  overflow_ops = CheckOpOverflow();
+  debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, overflow_ops);
  std::list<WatchpointHit> hits;
  for (unsigned int i = 0; i < name.size(); i++) {
    WatchpointHit hit;
@ -658,4 +704,70 @@ void Debugger::SetStepNum(int32_t cur_num_step) {

 int32_t Debugger::step_num() const { return num_step_; }

+uint64_t BytestoInt64(const std::vector<char> &buffer) {
+  uint64_t ret;
+
+  ret = ((uint64_t)buffer[7] << 56) | ((uint64_t)buffer[6] << 48) | ((uint64_t)buffer[5] << 40) |
+        ((uint64_t)buffer[4] << 32) | (buffer[3] << 24) | (buffer[2] << 16) | (buffer[1] << 8) | buffer[0];
+
+  return ret;
+}
+
+#define BUF_SIZ 256
+std::vector<std::string> Debugger::CheckOpOverflow() {
+  std::vector<double> bin_list;
+  std::vector<std::string> op_names;
+  DIR *d;
+  struct dirent *dir;
+  d = opendir(overflow_bin_path_.c_str());
+  if (d) {
+    while ((dir = readdir(d)) != NULL) {
+      if (dir->d_type == DT_REG) {
+        std::string file_path = overflow_bin_path_;
+        file_path.append(dir->d_name);
+        std::string file_name = dir->d_name;
+        std::size_t found = file_name.find_last_of(".");
+        if (found == std::string::npos) {
+          continue;
+        }
+        std::string overflow_time = file_name.substr(found + 1);
+        if (stod(overflow_time) <= last_overflow_bin_) {
+          MS_LOG(INFO) << "File already processed " << file_name;
+          continue;
+        }
+        bin_list.push_back(stod(overflow_time));
+        std::fstream infile;
+        infile.open(file_path.c_str(), std::ios::binary | std::ios::in);
+        infile.seekg(313, std::ios::beg);
+        std::vector<char> buffer;
+        buffer.resize(BUF_SIZ);
+        infile.read(buffer.data(), BUF_SIZ);
+        uint64_t stream_id = BytestoInt64(std::vector<char>(buffer.begin() + 8, buffer.end()));
+        uint64_t task_id = BytestoInt64(std::vector<char>(buffer.begin() + 16, buffer.end()));
+        MS_LOG(INFO) << "Overflow stream_id " << stream_id << ", task_id " << task_id << ".";
+        auto op = debugger_->stream_task_to_opname_.find(std::make_pair(stream_id, task_id));
+        if (op != debugger_->stream_task_to_opname_.end()) {
+          MS_LOG(ERROR) << "Overflow detected on node " << op->second << std::endl;
+          op_names.push_back(op->second);
+        } else {
+          MS_LOG(INFO) << "No overflow is detected " << std::endl;
+        }
+        infile.close();
+      }
+    }
+  } else {
+    MS_LOG(INFO) << "OverFlow bin directory does not exist!";
+  }
+  closedir(d);
+  MS_LOG(ERROR) << "These operation overflows are detected " << op_names;
+
+  for (auto &i : bin_list) {
+    if (i > last_overflow_bin_) {
+      last_overflow_bin_ = i;
+    }
+  }
+
+  return op_names;
+}
+
 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/debugger/debugger.h
+++ b/mindspore/ccsrc/debug/debugger/debugger.h
@ -19,6 +19,9 @@
 #include <list>
 #include <memory>
 #include <string>
+#include <utility>
+#include <vector>
+#include <map>
 #include "backend/session/kernel_graph.h"
 #include "debug/debugger/grpc_client.h"
 #include "debug/debug_services.h"
@ -90,6 +93,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {

  int32_t step_num() const;

+  std::map<std::pair<uint32_t, uint32_t>, std::string> &GetStreamTaskToOpnameMap();
+
 private:
  // private constructor for singleton
  Debugger();
@ -130,12 +135,15 @@ class Debugger : public std::enable_shared_from_this<Debugger> {

  // analyze tensors and check watchpoint conditions
  // return names of tensors and what condition they hit
-  std::list<WatchpointHit> CheckWatchpoints() const;
+  std::list<WatchpointHit> CheckWatchpoints();
  std::list<WatchpointHit> CheckSingleWatchpoint(std::string watchnode) const;

  // send watchpoints that hit and enter command wait loop
  void SendWatchpointsAndSuspend(const std::list<WatchpointHit> &points);

+  // Find if any operation overflow happened and return their names
+  std::vector<std::string> CheckOpOverflow();
+
  // class members
  std::unique_ptr<GrpcClient> grpc_client_;
  std::unique_ptr<DebugServices> debug_services_;
@ -150,7 +158,9 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
  bool is_dataset_graph_;
  bool partial_memory_;
  std::mutex access_lock_;
-
+  std::map<std::pair<uint32_t, uint32_t>, std::string> stream_task_to_opname_;
+  double last_overflow_bin_;
+  std::string overflow_bin_path_;
  // singleton
  static std::mutex instance_lock_;
  static std::shared_ptr<Debugger> debugger_;
@ -180,5 +190,6 @@ ProtoVector<TensorProto> GetTensors(const EventReply &reply);
 // get the full name of a tensor, which is the name used in TensorLoader
 std::string GetTensorFullName(const TensorProto &tensor);

+uint64_t BytestoInt64(const std::vector<char> &buffer);
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_DEBUG_DEBUGGER_DEBUGGER_H_
--- a/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc
@ -27,6 +27,9 @@
 #include "proto/op_mapping_info.pb.h"
 #include "utils/ms_context.h"
 #include "debug/data_dump_parser.h"
+#ifdef ENABLE_DEBUGGER
+#include "debug/debugger/debugger.h"
+#endif

 static constexpr uint32_t kAicpuLoadFlag = 1;
 static constexpr uint32_t kAicpuUnloadFlag = 0;
@ -90,6 +93,18 @@ void DataDumper::LoadDumpInfo() {
  load_flag_ = true;
  // graph id may changed in Unload
  graph_id_ = kernel_graph_->graph_id();
+#ifdef ENABLE_DEBUGGER
+  auto debugger = mindspore::Debugger::GetInstance();
+  MS_EXCEPTION_IF_NULL(debugger);
+  std::map<std::pair<uint32_t, uint32_t>, std::string> &stream_task_to_opname = debugger->GetStreamTaskToOpnameMap();
+  // extract stream id, task id and opname from runtime_info_map for overflow detection
+  std::transform(runtime_info_map_.begin(), runtime_info_map_.end(),
+                 std::inserter(stream_task_to_opname, stream_task_to_opname.end()),
+                 [](const std::pair<std::string, std::shared_ptr<RuntimeInfo>> &p)
+                   -> std::pair<std::pair<uint32_t, uint32_t>, std::string> {
+                   return {{std::get<1>(*p.second), std::get<0>(*p.second)}, p.first};
+                 });
+#endif
  MS_LOG(INFO) << "[DataDump] LoadDumpInfo end";
 }