Op Overflow Watchpoint support for D-chip debugger

Other Authors: Harshvardhan Gupta, Li Chen
pull/4428/head
Adel Shafiei 5 years ago committed by Harshvardhan Gupta
parent 2953720169
commit 4834a3378b

@ -209,4 +209,24 @@ void DataDumpParser::CheckOpDebugMode(uint32_t op_debug_mode) const {
MS_LOG(EXCEPTION) << "[DataDump] op_debug_mode in config json file should be [0-3]";
}
}
std::string DataDumpParser::GetOpOverflowBinPath(uint32_t graph_id, uint32_t device_id) const {
std::string bin_path = "/var/log/npu/ide_daemon/dump";
const char *dump_data_path = std::getenv("DATA_DUMP_PATH");
bin_path.append(dump_data_path);
bin_path.append("_");
bin_path.append(std::to_string(device_id));
bin_path.append("/");
bin_path.append(net_name_);
bin_path.append("_");
bin_path.append(std::to_string(graph_id));
bin_path.append("/");
bin_path.append(std::to_string(dump_mode_));
bin_path.append("/");
bin_path.append(std::to_string(dump_step_));
bin_path.append("/");
return bin_path;
}
} // namespace mindspore

@ -42,6 +42,7 @@ class DataDumpParser {
uint32_t dump_step() const { return dump_step_; }
void MatchKernel(const std::string &kernel_name);
void PrintUnusedKernel();
std::string GetOpOverflowBinPath(uint32_t graph_id, uint32_t device_id) const;
private:
DataDumpParser() = default;

@ -50,6 +50,8 @@ void DebugServices::AddWatchpoint(unsigned int id, unsigned int watch_condition,
} else if (watch_condition == 1) {
watchpoint_item.conditions.inf.enabled = true;
watchpoint_item.conditions.neg_inf.enabled = true;
} else if (watch_condition == 2) {
watchpoint_item.conditions.overflow.enabled = true;
}
watchpoint_item.check_node_list = check_node_list;
@ -63,8 +65,8 @@ void DebugServices::RemoveWatchpoint(unsigned int id) {
}
void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot,
std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size,
std::vector<int> *condition, std::vector<unsigned int> *wacthpoint_id) {
std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id,
const std::vector<std::string> &op_overflows) {
std::lock_guard<std::mutex> lg(lock_);
std::vector<std::shared_ptr<TensorData>> tensor_list = tensor_loader_->GetTensor();
@ -74,6 +76,7 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
for (std::size_t i = 0; i < tensor_list.size(); i++) {
current_tensor_name = tensor_list[i]->GetName();
std::string tensor_slot = std::to_string(tensor_list[i]->GetSlot());
mindspore::tensor::TensorPtr tensor_ptr = tensor_list[i]->GetTensor();
int tensor_data_type = tensor_ptr->data_type_c();
@ -106,10 +109,23 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
}
}
}
std::vector<unsigned int> hit_encountered;
// check if no watchpoints are valid for the current tensor
if (watchpoints_to_check_table.empty()) {
continue;
// handle watchpoint conditions that do not require per element checks
for (auto it_w_table_check = watchpoints_to_check_table.begin();
it_w_table_check != watchpoints_to_check_table.end(); ++it_w_table_check) {
if (it_w_table_check->second.conditions.overflow.enabled) {
std::string name_no_slot = current_tensor_name.substr(0, current_tensor_name.find_first_of(":"));
if (std::find(op_overflows.begin(), op_overflows.end(), name_no_slot) != op_overflows.end()) {
hit_encountered.push_back(it_w_table_check->second.id);
}
}
}
if (hit_encountered.size()) {
HandleWatchpointHits(hit_encountered, name, slot, condition, watchpoint_id, current_tensor_name,
&watchpoints_to_check_table, tensor_slot);
hit_encountered.clear();
}
// need to add support for float16 and float64, and other types when we support conditions beyond inf and nan
@ -117,11 +133,14 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
continue;
}
// check if no watchpoints are remaining
if (watchpoints_to_check_table.empty()) {
continue;
}
float *start_addr = reinterpret_cast<float *>(tensor_ptr->data_c());
unsigned int num_elements = (tensor_ptr->data().nbytes()) / sizeof(float);
std::unordered_map<unsigned int, watchpoint_t>::iterator it_w_table_check;
std::vector<unsigned int> hit_encountered;
for (unsigned int index = 0; index < num_elements; index++) {
float x = start_addr[index];
@ -134,33 +153,12 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
} else if (it_w_table_check->second.conditions.nan.enabled && isnan(x)) {
hit_encountered.push_back(it_w_table_check->second.id);
}
++it_w_table_check;
}
if (hit_encountered.size()) {
for (auto it_hit_id = hit_encountered.begin(); it_hit_id != hit_encountered.end(); ++it_hit_id) {
std::string name_no_slot = current_tensor_name.substr(0, current_tensor_name.find_first_of(":"));
name->push_back(name_no_slot);
slot->push_back(std::to_string(tensor_list[i]->GetSlot()));
data_ptr->push_back(reinterpret_cast<char *>(tensor_ptr->data_c()));
data_size->push_back(tensor_ptr->data().nbytes());
int condition_item = -1;
if (watchpoint_table[*it_hit_id].conditions.nan.enabled) {
condition_item = 0;
} else if (watchpoint_table[*it_hit_id].conditions.inf.enabled ||
watchpoint_table[*it_hit_id].conditions.neg_inf.enabled) {
condition_item = 1;
}
condition->push_back(condition_item);
wacthpoint_id->push_back(*it_hit_id);
watchpoints_to_check_table.erase(*it_hit_id);
}
HandleWatchpointHits(hit_encountered, name, slot, condition, watchpoint_id, current_tensor_name,
&watchpoints_to_check_table, tensor_slot);
hit_encountered.clear();
}
@ -171,6 +169,34 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *name, std::vector
}
}
void DebugServices::HandleWatchpointHits(const std::vector<unsigned int> &hit_encountered,
std::vector<std::string> *name, std::vector<std::string> *slot,
std::vector<int> *condition, std::vector<unsigned int> *watchpoint_id,
std::string current_tensor_name,
std::unordered_map<unsigned int, watchpoint_t> *watchpoints_to_check_table,
std::string tensor_slot) {
for (auto it_hit_id = hit_encountered.begin(); it_hit_id != hit_encountered.end(); ++it_hit_id) {
if (watchpoint_table.find(*it_hit_id) != watchpoint_table.end()) {
std::string name_no_slot = current_tensor_name.substr(0, current_tensor_name.find_first_of(":"));
name->push_back(name_no_slot);
slot->push_back(tensor_slot);
int condition_item = -1;
if (watchpoint_table[*it_hit_id].conditions.nan.enabled) {
condition_item = 0;
} else if (watchpoint_table[*it_hit_id].conditions.inf.enabled ||
watchpoint_table[*it_hit_id].conditions.neg_inf.enabled) {
condition_item = 1;
} else if (watchpoint_table[*it_hit_id].conditions.overflow.enabled) {
condition_item = 2;
}
condition->push_back(condition_item);
watchpoint_id->push_back(*it_hit_id);
}
watchpoints_to_check_table->erase(*it_hit_id);
}
}
void DebugServices::CheckSingleWatchpoint(std::shared_ptr<TensorData> watchtensor, std::string *name, std::string *slot,
char **data_ptr, unsigned int *data_size, int *condition,
unsigned int *wacthpoint_id) {

@ -51,6 +51,7 @@ class DebugServices {
condition_no_param_t inf;
condition_no_param_t neg_inf;
condition_no_param_t nan;
condition_no_param_t overflow;
condition_with_param_t max_below;
condition_with_param_t max_above;
condition_with_param_t min_below;
@ -74,9 +75,8 @@ class DebugServices {
void RemoveWatchpoint(unsigned int id);
void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<char *> *data_ptr,
std::vector<unsigned int> *data_size, std::vector<int> *condition,
std::vector<unsigned int> *wacthpoint_id);
void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition,
std::vector<unsigned int> *watchpoint_id, const std::vector<std::string> &op_overflows);
void CheckSingleWatchpoint(std::shared_ptr<TensorData> watchnode, std::string *name, std::string *slot,
char **data_ptr, unsigned int *data_size, int *condition, unsigned int *wacthpoint_id);
@ -97,6 +97,12 @@ class DebugServices {
std::unordered_map<unsigned int, watchpoint_t> watchpoint_table;
TensorLoader *tensor_loader_;
void HandleWatchpointHits(const std::vector<unsigned int> &hit_encountered, std::vector<std::string> *name,
std::vector<std::string> *slot, std::vector<int> *condition,
std::vector<unsigned int> *watchpoint_id, std::string current_tensor_name,
std::unordered_map<unsigned int, watchpoint_t> *watchpoints_to_check_table,
std::string tensor_slot);
};
} // namespace mindspore

@ -79,8 +79,16 @@ message WatchCondition {
enum Condition {
nan = 0;
inf = 1;
overflow = 2;
ge = 3; // greater than and equal to
gt = 4; // greater than
le = 5; // less than and equal to
lt = 6; // less than
between = 7; // between
}
Condition condition = 1;
repeated float value = 2; // for between condition, there will be two values
repeated bool include = 3; // for between condition, define the value is included or not
}
message WatchNode {

@ -14,11 +14,18 @@
* limitations under the License.
*/
#include <dirent.h>
#include <stdio.h>
#include <fstream>
#include <tuple>
#include <vector>
#include <algorithm>
#include <iostream>
#include <cstring>
#include <utility>
#include <map>
#include "debug/debugger/debugger.h"
#include "debug/data_dump_parser.h"
#include "pipeline/jit/pipeline.h"
#include "backend/session/anf_runtime_algorithm.h"
#include "runtime/device/kernel_runtime_manager.h"
@ -49,7 +56,9 @@ Debugger::Debugger()
node_name_(""),
cur_name_(""),
is_dataset_graph_(false),
partial_memory_(false) {}
partial_memory_(false),
last_overflow_bin_(0),
overflow_bin_path_("") {}
void Debugger::Init(const uint32_t device_id, const std::string device_target) {
// access lock for public method
@ -133,6 +142,35 @@ void Debugger::EnableDebugger() {
"usage for large models.";
}
if (device_target_ == kAscendDevice) {
// set operation overflow info
overflow_bin_path_ = DataDumpParser::GetInstance().GetOpOverflowBinPath(graph_ptr_->graph_id(), device_id_);
// new overflow dump files will have a timestamp greater than last_overflow_bin_
last_overflow_bin_ = 0;
DIR *d;
d = opendir(overflow_bin_path_.c_str());
if (d) {
struct dirent *dir;
while ((dir = readdir(d)) != NULL) {
if (dir->d_type == DT_REG) {
std::string file_path = overflow_bin_path_;
file_path.append(dir->d_name);
std::size_t found = file_path.find_last_of(".");
if (found == std::string::npos) {
continue;
}
std::string overflow_time = file_path.substr(found + 1);
if (stod(overflow_time) <= last_overflow_bin_) {
MS_LOG(INFO) << "Old op overflow bin folder" << file_path;
continue;
}
last_overflow_bin_ = stod(overflow_time);
}
}
MS_LOG(INFO) << "last op overflow bin folder" << last_overflow_bin_;
}
}
// initialize grpc client
if (debugger_enabled_) {
grpc_client_ = std::make_unique<GrpcClient>(host, port);
@ -154,6 +192,9 @@ void Debugger::Reset() {
graph_ptr_ = nullptr;
grpc_client_ = nullptr;
debug_services_ = nullptr;
last_overflow_bin_ = 0;
overflow_bin_path_ = "";
stream_task_to_opname_.clear();
}
void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
@ -200,6 +241,7 @@ void Debugger::PostExecuteNode() {
if (debugger_enabled_ && !is_dataset_graph_) {
auto watchpoint_table = debug_services_->GetWatchpointTable();
auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, watchpoint_table);
// if kernel is watchpoint,and get hit. suspend.
if (is_watchpoint) {
auto hits = CheckSingleWatchpoint(cur_name_);
@ -225,6 +267,10 @@ void Debugger::PostDebugOp() {
}
}
std::map<std::pair<uint32_t, uint32_t>, std::string> &Debugger::GetStreamTaskToOpnameMap() {
return stream_task_to_opname_;
}
void Debugger::CheckGraphPtr(const KernelGraphPtr &graph_ptr) {
if (graph_ptr_ != graph_ptr) {
MS_LOG(INFO) << "Debugger got new graph: " << graph_ptr->graph_id();
@ -476,15 +522,15 @@ void Debugger::Exit() {
std::exit(EXIT_FAILURE);
}
std::list<WatchpointHit> Debugger::CheckWatchpoints() const {
std::list<WatchpointHit> Debugger::CheckWatchpoints() {
std::vector<std::string> name;
std::vector<std::string> slot;
std::vector<char *> data_ptr;
std::vector<unsigned int> data_size;
std::vector<int> condition;
std::vector<unsigned int> watchpoint_id;
std::vector<std::string> overflow_ops;
debug_services_->CheckWatchpoints(&name, &slot, &data_ptr, &data_size, &condition, &watchpoint_id);
overflow_ops = CheckOpOverflow();
debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, overflow_ops);
std::list<WatchpointHit> hits;
for (unsigned int i = 0; i < name.size(); i++) {
WatchpointHit hit;
@ -658,4 +704,70 @@ void Debugger::SetStepNum(int32_t cur_num_step) {
int32_t Debugger::step_num() const { return num_step_; }
uint64_t BytestoInt64(const std::vector<char> &buffer) {
uint64_t ret;
ret = ((uint64_t)buffer[7] << 56) | ((uint64_t)buffer[6] << 48) | ((uint64_t)buffer[5] << 40) |
((uint64_t)buffer[4] << 32) | (buffer[3] << 24) | (buffer[2] << 16) | (buffer[1] << 8) | buffer[0];
return ret;
}
#define BUF_SIZ 256
std::vector<std::string> Debugger::CheckOpOverflow() {
std::vector<double> bin_list;
std::vector<std::string> op_names;
DIR *d;
struct dirent *dir;
d = opendir(overflow_bin_path_.c_str());
if (d) {
while ((dir = readdir(d)) != NULL) {
if (dir->d_type == DT_REG) {
std::string file_path = overflow_bin_path_;
file_path.append(dir->d_name);
std::string file_name = dir->d_name;
std::size_t found = file_name.find_last_of(".");
if (found == std::string::npos) {
continue;
}
std::string overflow_time = file_name.substr(found + 1);
if (stod(overflow_time) <= last_overflow_bin_) {
MS_LOG(INFO) << "File already processed " << file_name;
continue;
}
bin_list.push_back(stod(overflow_time));
std::fstream infile;
infile.open(file_path.c_str(), std::ios::binary | std::ios::in);
infile.seekg(313, std::ios::beg);
std::vector<char> buffer;
buffer.resize(BUF_SIZ);
infile.read(buffer.data(), BUF_SIZ);
uint64_t stream_id = BytestoInt64(std::vector<char>(buffer.begin() + 8, buffer.end()));
uint64_t task_id = BytestoInt64(std::vector<char>(buffer.begin() + 16, buffer.end()));
MS_LOG(INFO) << "Overflow stream_id " << stream_id << ", task_id " << task_id << ".";
auto op = debugger_->stream_task_to_opname_.find(std::make_pair(stream_id, task_id));
if (op != debugger_->stream_task_to_opname_.end()) {
MS_LOG(ERROR) << "Overflow detected on node " << op->second << std::endl;
op_names.push_back(op->second);
} else {
MS_LOG(INFO) << "No overflow is detected " << std::endl;
}
infile.close();
}
}
} else {
MS_LOG(INFO) << "OverFlow bin directory does not exist!";
}
closedir(d);
MS_LOG(ERROR) << "These operation overflows are detected " << op_names;
for (auto &i : bin_list) {
if (i > last_overflow_bin_) {
last_overflow_bin_ = i;
}
}
return op_names;
}
} // namespace mindspore

@ -19,6 +19,9 @@
#include <list>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include <map>
#include "backend/session/kernel_graph.h"
#include "debug/debugger/grpc_client.h"
#include "debug/debug_services.h"
@ -90,6 +93,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
int32_t step_num() const;
std::map<std::pair<uint32_t, uint32_t>, std::string> &GetStreamTaskToOpnameMap();
private:
// private constructor for singleton
Debugger();
@ -130,12 +135,15 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
// analyze tensors and check watchpoint conditions
// return names of tensors and what condition they hit
std::list<WatchpointHit> CheckWatchpoints() const;
std::list<WatchpointHit> CheckWatchpoints();
std::list<WatchpointHit> CheckSingleWatchpoint(std::string watchnode) const;
// send watchpoints that hit and enter command wait loop
void SendWatchpointsAndSuspend(const std::list<WatchpointHit> &points);
// Find if any operation overflow happened and return their names
std::vector<std::string> CheckOpOverflow();
// class members
std::unique_ptr<GrpcClient> grpc_client_;
std::unique_ptr<DebugServices> debug_services_;
@ -150,7 +158,9 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
bool is_dataset_graph_;
bool partial_memory_;
std::mutex access_lock_;
std::map<std::pair<uint32_t, uint32_t>, std::string> stream_task_to_opname_;
double last_overflow_bin_;
std::string overflow_bin_path_;
// singleton
static std::mutex instance_lock_;
static std::shared_ptr<Debugger> debugger_;
@ -180,5 +190,6 @@ ProtoVector<TensorProto> GetTensors(const EventReply &reply);
// get the full name of a tensor, which is the name used in TensorLoader
std::string GetTensorFullName(const TensorProto &tensor);
uint64_t BytestoInt64(const std::vector<char> &buffer);
} // namespace mindspore
#endif // MINDSPORE_CCSRC_DEBUG_DEBUGGER_DEBUGGER_H_

@ -27,6 +27,9 @@
#include "proto/op_mapping_info.pb.h"
#include "utils/ms_context.h"
#include "debug/data_dump_parser.h"
#ifdef ENABLE_DEBUGGER
#include "debug/debugger/debugger.h"
#endif
static constexpr uint32_t kAicpuLoadFlag = 1;
static constexpr uint32_t kAicpuUnloadFlag = 0;
@ -90,6 +93,18 @@ void DataDumper::LoadDumpInfo() {
load_flag_ = true;
// graph id may changed in Unload
graph_id_ = kernel_graph_->graph_id();
#ifdef ENABLE_DEBUGGER
auto debugger = mindspore::Debugger::GetInstance();
MS_EXCEPTION_IF_NULL(debugger);
std::map<std::pair<uint32_t, uint32_t>, std::string> &stream_task_to_opname = debugger->GetStreamTaskToOpnameMap();
// extract stream id, task id and opname from runtime_info_map for overflow detection
std::transform(runtime_info_map_.begin(), runtime_info_map_.end(),
std::inserter(stream_task_to_opname, stream_task_to_opname.end()),
[](const std::pair<std::string, std::shared_ptr<RuntimeInfo>> &p)
-> std::pair<std::pair<uint32_t, uint32_t>, std::string> {
return {{std::get<1>(*p.second), std::get<0>(*p.second)}, p.first};
});
#endif
MS_LOG(INFO) << "[DataDump] LoadDumpInfo end";
}

Loading…
Cancel
Save