diff --git a/mindspore/ccsrc/debug/CMakeLists.txt b/mindspore/ccsrc/debug/CMakeLists.txt index b237f64d7e..8b9cafc12a 100644 --- a/mindspore/ccsrc/debug/CMakeLists.txt +++ b/mindspore/ccsrc/debug/CMakeLists.txt @@ -12,6 +12,7 @@ if (ENABLE_DEBUGGER) "${CMAKE_CURRENT_SOURCE_DIR}/debugger/debugger.cc" "${CMAKE_CURRENT_SOURCE_DIR}/debugger/grpc_client.cc" "${CMAKE_CURRENT_SOURCE_DIR}/debugger/proto_exporter.cc" + "${CMAKE_CURRENT_SOURCE_DIR}/debugger/tensor_summary.cc" "${CMAKE_CURRENT_SOURCE_DIR}/debug_services.cc" ) endif (ENABLE_DEBUGGER) diff --git a/mindspore/ccsrc/debug/debug_services.cc b/mindspore/ccsrc/debug/debug_services.cc index 758ca61f19..0fb6f95daa 100644 --- a/mindspore/ccsrc/debug/debug_services.cc +++ b/mindspore/ccsrc/debug/debug_services.cc @@ -17,6 +17,8 @@ #include #include "backend/session/anf_runtime_algorithm.h" #include "debug/debug_services.h" +#include "debug/debugger/tensor_summary.h" + namespace mindspore { DebugServices::DebugServices() { @@ -49,9 +51,6 @@ void DebugServices::AddWatchpoint(unsigned int id, unsigned int watch_condition, watchpoint_item.id = id; watchpoint_item.condition.type = static_cast(watch_condition); watchpoint_item.condition.parameter = parameter; - if (watch_condition > 2 && watch_condition < 13) - // odd indices are greater than conditions and even indices are less than - watchpoint_item.condition.comparison = (watch_condition & 1) == 0 ? "LT" : "GT"; watchpoint_item.check_node_list = check_node_list; watchpoint_item.parameter_list = parameter_list; watchpoint_table[id] = watchpoint_item; @@ -62,77 +61,14 @@ void DebugServices::RemoveWatchpoint(unsigned int id) { watchpoint_table.erase(id); } -template -DebugServices::tensor_stats DebugServices::SummarizeTensor(const T *start, const T *start_prev, unsigned int n, - bool need_min_max, bool need_mean_sd, - bool need_zero_percentage, - bool need_tensor_update_ratio_mean, bool need_allclose, - bool need_abs_mean) { - tensor_stats stats; - double zero_count = 0.0; - double rtol = 1.0e-5; - double atol = 1.0e-8; - double update_ratio_sum = 0.0; - double epsilon = 1.0e-9; - for (unsigned int i = 0; i < n; ++i) { - auto val = static_cast(start[i]); - double val_prev = 0.0; - if (start_prev) { - val_prev = static_cast(start_prev[i]); - } - stats.has_nan = stats.has_nan || std::isnan(val); - stats.has_inf = stats.has_inf || std::isinf(val); - if (stats.has_inf && stats.has_nan) { - // other statistics don't make sense in this case - break; - } - - if (need_min_max) { - stats.min = std::min(stats.min, val); - stats.max = std::max(stats.max, val); - } - - if (need_mean_sd) { - double delta = val - stats.mean; - stats.mean += delta / (i + 1); - stats.m2 += delta * (val - stats.mean); - } - - if (need_abs_mean) { - double delta = std::abs(val) - stats.abs_mean; - stats.abs_mean += delta / (i + 1); - } - - if (need_zero_percentage) { - if (val == 0) zero_count++; - } - - if (need_tensor_update_ratio_mean && start_prev) { - update_ratio_sum += (std::abs(val - val_prev) / (epsilon + std::abs(val_prev))); - } - - if (need_allclose && start_prev) { - stats.allclose &= (std::abs(val - val_prev) <= (atol + rtol * std::abs(val_prev))); - } - } - if (need_tensor_update_ratio_mean && start_prev) { - stats.tensor_update_ratio_mean = (update_ratio_sum / n); - } - stats.zero_percentage = (zero_count / n) * 100; - stats.n = n; - return stats; -} - void DebugServices::CheckWatchpoints(std::vector *name, std::vector *slot, std::vector *condition, std::vector *watchpoint_id, std::vector> *parameters, - const std::vector &op_overflows, + std::vector *error_codes, const std::vector &op_overflows, const std::vector> &tensor_list, const bool init_dbg_suspend) { std::lock_guard lg(lock_); - if (watchpoint_table.empty()) { - return; - } + if (watchpoint_table.empty()) return; for (const auto &tensor : tensor_list) { const auto tensor_name = tensor->GetName(); @@ -140,268 +76,113 @@ void DebugServices::CheckWatchpoints(std::vector *name, std::vector const auto tensor_slot = std::to_string(tensor->GetSlot()); mindspore::tensor::TensorPtr tensor_ptr = tensor->GetTensor(); int tensor_dtype = tensor_ptr->data_type_c(); - std::vector hit_encountered; - std::vector> hit_parms; - std::unordered_map watchpoints_to_check_table; - bool min_max_enabled = false; - bool mean_sd_enabled = false; - bool inf_nan_enabled = false; - bool zero_percentage_enabled = false; - bool tensor_update_ratio_mean_enabled = false; - bool allclose_enabled = false; - bool abs_mean_enabled = false; + std::vector watchpoints_to_check; + std::string qualified_tensor_name; for (auto w_table_item : watchpoint_table) { auto wp = std::get<1>(w_table_item); if (wp.condition.type == INIT && !init_dbg_suspend) continue; if (wp.condition.type != IS_OVERFLOW && tensor_dtype == kNumberTypeBool) continue; - if (wp.IsNodeIncluded(tensor_name_no_slot)) { - min_max_enabled |= wp.min_max_enabled(); - mean_sd_enabled |= wp.mean_sd_enabled(); - inf_nan_enabled |= wp.inf_nan_enabled(); - zero_percentage_enabled |= wp.zero_percentage_enabled(); - tensor_update_ratio_mean_enabled |= wp.tensor_update_ratio_mean_enabled(); - allclose_enabled |= wp.allclose_enabled(); - abs_mean_enabled |= wp.abs_mean_enabled(); - watchpoints_to_check_table[w_table_item.second.id] = w_table_item.second; + std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot); + if (!found.empty()) { + qualified_tensor_name = found; + watchpoints_to_check.push_back(w_table_item.second); } } - tensor_stats stats; - uint num_elements = tensor_ptr->DataSize(); - if (min_max_enabled || mean_sd_enabled || inf_nan_enabled || zero_percentage_enabled || - tensor_update_ratio_mean_enabled || allclose_enabled || abs_mean_enabled) { - bool need_prev = (tensor_update_ratio_mean_enabled || allclose_enabled); - bool have_prev = tensor_loader_->GetPrevTensor(tensor_name) != NULL; + // no wp set on current tensor + if (watchpoints_to_check.empty()) continue; + + uint32_t num_elements = tensor_ptr->DataSize(); + void *previous_tensor_ptr = tensor_loader_->GetPrevTensor(tensor_name) + ? tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c() + : nullptr; + std::unique_ptr base_summary_ptr; + if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) { switch (tensor_dtype) { case kNumberTypeUInt8: { - auto start_addr = reinterpret_cast(tensor_ptr->data_c()); - auto start_addr_prev = - (need_prev && have_prev - ? reinterpret_cast(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) - : NULL); - stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, - zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled, - abs_mean_enabled); + base_summary_ptr = + std::make_unique>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); break; } case kNumberTypeInt8: { - auto start_addr = reinterpret_cast(tensor_ptr->data_c()); - auto start_addr_prev = - (need_prev && have_prev - ? reinterpret_cast(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) - : NULL); - stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, - zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled, - abs_mean_enabled); + base_summary_ptr = + std::make_unique>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); break; } case kNumberTypeUInt16: { - auto start_addr = reinterpret_cast(tensor_ptr->data_c()); - auto start_addr_prev = - (need_prev && have_prev - ? reinterpret_cast(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) - : NULL); - stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, - zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled, - abs_mean_enabled); + base_summary_ptr = + std::make_unique>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); break; } case kNumberTypeInt16: { - auto start_addr = reinterpret_cast(tensor_ptr->data_c()); - auto start_addr_prev = - (need_prev && have_prev - ? reinterpret_cast(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) - : NULL); - stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, - zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled, - abs_mean_enabled); + base_summary_ptr = + std::make_unique>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); break; } case kNumberTypeUInt32: { - auto start_addr = reinterpret_cast(tensor_ptr->data_c()); - auto start_addr_prev = - (need_prev && have_prev - ? reinterpret_cast(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) - : NULL); - stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, - zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled, - abs_mean_enabled); + base_summary_ptr = + std::make_unique>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); break; } case kNumberTypeInt32: case kNumberTypeInt: { - auto start_addr = reinterpret_cast(tensor_ptr->data_c()); - auto start_addr_prev = - (need_prev && have_prev - ? reinterpret_cast(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) - : NULL); - stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, - zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled, - abs_mean_enabled); + base_summary_ptr = + std::make_unique>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); break; } case kNumberTypeUInt64: { - auto start_addr = reinterpret_cast(tensor_ptr->data_c()); - auto start_addr_prev = - (need_prev && have_prev - ? reinterpret_cast(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) - : NULL); - stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, - zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled, - abs_mean_enabled); + base_summary_ptr = + std::make_unique>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); break; } case kNumberTypeInt64: { - auto start_addr = reinterpret_cast(tensor_ptr->data_c()); - auto start_addr_prev = - (need_prev && have_prev - ? reinterpret_cast(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) - : NULL); - stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, - zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled, - abs_mean_enabled); + base_summary_ptr = + std::make_unique>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); break; } case kNumberTypeFloat16: { - auto start_addr = reinterpret_cast(tensor_ptr->data_c()); - auto start_addr_prev = - (need_prev && have_prev - ? reinterpret_cast(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) - : NULL); - stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, - zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled, - abs_mean_enabled); + base_summary_ptr = + std::make_unique>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); break; } case kNumberTypeFloat32: case kNumberTypeFloat: { - auto start_addr = reinterpret_cast(tensor_ptr->data_c()); - auto start_addr_prev = - (need_prev && have_prev - ? reinterpret_cast(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) - : NULL); - stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, - zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled, - abs_mean_enabled); + base_summary_ptr = + std::make_unique>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); break; } case kNumberTypeFloat64: { - auto start_addr = reinterpret_cast(tensor_ptr->data_c()); - auto start_addr_prev = - (need_prev && have_prev - ? reinterpret_cast(tensor_loader_->GetPrevTensor(tensor_name)->GetTensor()->data_c()) - : NULL); - stats = SummarizeTensor(start_addr, start_addr_prev, num_elements, min_max_enabled, mean_sd_enabled, - zero_percentage_enabled, tensor_update_ratio_mean_enabled, allclose_enabled, - abs_mean_enabled); + base_summary_ptr = + std::make_unique>(tensor_ptr->data_c(), previous_tensor_ptr, num_elements); break; } default: MS_LOG(INFO) << "Unsupported tensor type"; break; } + base_summary_ptr->SummarizeTensor(watchpoints_to_check); } - for (auto &it : watchpoints_to_check_table) { - auto wp_id = it.second.id; - std::vector hit_p; - CONDITION_TYPE enabled_condition = it.second.condition.type; - bool hit = (enabled_condition == HAS_NAN && stats.has_nan) || (enabled_condition == HAS_INF && stats.has_inf) || - (enabled_condition == GENERAL_OVERFLOW && (stats.has_nan || stats.has_inf)) || - (enabled_condition == IS_OVERFLOW && - std::find(op_overflows.begin(), op_overflows.end(), tensor_name_no_slot) != op_overflows.end()); - - if (enabled_condition > 2 && enabled_condition != GENERAL_OVERFLOW) { - if (stats.has_inf || stats.has_nan) { - MS_LOG(WARNING) << "NaN or/and INF present in tensor: " << tensor_name << ". Cannot check " - << condition_label[enabled_condition] << " watchpoint."; - } else if (enabled_condition < 13) { - bool gt = stats.statLookup(enabled_condition) > it.second.condition.parameter; - bool lt = stats.statLookup(enabled_condition) < it.second.condition.parameter; - hit |= it.second.condition.comparison == "GT" ? gt : lt; - } else { - std::vector parameter_list_item = it.second.parameter_list; - for (auto &p : parameter_list_item) { - if (p.disabled == false) { - bool p_hit = false; - if (p.name == "zero_percentage_ge") { - p_hit = stats.parmLookup(STAT_ZERO_PERCENTAGE) >= p.value; - } else if (p.name == "max_gt") { - p_hit = stats.parmLookup(STAT_MAX) > p.value; - } else if (p.name == "max_lt") { - p_hit = stats.parmLookup(STAT_MAX) < p.value; - } else if (p.name == "min_gt") { - p_hit = stats.parmLookup(STAT_MIN) > p.value; - } else if (p.name == "min_lt") { - p_hit = stats.parmLookup(STAT_MIN) < p.value; - } else if (p.name == "mean_gt") { - p_hit = stats.parmLookup(STAT_MEAN) > p.value; - } else if (p.name == "mean_lt") { - p_hit = stats.parmLookup(STAT_MEAN) < p.value; - } else if (p.name == "abs_mean_gt") { - p_hit = stats.parmLookup(STAT_ABS_MEAN) > p.value; - } else if (p.name == "abs_mean_lt") { - p_hit = stats.parmLookup(STAT_ABS_MEAN) < p.value; - } else if (p.name == "abs_update_ratio_mean_gt") { - p_hit = stats.parmLookup(STAT_TENSOR_UPDATE_RATIO_MEAN) > p.value; - } else if (p.name == "abs_update_ratio_mean_lt") { - p_hit = stats.parmLookup(STAT_TENSOR_UPDATE_RATIO_MEAN) < p.value; - } - hit |= p_hit; - hit_p.push_back(p_hit); - } else { - hit_p.push_back(false); - } - } - - hit |= (enabled_condition == NOT_CHANGED && stats.parmLookup(STAT_ALLCLOSE)); - - if (hit) hit_parms.push_back(hit_p); - } + for (auto &wp : watchpoints_to_check) { + bool is_hit = false; + int error_code = 0; + std::vector parameter_list = {}; + if (wp.condition.type == IS_OVERFLOW) { + is_hit = (std::find(op_overflows.begin(), op_overflows.end(), tensor_name_no_slot) != op_overflows.end()); + } else { + auto item = base_summary_ptr->IsWatchpointHit(wp); + is_hit = std::get<0>(item); + error_code = std::get<1>(item); + parameter_list = std::get<2>(item); } - if (hit) hit_encountered.push_back(wp_id); - } - - unsigned int index_parm_list = 0; - for (auto it_hit_id = hit_encountered.begin(); it_hit_id != hit_encountered.end(); ++it_hit_id) { - if (watchpoint_table.find(*it_hit_id) != watchpoint_table.end()) { - // return fully qualified name for weights and bias to MI - auto found_dot = tensor_name_no_slot.find_last_of('.'); - if (found_dot != std::string::npos && (tensor_name_no_slot.substr(found_dot + 1) == "weight" || - tensor_name_no_slot.substr(found_dot + 1) == "bias")) { - auto check_node_list = watchpoint_table.find(*it_hit_id)->second.check_node_list; - bool found_match = false; - for (auto check_node : check_node_list) { - std::string w_name = std::get<0>(check_node); - auto found_slash = w_name.find_last_of('/'); - if (found_slash != std::string::npos && w_name.substr(found_slash + 1) == tensor_name_no_slot) { - name->push_back(w_name); - found_match = true; - break; - } - } - if (!found_match) { - name->push_back(tensor_name_no_slot); - } - } else { - name->push_back(tensor_name_no_slot); - } + if (is_hit || error_code) { + name->push_back(qualified_tensor_name); slot->push_back(tensor_slot); - int condition_item = watchpoint_table.find(*it_hit_id)->second.condition.type; - condition->push_back(condition_item); - watchpoint_id->push_back(*it_hit_id); - std::vector parameter_list_item = watchpoint_table.find(*it_hit_id)->second.parameter_list; - if (condition_item >= 13) { - unsigned int index_hit_parm = 0; - for (auto &p : parameter_list_item) { - p.hit = hit_parms[index_parm_list][index_hit_parm]; - index_hit_parm++; - } - index_parm_list++; - } - parameters->push_back(parameter_list_item); + condition->push_back(wp.condition.type); + watchpoint_id->push_back(wp.id); + parameters->push_back(parameter_list); + error_codes->push_back(error_code); } - watchpoints_to_check_table.erase(*it_hit_id); } } } diff --git a/mindspore/ccsrc/debug/debug_services.h b/mindspore/ccsrc/debug/debug_services.h index e694de9482..14e1612dcf 100644 --- a/mindspore/ccsrc/debug/debug_services.h +++ b/mindspore/ccsrc/debug/debug_services.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "debug/tensor_load.h" #include "debug/tensor_data.h" @@ -60,23 +61,13 @@ class DebugServices { ALL_ZERO, CHANGE_TOO_LARGE, CHANGE_TOO_SMALL, - NOT_CHANGED - }; - - enum STAT_TYPE { - STAT_MIN, - STAT_MAX, - STAT_MEAN, - STAT_ZERO_PERCENTAGE, - STAT_TENSOR_UPDATE_RATIO_MEAN, - STAT_ALLCLOSE, - STAT_ABS_MEAN + NOT_CHANGED, + RANGE }; typedef struct condition { CONDITION_TYPE type; float parameter = 0; - std::string comparison; } condition_t; typedef struct parameter { @@ -84,6 +75,25 @@ class DebugServices { bool disabled; double_t value; bool hit; + double_t actual_value; + void Evaluate(double_t actualValue, std::string inequality_type) { + if (std::isnan(actualValue)) return; + + actual_value = actualValue; + if (inequality_type.empty()) { + auto pos = name.find_last_of('_'); + if (pos != std::string::npos) { + inequality_type = name.substr(pos + 1); + } + } + + std::map condition_check{{"gt", actual_value > value}, + {"lt", actual_value < value}, + {"ge", actual_value >= value}, + {"le", actual_value <= value}}; + + hit = condition_check[inequality_type]; + } } parameter_t; typedef struct watchpoint { @@ -93,18 +103,28 @@ class DebugServices { std::vector parameter_list; size_t location = 0; - bool IsNodeIncluded(const std::string &tensor_name) { + std::string FindQualifiedTensorName(const std::string &tensor_name) { std::string node_name = tensor_name.substr(0, tensor_name.find_first_of(':')); for (auto check_node : check_node_list) { std::string w_name = std::get<0>(check_node); bool w_type = std::get<1>(check_node); auto found = w_name.find_last_of('/'); - if (found != std::string::npos && w_name.substr(found + 1) == tensor_name) return true; + if (found != std::string::npos && w_name.substr(found + 1) == tensor_name) return w_name; if ((w_type && (tensor_name.find(w_name) == location || w_name == "*")) || (!w_type && node_name == w_name)) { - return true; + return w_name; } } - return false; + return {}; + } + + bool is_gt_wp() { + return condition.type == MAX_GT || condition.type == MIN_GT || condition.type == MEAN_GT || + condition.type == SD_GT || condition.type == MAX_MIN_GT; + } + + bool is_lt_wp() { + return condition.type == MAX_LT || condition.type == MIN_LT || condition.type == MEAN_LT || + condition.type == SD_LT || condition.type == MAX_MIN_LT; } bool min_max_enabled() { @@ -119,67 +139,26 @@ class DebugServices { return condition.type == HAS_INF || condition.type == HAS_NAN || condition.type == GENERAL_OVERFLOW; } // mean or sd related condition set - bool mean_sd_enabled() { + bool mean_sd_enabled() const { return condition.type == MEAN_LT || condition.type == MEAN_GT || condition.type == SD_LT || condition.type == SD_GT || (condition.type == TOO_LARGE && !parameter_list[3].disabled) || (condition.type == TOO_SMALL && !parameter_list[3].disabled); } - bool abs_mean_enabled() { + bool abs_mean_enabled() const { return (condition.type == TOO_LARGE && !parameter_list[0].disabled) || (condition.type == TOO_SMALL && !parameter_list[0].disabled); } bool zero_percentage_enabled() { return condition.type == ALL_ZERO || condition.type == INIT; } - bool tensor_update_ratio_mean_enabled() { - return condition.type == CHANGE_TOO_LARGE || condition.type == CHANGE_TOO_SMALL; - } - bool allclose_enabled() { return condition.type == NOT_CHANGED; } - } watchpoint_t; - struct tensor_stats { - double min = std::numeric_limits::max(); - double max = std::numeric_limits::lowest(); - bool has_inf = false; - bool has_nan = false; - unsigned int n = 0; - double mean = 0.0; - double m2 = 0.0; - double zero_percentage = 0.0; - double tensor_update_ratio_mean = -1; - bool allclose = false; - double abs_mean = 0.0; - - double statLookup(CONDITION_TYPE type) const { - if (type == MAX_GT || type == MAX_LT) return max; - if (type == MIN_GT || type == MIN_LT) return min; - if (type == MAX_MIN_GT || type == MAX_MIN_LT) return (max - min); - if (type == MEAN_GT || type == MEAN_LT) return mean; - if (type == SD_GT || type == SD_LT) return getStandardDeviation(); - return std::numeric_limits::quiet_NaN(); + bool tensor_update_ratio_mean_enabled() const { + return condition.type == CHANGE_TOO_LARGE || condition.type == CHANGE_TOO_SMALL; } + bool allclose_enabled() const { return condition.type == NOT_CHANGED; } - double parmLookup(STAT_TYPE type) const { - if (type == STAT_MAX) return max; - if (type == STAT_MIN) return min; - if (type == STAT_MEAN) return mean; - if (type == STAT_ZERO_PERCENTAGE) return zero_percentage; - if (type == STAT_TENSOR_UPDATE_RATIO_MEAN) return tensor_update_ratio_mean; - if (type == STAT_ALLCLOSE) return allclose; - if (type == STAT_ABS_MEAN) return abs_mean; - return std::numeric_limits::quiet_NaN(); + bool range_enabled() const { + return condition.type == RANGE && (!parameter_list[0].disabled || !parameter_list[1].disabled); } - - double getMean() const { return mean; } - - double getVariance() const { - if (n > 1) { - return m2 / (n - 1); - } else { - return 0.0; - } - } - - double getStandardDeviation() const { return sqrt(getVariance()); } - }; + } watchpoint_t; void AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter, const std::vector> &check_node_list, @@ -189,7 +168,7 @@ class DebugServices { void CheckWatchpoints(std::vector *name, std::vector *slot, std::vector *condition, std::vector *watchpoint_id, std::vector> *parameters, - const std::vector &op_overflows, + std::vector *error_code, const std::vector &op_overflows, const std::vector> &tensor_list, bool init_dbg_suspend); void ReadNodesTensors(std::vector name, std::vector *ret_name, @@ -210,19 +189,8 @@ class DebugServices { std::mutex lock_; std::unordered_map watchpoint_table; - std::vector condition_label = { - "HAS_NAN", "HAS_INF", "IS_OVERFLOW", "MAX_GT", "MAX_LT", - "MIN_GT", "MIN_LT", "MAX_MIN_GT", "MAX_MIN_LT", "MEAN_GT", - "MEAN_LT", "SD_GT", "SD_LT", "GENERAL_OVERFLOW", "INIT", - "TOO_LARGE", "TOO_SMALL", "ALL_ZERO", "CHANGE_TOO_LARGE", "CHANGE_TOO_SMALL", - "NOT_CHANGED"}; TensorLoader *tensor_loader_; - - template - static tensor_stats SummarizeTensor(const T *start, const T *start_prev, unsigned int n, bool need_min_max, - bool need_mean_sd, bool need_zero_percentage, bool need_tensor_update_ratio_mean, - bool need_allclose, bool need_abs_mean_sd); }; } // namespace mindspore diff --git a/mindspore/ccsrc/debug/debugger/debug_grpc.proto b/mindspore/ccsrc/debug/debugger/debug_grpc.proto index bddc725d81..00eb81bd4d 100644 --- a/mindspore/ccsrc/debug/debugger/debug_grpc.proto +++ b/mindspore/ccsrc/debug/debugger/debug_grpc.proto @@ -37,14 +37,14 @@ message Metadata { // the full name of current node string cur_node = 4; // check if training is done. - bool training_done = 5; + bool training_done = 5; // the number of total graphs int32 graph_num = 6; } message Chunk { - bytes buffer = 1; - bool finished = 2; + bytes buffer = 1; + bool finished = 2; } message EventReply { @@ -108,6 +108,7 @@ message WatchCondition { tensor_change_too_large = 18; tensor_change_too_small = 19; tensor_not_changed = 20; + tensor_range = 21; } Condition condition = 1; float value = 2; @@ -116,6 +117,7 @@ message WatchCondition { bool disabled = 2; double value = 3; bool hit = 4; // Whether this parameter is hit when checking tensor. + double actual_value = 5; } repeated Parameter params = 4; } @@ -129,4 +131,5 @@ message WatchpointHit { TensorProto tensor = 1; WatchCondition watch_condition = 2; int32 id = 3; + int32 error_code = 4; } diff --git a/mindspore/ccsrc/debug/debugger/debugger.cc b/mindspore/ccsrc/debug/debugger/debugger.cc index acf7d4c8a1..6beb21f92f 100644 --- a/mindspore/ccsrc/debug/debugger/debugger.cc +++ b/mindspore/ccsrc/debug/debugger/debugger.cc @@ -757,6 +757,7 @@ std::list Debugger::CheckWatchpoints(const std::string &watchnode std::vector watchpoint_id; std::vector overflow_ops; std::vector> parameters; + std::vector error_codes; #ifdef ENABLE_D overflow_ops = CheckOpOverflow(); #endif @@ -768,14 +769,14 @@ std::list Debugger::CheckWatchpoints(const std::string &watchnode tensor_list = tensor_loader->GetNodeTensorMap(watchnode); debug_services_->AddWeightsBiasInputs(&tensor_list, kernel); } - debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, ¶meters, overflow_ops, tensor_list, - initial_suspend_); + debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, ¶meters, &error_codes, overflow_ops, + tensor_list, initial_suspend_); std::list hits; for (unsigned int i = 0; i < name.size(); i++) { WatchpointHit hit; std::vector ¶meter = parameters[i]; hit.set_id(watchpoint_id[i]); - + hit.set_error_code(error_codes[i]); // here TensorProto act as a tensor indicator, not sending tensor content TensorProto *tensor_item = hit.mutable_tensor(); tensor_item->set_node_name(name[i]); @@ -790,6 +791,7 @@ std::list Debugger::CheckWatchpoints(const std::string &watchnode x->set_disabled(p.disabled); x->set_value(p.value); x->set_hit(p.hit); + x->set_actual_value(p.actual_value); } hits.push_back(hit); } diff --git a/mindspore/ccsrc/debug/debugger/tensor_summary.cc b/mindspore/ccsrc/debug/debugger/tensor_summary.cc new file mode 100644 index 0000000000..4cdeb1564a --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/tensor_summary.cc @@ -0,0 +1,268 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include "debug/debugger/tensor_summary.h" + +namespace mindspore { +using CONDITION_TYPE = DebugServices::CONDITION_TYPE; + +RangeCountCalculator::RangeCountCalculator() + : range_start_inclusive(-std::numeric_limits::infinity()), + range_end_inclusive(std::numeric_limits::infinity()), + count(0), + total(0) {} + +void RangeCountCalculator::ProcessElement(double element) { + count += (element >= range_start_inclusive && element <= range_end_inclusive); + total += 1; +} + +double RangeCountCalculator::GetPercentInRange() { + if (total == 0) { + return 0.0; + } + return 100.0 * count / total; +} + +AllCloseCalculator::AllCloseCalculator() : atol(1.0e-8), rtol(1.0e-5), result(true) {} + +void AllCloseCalculator::ProcessElement(double current, double previous) { + result &= (std::abs(current - previous) <= (atol + rtol * std::abs(previous))); +} + +bool AllCloseCalculator::IsAllClose() { return result; } + +MeanCalculator::MeanCalculator() : mean(0.0), count(0) {} + +void MeanCalculator::ProcessElement(double value) { + count += 1; + double delta = value - mean; + mean += delta / count; +} + +double MeanCalculator::GetMean() { return mean; } + +VarianceAndMeanCalculator::VarianceAndMeanCalculator() : mean(0.0), count(0), m2(0.0) {} + +void VarianceAndMeanCalculator::ProcessElement(double value) { + count += 1; + double delta = value - mean; + mean += delta / count; + m2 += delta * (value - mean); +} + +double VarianceAndMeanCalculator::GetMean() { return mean; } + +double VarianceAndMeanCalculator::GetVariance() { + if (count > 1) { + return m2 / (count - 1); + } else { + return 0.0; + } +} + +double VarianceAndMeanCalculator::GetStandardDeviation() { return sqrt(GetVariance()); } + +template +TensorSummary::TensorSummary(void *current_tensor_ptr, void *previous_tensor_ptr, uint32_t num_elements) + : current_tensor_ptr(reinterpret_cast(current_tensor_ptr)), + prev_tensor_ptr(reinterpret_cast(previous_tensor_ptr)), + num_elements(num_elements), + min(std::numeric_limits::max()), + max(std::numeric_limits::lowest()), + inf_count(0), + nan_count(0), + zero_count(0), + epsilon(1.0e-9), + mean_sd_cal_enabled(false) {} + +template +void TensorSummary::SummarizeTensor(const std::vector &wps) { + InitCalculators(wps); + for (size_t i = 0; i < num_elements; ++i) { + auto current_value = static_cast(current_tensor_ptr[i]); + double previous_value = + prev_tensor_ptr ? static_cast(prev_tensor_ptr[i]) : std::numeric_limits::quiet_NaN(); + inf_count += std::isinf(current_value); + nan_count += std::isnan(current_value); + zero_count += (current_value == 0); + max = std::max(max, current_value); + min = std::min(min, current_value); + if (mean_sd_cal_enabled) { + current_mean_variance.ProcessElement(current_value); + } + for (auto &it : all_close) { + it.second->ProcessElement(current_value, previous_value); + } + for (auto &range_count : range_counts) { + range_count.second->ProcessElement(current_value); + } + for (auto &mean : means) { + if (mean.first == "curr_prev_diff_mean") { + mean.second->ProcessElement(std::abs(current_value - previous_value)); + } else if (mean.first == "abs_prev_mean") { + mean.second->ProcessElement(std::abs(previous_value)); + } else if (mean.first == "abs_current_mean") { + mean.second->ProcessElement(std::abs(current_value)); + } + } + } +} + +template +std::tuple> TensorSummary::IsWatchpointHit( + DebugServices::watchpoint_t wp) { + auto parameter_list = wp.parameter_list; + bool hit = false; + std::bitset<32> error_code; + CONDITION_TYPE type = wp.condition.type; + + error_code.set(0, nan_count > 0); + error_code.set(1, inf_count > 0); + + if (type == CONDITION_TYPE::HAS_NAN) { + error_code.reset(); + hit = nan_count > 0; + } else if (type == CONDITION_TYPE::HAS_INF) { + error_code.reset(); + hit = inf_count > 0; + } else if (type == CONDITION_TYPE::GENERAL_OVERFLOW) { + error_code.reset(); + hit = (nan_count + inf_count) > 0; + } else if (type == CONDITION_TYPE::NOT_CHANGED && prev_tensor_ptr && error_code.none()) { + hit = all_close[wp.id]->IsAllClose(); + } + + for (auto ¶meter : parameter_list) { + if (parameter.disabled || error_code.any()) { + continue; + } + std::string inequality_type; + if (wp.is_gt_wp()) { + inequality_type = "gt"; + } else if (wp.is_lt_wp()) { + inequality_type = "lt"; + } + parameter.Evaluate(StatLookup(parameter.name, wp), inequality_type); + hit |= parameter.hit; + } + return std::make_tuple(hit, static_cast(error_code.to_ulong()), parameter_list); +} + +template +double_t TensorSummary::StatLookup(const std::string ¶meter_name, const DebugServices::watchpoint_t &wp) { + if (parameter_name == "param") return StatLookup(wp); + std::string param_type; + auto pos = parameter_name.find_last_of('_'); + if (pos != std::string::npos) { + param_type = parameter_name.substr(0, pos); + } + + if (param_type == "max") { + return max; + } else if (param_type == "min") { + return min; + } else if (param_type == "max_min") { + return max - min; + } else if (param_type == "mean") { + return current_mean_variance.GetMean(); + } else if (param_type == "sd") { + return current_mean_variance.GetStandardDeviation(); + } else if (param_type == "abs_mean") { + return means["abs_current_mean"]->GetMean(); + } else if (param_type == "abs_mean_update_ratio") { + return means["curr_prev_diff_mean"]->GetMean() / (means["abs_prev_mean"]->GetMean() + epsilon); + } else if (param_type == "range_percentage") { + return range_counts[wp.id]->GetPercentInRange(); + } else if (param_type == "zero_percentage") { + return GetZeroValPercent(); + } + return std::numeric_limits::quiet_NaN(); +} + +template +double_t TensorSummary::StatLookup(const DebugServices::watchpoint_t &wp) { + CONDITION_TYPE type = wp.condition.type; + if (type == CONDITION_TYPE::MAX_LT || type == CONDITION_TYPE::MAX_GT) { + return max; + } else if (type == CONDITION_TYPE::MIN_LT || type == CONDITION_TYPE::MIN_GT) { + return min; + } else if (type == CONDITION_TYPE::MEAN_LT || type == CONDITION_TYPE::MEAN_GT) { + return current_mean_variance.GetMean(); + } else if (type == CONDITION_TYPE::SD_LT || type == CONDITION_TYPE::SD_GT) { + return current_mean_variance.GetStandardDeviation(); + } else if (type == CONDITION_TYPE::MAX_MIN_GT || type == CONDITION_TYPE::MAX_MIN_LT) { + return max - min; + } + return std::numeric_limits::quiet_NaN(); +} + +template +double_t TensorSummary::GetZeroValPercent() { + if (num_elements == 0) { + return 0; + } + + return (zero_count * 100.0) / num_elements; +} + +template +void TensorSummary::InitCalculators(const std::vector &wps) { + for (auto &wp : wps) { + auto wp_id = wp.id; + mean_sd_cal_enabled |= wp.mean_sd_enabled(); + if (wp.allclose_enabled() && prev_tensor_ptr) { + all_close[wp_id] = std::make_unique(); + if (!wp.parameter_list[0].disabled) { + all_close[wp_id]->set_atol(wp.parameter_list[0].value); + } + if (!wp.parameter_list[1].disabled) { + all_close[wp_id]->set_rtol(wp.parameter_list[1].value); + } + } else if (wp.range_enabled()) { + range_counts[wp_id] = std::make_unique(); + if (!wp.parameter_list[0].disabled) { + range_counts[wp_id]->set_range_start_inclusive(wp.parameter_list[0].value); + } + if (!wp.parameter_list[1].disabled) { + range_counts[wp_id]->set_range_end_inclusive(wp.parameter_list[1].value); + } + } else if (wp.tensor_update_ratio_mean_enabled() && prev_tensor_ptr) { + means.insert({"curr_prev_diff_mean", std::make_unique()}); + means.insert({"abs_prev_mean", std::make_unique()}); + } else if (wp.abs_mean_enabled()) { + means.insert({"abs_current_mean", std::make_unique()}); + } + } +} +template class TensorSummary; +template class TensorSummary; +template class TensorSummary; +template class TensorSummary; +template class TensorSummary; +template class TensorSummary; +template class TensorSummary; +template class TensorSummary; +template class TensorSummary; +template class TensorSummary; +template class TensorSummary; +} // namespace mindspore diff --git a/mindspore/ccsrc/debug/debugger/tensor_summary.h b/mindspore/ccsrc/debug/debugger/tensor_summary.h new file mode 100644 index 0000000000..84d98704a0 --- /dev/null +++ b/mindspore/ccsrc/debug/debugger/tensor_summary.h @@ -0,0 +1,120 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_TENSOR_SUMMARY_H +#define MINDSPORE_TENSOR_SUMMARY_H + +#include +#include +#include +#include +#include + +#include "debug/debug_services.h" + +namespace mindspore { +class RangeCountCalculator { + public: + RangeCountCalculator(); + void ProcessElement(double element); + double GetPercentInRange(); + void set_range_start_inclusive(double value) { range_start_inclusive = value; } + void set_range_end_inclusive(double value) { range_end_inclusive = value; } + + private: + double range_start_inclusive; + double range_end_inclusive; + int count; + int total; +}; + +class AllCloseCalculator { + public: + AllCloseCalculator(); + void ProcessElement(double current, double previous); + bool IsAllClose(); + void set_atol(double value) { atol = value; } + void set_rtol(double value) { rtol = value; } + + private: + double atol; + double rtol; + bool result; +}; + +class MeanCalculator { + public: + MeanCalculator(); + void ProcessElement(double value); + double GetMean(); + + protected: + double mean; + int count; +}; + +class VarianceAndMeanCalculator { + public: + VarianceAndMeanCalculator(); + void ProcessElement(double value); + double GetStandardDeviation(); + double GetVariance(); + double GetMean(); + + private: + double mean; + int count; + double m2; +}; + +class ITensorSummary { + public: + virtual ~ITensorSummary() = default; + virtual void SummarizeTensor(const std::vector &) = 0; + virtual std::tuple> IsWatchpointHit( + DebugServices::watchpoint_t) = 0; +}; + +template +class TensorSummary : public ITensorSummary { + public: + TensorSummary() = default; + TensorSummary(void *, void *, uint32_t); + void SummarizeTensor(const std::vector &) override; + // returns hit, error_code, parameter_list + std::tuple> IsWatchpointHit(DebugServices::watchpoint_t) override; + + private: + T *current_tensor_ptr; + T *prev_tensor_ptr; + uint32_t num_elements; + double min; + double max; + uint32_t inf_count; + uint32_t nan_count; + uint32_t zero_count; + double epsilon; + bool mean_sd_cal_enabled; + VarianceAndMeanCalculator current_mean_variance; + std::unordered_map> means; + std::unordered_map> all_close; + std::unordered_map> range_counts; + double_t StatLookup(const DebugServices::watchpoint_t &); + double_t StatLookup(const std::string &, const DebugServices::watchpoint_t &); + double_t GetZeroValPercent(); + void InitCalculators(const std::vector &); +}; +} // namespace mindspore +#endif // MINDSPORE_TENSOR_SUMMARY_H diff --git a/mindspore/ccsrc/debug/tensor_load.h b/mindspore/ccsrc/debug/tensor_load.h index ff1289751a..cd690b1452 100644 --- a/mindspore/ccsrc/debug/tensor_load.h +++ b/mindspore/ccsrc/debug/tensor_load.h @@ -56,7 +56,12 @@ class TensorLoader { std::map> GetTensorMap() { return tensor_list_map; } - std::shared_ptr GetPrevTensor(std::string tensor_name) { return tensor_list_map[tensor_name + ":prev"]; } + std::shared_ptr GetPrevTensor(std::string tensor_name) { + if (tensor_list_map.find(tensor_name + ":prev") != tensor_list_map.end()) { + return tensor_list_map[tensor_name + ":prev"]; + } + return nullptr; + } std::vector> GetNodeTensorMap(std::string node_name) { std::vector> tensors;