pull/9033/head
Harshvardhan Gupta 4 years ago
parent c4d2f41829
commit 561f9082e9

@ -12,6 +12,7 @@ if (ENABLE_DEBUGGER)
"${CMAKE_CURRENT_SOURCE_DIR}/debugger/debugger.cc" "${CMAKE_CURRENT_SOURCE_DIR}/debugger/debugger.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/debugger/grpc_client.cc" "${CMAKE_CURRENT_SOURCE_DIR}/debugger/grpc_client.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/debugger/proto_exporter.cc" "${CMAKE_CURRENT_SOURCE_DIR}/debugger/proto_exporter.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/debugger/tensor_summary.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/debug_services.cc" "${CMAKE_CURRENT_SOURCE_DIR}/debug_services.cc"
) )
endif (ENABLE_DEBUGGER) endif (ENABLE_DEBUGGER)

File diff suppressed because it is too large Load Diff

@ -23,6 +23,7 @@
#include <tuple> #include <tuple>
#include <unordered_map> #include <unordered_map>
#include <mutex> #include <mutex>
#include <map>
#include <limits> #include <limits>
#include "debug/tensor_load.h" #include "debug/tensor_load.h"
#include "debug/tensor_data.h" #include "debug/tensor_data.h"
@ -60,23 +61,13 @@ class DebugServices {
ALL_ZERO, ALL_ZERO,
CHANGE_TOO_LARGE, CHANGE_TOO_LARGE,
CHANGE_TOO_SMALL, CHANGE_TOO_SMALL,
NOT_CHANGED NOT_CHANGED,
}; RANGE
enum STAT_TYPE {
STAT_MIN,
STAT_MAX,
STAT_MEAN,
STAT_ZERO_PERCENTAGE,
STAT_TENSOR_UPDATE_RATIO_MEAN,
STAT_ALLCLOSE,
STAT_ABS_MEAN
}; };
typedef struct condition { typedef struct condition {
CONDITION_TYPE type; CONDITION_TYPE type;
float parameter = 0; float parameter = 0;
std::string comparison;
} condition_t; } condition_t;
typedef struct parameter { typedef struct parameter {
@ -84,6 +75,25 @@ class DebugServices {
bool disabled; bool disabled;
double_t value; double_t value;
bool hit; bool hit;
double_t actual_value;
void Evaluate(double_t actualValue, std::string inequality_type) {
if (std::isnan(actualValue)) return;
actual_value = actualValue;
if (inequality_type.empty()) {
auto pos = name.find_last_of('_');
if (pos != std::string::npos) {
inequality_type = name.substr(pos + 1);
}
}
std::map<std::string, bool> condition_check{{"gt", actual_value > value},
{"lt", actual_value < value},
{"ge", actual_value >= value},
{"le", actual_value <= value}};
hit = condition_check[inequality_type];
}
} parameter_t; } parameter_t;
typedef struct watchpoint { typedef struct watchpoint {
@ -93,18 +103,28 @@ class DebugServices {
std::vector<parameter_t> parameter_list; std::vector<parameter_t> parameter_list;
size_t location = 0; size_t location = 0;
bool IsNodeIncluded(const std::string &tensor_name) { std::string FindQualifiedTensorName(const std::string &tensor_name) {
std::string node_name = tensor_name.substr(0, tensor_name.find_first_of(':')); std::string node_name = tensor_name.substr(0, tensor_name.find_first_of(':'));
for (auto check_node : check_node_list) { for (auto check_node : check_node_list) {
std::string w_name = std::get<0>(check_node); std::string w_name = std::get<0>(check_node);
bool w_type = std::get<1>(check_node); bool w_type = std::get<1>(check_node);
auto found = w_name.find_last_of('/'); auto found = w_name.find_last_of('/');
if (found != std::string::npos && w_name.substr(found + 1) == tensor_name) return true; if (found != std::string::npos && w_name.substr(found + 1) == tensor_name) return w_name;
if ((w_type && (tensor_name.find(w_name) == location || w_name == "*")) || (!w_type && node_name == w_name)) { if ((w_type && (tensor_name.find(w_name) == location || w_name == "*")) || (!w_type && node_name == w_name)) {
return true; return w_name;
}
}
return {};
} }
bool is_gt_wp() {
return condition.type == MAX_GT || condition.type == MIN_GT || condition.type == MEAN_GT ||
condition.type == SD_GT || condition.type == MAX_MIN_GT;
} }
return false;
bool is_lt_wp() {
return condition.type == MAX_LT || condition.type == MIN_LT || condition.type == MEAN_LT ||
condition.type == SD_LT || condition.type == MAX_MIN_LT;
} }
bool min_max_enabled() { bool min_max_enabled() {
@ -119,67 +139,26 @@ class DebugServices {
return condition.type == HAS_INF || condition.type == HAS_NAN || condition.type == GENERAL_OVERFLOW; return condition.type == HAS_INF || condition.type == HAS_NAN || condition.type == GENERAL_OVERFLOW;
} }
// mean or sd related condition set // mean or sd related condition set
bool mean_sd_enabled() { bool mean_sd_enabled() const {
return condition.type == MEAN_LT || condition.type == MEAN_GT || condition.type == SD_LT || return condition.type == MEAN_LT || condition.type == MEAN_GT || condition.type == SD_LT ||
condition.type == SD_GT || (condition.type == TOO_LARGE && !parameter_list[3].disabled) || condition.type == SD_GT || (condition.type == TOO_LARGE && !parameter_list[3].disabled) ||
(condition.type == TOO_SMALL && !parameter_list[3].disabled); (condition.type == TOO_SMALL && !parameter_list[3].disabled);
} }
bool abs_mean_enabled() { bool abs_mean_enabled() const {
return (condition.type == TOO_LARGE && !parameter_list[0].disabled) || return (condition.type == TOO_LARGE && !parameter_list[0].disabled) ||
(condition.type == TOO_SMALL && !parameter_list[0].disabled); (condition.type == TOO_SMALL && !parameter_list[0].disabled);
} }
bool zero_percentage_enabled() { return condition.type == ALL_ZERO || condition.type == INIT; } bool zero_percentage_enabled() { return condition.type == ALL_ZERO || condition.type == INIT; }
bool tensor_update_ratio_mean_enabled() {
return condition.type == CHANGE_TOO_LARGE || condition.type == CHANGE_TOO_SMALL;
}
bool allclose_enabled() { return condition.type == NOT_CHANGED; }
} watchpoint_t;
struct tensor_stats {
double min = std::numeric_limits<double>::max();
double max = std::numeric_limits<double>::lowest();
bool has_inf = false;
bool has_nan = false;
unsigned int n = 0;
double mean = 0.0;
double m2 = 0.0;
double zero_percentage = 0.0;
double tensor_update_ratio_mean = -1;
bool allclose = false;
double abs_mean = 0.0;
double statLookup(CONDITION_TYPE type) const {
if (type == MAX_GT || type == MAX_LT) return max;
if (type == MIN_GT || type == MIN_LT) return min;
if (type == MAX_MIN_GT || type == MAX_MIN_LT) return (max - min);
if (type == MEAN_GT || type == MEAN_LT) return mean;
if (type == SD_GT || type == SD_LT) return getStandardDeviation();
return std::numeric_limits<double>::quiet_NaN();
}
double parmLookup(STAT_TYPE type) const { bool tensor_update_ratio_mean_enabled() const {
if (type == STAT_MAX) return max; return condition.type == CHANGE_TOO_LARGE || condition.type == CHANGE_TOO_SMALL;
if (type == STAT_MIN) return min;
if (type == STAT_MEAN) return mean;
if (type == STAT_ZERO_PERCENTAGE) return zero_percentage;
if (type == STAT_TENSOR_UPDATE_RATIO_MEAN) return tensor_update_ratio_mean;
if (type == STAT_ALLCLOSE) return allclose;
if (type == STAT_ABS_MEAN) return abs_mean;
return std::numeric_limits<double>::quiet_NaN();
} }
bool allclose_enabled() const { return condition.type == NOT_CHANGED; }
double getMean() const { return mean; } bool range_enabled() const {
return condition.type == RANGE && (!parameter_list[0].disabled || !parameter_list[1].disabled);
double getVariance() const {
if (n > 1) {
return m2 / (n - 1);
} else {
return 0.0;
} }
} } watchpoint_t;
double getStandardDeviation() const { return sqrt(getVariance()); }
};
void AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter, void AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter,
const std::vector<std::tuple<std::string, bool>> &check_node_list, const std::vector<std::tuple<std::string, bool>> &check_node_list,
@ -189,7 +168,7 @@ class DebugServices {
void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition, void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition,
std::vector<unsigned int> *watchpoint_id, std::vector<std::vector<parameter_t>> *parameters, std::vector<unsigned int> *watchpoint_id, std::vector<std::vector<parameter_t>> *parameters,
const std::vector<std::string> &op_overflows, std::vector<int32_t> *error_code, const std::vector<std::string> &op_overflows,
const std::vector<std::shared_ptr<TensorData>> &tensor_list, bool init_dbg_suspend); const std::vector<std::shared_ptr<TensorData>> &tensor_list, bool init_dbg_suspend);
void ReadNodesTensors(std::vector<std::string> name, std::vector<std::string> *ret_name, void ReadNodesTensors(std::vector<std::string> name, std::vector<std::string> *ret_name,
@ -210,19 +189,8 @@ class DebugServices {
std::mutex lock_; std::mutex lock_;
std::unordered_map<unsigned int, watchpoint_t> watchpoint_table; std::unordered_map<unsigned int, watchpoint_t> watchpoint_table;
std::vector<std::string> condition_label = {
"HAS_NAN", "HAS_INF", "IS_OVERFLOW", "MAX_GT", "MAX_LT",
"MIN_GT", "MIN_LT", "MAX_MIN_GT", "MAX_MIN_LT", "MEAN_GT",
"MEAN_LT", "SD_GT", "SD_LT", "GENERAL_OVERFLOW", "INIT",
"TOO_LARGE", "TOO_SMALL", "ALL_ZERO", "CHANGE_TOO_LARGE", "CHANGE_TOO_SMALL",
"NOT_CHANGED"};
TensorLoader *tensor_loader_; TensorLoader *tensor_loader_;
template <typename T>
static tensor_stats SummarizeTensor(const T *start, const T *start_prev, unsigned int n, bool need_min_max,
bool need_mean_sd, bool need_zero_percentage, bool need_tensor_update_ratio_mean,
bool need_allclose, bool need_abs_mean_sd);
}; };
} // namespace mindspore } // namespace mindspore

@ -108,6 +108,7 @@ message WatchCondition {
tensor_change_too_large = 18; tensor_change_too_large = 18;
tensor_change_too_small = 19; tensor_change_too_small = 19;
tensor_not_changed = 20; tensor_not_changed = 20;
tensor_range = 21;
} }
Condition condition = 1; Condition condition = 1;
float value = 2; float value = 2;
@ -116,6 +117,7 @@ message WatchCondition {
bool disabled = 2; bool disabled = 2;
double value = 3; double value = 3;
bool hit = 4; // Whether this parameter is hit when checking tensor. bool hit = 4; // Whether this parameter is hit when checking tensor.
double actual_value = 5;
} }
repeated Parameter params = 4; repeated Parameter params = 4;
} }
@ -129,4 +131,5 @@ message WatchpointHit {
TensorProto tensor = 1; TensorProto tensor = 1;
WatchCondition watch_condition = 2; WatchCondition watch_condition = 2;
int32 id = 3; int32 id = 3;
int32 error_code = 4;
} }

@ -757,6 +757,7 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode
std::vector<unsigned int> watchpoint_id; std::vector<unsigned int> watchpoint_id;
std::vector<std::string> overflow_ops; std::vector<std::string> overflow_ops;
std::vector<std::vector<DebugServices::parameter_t>> parameters; std::vector<std::vector<DebugServices::parameter_t>> parameters;
std::vector<int32_t> error_codes;
#ifdef ENABLE_D #ifdef ENABLE_D
overflow_ops = CheckOpOverflow(); overflow_ops = CheckOpOverflow();
#endif #endif
@ -768,14 +769,14 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode
tensor_list = tensor_loader->GetNodeTensorMap(watchnode); tensor_list = tensor_loader->GetNodeTensorMap(watchnode);
debug_services_->AddWeightsBiasInputs(&tensor_list, kernel); debug_services_->AddWeightsBiasInputs(&tensor_list, kernel);
} }
debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, &parameters, overflow_ops, tensor_list, debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, &parameters, &error_codes, overflow_ops,
initial_suspend_); tensor_list, initial_suspend_);
std::list<WatchpointHit> hits; std::list<WatchpointHit> hits;
for (unsigned int i = 0; i < name.size(); i++) { for (unsigned int i = 0; i < name.size(); i++) {
WatchpointHit hit; WatchpointHit hit;
std::vector<DebugServices::parameter_t> &parameter = parameters[i]; std::vector<DebugServices::parameter_t> &parameter = parameters[i];
hit.set_id(watchpoint_id[i]); hit.set_id(watchpoint_id[i]);
hit.set_error_code(error_codes[i]);
// here TensorProto act as a tensor indicator, not sending tensor content // here TensorProto act as a tensor indicator, not sending tensor content
TensorProto *tensor_item = hit.mutable_tensor(); TensorProto *tensor_item = hit.mutable_tensor();
tensor_item->set_node_name(name[i]); tensor_item->set_node_name(name[i]);
@ -790,6 +791,7 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode
x->set_disabled(p.disabled); x->set_disabled(p.disabled);
x->set_value(p.value); x->set_value(p.value);
x->set_hit(p.hit); x->set_hit(p.hit);
x->set_actual_value(p.actual_value);
} }
hits.push_back(hit); hits.push_back(hit);
} }

File diff suppressed because it is too large Load Diff

@ -0,0 +1,120 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_TENSOR_SUMMARY_H
#define MINDSPORE_TENSOR_SUMMARY_H
#include <vector>
#include <unordered_map>
#include <tuple>
#include <memory>
#include <string>
#include "debug/debug_services.h"
namespace mindspore {
class RangeCountCalculator {
public:
RangeCountCalculator();
void ProcessElement(double element);
double GetPercentInRange();
void set_range_start_inclusive(double value) { range_start_inclusive = value; }
void set_range_end_inclusive(double value) { range_end_inclusive = value; }
private:
double range_start_inclusive;
double range_end_inclusive;
int count;
int total;
};
class AllCloseCalculator {
public:
AllCloseCalculator();
void ProcessElement(double current, double previous);
bool IsAllClose();
void set_atol(double value) { atol = value; }
void set_rtol(double value) { rtol = value; }
private:
double atol;
double rtol;
bool result;
};
class MeanCalculator {
public:
MeanCalculator();
void ProcessElement(double value);
double GetMean();
protected:
double mean;
int count;
};
class VarianceAndMeanCalculator {
public:
VarianceAndMeanCalculator();
void ProcessElement(double value);
double GetStandardDeviation();
double GetVariance();
double GetMean();
private:
double mean;
int count;
double m2;
};
class ITensorSummary {
public:
virtual ~ITensorSummary() = default;
virtual void SummarizeTensor(const std::vector<DebugServices::watchpoint_t> &) = 0;
virtual std::tuple<bool, int32_t, std::vector<DebugServices::parameter_t>> IsWatchpointHit(
DebugServices::watchpoint_t) = 0;
};
template <typename T>
class TensorSummary : public ITensorSummary {
public:
TensorSummary() = default;
TensorSummary(void *, void *, uint32_t);
void SummarizeTensor(const std::vector<DebugServices::watchpoint_t> &) override;
// returns hit, error_code, parameter_list
std::tuple<bool, int, std::vector<DebugServices::parameter_t>> IsWatchpointHit(DebugServices::watchpoint_t) override;
private:
T *current_tensor_ptr;
T *prev_tensor_ptr;
uint32_t num_elements;
double min;
double max;
uint32_t inf_count;
uint32_t nan_count;
uint32_t zero_count;
double epsilon;
bool mean_sd_cal_enabled;
VarianceAndMeanCalculator current_mean_variance;
std::unordered_map<std::string, std::unique_ptr<MeanCalculator>> means;
std::unordered_map<uint32_t, std::unique_ptr<AllCloseCalculator>> all_close;
std::unordered_map<uint32_t, std::unique_ptr<RangeCountCalculator>> range_counts;
double_t StatLookup(const DebugServices::watchpoint_t &);
double_t StatLookup(const std::string &, const DebugServices::watchpoint_t &);
double_t GetZeroValPercent();
void InitCalculators(const std::vector<DebugServices::watchpoint_t> &);
};
} // namespace mindspore
#endif // MINDSPORE_TENSOR_SUMMARY_H

@ -56,7 +56,12 @@ class TensorLoader {
std::map<std::string, std::shared_ptr<TensorData>> GetTensorMap() { return tensor_list_map; } std::map<std::string, std::shared_ptr<TensorData>> GetTensorMap() { return tensor_list_map; }
std::shared_ptr<TensorData> GetPrevTensor(std::string tensor_name) { return tensor_list_map[tensor_name + ":prev"]; } std::shared_ptr<TensorData> GetPrevTensor(std::string tensor_name) {
if (tensor_list_map.find(tensor_name + ":prev") != tensor_list_map.end()) {
return tensor_list_map[tensor_name + ":prev"];
}
return nullptr;
}
std::vector<std::shared_ptr<TensorData>> GetNodeTensorMap(std::string node_name) { std::vector<std::shared_ptr<TensorData>> GetNodeTensorMap(std::string node_name) {
std::vector<std::shared_ptr<TensorData>> tensors; std::vector<std::shared_ptr<TensorData>> tensors;

Loading…
Cancel
Save