init

5 years ago · 561f9082e9
parent c4d2f41829
commit 561f9082e9
8 changed files with 510 additions and 362 deletions
--- a/mindspore/ccsrc/debug/CMakeLists.txt
+++ b/mindspore/ccsrc/debug/CMakeLists.txt
@ -12,6 +12,7 @@ if (ENABLE_DEBUGGER)
        "${CMAKE_CURRENT_SOURCE_DIR}/debugger/debugger.cc"
        "${CMAKE_CURRENT_SOURCE_DIR}/debugger/grpc_client.cc"
        "${CMAKE_CURRENT_SOURCE_DIR}/debugger/proto_exporter.cc"
        "${CMAKE_CURRENT_SOURCE_DIR}/debugger/tensor_summary.cc"
        "${CMAKE_CURRENT_SOURCE_DIR}/debug_services.cc"
        )
 endif (ENABLE_DEBUGGER)
--- a/mindspore/ccsrc/debug/debug_services.cc
+++ b/mindspore/ccsrc/debug/debug_services.cc
--- a/mindspore/ccsrc/debug/debug_services.h
+++ b/mindspore/ccsrc/debug/debug_services.h
@ -23,6 +23,7 @@
 #include <tuple>
 #include <unordered_map>
 #include <mutex>
 #include <map>
 #include <limits>
 #include "debug/tensor_load.h"
 #include "debug/tensor_data.h"
@ -60,23 +61,13 @@ class DebugServices {
    ALL_ZERO,
    CHANGE_TOO_LARGE,
    CHANGE_TOO_SMALL,
-    NOT_CHANGED
+    NOT_CHANGED,
-  };
+    RANGE
  enum STAT_TYPE {
    STAT_MIN,
    STAT_MAX,
    STAT_MEAN,
    STAT_ZERO_PERCENTAGE,
    STAT_TENSOR_UPDATE_RATIO_MEAN,
    STAT_ALLCLOSE,
    STAT_ABS_MEAN
  };
  typedef struct condition {
    CONDITION_TYPE type;
    float parameter = 0;
    std::string comparison;
  } condition_t;
  typedef struct parameter {
@ -84,6 +75,25 @@ class DebugServices {
    bool disabled;
    double_t value;
    bool hit;
    double_t actual_value;
    void Evaluate(double_t actualValue, std::string inequality_type) {
      if (std::isnan(actualValue)) return;
      actual_value = actualValue;
      if (inequality_type.empty()) {
        auto pos = name.find_last_of('_');
        if (pos != std::string::npos) {
          inequality_type = name.substr(pos + 1);
        }
      }
      std::map<std::string, bool> condition_check{{"gt", actual_value > value},
                                                  {"lt", actual_value < value},
                                                  {"ge", actual_value >= value},
                                                  {"le", actual_value <= value}};
      hit = condition_check[inequality_type];
    }
  } parameter_t;
  typedef struct watchpoint {
@ -93,18 +103,28 @@ class DebugServices {
    std::vector<parameter_t> parameter_list;
    size_t location = 0;
-    bool IsNodeIncluded(const std::string &tensor_name) {
+    std::string FindQualifiedTensorName(const std::string &tensor_name) {
      std::string node_name = tensor_name.substr(0, tensor_name.find_first_of(':'));
      for (auto check_node : check_node_list) {
        std::string w_name = std::get<0>(check_node);
        bool w_type = std::get<1>(check_node);
        auto found = w_name.find_last_of('/');
-        if (found != std::string::npos && w_name.substr(found + 1) == tensor_name) return true;
+        if (found != std::string::npos && w_name.substr(found + 1) == tensor_name) return w_name;
        if ((w_type && (tensor_name.find(w_name) == location || w_name == "*")) || (!w_type && node_name == w_name)) {
-          return true;
+          return w_name;
        }
      }
      return {};
    }
    bool is_gt_wp() {
      return condition.type == MAX_GT || condition.type == MIN_GT || condition.type == MEAN_GT ||
             condition.type == SD_GT || condition.type == MAX_MIN_GT;
    }
-      return false;
+
    bool is_lt_wp() {
      return condition.type == MAX_LT || condition.type == MIN_LT || condition.type == MEAN_LT ||
             condition.type == SD_LT || condition.type == MAX_MIN_LT;
    }
    bool min_max_enabled() {
@ -119,67 +139,26 @@ class DebugServices {
      return condition.type == HAS_INF || condition.type == HAS_NAN || condition.type == GENERAL_OVERFLOW;
    }
    // mean or sd related condition set
-    bool mean_sd_enabled() {
+    bool mean_sd_enabled() const {
      return condition.type == MEAN_LT || condition.type == MEAN_GT || condition.type == SD_LT ||
             condition.type == SD_GT || (condition.type == TOO_LARGE && !parameter_list[3].disabled) ||
             (condition.type == TOO_SMALL && !parameter_list[3].disabled);
    }
-    bool abs_mean_enabled() {
+    bool abs_mean_enabled() const {
      return (condition.type == TOO_LARGE && !parameter_list[0].disabled) ||
             (condition.type == TOO_SMALL && !parameter_list[0].disabled);
    }
    bool zero_percentage_enabled() { return condition.type == ALL_ZERO || condition.type == INIT; }
    bool tensor_update_ratio_mean_enabled() {
      return condition.type == CHANGE_TOO_LARGE || condition.type == CHANGE_TOO_SMALL;
    }
    bool allclose_enabled() { return condition.type == NOT_CHANGED; }
  } watchpoint_t;
  struct tensor_stats {
    double min = std::numeric_limits<double>::max();
    double max = std::numeric_limits<double>::lowest();
    bool has_inf = false;
    bool has_nan = false;
    unsigned int n = 0;
    double mean = 0.0;
    double m2 = 0.0;
    double zero_percentage = 0.0;
    double tensor_update_ratio_mean = -1;
    bool allclose = false;
    double abs_mean = 0.0;
    double statLookup(CONDITION_TYPE type) const {
      if (type == MAX_GT || type == MAX_LT) return max;
      if (type == MIN_GT || type == MIN_LT) return min;
      if (type == MAX_MIN_GT || type == MAX_MIN_LT) return (max - min);
      if (type == MEAN_GT || type == MEAN_LT) return mean;
      if (type == SD_GT || type == SD_LT) return getStandardDeviation();
      return std::numeric_limits<double>::quiet_NaN();
    }
-    double parmLookup(STAT_TYPE type) const {
+    bool tensor_update_ratio_mean_enabled() const {
-      if (type == STAT_MAX) return max;
+      return condition.type == CHANGE_TOO_LARGE || condition.type == CHANGE_TOO_SMALL;
      if (type == STAT_MIN) return min;
      if (type == STAT_MEAN) return mean;
      if (type == STAT_ZERO_PERCENTAGE) return zero_percentage;
      if (type == STAT_TENSOR_UPDATE_RATIO_MEAN) return tensor_update_ratio_mean;
      if (type == STAT_ALLCLOSE) return allclose;
      if (type == STAT_ABS_MEAN) return abs_mean;
      return std::numeric_limits<double>::quiet_NaN();
    }
    bool allclose_enabled() const { return condition.type == NOT_CHANGED; }
-    double getMean() const { return mean; }
+    bool range_enabled() const {
-
+      return condition.type == RANGE && (!parameter_list[0].disabled || !parameter_list[1].disabled);
    double getVariance() const {
      if (n > 1) {
        return m2 / (n - 1);
      } else {
        return 0.0;
    }
-    }
+  } watchpoint_t;
    double getStandardDeviation() const { return sqrt(getVariance()); }
  };
  void AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter,
                     const std::vector<std::tuple<std::string, bool>> &check_node_list,
@ -189,7 +168,7 @@ class DebugServices {
  void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition,
                        std::vector<unsigned int> *watchpoint_id, std::vector<std::vector<parameter_t>> *parameters,
-                        const std::vector<std::string> &op_overflows,
+                        std::vector<int32_t> *error_code, const std::vector<std::string> &op_overflows,
                        const std::vector<std::shared_ptr<TensorData>> &tensor_list, bool init_dbg_suspend);
  void ReadNodesTensors(std::vector<std::string> name, std::vector<std::string> *ret_name,
@ -210,19 +189,8 @@ class DebugServices {
  std::mutex lock_;
  std::unordered_map<unsigned int, watchpoint_t> watchpoint_table;
  std::vector<std::string> condition_label = {
    "HAS_NAN",    "HAS_INF",   "IS_OVERFLOW", "MAX_GT",           "MAX_LT",
    "MIN_GT",     "MIN_LT",    "MAX_MIN_GT",  "MAX_MIN_LT",       "MEAN_GT",
    "MEAN_LT",    "SD_GT",     "SD_LT",       "GENERAL_OVERFLOW", "INIT",
    "TOO_LARGE",  "TOO_SMALL", "ALL_ZERO",    "CHANGE_TOO_LARGE", "CHANGE_TOO_SMALL",
    "NOT_CHANGED"};
  TensorLoader *tensor_loader_;
  template <typename T>
  static tensor_stats SummarizeTensor(const T *start, const T *start_prev, unsigned int n, bool need_min_max,
                                      bool need_mean_sd, bool need_zero_percentage, bool need_tensor_update_ratio_mean,
                                      bool need_allclose, bool need_abs_mean_sd);
 };
 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/debugger/debug_grpc.proto
+++ b/mindspore/ccsrc/debug/debugger/debug_grpc.proto
@ -108,6 +108,7 @@ message WatchCondition {
    tensor_change_too_large = 18;
    tensor_change_too_small = 19;
    tensor_not_changed = 20;
    tensor_range = 21;
  }
  Condition condition = 1;
  float value = 2;
@ -116,6 +117,7 @@ message WatchCondition {
    bool disabled = 2;
    double value = 3;
    bool hit = 4;  // Whether this parameter is hit when checking tensor.
    double actual_value = 5;
  }
  repeated Parameter params = 4;
 }
@ -129,4 +131,5 @@ message WatchpointHit {
  TensorProto tensor = 1;
  WatchCondition watch_condition = 2;
  int32 id = 3;
  int32 error_code = 4;
 }
--- a/mindspore/ccsrc/debug/debugger/debugger.cc
+++ b/mindspore/ccsrc/debug/debugger/debugger.cc
@ -757,6 +757,7 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode
  std::vector<unsigned int> watchpoint_id;
  std::vector<std::string> overflow_ops;
  std::vector<std::vector<DebugServices::parameter_t>> parameters;
  std::vector<int32_t> error_codes;
 #ifdef ENABLE_D
  overflow_ops = CheckOpOverflow();
 #endif
@ -768,14 +769,14 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode
    tensor_list = tensor_loader->GetNodeTensorMap(watchnode);
    debug_services_->AddWeightsBiasInputs(&tensor_list, kernel);
  }
-  debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, &parameters, overflow_ops, tensor_list,
+  debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, &parameters, &error_codes, overflow_ops,
-                                    initial_suspend_);
+                                    tensor_list, initial_suspend_);
  std::list<WatchpointHit> hits;
  for (unsigned int i = 0; i < name.size(); i++) {
    WatchpointHit hit;
    std::vector<DebugServices::parameter_t> &parameter = parameters[i];
    hit.set_id(watchpoint_id[i]);
-
+    hit.set_error_code(error_codes[i]);
    // here TensorProto act as a tensor indicator, not sending tensor content
    TensorProto *tensor_item = hit.mutable_tensor();
    tensor_item->set_node_name(name[i]);
@ -790,6 +791,7 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode
      x->set_disabled(p.disabled);
      x->set_value(p.value);
      x->set_hit(p.hit);
      x->set_actual_value(p.actual_value);
    }
    hits.push_back(hit);
  }
--- a/mindspore/ccsrc/debug/debugger/tensor_summary.cc
+++ b/mindspore/ccsrc/debug/debugger/tensor_summary.cc
--- a/mindspore/ccsrc/debug/debugger/tensor_summary.h
+++ b/mindspore/ccsrc/debug/debugger/tensor_summary.h
@ -0,0 +1,120 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_TENSOR_SUMMARY_H
 #define MINDSPORE_TENSOR_SUMMARY_H
 #include <vector>
 #include <unordered_map>
 #include <tuple>
 #include <memory>
 #include <string>
 #include "debug/debug_services.h"
 namespace mindspore {
 class RangeCountCalculator {
 public:
  RangeCountCalculator();
  void ProcessElement(double element);
  double GetPercentInRange();
  void set_range_start_inclusive(double value) { range_start_inclusive = value; }
  void set_range_end_inclusive(double value) { range_end_inclusive = value; }
 private:
  double range_start_inclusive;
  double range_end_inclusive;
  int count;
  int total;
 };
 class AllCloseCalculator {
 public:
  AllCloseCalculator();
  void ProcessElement(double current, double previous);
  bool IsAllClose();
  void set_atol(double value) { atol = value; }
  void set_rtol(double value) { rtol = value; }
 private:
  double atol;
  double rtol;
  bool result;
 };
 class MeanCalculator {
 public:
  MeanCalculator();
  void ProcessElement(double value);
  double GetMean();
 protected:
  double mean;
  int count;
 };
 class VarianceAndMeanCalculator {
 public:
  VarianceAndMeanCalculator();
  void ProcessElement(double value);
  double GetStandardDeviation();
  double GetVariance();
  double GetMean();
 private:
  double mean;
  int count;
  double m2;
 };
 class ITensorSummary {
 public:
  virtual ~ITensorSummary() = default;
  virtual void SummarizeTensor(const std::vector<DebugServices::watchpoint_t> &) = 0;
  virtual std::tuple<bool, int32_t, std::vector<DebugServices::parameter_t>> IsWatchpointHit(
    DebugServices::watchpoint_t) = 0;
 };
 template <typename T>
 class TensorSummary : public ITensorSummary {
 public:
  TensorSummary() = default;
  TensorSummary(void *, void *, uint32_t);
  void SummarizeTensor(const std::vector<DebugServices::watchpoint_t> &) override;
  // returns hit, error_code, parameter_list
  std::tuple<bool, int, std::vector<DebugServices::parameter_t>> IsWatchpointHit(DebugServices::watchpoint_t) override;
 private:
  T *current_tensor_ptr;
  T *prev_tensor_ptr;
  uint32_t num_elements;
  double min;
  double max;
  uint32_t inf_count;
  uint32_t nan_count;
  uint32_t zero_count;
  double epsilon;
  bool mean_sd_cal_enabled;
  VarianceAndMeanCalculator current_mean_variance;
  std::unordered_map<std::string, std::unique_ptr<MeanCalculator>> means;
  std::unordered_map<uint32_t, std::unique_ptr<AllCloseCalculator>> all_close;
  std::unordered_map<uint32_t, std::unique_ptr<RangeCountCalculator>> range_counts;
  double_t StatLookup(const DebugServices::watchpoint_t &);
  double_t StatLookup(const std::string &, const DebugServices::watchpoint_t &);
  double_t GetZeroValPercent();
  void InitCalculators(const std::vector<DebugServices::watchpoint_t> &);
 };
 }  // namespace mindspore
 #endif  // MINDSPORE_TENSOR_SUMMARY_H
--- a/mindspore/ccsrc/debug/tensor_load.h
+++ b/mindspore/ccsrc/debug/tensor_load.h
@ -56,7 +56,12 @@ class TensorLoader {
  std::map<std::string, std::shared_ptr<TensorData>> GetTensorMap() { return tensor_list_map; }
-  std::shared_ptr<TensorData> GetPrevTensor(std::string tensor_name) { return tensor_list_map[tensor_name + ":prev"]; }
+  std::shared_ptr<TensorData> GetPrevTensor(std::string tensor_name) {
    if (tensor_list_map.find(tensor_name + ":prev") != tensor_list_map.end()) {
      return tensor_list_map[tensor_name + ":prev"];
    }
    return nullptr;
  }
  std::vector<std::shared_ptr<TensorData>> GetNodeTensorMap(std::string node_name) {
    std::vector<std::shared_ptr<TensorData>> tensors;