!13009 [debugger] offline debug feature

From: @islam_amin
Reviewed-by: 
Signed-off-by:
pull/13009/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit 83b25e10e9

@ -63,6 +63,16 @@ install(
COMPONENT mindspore
)
if(CMAKE_SYSTEM_NAME MATCHES "Windows")
message("offline debugger does not support windows system temporarily")
else()
install(
TARGETS _mindspore_offline_debug
DESTINATION ${INSTALL_BASE_DIR}
COMPONENT mindspore
)
endif()
install(
TARGETS mindspore_shared_lib
DESTINATION ${INSTALL_LIB_DIR}
@ -317,6 +327,18 @@ if(EXISTS ${CMAKE_SOURCE_DIR}/mindspore/dataset)
)
endif()
if(CMAKE_SYSTEM_NAME MATCHES "Windows")
message("offline debugger does not support windows system temporarily")
else()
if(EXISTS ${CMAKE_SOURCE_DIR}/mindspore/offline_debug)
install(
DIRECTORY ${CMAKE_SOURCE_DIR}/mindspore/offline_debug
DESTINATION ${INSTALL_PY_DIR}
COMPONENT mindspore
)
endif()
endif()
## Public header files
install(
DIRECTORY ${CMAKE_SOURCE_DIR}/include

@ -1,3 +1,6 @@
include_directories(${CMAKE_SOURCE_DIR}/mindspore/ccsrc/debug/)
include_directories(${CMAKE_BINARY_DIR})
set(_DEBUG_SRC_LIST
"${CMAKE_CURRENT_SOURCE_DIR}/anf_ir_dump.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/anf_ir_utils.cc"
@ -8,6 +11,14 @@ set(_DEBUG_SRC_LIST
"${CMAKE_CURRENT_SOURCE_DIR}/env_config_parser.cc"
)
set(_OFFLINE_SRC_LIST
"${CMAKE_CURRENT_SOURCE_DIR}/debug_services.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/debugger/tensor_summary.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/debugger/offline_debug/offline_logger.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/debugger/offline_debug/dbg_services.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/debugger/offline_debug/mi_pybind_register.cc"
)
if(ENABLE_DUMP_IR)
file(GLOB_RECURSE _RDR_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "rdr/*.cc")
if(NOT ENABLE_D)
@ -38,3 +49,13 @@ endif()
set_property(SOURCE ${_DEBUG_SRC_LIST} ${_RDR_SRC_LIST} PROPERTY COMPILE_DEFINITIONS
SUBMODULE_ID=mindspore::SubModuleId::SM_DEBUG)
add_library(_mindspore_debug_obj OBJECT ${_DEBUG_SRC_LIST} ${_RDR_SRC_LIST})
if(NOT CMAKE_SYSTEM_NAME MATCHES "Windows")
add_compile_options(-Wall -DOFFLINE_DBG_MODE -fPIC -O2)
set_property(SOURCE ${_OFFLINE_SRC_LIST} PROPERTY COMPILE_DEFINITIONS
SUBMODULE_ID=mindspore::SubModuleId::SM_OFFLINE_DEBUG)
add_library(_mindspore_offline_debug SHARED ${_OFFLINE_SRC_LIST})
set_target_properties(_mindspore_offline_debug PROPERTIES
PREFIX "${PYTHON_MODULE_PREFIX}"
SUFFIX "${PYTHON_MODULE_EXTENSION}"
)
endif()

File diff suppressed because it is too large Load Diff

@ -16,6 +16,17 @@
#ifndef MINDSPORE_CCSRC_DEBUG_DEBUG_SERVICES_H_
#define MINDSPORE_CCSRC_DEBUG_DEBUG_SERVICES_H_
#ifndef OFFLINE_DBG_MODE
#define ONLINE_DBG_MODE
#endif
#ifdef OFFLINE_DBG_MODE
#include "Eigen/Core"
#include "Eigen/src/Core/arch/CUDA/Half.h"
using float16 = Eigen::half;
#include "debugger/offline_debug/offline_logger.h"
#endif
#include <math.h>
#include <vector>
#include <string>
@ -26,11 +37,13 @@
#include <mutex>
#include <map>
#include <limits>
#include <sstream>
#include "debug/tensor_load.h"
#include "debug/tensor_data.h"
#include "ir/dtype.h"
#ifdef ONLINE_DBG_MODE
namespace mindspore {
#endif
class DebugServices {
public:
DebugServices();
@ -103,6 +116,8 @@ class DebugServices {
unsigned int id;
condition_t condition;
std::vector<std::tuple<std::string, bool>> check_node_list;
std::vector<std::tuple<std::string, std::vector<uint32_t>>> check_node_device_list;
std::vector<std::tuple<std::string, std::vector<uint32_t>>> check_node_graph_list;
std::vector<parameter_t> parameter_list;
size_t location = 0;
@ -167,30 +182,55 @@ class DebugServices {
}
} watchpoint_t;
void AddWatchpoint(unsigned int id, unsigned int watch_condition, float parameter,
const std::vector<std::tuple<std::string, bool>> &check_node_list,
const std::vector<parameter_t> &parameter_list);
void AddWatchpoint(
unsigned int id, unsigned int watch_condition, float parameter,
const std::vector<std::tuple<std::string, bool>> &check_node_list, const std::vector<parameter_t> &parameter_list,
const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list = nullptr,
const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list = nullptr);
void RemoveWatchpoint(unsigned int id);
void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition,
std::vector<unsigned int> *watchpoint_id, std::vector<std::vector<parameter_t>> *parameters,
std::vector<int32_t> *error_code, const std::vector<std::string> &op_overflows,
const std::vector<std::shared_ptr<TensorData>> &tensor_list, bool init_dbg_suspend,
const bool step_end, const bool recheck);
std::vector<std::shared_ptr<TensorData>> *tensor_list, bool init_dbg_suspend,
const bool step_end, const bool recheck, std::vector<unsigned int> *device_id = nullptr,
std::vector<unsigned int> *root_graph_id = nullptr);
void ReadNodesTensors(std::vector<std::string> name, std::vector<std::string> *ret_name,
std::vector<char *> *data_ptr, std::vector<ssize_t> *data_size, std::vector<TypePtr> *dtype,
std::vector<std::vector<int64_t>> *shape);
void AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck, const std::string &tensor_name,
const std::string &tensor_name_no_slot, bool *previous_iter_tensor_needed,
std::string *qualified_tensor_name, std::vector<watchpoint_t> *watchpoints_to_check);
#ifdef OFFLINE_DBG_MODE
void GetSlotInfo(const std::string &file_name, const std::string &dump_name, const std::string &specific_dump_dir,
std::vector<size_t> *slot_list);
std::size_t GetShapeTypeInfo(const std::string &specific_dump_dir, std::size_t slot,
const std::string &prefix_dump_file_name, std::string *file_name, std::string *type_name,
std::string *out_dir, std::vector<int64_t> *shape);
void ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot,
std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
std::vector<unsigned int> root_graph_id, std::vector<std::shared_ptr<TensorData>> *result_list);
std::vector<std::shared_ptr<TensorData>> ReadNeededDumpedTensors(unsigned int iteration);
void *GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed);
#endif
void ReadNodesTensors(std::vector<std::string> name, std::vector<std::string> *ret_name,
std::vector<char *> *data_ptr, std::vector<ssize_t> *data_size,
std::vector<unsigned int> *dtype, std::vector<std::vector<int64_t>> *shape);
#ifdef ONLINE_DBG_MODE
bool IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel = nullptr) const;
bool IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const;
#endif
void EmptyTensor();
std::vector<std::shared_ptr<TensorData>> GetTensor() const;
void AddAnalyzedTensorToCache(const bool recheck, const unsigned int id, const std::string &tensor_name);
std::vector<std::shared_ptr<TensorData>> GetNodeTensorMap(const std::string &node_name) const;
uint32_t GetTensorLoaderIterNum() const;
@ -201,31 +241,51 @@ class DebugServices {
void EmptyCurrentTensor();
#ifdef ONLINE_DBG_MODE
bool DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath,
const std::string &host_fmt, const std::vector<int64_t> &host_shape, TypeId host_type,
TypeId addr_type_id, const std::string &addr_format, size_t slot) const;
#endif
bool LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev);
std::unordered_map<unsigned int, watchpoint_t> GetWatchpointTable();
void ResetLoadedTensors();
#ifdef ONLINE_DBG_MODE
std::vector<std::shared_ptr<TensorData>> GetNodeTensor(const CNodePtr &kernel);
#endif
bool TensorExistsInCurrent(std::string tensor_name);
void MoveTensorCurrentToPrev(std::string tensor_name);
void SetNetName(std::string net_name);
std::string GetNetName();
void SetDumpDir(std::string dump_dir);
std::string GetDumpDir();
void SetSyncMode(bool is_sync_mode);
bool GetSyncMode();
private:
std::mutex lock_;
// to keep track of watchpoints that have been checked already for a tensor in current step
std::unordered_map<std::string, std::set<int32_t>> wp_id_cache;
std::unordered_map<unsigned int, watchpoint_t> watchpoint_table;
std::string net_name;
std::string dump_dir;
bool is_sync_mode;
TensorLoader *tensor_loader_;
};
#ifdef ONLINE_DBG_MODE
} // namespace mindspore
#endif
#endif // MINDSPORE_CCSRC_DEBUG_DEBUG_SERVICES_H_

@ -755,7 +755,7 @@ std::list<TensorProto> Debugger::LoadTensors(const ProtoVector<TensorProto> &ten
std::vector<std::string> ret_name;
std::vector<char *> data_ptr;
std::vector<ssize_t> data_size;
std::vector<TypePtr> dtype;
std::vector<unsigned int> dtype;
std::vector<std::vector<int64_t>> shape;
std::transform(tensors.begin(), tensors.end(), std::back_inserter(name), GetTensorFullName);
@ -789,7 +789,7 @@ std::list<TensorProto> Debugger::LoadTensors(const ProtoVector<TensorProto> &ten
tensor_item.set_tensor_content(data_ptr[result_index] + size_iter, chunk_size);
tensor_item.set_data_type(GetDebuggerNumberDataType(dtype[result_index]));
tensor_item.set_data_type((debugger::DataType)dtype[result_index]);
for (auto &elem : shape[result_index]) {
tensor_item.add_dims(elem);
}
@ -827,7 +827,7 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode
tensor_list = debug_services_->GetNodeTensor(kernel);
}
debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, &parameters, &error_codes, overflow_ops,
tensor_list, initial_suspend_, watchnode.empty(), recheck);
&tensor_list, initial_suspend_, watchnode.empty(), recheck);
std::list<WatchpointHit> hits;
for (unsigned int i = 0; i < name.size(); i++) {
WatchpointHit hit;

@ -0,0 +1,28 @@
-----------------------------------------------------------
tensor_info_1 attributes:
node name = Default/network-TrainOneStepCell/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op169
slot = 0
iteration = 2
device_id = None
root_graph_id = 1
is_parameter = False
tensor_data_1 attributes:
data (printed in uint8) = [149 167 124 ... 158 212 164]
size in bytes = 2076672
debugger dtype = 10
shape = [32, 192, 13, 13]
-----------------------------------------------------------
tensor_info_2 attributes:
node name = Default/network-TrainOneStepCell/network-WithLossCell/_backbone-AlexNet/ReLUV2-op348
slot = 1
iteration = 2
device_id = None
root_graph_id = 1
is_parameter = False
tensor_data_2 attributes:
data (printed in uint8) = [ 20 21 18 ... 126 98 25]
size in bytes = 129792
debugger dtype = 6
shape = [32, 12, 13, 13, 2]

@ -0,0 +1,72 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Read tensor test script for offline debugger APIs.
"""
import mindspore.offline_debug.dbg_services as d
import numpy as np
def main():
debugger_backend = d.DbgServices(
dump_file_path="/opt/nvme2n1/j00455527/dumps/async_sink_true/032421")
_ = debugger_backend.initialize(net_name="alexnet", is_sync_mode=False)
# output tensor with zero slot
info1 = d.TensorInfo(node_name="Default/network-TrainOneStepCell/network-WithLossCell/_backbone-AlexNet/"
"conv3-Conv2d/Conv2D-op169",
slot=0, iteration=2, device_id=0, root_graph_id=1, is_parameter=False)
# output tensor with non-zero slot
info2 = d.TensorInfo(node_name="Default/network-TrainOneStepCell/network-WithLossCell/_backbone-AlexNet/"
"ReLUV2-op348",
slot=1, iteration=2, device_id=0, root_graph_id=1, is_parameter=False)
tensor_info = [info1, info2]
tensor_data = debugger_backend.read_tensors(tensor_info)
print_read_tensors(tensor_info, tensor_data)
def print_read_tensors(tensor_info, tensor_data):
"""Print read tensors."""
for x, _ in enumerate(tensor_info):
print("-----------------------------------------------------------")
print("tensor_info_" + str(x+1) + " attributes:")
print("node name = ", tensor_info[x].node_name)
print("slot = ", tensor_info[x].slot)
print("iteration = ", tensor_info[x].iteration)
print("device_id = ", tensor_info[x].device_id)
print("root_graph_id = ", tensor_info[x].root_graph_id)
print("is_parameter = ", tensor_info[x].is_parameter)
print()
print("tensor_data_" + str(x+1) + " attributes:")
print("data (printed in uint8) = ", np.frombuffer(
tensor_data[x].data_ptr, np.uint8, tensor_data[x].data_size))
py_byte_size = len(tensor_data[x].data_ptr)
c_byte_size = tensor_data[x].data_size
if c_byte_size != py_byte_size:
print("The python byte size of ", py_byte_size,
" does not match the C++ byte size of ", c_byte_size)
print("size in bytes = ", tensor_data[x].data_size)
print("debugger dtype = ", tensor_data[x].dtype)
print("shape = ", tensor_data[x].shape)
if __name__ == "__main__":
main()

@ -0,0 +1,14 @@
-----------------------------------------------------------
watchpoint_hit for test_1 attributes:
name = Default/network-TrainOneStepCell/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op169
slot = 0
condition = 6
watchpoint_id = 1
parameter 0 name = param
parameter 0 disabled = False
parameter 0 value = 0.0
parameter 0 hit = True
parameter 0 actual_value = -0.1417236328125
error code = 0
device_id = 0
root_graph_id = 1

@ -0,0 +1,92 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Watchpoints test script for offline debugger APIs.
"""
import mindspore.offline_debug.dbg_services as d
def main():
debugger_backend = d.DbgServices(
dump_file_path="/opt/nvme2n1/j00455527/dumps/async_sink_true/032421")
_ = debugger_backend.initialize(net_name="alexnet", is_sync_mode=False)
# NOTES:
# -> watch_condition=6 is MIN_LT
# -> watch_condition=18 is CHANGE_TOO_LARGE
# test 1: watchpoint set and hit (watch_condition=6)
param1 = d.Parameter(name="param", disabled=False, value=0.0)
_ = debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=6,
check_node_list={"Default/network-TrainOneStepCell/network-WithLossCell/"
"_backbone-AlexNet/conv3-Conv2d/Conv2D-op169":
{"device_id": [0], "root_graph_id": [1], "is_parameter": False
}}, parameter_list=[param1])
watchpoint_hits_test_1 = debugger_backend.check_watchpoints(iteration=2)
if len(watchpoint_hits_test_1) != 1:
print("ERROR -> test 1: watchpoint set but not hit just once")
print_watchpoint_hits(watchpoint_hits_test_1, 1)
# test 2: watchpoint remove and ensure it's not hit
_ = debugger_backend.remove_watchpoint(watchpoint_id=1)
watchpoint_hits_test_2 = debugger_backend.check_watchpoints(iteration=2)
if watchpoint_hits_test_2:
print("ERROR -> test 2: watchpoint removed but hit")
# test 3: watchpoint set and not hit, then remove
param2 = d.Parameter(name="param", disabled=False, value=-1000.0)
_ = debugger_backend.add_watchpoint(watchpoint_id=2, watch_condition=6,
check_node_list={"Default/network-TrainOneStepCell/network-WithLossCell/"
"_backbone-AlexNet/conv3-Conv2d/Conv2D-op169":
{"device_id": [0], "root_graph_id": [1], "is_parameter": False
}}, parameter_list=[param2])
watchpoint_hits_test_3 = debugger_backend.check_watchpoints(iteration=2)
if watchpoint_hits_test_3:
print("ERROR -> test 3: watchpoint set but not supposed to be hit")
_ = debugger_backend.remove_watchpoint(watchpoint_id=2)
def print_watchpoint_hits(watchpoint_hits, test_id):
"""Print watchpoint hits."""
for x, _ in enumerate(watchpoint_hits):
print("-----------------------------------------------------------")
print("watchpoint_hit for test_%u attributes:" % test_id)
print("name = ", watchpoint_hits[x].name)
print("slot = ", watchpoint_hits[x].slot)
print("condition = ", watchpoint_hits[x].condition)
print("watchpoint_id = ", watchpoint_hits[x].watchpoint_id)
for p, _ in enumerate(watchpoint_hits[x].parameters):
print("parameter ", p, " name = ",
watchpoint_hits[x].parameters[p].name)
print("parameter ", p, " disabled = ",
watchpoint_hits[x].parameters[p].disabled)
print("parameter ", p, " value = ",
watchpoint_hits[x].parameters[p].value)
print("parameter ", p, " hit = ",
watchpoint_hits[x].parameters[p].hit)
print("parameter ", p, " actual_value = ",
watchpoint_hits[x].parameters[p].actual_value)
print("error code = ", watchpoint_hits[x].error_code)
print("device_id = ", watchpoint_hits[x].device_id)
print("root_graph_id = ", watchpoint_hits[x].root_graph_id)
if __name__ == "__main__":
main()

@ -0,0 +1,49 @@
python sync_trans_false_read_tensors.py > sync_trans_false_read_tensors.actual
sed -i '/\[WARNING\]/d' sync_trans_false_read_tensors.actual
sed -i '/Deprecated/d' sync_trans_false_read_tensors.actual
diff sync_trans_false_read_tensors.actual sync_trans_false_read_tensors.expected
if [ $? -eq 0 ]; then
echo sync_trans_false_read_tensors PASSED
else
echo sync_trans_false_read_tensors FAILED
fi
python sync_trans_true_read_tensors.py > sync_trans_true_read_tensors.actual
sed -i '/\[WARNING\]/d' sync_trans_true_read_tensors.actual
sed -i '/Deprecated/d' sync_trans_true_read_tensors.actual
diff sync_trans_true_read_tensors.actual sync_trans_true_read_tensors.expected
if [ $? -eq 0 ]; then
echo sync_trans_true_read_tensors PASSED
else
echo sync_trans_true_read_tensors FAILED
fi
python sync_trans_false_watchpoints.py > sync_trans_false_watchpoints.actual
sed -i '/\[WARNING\]/d' sync_trans_false_watchpoints.actual
sed -i '/Deprecated/d' sync_trans_false_watchpoints.actual
diff sync_trans_false_watchpoints.actual sync_trans_false_watchpoints.expected
if [ $? -eq 0 ]; then
echo sync_trans_false_watchpoints PASSED
else
echo sync_trans_false_watchpoints FAILED
fi
python async_sink_mode_true_read_tensors.py > async_sink_mode_true_read_tensors.actual
sed -i '/\[WARNING\]/d' async_sink_mode_true_read_tensors.actual
sed -i '/Deprecated/d' async_sink_mode_true_read_tensors.actual
diff async_sink_mode_true_read_tensors.actual async_sink_mode_true_read_tensors.expected
if [ $? -eq 0 ]; then
echo async_sink_mode_true_read_tensors PASSED
else
echo async_sink_mode_true_read_tensors FAILED
fi
python async_sink_mode_true_watchpoints.py > async_sink_mode_true_watchpoints.actual
sed -i '/\[WARNING\]/d' async_sink_mode_true_watchpoints.actual
sed -i '/Deprecated/d' async_sink_mode_true_watchpoints.actual
diff async_sink_mode_true_watchpoints.actual async_sink_mode_true_watchpoints.expected
if [ $? -eq 0 ]; then
echo async_sink_mode_true_watchpoints PASSED
else
echo async_sink_mode_true_watchpoints FAILED
fi

@ -0,0 +1,70 @@
-----------------------------------------------------------
tensor_info_1 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias
slot = 0
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = True
tensor_data_1 attributes:
data (printed in uint8) = [170 19 44 181 254 212 16 52 52 162 148 180 130 115 226 180 183 243
101 52 224 79 189 51 10 70 69 51 199 75 159 52 79 98 104 52
106 77 19 52 129 183 8 180 252 58 48 180 35 219 9 52 240 201
179 51 142 151 158 51 210 145 182 53 140 219 0 53 140 219 22 181
46 33 87 180 238 90 122 180 166 10 38 179 202 195 4 53 166 10
150 51 214 120 209 52 235 115 37 180 92 177 215 180 0 136 84 51
72 114 145 180 43 169 255 180 114 27 61 52 76 225 122 50 126 72
159 51 58 35 202 51 114 61 106 51 60 223 63 52 209 179 1 52
232 217 44 178 130 158 109 179 213 231 10 179 37 40 94 179 208 68
64 53 6 52 249 52 162 35 1 181 231 29 155 52 30 201 69 180
229 131 126 51 18 165 109 180 164 112 163 181 116 172 11 178 6 129
37 52 54 205 203 180 115 104 145 52 232 106 219 179 36 40 214 52
202 50 204 52 76 89 38 179 230 140 232 178 168 53 77 52 180 191
108 51 128 183 64 51 56 137 161 180 247 6 143 180 126 63 197 180
198 177 94 52 140 185 139 51 150 178 228 180 255 67 150 52 134 201
164 52 107 43 14 53 174 216 63 179 40 160 41 53 120 88 72 179
218 172 234 52 234 38 25 52 85 159 155 180 254 67 138 180 34 253
118 180 218 61 17 52 242 133 253 52 175 37 180 52 171 62 163 52
202 195 86 53 160 171 45 52 34 31 176 180 156 85 5 53 178 191
68 180 42 203 140 52 248 117 72 52 248 253 212 176 195 100 202 51
87 14 141 52 91 100 235 51 48 221 136 52 143 117 17 180 51 196
25 52 127 29 112 180 152 144 207 178 219 104 64 52 21 174 251 52
164 78 138 181 20 63 6 52 10 249 96 179 163 146 18 53 200 186
236 52 2 188 85 52 124 140 121 179 246 185 22 181 246 74 249 51
70 182 135 53 189 227 76 52 249 160 159 180 134 235 65 53 64 164
255 51 224 156 41 53 142 117 69 181 247 151 101 53 185 175 35 52
164 112 21 53 30 31 212 179 142 151 110 179 176 148 29 181 206 204
88 53 116 215 214 180 172 173 216 51 106 222 153 180 200 152 19 181
176 3 7 52 215 52 87 52]
size in bytes = 512
debugger dtype = 11
shape = [128]
-----------------------------------------------------------
tensor_info_2 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op168
slot = 0
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = False
tensor_data_2 attributes:
data (printed in uint8) = [181 167 46 ... 12 204 164]
size in bytes = 2076672
debugger dtype = 10
shape = [32, 12, 13, 13, 16]
-----------------------------------------------------------
tensor_info_3 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op346
slot = 1
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = False
tensor_data_3 attributes:
data (printed in uint8) = [ 50 17 122 ... 94 42 90]
size in bytes = 129792
debugger dtype = 6
shape = [32, 12, 13, 13, 2]

@ -0,0 +1,74 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Read tensor test script for offline debugger APIs.
"""
import mindspore.offline_debug.dbg_services as d
import numpy as np
def main():
debugger_backend = d.DbgServices(
dump_file_path="/opt/nvme2n1/j00455527/dumps/sync_trans_false/032421/alexnet")
_ = debugger_backend.initialize(
net_name="Network Name goes here!", is_sync_mode=True)
# parameter
info1 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias",
slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=True)
# output tensor with zero slot
info2 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op168",
slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=False)
# output tensor with non-zero slot
info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op346",
slot=1, iteration=2, device_id=0, root_graph_id=0, is_parameter=False)
tensor_info = [info1, info2, info3]
tensor_data = debugger_backend.read_tensors(tensor_info)
print_read_tensors(tensor_info, tensor_data)
def print_read_tensors(tensor_info, tensor_data):
"""Print read tensors."""
for x, _ in enumerate(tensor_info):
print("-----------------------------------------------------------")
print("tensor_info_" + str(x+1) + " attributes:")
print("node name = ", tensor_info[x].node_name)
print("slot = ", tensor_info[x].slot)
print("iteration = ", tensor_info[x].iteration)
print("device_id = ", tensor_info[x].device_id)
print("root_graph_id = ", tensor_info[x].root_graph_id)
print("is_parameter = ", tensor_info[x].is_parameter)
print()
print("tensor_data_" + str(x+1) + " attributes:")
print("data (printed in uint8) = ", np.frombuffer(
tensor_data[x].data_ptr, np.uint8, tensor_data[x].data_size))
py_byte_size = len(tensor_data[x].data_ptr)
c_byte_size = tensor_data[x].data_size
if c_byte_size != py_byte_size:
print("The python byte size of ", py_byte_size,
" does not match the C++ byte size of ", c_byte_size)
print("size in bytes = ", tensor_data[x].data_size)
print("debugger dtype = ", tensor_data[x].dtype)
print("shape = ", tensor_data[x].shape)
if __name__ == "__main__":
main()

@ -0,0 +1,33 @@
-----------------------------------------------------------
watchpoint_hit for test_1 attributes:
name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op168
slot = 0
condition = 6
watchpoint_id = 1
parameter 0 name = param
parameter 0 disabled = False
parameter 0 value = 0.0
parameter 0 hit = True
parameter 0 actual_value = -0.14013671875
error code = 0
device_id = 0
root_graph_id = 0
-----------------------------------------------------------
watchpoint_hit for test_4 attributes:
name = Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/Parameter[6]_11/fc3.bias
slot = 0
condition = 18
watchpoint_id = 3
parameter 0 name = abs_mean_update_ratio_gt
parameter 0 disabled = False
parameter 0 value = 0.0
parameter 0 hit = True
parameter 0 actual_value = 0.5243796973599475
parameter 1 name = epsilon
parameter 1 disabled = True
parameter 1 value = 0.0
parameter 1 hit = False
parameter 1 actual_value = 0.0
error code = 0
device_id = 0
root_graph_id = 0

@ -0,0 +1,109 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Watchpoints test script for offline debugger APIs.
"""
import mindspore.offline_debug.dbg_services as d
def main():
debugger_backend = d.DbgServices(
dump_file_path="/opt/nvme2n1/j00455527/dumps/sync_trans_false/032421/alexnet")
_ = debugger_backend.initialize(
net_name="Network Name goes here!", is_sync_mode=True)
# NOTES:
# -> watch_condition=6 is MIN_LT
# -> watch_condition=18 is CHANGE_TOO_LARGE
# test 1: watchpoint set and hit (watch_condition=6)
param1 = d.Parameter(name="param", disabled=False, value=0.0)
_ = debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=6,
check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/"
"Conv2D-op168":
{"device_id": [0], "root_graph_id": [0], "is_parameter": False
}}, parameter_list=[param1])
watchpoint_hits_test_1 = debugger_backend.check_watchpoints(iteration=2)
if len(watchpoint_hits_test_1) != 1:
print("ERROR -> test 1: watchpoint set but not hit just once")
print_watchpoint_hits(watchpoint_hits_test_1, 1)
# test 2: watchpoint remove and ensure it's not hit
_ = debugger_backend.remove_watchpoint(watchpoint_id=1)
watchpoint_hits_test_2 = debugger_backend.check_watchpoints(iteration=2)
if watchpoint_hits_test_2:
print("ERROR -> test 2: watchpoint removed but hit")
# test 3: watchpoint set and not hit, then remove
param2 = d.Parameter(name="param", disabled=False, value=-1000.0)
_ = debugger_backend.add_watchpoint(watchpoint_id=2, watch_condition=6,
check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/"
"Conv2D-op308":
{"device_id": [0], "root_graph_id": [0], "is_parameter": False
}}, parameter_list=[param2])
watchpoint_hits_test_3 = debugger_backend.check_watchpoints(iteration=2)
if watchpoint_hits_test_3:
print("ERROR -> test 3: watchpoint set but not supposed to be hit")
_ = debugger_backend.remove_watchpoint(watchpoint_id=2)
# test 4: weight change watchpoint set and hit
param_abs_mean_update_ratio_gt = d.Parameter(
name="abs_mean_update_ratio_gt", disabled=False, value=0.0)
param_epsilon = d.Parameter(name="epsilon", disabled=True, value=0.0)
_ = debugger_backend.add_watchpoint(watchpoint_id=3, watch_condition=18,
check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/"
"Parameter[6]_11/fc3.bias":
{"device_id": [0], "root_graph_id": [0], "is_parameter": True
}}, parameter_list=[param_abs_mean_update_ratio_gt,
param_epsilon])
watchpoint_hits_test_4 = debugger_backend.check_watchpoints(iteration=3)
if len(watchpoint_hits_test_4) != 1:
print("ERROR -> test 4: watchpoint weight change set but not hit just once")
print_watchpoint_hits(watchpoint_hits_test_4, 4)
def print_watchpoint_hits(watchpoint_hits, test_id):
"""Print watchpoint hits."""
for x, _ in enumerate(watchpoint_hits):
print("-----------------------------------------------------------")
print("watchpoint_hit for test_%u attributes:" % test_id)
print("name = ", watchpoint_hits[x].name)
print("slot = ", watchpoint_hits[x].slot)
print("condition = ", watchpoint_hits[x].condition)
print("watchpoint_id = ", watchpoint_hits[x].watchpoint_id)
for p, _ in enumerate(watchpoint_hits[x].parameters):
print("parameter ", p, " name = ",
watchpoint_hits[x].parameters[p].name)
print("parameter ", p, " disabled = ",
watchpoint_hits[x].parameters[p].disabled)
print("parameter ", p, " value = ",
watchpoint_hits[x].parameters[p].value)
print("parameter ", p, " hit = ",
watchpoint_hits[x].parameters[p].hit)
print("parameter ", p, " actual_value = ",
watchpoint_hits[x].parameters[p].actual_value)
print("error code = ", watchpoint_hits[x].error_code)
print("device_id = ", watchpoint_hits[x].device_id)
print("root_graph_id = ", watchpoint_hits[x].root_graph_id)
if __name__ == "__main__":
main()

@ -0,0 +1,70 @@
-----------------------------------------------------------
tensor_info_1 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias
slot = 0
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = True
tensor_data_1 attributes:
data (printed in uint8) = [230 208 10 52 104 34 252 52 4 231 144 52 188 150 64 180 88 236
15 180 254 135 180 51 131 226 147 52 88 202 62 53 2 43 55 53
231 29 87 180 220 249 30 180 157 17 177 180 81 107 140 181 8 95
192 180 89 134 112 180 96 238 90 178 156 196 212 180 206 25 15 181
212 154 6 180 91 211 116 52 191 14 140 51 128 106 124 53 28 158
70 181 182 21 251 50 100 204 157 179 88 202 42 180 7 95 8 53
128 251 238 52 241 133 241 52 111 86 157 179 48 221 148 180 200 7
141 180 236 226 182 51 190 82 158 180 140 108 179 180 195 134 215 179
103 213 39 179 89 168 149 180 42 58 58 180 64 53 62 179 250 126
158 52 38 83 117 52 0 0 136 180 136 133 122 51 110 18 131 179
238 13 94 51 102 136 15 181 134 90 227 180 16 11 117 180 35 74
163 52 105 0 87 181 112 18 131 50 226 233 67 181 217 172 10 52
206 25 217 52 208 213 22 52 146 203 87 180 74 46 207 52 178 191
4 180 100 93 216 52 119 190 171 180 223 2 5 181 128 72 207 179
58 146 11 179 224 79 137 52 143 228 154 180 246 219 215 179 14 79
195 52 126 29 64 52 132 192 42 51 94 220 86 52 94 109 1 181
72 37 117 178 110 197 94 180 160 94 153 179 118 224 80 181 156 17
37 50 120 156 162 53 26 115 135 180 228 20 29 53 145 126 147 52
99 16 48 180 211 188 199 180 52 51 99 180 93 254 227 52 152 126
123 49 6 18 16 181 5 163 130 51 27 158 98 53 134 235 189 52
119 45 9 180 130 115 110 52 158 128 162 52 232 251 197 180 178 46
158 179 57 214 157 52 172 207 161 180 208 0 222 49 242 99 32 53
20 174 135 50 247 117 176 52 194 57 43 180 140 108 135 51 243 65
175 51 187 73 156 51 63 232 217 50 180 234 115 52 194 168 148 52
27 192 183 180 45 178 157 52 125 208 17 53 236 192 65 53 190 193
7 53 254 246 57 53 3 43 199 51 64 164 215 180 220 104 240 51
23 72 24 180 68 173 9 51 72 114 29 53 105 0 57 181 188 150
8 53 229 97 131 53 0 34 189 51 163 146 74 53 31 244 204 51
86 193 220 180 156 51 146 179]
size in bytes = 512
debugger dtype = 11
shape = [128]
-----------------------------------------------------------
tensor_info_2 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op171
slot = 0
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = False
tensor_data_2 attributes:
data (printed in uint8) = [ 99 26 69 ... 154 218 164]
size in bytes = 2076672
debugger dtype = 10
shape = [32, 192, 13, 13]
-----------------------------------------------------------
tensor_info_3 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op353
slot = 1
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = False
tensor_data_3 attributes:
data (printed in uint8) = [19 17 27 ... 94 42 90]
size in bytes = 129792
debugger dtype = 6
shape = [32, 12, 13, 13, 2]

@ -0,0 +1,74 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Read tensor test script for offline debugger APIs.
"""
import mindspore.offline_debug.dbg_services as d
import numpy as np
def main():
debugger_backend = d.DbgServices(
dump_file_path="/opt/nvme2n1/j00455527/dumps/sync_trans_true/032421/alexnet")
_ = debugger_backend.initialize(
net_name="Network Name goes here!", is_sync_mode=True)
# parameter
info1 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias",
slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=True)
# output tensor with zero slot
info2 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op171",
slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=False)
# output tensor with non-zero slot
info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op353",
slot=1, iteration=2, device_id=0, root_graph_id=0, is_parameter=False)
tensor_info = [info1, info2, info3]
tensor_data = debugger_backend.read_tensors(tensor_info)
print_read_tensors(tensor_info, tensor_data)
def print_read_tensors(tensor_info, tensor_data):
"""Print read tensors."""
for x, _ in enumerate(tensor_info):
print("-----------------------------------------------------------")
print("tensor_info_" + str(x+1) + " attributes:")
print("node name = ", tensor_info[x].node_name)
print("slot = ", tensor_info[x].slot)
print("iteration = ", tensor_info[x].iteration)
print("device_id = ", tensor_info[x].device_id)
print("root_graph_id = ", tensor_info[x].root_graph_id)
print("is_parameter = ", tensor_info[x].is_parameter)
print()
print("tensor_data_" + str(x+1) + " attributes:")
print("data (printed in uint8) = ", np.frombuffer(
tensor_data[x].data_ptr, np.uint8, tensor_data[x].data_size))
py_byte_size = len(tensor_data[x].data_ptr)
c_byte_size = tensor_data[x].data_size
if c_byte_size != py_byte_size:
print("The python byte size of ", py_byte_size,
" does not match the C++ byte size of ", c_byte_size)
print("size in bytes = ", tensor_data[x].data_size)
print("debugger dtype = ", tensor_data[x].dtype)
print("shape = ", tensor_data[x].shape)
if __name__ == "__main__":
main()

@ -0,0 +1,149 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DEBUG_DBG_SERVICES_H_
#define DEBUG_DBG_SERVICES_H_
#include <vector>
#include <string>
#include <map>
#include <memory>
#include <tuple>
#include <iostream>
#include <variant>
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
#include "pybind11/stl_bind.h"
#include "debug/debug_services.h"
namespace py = pybind11;
typedef struct parameter {
parameter(const std::string &name, bool disabled, double value, bool hit, double actual_value)
: name(name), disabled(disabled), value(value), hit(hit), actual_value(actual_value) {}
const std::string get_name() const { return name; }
const bool get_disabled() const { return disabled; }
const double get_value() const { return value; }
const bool get_hit() const { return hit; }
const double get_actual_value() const { return actual_value; }
std::string name;
bool disabled;
double value;
bool hit;
double actual_value;
} parameter_t;
typedef struct watchpoint_hit {
watchpoint_hit(const std::string &name, uint32_t slot, int condition, uint32_t watchpoint_id,
const std::vector<parameter_t> &parameters, int32_t error_code, uint32_t device_id,
uint32_t root_graph_id)
: name(name),
slot(slot),
condition(condition),
watchpoint_id(watchpoint_id),
parameters(parameters),
error_code(error_code),
device_id(device_id),
root_graph_id(root_graph_id) {}
const std::string get_name() const { return name; }
const uint32_t get_slot() const { return slot; }
const int get_condition() const { return condition; }
const uint32_t get_watchpoint_id() const { return watchpoint_id; }
const std::vector<parameter_t> get_parameters() const { return parameters; }
const int32_t get_error_code() const { return error_code; }
const uint32_t get_device_id() const { return device_id; }
const uint32_t get_root_graph_id() const { return root_graph_id; }
std::string name;
uint32_t slot;
int condition;
uint32_t watchpoint_id;
std::vector<parameter_t> parameters;
int32_t error_code;
uint32_t device_id;
uint32_t root_graph_id;
} watchpoint_hit_t;
typedef struct tensor_info {
tensor_info(const std::string &node_name, uint32_t slot, uint32_t iteration, uint32_t device_id,
uint32_t root_graph_id, bool is_parameter)
: node_name(node_name),
slot(slot),
iteration(iteration),
device_id(device_id),
root_graph_id(root_graph_id),
is_parameter(is_parameter) {}
const std::string get_node_name() const { return node_name; }
const uint32_t get_slot() const { return slot; }
const uint32_t get_iteration() const { return iteration; }
const uint32_t get_device_id() const { return device_id; }
const uint32_t get_root_graph_id() const { return root_graph_id; }
const bool get_is_parameter() const { return is_parameter; }
std::string node_name;
uint32_t slot;
uint32_t iteration;
uint32_t device_id;
uint32_t root_graph_id;
bool is_parameter;
} tensor_info_t;
typedef struct tensor_data {
tensor_data(char *data_ptr, uint64_t data_size, int dtype, const std::vector<int64_t> &shape)
: data_size(data_size), dtype(dtype), shape(shape) {
if (data_ptr != NULL) {
this->data_ptr = py::bytes(data_ptr, data_size);
} else {
this->data_ptr = py::bytes();
}
}
const py::bytes get_data_ptr() const { return data_ptr; }
const uint64_t get_data_size() const { return data_size; }
const int get_dtype() const { return dtype; }
const std::vector<int64_t> &get_shape() const { return shape; }
py::bytes data_ptr;
uint64_t data_size;
int dtype;
std::vector<int64_t> shape;
} tensor_data_t;
class DbgServices {
private:
DebugServices *debug_services;
public:
explicit DbgServices(bool verbose = false);
DbgServices(const DbgServices &other);
DbgServices &operator=(const DbgServices &other);
~DbgServices();
int32_t Initialize(std::string net_name, std::string dump_folder_path, bool is_sync_mode);
int32_t AddWatchpoint(
unsigned int id, unsigned int watch_condition,
std::map<std::string, std::map<std::string, std::variant<bool, std::vector<std::string>>>> check_nodes,
std::vector<parameter_t> parameter_list);
int32_t RemoveWatchpoint(unsigned int id);
std::vector<watchpoint_hit_t> CheckWatchpoints(unsigned int iteration);
std::vector<tensor_data_t> ReadTensors(std::vector<tensor_info_t> info);
std::string GetVersion();
};
#endif // DEBUG_DBG_SERVICES_H_

@ -0,0 +1,24 @@
python sync_trans_false_read_tensors.py > sync_trans_false_read_tensors.actual
diff sync_trans_false_read_tensors.actual sync_trans_false_read_tensors.expected
if [ $? -eq 0 ]; then
echo sync_trans_false_read_tensors PASSED
else
echo sync_trans_false_read_tensors FAILED
fi
python sync_trans_true_read_tensors.py > sync_trans_true_read_tensors.actual
diff sync_trans_true_read_tensors.actual sync_trans_true_read_tensors.expected
if [ $? -eq 0 ]; then
echo sync_trans_true_read_tensors PASSED
else
echo sync_trans_true_read_tensors FAILED
fi
python sync_trans_false_watchpoints.py > sync_trans_false_watchpoints.actual
diff sync_trans_false_watchpoints.actual sync_trans_false_watchpoints.expected
if [ $? -eq 0 ]; then
echo sync_trans_false_watchpoints PASSED
else
echo sync_trans_false_watchpoints FAILED
fi

@ -0,0 +1,70 @@
-----------------------------------------------------------
tensor_info_1 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias
slot = 0
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = True
tensor_data_1 attributes:
data (printed in uint8) = [ 0 0 0 0 195 127 0 0 176 202 195 248 194 127 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 8 58 196 248
194 127 0 0 17 0 0 0 0 0 0 0 160 76 6 140 195 127
0 0 69 0 0 0 0 0 0 0 1 0 0 0 195 127 0 0
64 195 195 248 194 127 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 88 1 196 248 194 127 0 0 18 0 0 0
0 0 0 0 160 47 6 140 195 127 0 0 69 0 0 0 0 0
0 0 1 0 0 0 195 127 0 0 176 203 195 248 194 127 0 0
176 204 195 248 194 127 0 0 0 0 0 0 0 0 0 0 216 241
195 248 194 127 0 0 19 0 0 0 0 0 0 0 96 39 6 140
195 127 0 0 69 0 0 0 0 0 0 0 1 0 0 0 195 127
0 0 112 52 196 248 194 127 0 0 176 52 196 248 194 127 0 0
0 0 0 0 0 0 0 0 88 250 195 248 194 127 0 0 20 0
0 0 0 0 0 0 128 130 5 140 195 127 0 0 69 0 0 0
0 0 0 0 0 0 0 0 195 127 0 0 208 136 195 248 194 127
0 0 176 202 195 248 194 127 0 0 48 52 196 248 194 127 0 0
184 247 195 248 194 127 0 0 21 0 0 0 0 0 0 0 176 213
4 140 195 127 0 0 69 0 0 0 0 0 0 0 0 0 0 0
195 127 0 0 48 52 196 248 194 127 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 8 249 195 248 194 127 0 0
22 0 0 0 0 0 0 0 16 46 4 140 195 127 0 0 69 0
0 0 0 0 0 0 1 0 0 0 195 127 0 0 64 137 195 248
194 127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 88 12 196 248 194 127 0 0 23 0 0 0 0 0 0 0
32 137 3 140 195 127 0 0 85 0 0 0 0 0 0 0 0 0
0 0 195 127 0 0 176 202 195 248 194 127 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 104 246 195 248 194 127
0 0 24 0 0 0 0 0 0 0 48 104 15 140 195 127 0 0
32 104 15 140 195 127 0 0]
size in bytes = 512
debugger dtype = 11
shape = [128]
-----------------------------------------------------------
tensor_info_2 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op308
slot = 0
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = False
tensor_data_2 attributes:
data (printed in uint8) = [ 0 169 0 ... 152 242 63]
size in bytes = 4153344
debugger dtype = 11
shape = [32, 192, 13, 13]
-----------------------------------------------------------
tensor_info_3 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op300
slot = 1
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = False
tensor_data_3 attributes:
data (printed in uint8) = [ 0 169 0 ... 217 4 52]
size in bytes = 831744
debugger dtype = 8
shape = [207936]

@ -0,0 +1,74 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Read tensor test script for offline debugger APIs.
"""
import mindspore.offline_debug.dbg_services as d
import numpy as np
def main():
debugger_backend = d.DbgServices(
dump_file_path="/home/jtzanaka/dumps/sync_trans_false/032421/alexnet")
_ = debugger_backend.initialize(
net_name="Network Name goes here!", is_sync_mode=True)
# parameter
info1 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias",
slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=True)
# output tensor with zero slot
info2 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op308",
slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=False)
# output tensor with non-zero slot
info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op300",
slot=1, iteration=2, device_id=0, root_graph_id=0, is_parameter=False)
tensor_info = [info1, info2, info3]
tensor_data = debugger_backend.read_tensors(tensor_info)
print_read_tensors(tensor_info, tensor_data)
def print_read_tensors(tensor_info, tensor_data):
"""Print read tensors."""
for x, _ in enumerate(tensor_info):
print("-----------------------------------------------------------")
print("tensor_info_" + str(x+1) + " attributes:")
print("node name = ", tensor_info[x].node_name)
print("slot = ", tensor_info[x].slot)
print("iteration = ", tensor_info[x].iteration)
print("device_id = ", tensor_info[x].device_id)
print("root_graph_id = ", tensor_info[x].root_graph_id)
print("is_parameter = ", tensor_info[x].is_parameter)
print()
print("tensor_data_" + str(x+1) + " attributes:")
print("data (printed in uint8) = ", np.frombuffer(
tensor_data[x].data_ptr, np.uint8, tensor_data[x].data_size))
py_byte_size = len(tensor_data[x].data_ptr)
c_byte_size = tensor_data[x].data_size
if c_byte_size != py_byte_size:
print("The python byte size of ", py_byte_size,
" does not match the C++ byte size of ", c_byte_size)
print("size in bytes = ", tensor_data[x].data_size)
print("debugger dtype = ", tensor_data[x].dtype)
print("shape = ", tensor_data[x].shape)
if __name__ == "__main__":
main()

@ -0,0 +1,33 @@
-----------------------------------------------------------
watchpoint_hit for test_1 attributes:
name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op308
slot = 0
condition = 6
watchpoint_id = 1
parameter 0 name = param
parameter 0 disabled = False
parameter 0 value = 0.0
parameter 0 hit = True
parameter 0 actual_value = -2.429065704345703
error code = 0
device_id = 0
root_graph_id = 0
-----------------------------------------------------------
watchpoint_hit for test_4 attributes:
name = Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/Parameter[6]_11/fc3.bias
slot = 0
condition = 18
watchpoint_id = 3
parameter 0 name = abs_mean_update_ratio_gt
parameter 0 disabled = False
parameter 0 value = 0.0
parameter 0 hit = True
parameter 0 actual_value = 1.793662034335766e-35
parameter 1 name = epsilon
parameter 1 disabled = True
parameter 1 value = 0.0
parameter 1 hit = False
parameter 1 actual_value = 0.0
error code = 0
device_id = 0
root_graph_id = 0

@ -0,0 +1,109 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Watchpoints test script for offline debugger APIs.
"""
import mindspore.offline_debug.dbg_services as d
def main():
debugger_backend = d.DbgServices(
dump_file_path="/home/jtzanaka/dumps/sync_trans_false/032421/alexnet")
_ = debugger_backend.initialize(
net_name="Network Name goes here!", is_sync_mode=True)
# NOTES:
# -> watch_condition=6 is MIN_LT
# -> watch_condition=18 is CHANGE_TOO_LARGE
# test 1: watchpoint set and hit (watch_condition=6)
param1 = d.Parameter(name="param", disabled=False, value=0.0)
_ = debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=6,
check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/"
"Conv2D-op308":
{"device_id": [0], "root_graph_id": [0], "is_parameter": False
}}, parameter_list=[param1])
watchpoint_hits_test_1 = debugger_backend.check_watchpoints(iteration=2)
if len(watchpoint_hits_test_1) != 1:
print("ERROR -> test 1: watchpoint set but not hit just once")
print_watchpoint_hits(watchpoint_hits_test_1, 1)
# test 2: watchpoint remove and ensure it's not hit
_ = debugger_backend.remove_watchpoint(watchpoint_id=1)
watchpoint_hits_test_2 = debugger_backend.check_watchpoints(iteration=2)
if watchpoint_hits_test_2:
print("ERROR -> test 2: watchpoint removed but hit")
# test 3: watchpoint set and not hit, then remove
param2 = d.Parameter(name="param", disabled=False, value=-1000.0)
_ = debugger_backend.add_watchpoint(watchpoint_id=2, watch_condition=6,
check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/"
"Conv2D-op308":
{"device_id": [0], "root_graph_id": [0], "is_parameter": False
}}, parameter_list=[param2])
watchpoint_hits_test_3 = debugger_backend.check_watchpoints(iteration=2)
if watchpoint_hits_test_3:
print("ERROR -> test 3: watchpoint set but not supposed to be hit")
_ = debugger_backend.remove_watchpoint(watchpoint_id=2)
# test 4: weight change watchpoint set and hit
param_abs_mean_update_ratio_gt = d.Parameter(
name="abs_mean_update_ratio_gt", disabled=False, value=0.0)
param_epsilon = d.Parameter(name="epsilon", disabled=True, value=0.0)
_ = debugger_backend.add_watchpoint(watchpoint_id=3, watch_condition=18,
check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/"
"Parameter[6]_11/fc3.bias":
{"device_id": [0], "root_graph_id": [0], "is_parameter": True
}}, parameter_list=[param_abs_mean_update_ratio_gt,
param_epsilon])
watchpoint_hits_test_4 = debugger_backend.check_watchpoints(iteration=3)
if len(watchpoint_hits_test_4) != 1:
print("ERROR -> test 4: watchpoint weight change set but not hit just once")
print_watchpoint_hits(watchpoint_hits_test_4, 4)
def print_watchpoint_hits(watchpoint_hits, test_id):
"""Print watchpoint hits."""
for x, _ in enumerate(watchpoint_hits):
print("-----------------------------------------------------------")
print("watchpoint_hit for test_%u attributes:" % test_id)
print("name = ", watchpoint_hits[x].name)
print("slot = ", watchpoint_hits[x].slot)
print("condition = ", watchpoint_hits[x].condition)
print("watchpoint_id = ", watchpoint_hits[x].watchpoint_id)
for p, _ in enumerate(watchpoint_hits[x].parameters):
print("parameter ", p, " name = ",
watchpoint_hits[x].parameters[p].name)
print("parameter ", p, " disabled = ",
watchpoint_hits[x].parameters[p].disabled)
print("parameter ", p, " value = ",
watchpoint_hits[x].parameters[p].value)
print("parameter ", p, " hit = ",
watchpoint_hits[x].parameters[p].hit)
print("parameter ", p, " actual_value = ",
watchpoint_hits[x].parameters[p].actual_value)
print("error code = ", watchpoint_hits[x].error_code)
print("device_id = ", watchpoint_hits[x].device_id)
print("root_graph_id = ", watchpoint_hits[x].root_graph_id)
if __name__ == "__main__":
main()

@ -0,0 +1,70 @@
-----------------------------------------------------------
tensor_info_1 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias
slot = 0
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = True
tensor_data_1 attributes:
data (printed in uint8) = [ 1 0 0 0 195 127 0 0 80 58 118 65 195 127 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 40 186 117 65
195 127 0 0 5 0 0 0 0 0 0 0 160 76 6 204 195 127
0 0 69 0 0 0 0 0 0 0 1 0 0 0 195 127 0 0
48 135 117 65 195 127 0 0 16 58 118 65 195 127 0 0 144 58
118 65 195 127 0 0 168 186 117 65 195 127 0 0 6 0 0 0
0 0 0 0 160 47 6 204 195 127 0 0 69 0 0 0 0 0
0 0 1 0 0 0 195 127 0 0 80 58 118 65 195 127 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 184 249
117 65 195 127 0 0 7 0 0 0 0 0 0 0 96 39 6 204
195 127 0 0 69 0 0 0 0 0 0 0 1 0 0 0 195 127
0 0 224 218 117 65 195 127 0 0 0 0 0 0 0 0 0 0
224 219 117 65 195 127 0 0 200 17 118 65 195 127 0 0 8 0
0 0 0 0 0 0 128 130 5 204 195 127 0 0 69 0 0 0
0 0 0 0 1 0 0 0 195 127 0 0 120 233 255 59 196 127
0 0 224 217 117 65 195 127 0 0 224 214 117 65 195 127 0 0
120 250 117 65 195 127 0 0 9 0 0 0 0 0 0 0 176 213
4 204 195 127 0 0 69 0 0 0 0 0 0 0 1 0 0 0
195 127 0 0 240 66 118 65 195 127 0 0 160 218 117 65 195 127
0 0 224 215 117 65 195 127 0 0 40 9 118 65 195 127 0 0
10 0 0 0 0 0 0 0 16 46 4 204 195 127 0 0 69 0
0 0 0 0 0 0 1 0 0 0 195 127 0 0 208 59 118 65
195 127 0 0 0 0 0 0 0 0 0 0 96 218 117 65 195 127
0 0 56 251 117 65 195 127 0 0 11 0 0 0 0 0 0 0
32 137 3 204 195 127 0 0 85 0 0 0 0 0 0 0 1 0
0 0 195 127 0 0 224 214 117 65 195 127 0 0 144 59 118 65
195 127 0 0 160 214 117 65 195 127 0 0 136 62 118 65 195 127
0 0 12 0 0 0 0 0 0 0 48 104 15 204 195 127 0 0
32 104 15 204 195 127 0 0]
size in bytes = 512
debugger dtype = 11
shape = [128]
-----------------------------------------------------------
tensor_info_2 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op308
slot = 0
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = False
tensor_data_2 attributes:
data (printed in uint8) = [206 239 74 ... 53 201 62]
size in bytes = 4153344
debugger dtype = 11
shape = [32, 192, 13, 13]
-----------------------------------------------------------
tensor_info_3 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op300
slot = 1
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = False
tensor_data_3 attributes:
data (printed in uint8) = [206 239 74 ... 16 239 51]
size in bytes = 831744
debugger dtype = 8
shape = [207936]

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save