Implementation for mindspore debugger

pull/2140/head
Shida He 5 years ago
parent 73f440a54d
commit 4c056855e0

@ -12,7 +12,7 @@ if (NOT CMAKE_SYSTEM_NAME MATCHES "Windows")
endif ()
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O2 -Werror -Wno-return-std-move -Wno-unused-private-field -Wno-unused-lambda-capture -Wno-sign-compare -Wno-overloaded-virtual -Wno-unneeded-internal-declaration -Wno-unused-variable -Wno-pessimizing-move -Wno-inconsistent-missing-override -DHALF_ENABLE_CPP11_USER_LITERALS=0 -D_FORTIFY_SOURCE=2")
set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O2 -Werror -Wno-return-std-move -Wno-unused-private-field -Wno-unused-lambda-capture -Wno-sign-compare -Wno-overloaded-virtual -Wno-unneeded-internal-declaration -Wno-unused-variable -Wno-pessimizing-move -Wno-inconsistent-missing-override -DHALF_ENABLE_CPP11_USER_LITERALS=0 -D_FORTIFY_SOURCE=2")
else()
set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O2 -Wl,--allow-shlib-undefined -DHALF_ENABLE_CPP11_USER_LITERALS=0 -D_FORTIFY_SOURCE=2")
endif()

@ -25,7 +25,7 @@ usage()
echo "Usage:"
echo "bash build.sh [-d] [-r] [-v] [-c on|off] [-t on|off] [-g on|off] [-h] [-b ge] [-m infer|train] \\"
echo " [-a on|off] [-Q on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\"
echo " [-P on|off] [-z [on|off]] [-M on|off] [-V 9.2|10.1] [-I] [-K]"
echo " [-P on|off] [-z [on|off]] [-M on|off] [-V 9.2|10.1] [-I] [-K] [-B on|off]"
echo ""
echo "Options:"
echo " -d Debug mode"
@ -54,6 +54,7 @@ usage()
echo " -I Compile predict, default off"
echo " -K Compile with AKG, default off"
echo " -s Enable serving module, default off"
echo " -B Enable debugger, default off"
}
# check value of input is 'on' or 'off'
@ -94,8 +95,10 @@ checkopts()
PREDICT_PLATFORM=""
ENABLE_AKG="on"
ENABLE_SERVING="off"
ENABLE_DEBUGGER="off"
# Process the options
while getopts 'drvj:c:t:hsb:a:g:p:ie:m:I:LRP:Q:D:zM:V:K:s' opt
while getopts 'drvj:c:t:hsb:a:g:p:ie:m:I:LRP:Q:D:zM:V:K:sB:' opt
do
OPTARG=$(echo ${OPTARG} | tr '[A-Z]' '[a-z]')
case "${opt}" in
@ -240,6 +243,11 @@ checkopts()
ENABLE_SERVING="on"
echo "enable serving"
;;
B)
check_on_off $OPTARG B
ENABLE_DEBUGGER="on"
echo "enable debugger"
;;
*)
echo "Unknown option ${opt}!"
usage
@ -322,6 +330,9 @@ build_mindspore()
if [[ "X$ENABLE_SERVING" = "Xon" ]]; then
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_SERVING=ON"
fi
if [[ "X$ENABLE_DEBUGGER" = "Xon" ]]; then
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DEBUGGER=ON"
fi
echo "${CMAKE_ARGS}"
if [[ "X$INC_BUILD" = "Xoff" ]]; then

@ -0,0 +1,14 @@
mindspore_add_pkg(absl
VER 20200225.2
LIBS absl_strings absl_throw_delegate absl_raw_logging_internal absl_int128 absl_bad_optional_access
URL https://github.com/abseil/abseil-cpp/archive/20200225.2.tar.gz
MD5 73f2b6e72f1599a9139170c29482ddc4
CMAKE_OPTION -DCMAKE_BUILD_TYPE:STRING=Release -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=TRUE)
include_directories(${absl_INC})
add_library(mindspore::absl_strings ALIAS absl::absl_strings)
add_library(mindspore::absl_throw_delegate ALIAS absl::absl_throw_delegate)
add_library(mindspore::absl_raw_logging_internal ALIAS absl::absl_raw_logging_internal)
add_library(mindspore::absl_int128 ALIAS absl::absl_int128)
add_library(mindspore::absl_bad_optional_access ALIAS absl::absl_bad_optional_access)

@ -0,0 +1,12 @@
mindspore_add_pkg(c-ares
VER 1.15.0
LIBS cares
URL https://github.com/c-ares/c-ares/releases/download/cares-1_15_0/c-ares-1.15.0.tar.gz
MD5 d2391da274653f7643270623e822dff7
CMAKE_OPTION -DCMAKE_BUILD_TYPE:STRING=Release
-DCARES_SHARED:BOOL=OFF
-DCARES_STATIC:BOOL=ON
-DCARES_STATIC_PIC:BOOL=ON)
include_directories(${c-ares_INC})
add_library(mindspore::cares ALIAS c-ares::cares)

@ -0,0 +1,110 @@
set(grpc_USE_STATIC_LIBS ON)
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
set(grpc_CXXFLAGS "-fstack-protector-all -Wno-uninitialized -Wno-unused-parameter -fPIC -fvisibility=hidden -D_FORTIFY_SOURCE=2 -O2")
elseif (${CMAKE_SYSTEM_NAME} MATCHES "Windows")
set(grpc_CXXFLAGS "-fstack-protector-all -Wno-maybe-uninitialized -Wno-unused-parameter -fPIC -fvisibility=hidden -D_FORTIFY_SOURCE=2 -O2")
else()
set(grpc_CXXFLAGS "-fstack-protector-all -Wno-maybe-uninitialized -Wno-unused-parameter -fPIC -fvisibility=hidden -D_FORTIFY_SOURCE=2 -D_GLIBCXX_USE_CXX11_ABI=0 -O2")
endif()
set(grpc_LDFLAGS "-Wl,-z,relro,-z,now,-z,noexecstack")
if (EXISTS ${protobuf_ROOT}/lib64)
set(_FINDPACKAGE_PROTOBUF_CONFIG_DIR "${protobuf_ROOT}/lib64/cmake/protobuf")
else()
set(_FINDPACKAGE_PROTOBUF_CONFIG_DIR "${protobuf_ROOT}/lib/cmake/protobuf")
endif()
message("grpc using Protobuf_DIR : " ${_FINDPACKAGE_PROTOBUF_CONFIG_DIR})
if (EXISTS ${absl_ROOT}/lib64)
set(_FINDPACKAGE_ABSL_CONFIG_DIR "${absl_ROOT}/lib64/cmake/absl")
else()
set(_FINDPACKAGE_ABSL_CONFIG_DIR "${absl_ROOT}/lib/cmake/absl")
endif()
message("grpc using absl_DIR : " ${_FINDPACKAGE_ABSL_CONFIG_DIR})
set(_CMAKE_ARGS_OPENSSL_ROOT_DIR "")
if (OPENSSL_ROOT_DIR)
set(_CMAKE_ARGS_OPENSSL_ROOT_DIR "-DOPENSSL_ROOT_DIR:PATH=${OPENSSL_ROOT_DIR}")
endif()
mindspore_add_pkg(grpc
VER 1.27.3
LIBS grpc++ grpc gpr upb address_sorting
EXE grpc_cpp_plugin
URL https://github.com/grpc/grpc/archive/v1.27.3.tar.gz
MD5 0c6c3fc8682d4262dd0e5e6fabe1a7e2
CMAKE_OPTION -DCMAKE_BUILD_TYPE:STRING=Release
-DgRPC_INSTALL:BOOL=ON
-DgRPC_BUILD_TESTS:BOOL=OFF
-DgRPC_PROTOBUF_PROVIDER:STRING=package
-DgRPC_PROTOBUF_PACKAGE_TYPE:STRING=CONFIG
-DProtobuf_DIR:PATH=${_FINDPACKAGE_PROTOBUF_CONFIG_DIR}
-DgRPC_ZLIB_PROVIDER:STRING=package
-DZLIB_ROOT:PATH=${zlib_ROOT}
-DgRPC_ABSL_PROVIDER:STRING=package
-Dabsl_DIR:PATH=${_FINDPACKAGE_ABSL_CONFIG_DIR}
-DgRPC_CARES_PROVIDER:STRING=package
-Dc-ares_DIR:PATH=${c-ares_ROOT}/lib/cmake/c-ares
-DgRPC_SSL_PROVIDER:STRING=package
${_CMAKE_ARGS_OPENSSL_ROOT_DIR}
)
include_directories(${grpc_INC})
add_library(mindspore::grpc++ ALIAS grpc::grpc++)
# link other grpc libs
target_link_libraries(grpc::grpc++ INTERFACE grpc::grpc grpc::gpr grpc::upb grpc::address_sorting)
# link built dependencies
target_link_libraries(grpc::grpc++ INTERFACE mindspore::z)
target_link_libraries(grpc::grpc++ INTERFACE mindspore::cares)
target_link_libraries(grpc::grpc++ INTERFACE mindspore::absl_strings mindspore::absl_throw_delegate
mindspore::absl_raw_logging_internal mindspore::absl_int128 mindspore::absl_bad_optional_access)
# link system openssl
find_package(OpenSSL REQUIRED)
target_link_libraries(grpc::grpc++ INTERFACE OpenSSL::SSL OpenSSL::Crypto)
function(ms_grpc_generate c_var h_var)
if(NOT ARGN)
message(SEND_ERROR "Error: ms_grpc_generate() called without any proto files")
return()
endif()
set(${c_var})
set(${h_var})
foreach(file ${ARGN})
get_filename_component(abs_file ${file} ABSOLUTE)
get_filename_component(file_name ${file} NAME_WE)
get_filename_component(file_dir ${abs_file} PATH)
file(RELATIVE_PATH rel_path ${CMAKE_CURRENT_SOURCE_DIR} ${file_dir})
list(APPEND ${c_var} "${CMAKE_BINARY_DIR}/proto/${file_name}.pb.cc")
list(APPEND ${h_var} "${CMAKE_BINARY_DIR}/proto/${file_name}.pb.h")
list(APPEND ${c_var} "${CMAKE_BINARY_DIR}/proto/${file_name}.grpc.pb.cc")
list(APPEND ${h_var} "${CMAKE_BINARY_DIR}/proto/${file_name}.grpc.pb.h")
add_custom_command(
OUTPUT "${CMAKE_BINARY_DIR}/proto/${file_name}.pb.cc"
"${CMAKE_BINARY_DIR}/proto/${file_name}.pb.h"
"${CMAKE_BINARY_DIR}/proto/${file_name}.grpc.pb.cc"
"${CMAKE_BINARY_DIR}/proto/${file_name}.grpc.pb.h"
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_BINARY_DIR}/proto"
COMMAND protobuf::protoc --version
COMMAND protobuf::protoc -I${file_dir} --cpp_out=${CMAKE_BINARY_DIR}/proto
--grpc_out=${CMAKE_BINARY_DIR}/proto --plugin=protoc-gen-grpc=$<TARGET_FILE:grpc::grpc_cpp_plugin> ${abs_file}
DEPENDS protobuf::protoc grpc::grpc_cpp_plugin ${abs_file}
COMMENT "Running C++ gRPC compiler on ${file}" VERBATIM)
endforeach()
set_source_files_properties(${${c_var}} ${${h_var}} PROPERTIES GENERATED TRUE)
set(${c_var} ${${c_var}} PARENT_SCOPE)
set(${h_var} ${${h_var}} PARENT_SCOPE)
endfunction()

@ -0,0 +1,9 @@
mindspore_add_pkg(zlib
VER 1.2.11
LIBS z
URL https://github.com/madler/zlib/archive/v1.2.11.tar.gz
MD5 0095d2d2d1f3442ce1318336637b695f
CMAKE_OPTION -DCMAKE_BUILD_TYPE:STRING=Release)
include_directories(${zlib_INC})
add_library(mindspore::z ALIAS zlib::z)

@ -14,6 +14,16 @@ include(${CMAKE_SOURCE_DIR}/cmake/external_libs/eigen.cmake)
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/json.cmake)
include(${CMAKE_SOURCE_DIR}/cmake/dependency_securec.cmake)
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/protobuf.cmake)
if (ENABLE_DEBUGGER)
# build dependencies of gRPC
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/absl.cmake)
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/c-ares.cmake)
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/zlib.cmake)
# build gRPC
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/grpc.cmake)
endif()
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/pybind11.cmake)
MESSAGE("go to link flatbuffers")
include(${CMAKE_SOURCE_DIR}/cmake/external_libs/flatbuffers.cmake)

@ -17,6 +17,7 @@ option(ENABLE_DUMP_E2E "Enable dump e2e file, default on" OFF)
option(ENABLE_DUMP_IR "Enable dump funciton graph ir, default on" ON)
option(ENABLE_MPI "enable mpi" OFF)
option(ENABLE_AKG "enable akg" OFF)
option(ENABLE_DEBUGGER "enable debugger" OFF)
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
if (WIN32)
@ -112,3 +113,7 @@ endif()
if(ENABLE_DUMP_E2E)
add_compile_definitions(ENABLE_DUMP_E2E)
endif()
if(ENABLE_DEBUGGER)
add_compile_definitions(ENABLE_DEBUGGER)
endif()

@ -71,6 +71,17 @@ message("onnx proto path is :" ${ONNX_PROTO})
ms_protobuf_generate(ONNX_PROTO_SRCS ONNX_PROTO_HDRS ${ONNX_PROTO})
list(APPEND MINDSPORE_PROTO_LIST ${ONNX_PROTO_SRCS})
if (ENABLE_DEBUGGER)
# debugger: compile proto files
include_directories("${CMAKE_BINARY_DIR}/debug/debugger")
file(GLOB_RECURSE DEBUGGER_PROTO_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "debug/debugger/debug_graph.proto")
ms_protobuf_generate(DEBUGGER_PROTO_SRCS DEBUGGER_PROTO_HDRS ${DEBUGGER_PROTO_LIST})
file(GLOB_RECURSE DEBUGGER_GRPC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "debug/debugger/debug_grpc.proto")
ms_grpc_generate(DEBUGGER_GRPC_SRCS DEBUGGER_GRPC_HDRS ${DEBUGGER_GRPC_LIST})
list(APPEND MINDSPORE_PROTO_LIST ${DEBUGGER_PROTO_SRCS})
list(APPEND MINDSPORE_PROTO_LIST ${DEBUGGER_GRPC_SRCS})
endif ()
if (ENABLE_DUMP_PROTO)
include_directories(${CMAKE_BINARY_DIR})
file(GLOB_RECURSE PROTO_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "utils/node_strategy.proto")
@ -125,6 +136,14 @@ endforeach ()
set_property(SOURCE ${SUB_OBJECTS_SRC} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_ME)
add_library(mindspore STATIC ${SUB_OBJECTS_SRC})
target_link_libraries(proto_input mindspore::protobuf)
if (ENABLE_DEBUGGER)
# debugger: link grpc
target_link_libraries(proto_input mindspore::grpc++)
endif()
target_link_libraries(mindspore proto_input)
if (ENABLE_CPU AND ENABLE_MPI)
target_link_libraries(mindspore securec mindspore::flatbuffers mindspore::ompi)
@ -217,6 +236,7 @@ if (USE_GLOG)
endif ()
if (ENABLE_DUMP_PROTO)
message("add protobuf lib to c_expression")
target_link_libraries(_c_expression PRIVATE mindspore::protobuf)
endif ()

@ -10,6 +10,15 @@ set(_DEBUG_SRC_LIST
"${CMAKE_CURRENT_SOURCE_DIR}/trace.cc"
)
if (ENABLE_DEBUGGER)
list(APPEND _DEBUG_SRC_LIST
"${CMAKE_CURRENT_SOURCE_DIR}/debugger/debugger.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/debugger/grpc_client.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/debugger/proto_exporter.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/debug_services.cc"
)
endif (ENABLE_DEBUGGER)
if (ENABLE_DUMP_E2E)
list(APPEND _DEBUG_SRC_LIST "${CMAKE_CURRENT_SOURCE_DIR}/e2e_dump.cc")
endif (ENABLE_DUMP_E2E)

@ -0,0 +1,194 @@
/**
* Copyright 2019-2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "debug/debug_services.h"
namespace mindspore {
DebugServices::DebugServices() {
tensor_loader_ = new TensorLoader();
uint32_t iter_num = -1;
tensor_loader_->set_iter_num(iter_num);
}
DebugServices::DebugServices(const DebugServices &other) {
tensor_loader_ = other.tensor_loader_;
watchpoint_table = other.watchpoint_table;
}
DebugServices &DebugServices::operator=(const DebugServices &other) {
if (this != &other) {
tensor_loader_ = other.tensor_loader_;
watchpoint_table = other.watchpoint_table;
}
return *this;
}
DebugServices::~DebugServices() { delete tensor_loader_; }
void DebugServices::add_watchpoint(unsigned int id, unsigned int watch_condition,
const std::vector<std::tuple<std::string, bool>> &check_node_list) {
std::lock_guard<std::mutex> lg(lock_);
watchpoint_t watchpoint_item;
watchpoint_item.id = id;
if (watch_condition == 0) {
watchpoint_item.conditions.nan.enabled = true;
} else if (watch_condition == 1) {
watchpoint_item.conditions.inf.enabled = true;
watchpoint_item.conditions.neg_inf.enabled = true;
}
watchpoint_item.check_node_list = check_node_list;
watchpoint_table[id] = watchpoint_item;
}
void DebugServices::remove_watchpoint(unsigned int id) {
std::lock_guard<std::mutex> lg(lock_);
watchpoint_table.erase(id);
}
void DebugServices::check_watchpoints(std::vector<std::string> *name, std::vector<std::string> *slot,
std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size,
std::vector<int> *condition, std::vector<unsigned int> *wacthpoint_id) {
std::lock_guard<std::mutex> lg(lock_);
std::vector<std::shared_ptr<TensorData>> tensor_list = tensor_loader_->GetTensor();
std::string current_tensor_name;
std::unordered_map<unsigned int, watchpoint_t> watchpoints_to_check_table;
for (std::size_t i = 0; i < tensor_list.size(); i++) {
current_tensor_name = tensor_list[i]->GetName();
mindspore::tensor::TensorPtr tensor_ptr = tensor_list[i]->GetTensor();
int tensor_data_type = tensor_ptr->data_type_c();
// check if we need to analyze this node and for which watchpoints we will check
// create a list of watchpoints to check
watchpoints_to_check_table.clear();
for (auto w_table_item : watchpoint_table) {
// if the watchpoint is checking for a nan or inf and the current tensor is not of a float type, then
// don't check the watchpoint for this tensor
if (std::get<1>(w_table_item).conditions.inf.enabled || std::get<1>(w_table_item).conditions.neg_inf.enabled ||
std::get<1>(w_table_item).conditions.nan.enabled) {
if (tensor_data_type != kNumberTypeFloat16 && tensor_data_type != kNumberTypeFloat &&
tensor_data_type != kNumberTypeFloat32 && tensor_data_type != kNumberTypeFloat64) {
continue;
}
}
auto check_node_list = std::get<1>(w_table_item).check_node_list;
for (auto check_node : check_node_list) {
std::string w_name = std::get<0>(check_node);
bool w_type = std::get<1>(check_node);
// check if the current node tensor name is included the watchpoint
std::string current_node_name = current_tensor_name.substr(0, current_tensor_name.find_first_of(":"));
if ((w_type == true && (current_tensor_name.find(w_name) != string::npos || w_name == "*")) ||
(w_type == false && current_node_name == w_name)) {
watchpoints_to_check_table[w_table_item.second.id] = w_table_item.second;
break;
}
}
}
// check if no watchpoints are valid for the current tensor
if (watchpoints_to_check_table.empty()) {
continue;
}
// need to add support for float16 and float64, and other types when we support conditions beyond inf and nan
if (tensor_data_type != kNumberTypeFloat && tensor_data_type != kNumberTypeFloat32) {
continue;
}
float *start_addr = reinterpret_cast<float *>(tensor_ptr->data_c(false));
unsigned int num_elements = (tensor_ptr->data().nbytes()) / sizeof(float);
std::unordered_map<unsigned int, watchpoint_t>::iterator it_w_table_check;
std::vector<unsigned int> hit_encountered;
for (unsigned int index = 0; index < num_elements; index++) {
float x = start_addr[index];
it_w_table_check = watchpoints_to_check_table.begin();
while (it_w_table_check != watchpoints_to_check_table.end()) {
if ((it_w_table_check->second.conditions.inf.enabled || it_w_table_check->second.conditions.neg_inf.enabled) &&
isinf(x)) {
hit_encountered.push_back(it_w_table_check->second.id);
} else if (it_w_table_check->second.conditions.nan.enabled && isnan(x)) {
hit_encountered.push_back(it_w_table_check->second.id);
}
++it_w_table_check;
}
if (hit_encountered.size()) {
for (auto it_hit_id = hit_encountered.begin(); it_hit_id != hit_encountered.end(); ++it_hit_id) {
std::string name_no_slot = current_tensor_name.substr(0, current_tensor_name.find_first_of(":"));
name->push_back(name_no_slot);
slot->push_back(std::to_string(tensor_list[i]->GetSlot()));
data_ptr->push_back(reinterpret_cast<char *>(tensor_ptr->data_c(false)));
data_size->push_back(tensor_ptr->data().nbytes());
int condition_item = -1;
if (watchpoint_table[*it_hit_id].conditions.nan.enabled) {
condition_item = 0;
} else if (watchpoint_table[*it_hit_id].conditions.inf.enabled ||
watchpoint_table[*it_hit_id].conditions.neg_inf.enabled) {
condition_item = 1;
}
condition->push_back(condition_item);
wacthpoint_id->push_back(*it_hit_id);
watchpoints_to_check_table.erase(*it_hit_id);
}
hit_encountered.clear();
}
if (watchpoints_to_check_table.empty()) {
break;
}
}
}
}
void DebugServices::read_nodes_tensors(std::vector<std::string> name, std::vector<std::string> *ret_name,
std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size,
std::vector<TypePtr> *dtype, std::vector<std::vector<int>> *shape) {
std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
tensor_loader_->SearchTensors(name, &result_list);
for (auto result : result_list) {
if (!std::get<1>(result)) {
continue;
}
ret_name->push_back(std::get<0>(result));
data_ptr->push_back(reinterpret_cast<char *>(std::get<1>(result)->GetTensor()->data_c(false)));
data_size->push_back(std::get<1>(result)->GetTensor()->data().nbytes());
dtype->push_back(std::get<1>(result)->GetTensor()->Dtype());
shape->push_back(std::get<1>(result)->GetTensor()->shape());
}
}
TensorLoader *DebugServices::get_tensor_loader() const { return tensor_loader_; }
} // namespace mindspore

@ -0,0 +1,95 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_DEBUG_DEBUG_SERVICES_H_
#define MINDSPORE_CCSRC_DEBUG_DEBUG_SERVICES_H_
#include <vector>
#include <string>
#include <memory>
#include <tuple>
#include <unordered_map>
#include <mutex>
#include "debug/tensor_load.h"
#include "debug/tensor_data.h"
#include "ir/dtype.h"
namespace mindspore {
class DebugServices {
public:
DebugServices();
DebugServices(const DebugServices &other);
DebugServices &operator=(const DebugServices &other);
~DebugServices();
void add_watchpoint(unsigned int id, unsigned int watch_condition,
const std::vector<std::tuple<std::string, bool>> &check_node_list);
void remove_watchpoint(unsigned int id);
void check_watchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<char *> *data_ptr,
std::vector<unsigned int> *data_size, std::vector<int> *condition,
std::vector<unsigned int> *wacthpoint_id);
void read_nodes_tensors(std::vector<std::string> name, std::vector<std::string> *ret_name,
std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size,
std::vector<TypePtr> *dtype, std::vector<std::vector<int>> *shape);
TensorLoader *get_tensor_loader() const;
private:
typedef struct condition_no_param {
bool enabled = false;
} condition_no_param_t;
typedef struct condition_with_param {
bool enabled = false;
float parameter = 0;
} condition_with_param_t;
typedef struct conditions {
condition_no_param_t inf;
condition_no_param_t neg_inf;
condition_no_param_t nan;
condition_with_param_t max_below;
condition_with_param_t max_above;
condition_with_param_t min_below;
condition_with_param_t min_above;
condition_with_param_t max_minus_min_below;
condition_with_param_t max_minus_min_above;
condition_with_param_t mean_below;
condition_with_param_t mean_above;
condition_with_param_t std_dev_below;
condition_with_param_t std_dev_above;
} conditions_t;
typedef struct watchpoint {
unsigned int id;
conditions_t conditions;
std::vector<std::tuple<std::string, bool>> check_node_list;
} watchpoint_t;
std::mutex lock_;
std::unordered_map<unsigned int, watchpoint_t> watchpoint_table;
TensorLoader *tensor_loader_;
};
} // namespace mindspore
#endif // MINDSPORE_CCSRC_DEBUG_DEBUG_SERVICES_H_

File diff suppressed because it is too large Load Diff

@ -0,0 +1,81 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
syntax = "proto3";
package debugger;
import "debug_graph.proto";
service EventListener {
rpc WaitCMD (Metadata) returns (EventReply) {};
rpc SendMetadata (Metadata) returns (EventReply) {};
rpc SendGraph (GraphProto) returns (EventReply) {};
rpc SendTensors (stream TensorProto) returns (EventReply) {};
rpc SendWatchpointHits (stream WatchpointHit) returns (EventReply) {};
}
message Metadata {
string device_name = 1;
int32 cur_step = 2;
}
message EventReply {
enum Status {
OK = 0;
FAILED = 1;
PENDING = 2;
}
Status status = 1;
oneof cmd {
bool exit = 2;
int32 run_cmd = 3;
SetCMD set_cmd = 4;
ViewCMD view_cmd = 5;
}
}
message SetCMD {
repeated WatchNode watch_nodes = 1;
WatchCondition watch_condition = 2;
bool delete = 3;
int32 id = 4;
}
message ViewCMD {
repeated TensorProto tensors = 1;
}
message WatchCondition {
enum Condition {
nan = 0;
inf = 1;
}
Condition condition = 1;
}
message WatchNode {
string node_name = 1;
string node_type = 2;
}
message WatchpointHit {
TensorProto tensor = 1;
WatchCondition watch_condition = 2;
int32 id = 3;
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,159 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_DEBUG_DEBUGGER_DEBUGGER_H_
#define MINDSPORE_CCSRC_DEBUG_DEBUGGER_DEBUGGER_H_
#include <list>
#include <memory>
#include <string>
#include "session/kernel_graph.h"
#include "debug/debugger/grpc_client.h"
#include "debug/debug_services.h"
using debugger::DataType;
using debugger::EventReply;
using debugger::GraphProto;
using debugger::ModelProto;
using debugger::TensorProto;
using debugger::WatchCondition;
using debugger::WatchNode;
using debugger::WatchpointHit;
template <class T>
using ProtoVector = google::protobuf::RepeatedPtrField<T>;
namespace mindspore {
// different types of command recieved by debugger
// need to keep sync with client-side proto and server-side proto
enum class DebuggerCommand { kExitCMD = 2, kRunCMD = 3, kSetCMD = 4, kViewCMD = 5, kUnknownCMD = -1 };
class Debugger : public std::enable_shared_from_this<Debugger> {
public:
static std::shared_ptr<Debugger> GetInstance() {
std::lock_guard<std::mutex> i_lock(instance_lock_);
if (debugger_ == nullptr) {
debugger_ = std::shared_ptr<Debugger>(new (std::nothrow) Debugger());
}
return debugger_;
}
// deconstructor
~Debugger() = default;
// init
// only save device_id
void Init(const uint32_t device_id);
// reset debugger
void Reset();
// enable debugger
// send graph and wait for command
// do nothing if graph is set already
void PreExecute(const KernelGraphPtr &graph_ptr);
// analyze tensors and wait for command
// don't need a graph_ptr because it is saved during pre_execute
void PostExecute();
// suspend the execution after a debug_op
void PostDebugOp();
DebugServices *get_debug_services();
bool debugger_enabled();
private:
// private constructor for singleton
Debugger();
// enable debugger
// instantiate class members
// read env variable for grpc client
void EnableDebugger();
// check and save graph pointer
void CheckGraphPtr(const KernelGraphPtr &graph_ptr);
// check if the graph is a dataset graph
void CheckDatasetGraph();
// serialize graph and get proto
GraphProto GetGraphProto();
// send graph and enter command wait loop
void SendGraphAndSuspend(const GraphProto &graph_proto);
// wait for command and process command
// send command request and process reply in a loop
// break if RunCMD
void CommandLoop();
// process reply and command type
DebuggerCommand GetCommand(const EventReply &reply);
// parse other data out of EventReply
ProtoVector<WatchNode> GetWatchnodes(const EventReply &reply);
WatchCondition GetWatchcondition(const EventReply &reply);
int32_t GetWatchpointID(const EventReply &reply);
bool GetWatchpointDelete(const EventReply &reply);
ProtoVector<TensorProto> GetTensors(const EventReply &reply);
// set what nodes and conditions to watch
void SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCondition &condition, const int32_t id);
// remove watchpoint with id
void RemoveWatchpoint(const int32_t id);
// load tensor for view command
std::list<TensorProto> LoadTensors(const ProtoVector<TensorProto> &tensors);
// terminate training process
void Exit();
// analyze tensors and check watchpoint conditions
// return names of tensors and what condition they hit
std::list<WatchpointHit> CheckWatchpoints();
// send watchpoints that hit and enter command wait loop
void SendWatchpointsAndSuspend(const std::list<WatchpointHit> &points);
// class members
std::unique_ptr<GrpcClient> grpc_client_;
std::unique_ptr<DebugServices> debug_services_;
KernelGraphPtr graph_ptr_;
uint32_t device_id_;
int32_t num_step_;
bool debugger_enabled_;
bool is_dataset_graph_;
std::mutex access_lock_;
// singleton
static std::mutex instance_lock_;
static std::shared_ptr<Debugger> debugger_;
};
using DebuggerPtr = std::shared_ptr<Debugger>;
// get debugger ModelProto
std::string GetDebuggerFuncGraphProtoString(const FuncGraphPtr &func_graph);
ModelProto GetDebuggerFuncGraphProto(const FuncGraphPtr &func_graph);
// for getting proto DataType from Type of Tensor
DataType GetDebuggerNumberDataType(const TypePtr &type);
} // namespace mindspore
#endif // MINDSPORE_CCSRC_DEBUG_DEBUGGER_DEBUGGER_H_

@ -0,0 +1,124 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <thread>
#include "debug/debugger/grpc_client.h"
#include "utils/log_adapter.h"
using debugger::EventListener;
using debugger::EventReply;
using debugger::EventReply_Status_FAILED;
using debugger::GraphProto;
using debugger::Metadata;
using debugger::TensorProto;
using debugger::WatchpointHit;
namespace mindspore {
GrpcClient::GrpcClient(const std::string &host, const std::string &port) : stub_(nullptr) { Init(host, port); }
void GrpcClient::Init(const std::string &host, const std::string &port) {
std::string target_str = host + ":" + port;
MS_LOG(INFO) << "GrpcClient connecting to: " << target_str;
std::shared_ptr<grpc::Channel> channel = grpc::CreateChannel(target_str, grpc::InsecureChannelCredentials());
stub_ = EventListener::NewStub(channel);
}
void GrpcClient::Reset() { stub_ = nullptr; }
EventReply GrpcClient::WaitForCommand(const Metadata &metadata) {
EventReply reply;
grpc::ClientContext context;
grpc::Status status = stub_->WaitCMD(&context, metadata, &reply);
if (!status.ok()) {
MS_LOG(ERROR) << "RPC failed: WaitForCommand";
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
reply.set_status(EventReply_Status_FAILED);
}
return reply;
}
EventReply GrpcClient::SendMetadata(const Metadata &metadata) {
EventReply reply;
grpc::ClientContext context;
grpc::Status status = stub_->SendMetadata(&context, metadata, &reply);
if (!status.ok()) {
MS_LOG(ERROR) << "RPC failed: SendMetadata";
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
reply.set_status(EventReply_Status_FAILED);
}
return reply;
}
EventReply GrpcClient::SendGraph(const GraphProto &graph) {
EventReply reply;
grpc::ClientContext context;
grpc::Status status = stub_->SendGraph(&context, graph, &reply);
if (!status.ok()) {
MS_LOG(ERROR) << "RPC failed: SendGraph";
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
reply.set_status(EventReply_Status_FAILED);
}
return reply;
}
EventReply GrpcClient::SendTensors(const std::list<TensorProto> &tensors) {
EventReply reply;
grpc::ClientContext context;
std::unique_ptr<grpc::ClientWriter<TensorProto> > writer(stub_->SendTensors(&context, &reply));
for (const auto &tensor : tensors) {
if (!writer->Write(tensor)) {
break;
}
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
writer->WritesDone();
grpc::Status status = writer->Finish();
if (!status.ok()) {
MS_LOG(ERROR) << "RPC failed: SendTensors";
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
reply.set_status(EventReply_Status_FAILED);
}
return reply;
}
EventReply GrpcClient::SendWatchpointHits(const std::list<WatchpointHit> &watchpoints) {
EventReply reply;
grpc::ClientContext context;
std::unique_ptr<grpc::ClientWriter<WatchpointHit> > writer(stub_->SendWatchpointHits(&context, &reply));
for (const auto &watchpoint : watchpoints) {
if (!writer->Write(watchpoint)) {
break;
}
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
writer->WritesDone();
grpc::Status status = writer->Finish();
if (!status.ok()) {
MS_LOG(ERROR) << "RPC failed: SendWatchpointHits";
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
reply.set_status(EventReply_Status_FAILED);
}
return reply;
}
} // namespace mindspore

@ -0,0 +1,61 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_DEBUG_DEBUGGER_GRPC_CLIENT_H_
#define MINDSPORE_CCSRC_DEBUG_DEBUGGER_GRPC_CLIENT_H_
#include <grpcpp/grpcpp.h>
#include <string>
#include <list>
#include <memory>
#include "proto/debug_grpc.grpc.pb.h"
using debugger::EventListener;
using debugger::EventReply;
using debugger::GraphProto;
using debugger::Metadata;
using debugger::TensorProto;
using debugger::WatchpointHit;
namespace mindspore {
class GrpcClient {
public:
// constructor
GrpcClient(const std::string &host, const std::string &port);
// deconstructor
~GrpcClient() = default;
// init
void Init(const std::string &host, const std::string &port);
// reset
void Reset();
EventReply WaitForCommand(const Metadata &metadata);
EventReply SendMetadata(const Metadata &metadata);
EventReply SendGraph(const GraphProto &graph);
EventReply SendTensors(const std::list<TensorProto> &tensors);
EventReply SendWatchpointHits(const std::list<WatchpointHit> &watchpoints);
private:
std::unique_ptr<EventListener::Stub> stub_;
};
} // namespace mindspore
#endif // MINDSPORE_CCSRC_DEBUG_DEBUGGER_GRPC_CLIENT_H_

File diff suppressed because it is too large Load Diff

@ -0,0 +1,75 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_DEBUG_TENSOR_DATA_H_
#define MINDSPORE_CCSRC_DEBUG_TENSOR_DATA_H_
#include <vector>
#include <string>
#include <cstring>
#include <iostream>
#include "ir/tensor.h"
namespace mindspore {
class TensorData {
private:
mindspore::tensor::TensorPtr tensor_ptr;
std::string name;
size_t slot;
int execution_order;
public:
TensorData() : slot(0), execution_order(-1) {}
TensorData(const TensorData &obj) {
std::cout << "Copy Constructor" << std::endl;
this->name = obj.name;
this->execution_order = obj.execution_order;
this->slot = obj.slot;
this->tensor_ptr = obj.tensor_ptr;
}
~TensorData() {}
std::string GetName() { return this->name; }
mindspore::tensor::TensorPtr GetTensor() { return this->tensor_ptr; }
size_t GetSlot() { return this->slot; }
int GetExecutionOrder() { return this->execution_order; }
int SetExecutionOrder(int execution_order) {
this->execution_order = execution_order;
return true;
}
int SetName(const std::string &name) {
this->name = name;
return true;
}
bool SetTensor(mindspore::tensor::TensorPtr out_tensor) {
this->tensor_ptr = out_tensor;
return true;
}
bool SetSlot(size_t slot) {
this->slot = slot;
return true;
}
};
} // namespace mindspore
#endif // MINDSPORE_CCSRC_DEBUG_TENSOR_DATA_H_

@ -0,0 +1,69 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_DEBUG_TENSOR_LOAD_H_
#define MINDSPORE_CCSRC_DEBUG_TENSOR_LOAD_H_
#include <memory>
#include <vector>
#include <map>
#include <tuple>
#include <string>
#include "debug/tensor_data.h"
namespace mindspore {
class TensorLoader {
public:
TensorLoader() : iter_num(-1) {}
~TensorLoader() {}
bool LoadNewTensor(std::shared_ptr<TensorData> tensor) {
tensor_list.push_back(tensor);
tensor_list_map.insert({tensor->GetName(), tensor});
return true;
}
std::vector<std::shared_ptr<TensorData>> GetTensor() { return tensor_list; }
uint32_t GetIterNum() { return iter_num; }
std::map<std::string, std::shared_ptr<TensorData>> GetTensorMap() { return tensor_list_map; }
void SearchTensors(const std::vector<std::string> &search_list,
std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list) {
for (auto i : search_list) {
std::map<std::string, std::shared_ptr<TensorData>>::iterator iter;
iter = tensor_list_map.find(i);
if (iter != tensor_list_map.end()) {
result_list->push_back(std::make_tuple(i, iter->second));
} else {
result_list->push_back(std::make_tuple(i, nullptr));
}
}
}
bool EmptyTensor() {
tensor_list_map.clear();
tensor_list.clear();
return true;
}
void set_iter_num(uint32_t iter_num) { this->iter_num = iter_num; }
private:
std::vector<std::shared_ptr<TensorData>> tensor_list;
std::map<std::string, std::shared_ptr<TensorData>> tensor_list_map;
uint32_t iter_num;
};
} // namespace mindspore
#endif // MINDSPORE_CCSRC_DEBUG_TENSOR_LOAD_H_

@ -30,6 +30,10 @@
#ifdef ENABLE_DUMP_E2E
#include "debug/e2e_dump.h"
#endif
#ifdef ENABLE_DEBUGGER
#include "debug/tensor_load.h"
#endif
namespace mindspore {
namespace device {
namespace ascend {
@ -346,6 +350,52 @@ bool AscendDeviceAddress::DumpMemToFile(bool trans_flag, const std::string &file
return ret;
}
#endif
#ifdef ENABLE_DEBUGGER
bool AscendDeviceAddress::LoadMemToHost(bool trans_flag, const std::string &tensor_name, int execution_order,
const std::string &host_fmt, const std::vector<int> &host_shape,
TypeId host_type, size_t slot, Debugger *debugger) const {
bool ret = false;
DebugServices *debug_services = debugger->get_debug_services();
TensorLoader *tensor_loader = debug_services->get_tensor_loader();
if (trans_flag) {
MS_LOG(INFO) << "E2E tensor name is " << tensor_name;
mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(host_type, host_shape);
size_t host_size = out_tensor->data().nbytes();
ret = SyncDeviceToHost(host_shape, host_size, host_type, out_tensor->data_c(true));
if (!ret) {
MS_LOG(ERROR) << "Copy device mem to host failed";
return ret;
}
auto tensor_data = std::make_shared<mindspore::TensorData>();
tensor_data->SetName(tensor_name);
tensor_data->SetExecutionOrder(execution_order);
tensor_data->SetTensor(out_tensor);
tensor_data->SetSlot(slot);
ret = tensor_loader->LoadNewTensor(tensor_data);
} else {
mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(type_id_, host_shape);
size_t host_size = out_tensor->data().nbytes();
auto ret_rt_memcpy = rtMemcpy(out_tensor->data_c(true), host_size, ptr_, host_size, RT_MEMCPY_DEVICE_TO_HOST);
auto tensor_data = std::make_shared<mindspore::TensorData>();
tensor_data->SetName(tensor_name);
tensor_data->SetExecutionOrder(execution_order);
tensor_data->SetTensor(out_tensor);
tensor_data->SetSlot(slot);
ret = tensor_loader->LoadNewTensor(tensor_data);
if (ret_rt_memcpy != RT_ERROR_NONE) {
MS_LOG(ERROR) << "SyncDeviceToHost: rtMemcpy mem size[" << size_ << "] fail, ret[" << ret_rt_memcpy << "]";
}
MS_LOG(INFO) << "E2E tensor name is " << tensor_name;
}
return ret;
}
#endif
} // namespace ascend
} // namespace device
} // namespace mindspore

@ -25,6 +25,9 @@
#include "ir/dtype.h"
namespace mindspore {
#ifdef ENABLE_DEBUGGER
class Debugger;
#endif
namespace device {
namespace ascend {
class AscendDeviceAddress : public DeviceAddress {
@ -39,6 +42,10 @@ class AscendDeviceAddress : public DeviceAddress {
#ifdef ENABLE_DUMP_E2E
bool DumpMemToFile(bool dump_mode, const std::string &filepath, const std::string &host_fmt,
const std::vector<int> &host_shape, TypeId host_type) const;
#endif
#ifdef ENABLE_DEBUGGER
bool LoadMemToHost(bool dump_mode, const std::string &tensor_name, int execution_order, const std::string &host_fmt,
const std::vector<int> &host_shape, TypeId host_type, size_t slot, Debugger *debugger) const;
#endif
private:
bool SyncDeviceToHostAndConvertFormat(const std::vector<int> &shape, size_t size, TypeId type, void *host_ptr) const;

@ -41,6 +41,7 @@
#include "kernel/tbe/tbe_python_funcs.h"
#include "pre_activate/mem_reuse/mem_reuse_checker.h"
#include "device/ascend/ascend_memory_manager.h"
#include "debug/tensor_load.h"
using mindspore::device::ascend::ProfilingManager;
using mindspore::device::ascend::ProfilingUtils;
@ -293,6 +294,91 @@ bool AscendKernelRuntime::DumpData(mindspore::session::KernelGraph *graph) {
return true;
}
#ifdef ENABLE_DEBUGGER
namespace {
void LoadOutput(mindspore::session::KernelGraph *graph, Debugger *debugger) {
MS_EXCEPTION_IF_NULL(graph);
bool trans_flag = false;
const auto &apply_kernels = graph->execution_order();
// for kernels, execution order starts from 1
int exec_order = 1;
for (const auto &node : apply_kernels) {
MS_EXCEPTION_IF_NULL(node);
auto node_name = AnfAlgo::GetCNodeName(node);
std::string kernel_name = node->fullname_with_scope();
auto output_size = AnfAlgo::GetOutputTensorNum(node);
for (size_t j = 0; j < output_size; ++j) {
auto addr = AnfAlgo::GetOutputAddr(node, j);
auto type = AnfAlgo::GetOutputInferDataType(node, j);
auto format = kOpFormat_DEFAULT;
string tensor_name = kernel_name + ':' + std::to_string(j);
auto ascend_addr = dynamic_cast<const mindspore::device::ascend::AscendDeviceAddress *>(addr);
std::vector<int> int_shapes;
if (trans_flag) {
int_shapes = trans::GetRuntimePaddingShape(node, j);
} else {
auto shape = AnfAlgo::GetOutputDeviceShape(node, j);
(void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
[](size_t inner_item) { return SizeToInt(inner_item); });
}
auto ret = ascend_addr->LoadMemToHost(trans_flag, tensor_name, exec_order, format, int_shapes, type, j, debugger);
if (!ret) {
MS_LOG(ERROR) << "LoadMemToHost: flag:" << trans_flag << ", tensor_name:" << tensor_name
<< ", host_format:" << format << ".!";
}
}
exec_order = exec_order + 1;
}
}
void LoadParameters(mindspore::session::KernelGraph *graph, Debugger *debugger) {
MS_EXCEPTION_IF_NULL(graph);
bool trans_flag = false;
const auto &parameters = graph->inputs();
// for parameters, set its execution order to be 0;
int exec_order = 0;
for (auto &item : parameters) {
if (!item->isa<Parameter>()) {
continue;
}
std::string parameter_name = item->fullname_with_scope();
auto addr = AnfAlgo::GetOutputAddr(item, PRAMATER_OUTPUT_INDEX);
auto type = AnfAlgo::GetOutputInferDataType(item, PRAMATER_OUTPUT_INDEX);
auto format = kOpFormat_DEFAULT;
string tensor_name = parameter_name + ':' + "0";
auto ascend_addr = dynamic_cast<const mindspore::device::ascend::AscendDeviceAddress *>(addr);
std::vector<int> int_shapes;
if (trans_flag) {
int_shapes = trans::GetRuntimePaddingShape(item, PRAMATER_OUTPUT_INDEX);
} else {
auto shape = AnfAlgo::GetOutputDeviceShape(item, PRAMATER_OUTPUT_INDEX);
(void)std::transform(shape.begin(), shape.end(), std::back_inserter(int_shapes),
[](size_t inner_item) { return SizeToInt(inner_item); });
}
auto ret = ascend_addr->LoadMemToHost(trans_flag, tensor_name, exec_order, format, int_shapes, type, 0, debugger);
if (!ret) {
MS_LOG(ERROR) << "LoadMemToHost Failed: flag:" << trans_flag << ", path:" << tensor_name
<< ", host_format:" << format << ".!";
}
}
}
} // namespace
#endif
bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph, Debugger *debugger) {
MS_EXCEPTION_IF_NULL(graph);
#ifdef ENABLE_DEBUGGER
MS_LOG(INFO) << "start load step";
uint32_t cur_iter = 0;
MS_LOG(INFO) << "cur iter is " << cur_iter;
// load output
LoadOutput(graph, debugger);
// load parameters
LoadParameters(graph, debugger);
#endif
return true;
}
bool AscendKernelRuntime::NodeOutputDeviceAddressExist(const AnfNodePtr &kernel, size_t index) {
if (AnfAlgo::OutputAddrExist(kernel, index)) {
auto address = AnfAlgo::GetOutputAddr(kernel, index);

@ -37,6 +37,7 @@ class AscendKernelRuntime : public KernelRuntime {
~AscendKernelRuntime() override;
bool Init() override;
bool DumpData(session::KernelGraph *graph) override;
bool LoadData(session::KernelGraph *graph, Debugger *debugger) override;
bool GenTask(const session::KernelGraph *graph) override;
bool RunTask(const session::KernelGraph *graph) override;
bool LoadTask(const session::KernelGraph *graph) override;

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save