parent
73f440a54d
commit
4c056855e0
@ -0,0 +1,14 @@
|
||||
mindspore_add_pkg(absl
|
||||
VER 20200225.2
|
||||
LIBS absl_strings absl_throw_delegate absl_raw_logging_internal absl_int128 absl_bad_optional_access
|
||||
URL https://github.com/abseil/abseil-cpp/archive/20200225.2.tar.gz
|
||||
MD5 73f2b6e72f1599a9139170c29482ddc4
|
||||
CMAKE_OPTION -DCMAKE_BUILD_TYPE:STRING=Release -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=TRUE)
|
||||
|
||||
include_directories(${absl_INC})
|
||||
|
||||
add_library(mindspore::absl_strings ALIAS absl::absl_strings)
|
||||
add_library(mindspore::absl_throw_delegate ALIAS absl::absl_throw_delegate)
|
||||
add_library(mindspore::absl_raw_logging_internal ALIAS absl::absl_raw_logging_internal)
|
||||
add_library(mindspore::absl_int128 ALIAS absl::absl_int128)
|
||||
add_library(mindspore::absl_bad_optional_access ALIAS absl::absl_bad_optional_access)
|
@ -0,0 +1,12 @@
|
||||
mindspore_add_pkg(c-ares
|
||||
VER 1.15.0
|
||||
LIBS cares
|
||||
URL https://github.com/c-ares/c-ares/releases/download/cares-1_15_0/c-ares-1.15.0.tar.gz
|
||||
MD5 d2391da274653f7643270623e822dff7
|
||||
CMAKE_OPTION -DCMAKE_BUILD_TYPE:STRING=Release
|
||||
-DCARES_SHARED:BOOL=OFF
|
||||
-DCARES_STATIC:BOOL=ON
|
||||
-DCARES_STATIC_PIC:BOOL=ON)
|
||||
|
||||
include_directories(${c-ares_INC})
|
||||
add_library(mindspore::cares ALIAS c-ares::cares)
|
@ -0,0 +1,110 @@
|
||||
set(grpc_USE_STATIC_LIBS ON)
|
||||
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
|
||||
set(grpc_CXXFLAGS "-fstack-protector-all -Wno-uninitialized -Wno-unused-parameter -fPIC -fvisibility=hidden -D_FORTIFY_SOURCE=2 -O2")
|
||||
elseif (${CMAKE_SYSTEM_NAME} MATCHES "Windows")
|
||||
set(grpc_CXXFLAGS "-fstack-protector-all -Wno-maybe-uninitialized -Wno-unused-parameter -fPIC -fvisibility=hidden -D_FORTIFY_SOURCE=2 -O2")
|
||||
else()
|
||||
set(grpc_CXXFLAGS "-fstack-protector-all -Wno-maybe-uninitialized -Wno-unused-parameter -fPIC -fvisibility=hidden -D_FORTIFY_SOURCE=2 -D_GLIBCXX_USE_CXX11_ABI=0 -O2")
|
||||
endif()
|
||||
|
||||
set(grpc_LDFLAGS "-Wl,-z,relro,-z,now,-z,noexecstack")
|
||||
|
||||
|
||||
if (EXISTS ${protobuf_ROOT}/lib64)
|
||||
set(_FINDPACKAGE_PROTOBUF_CONFIG_DIR "${protobuf_ROOT}/lib64/cmake/protobuf")
|
||||
else()
|
||||
set(_FINDPACKAGE_PROTOBUF_CONFIG_DIR "${protobuf_ROOT}/lib/cmake/protobuf")
|
||||
endif()
|
||||
message("grpc using Protobuf_DIR : " ${_FINDPACKAGE_PROTOBUF_CONFIG_DIR})
|
||||
|
||||
if (EXISTS ${absl_ROOT}/lib64)
|
||||
set(_FINDPACKAGE_ABSL_CONFIG_DIR "${absl_ROOT}/lib64/cmake/absl")
|
||||
else()
|
||||
set(_FINDPACKAGE_ABSL_CONFIG_DIR "${absl_ROOT}/lib/cmake/absl")
|
||||
endif()
|
||||
message("grpc using absl_DIR : " ${_FINDPACKAGE_ABSL_CONFIG_DIR})
|
||||
|
||||
set(_CMAKE_ARGS_OPENSSL_ROOT_DIR "")
|
||||
if (OPENSSL_ROOT_DIR)
|
||||
set(_CMAKE_ARGS_OPENSSL_ROOT_DIR "-DOPENSSL_ROOT_DIR:PATH=${OPENSSL_ROOT_DIR}")
|
||||
endif()
|
||||
|
||||
mindspore_add_pkg(grpc
|
||||
VER 1.27.3
|
||||
LIBS grpc++ grpc gpr upb address_sorting
|
||||
EXE grpc_cpp_plugin
|
||||
URL https://github.com/grpc/grpc/archive/v1.27.3.tar.gz
|
||||
MD5 0c6c3fc8682d4262dd0e5e6fabe1a7e2
|
||||
CMAKE_OPTION -DCMAKE_BUILD_TYPE:STRING=Release
|
||||
-DgRPC_INSTALL:BOOL=ON
|
||||
-DgRPC_BUILD_TESTS:BOOL=OFF
|
||||
-DgRPC_PROTOBUF_PROVIDER:STRING=package
|
||||
-DgRPC_PROTOBUF_PACKAGE_TYPE:STRING=CONFIG
|
||||
-DProtobuf_DIR:PATH=${_FINDPACKAGE_PROTOBUF_CONFIG_DIR}
|
||||
-DgRPC_ZLIB_PROVIDER:STRING=package
|
||||
-DZLIB_ROOT:PATH=${zlib_ROOT}
|
||||
-DgRPC_ABSL_PROVIDER:STRING=package
|
||||
-Dabsl_DIR:PATH=${_FINDPACKAGE_ABSL_CONFIG_DIR}
|
||||
-DgRPC_CARES_PROVIDER:STRING=package
|
||||
-Dc-ares_DIR:PATH=${c-ares_ROOT}/lib/cmake/c-ares
|
||||
-DgRPC_SSL_PROVIDER:STRING=package
|
||||
${_CMAKE_ARGS_OPENSSL_ROOT_DIR}
|
||||
)
|
||||
|
||||
include_directories(${grpc_INC})
|
||||
|
||||
add_library(mindspore::grpc++ ALIAS grpc::grpc++)
|
||||
|
||||
# link other grpc libs
|
||||
target_link_libraries(grpc::grpc++ INTERFACE grpc::grpc grpc::gpr grpc::upb grpc::address_sorting)
|
||||
|
||||
# link built dependencies
|
||||
target_link_libraries(grpc::grpc++ INTERFACE mindspore::z)
|
||||
target_link_libraries(grpc::grpc++ INTERFACE mindspore::cares)
|
||||
target_link_libraries(grpc::grpc++ INTERFACE mindspore::absl_strings mindspore::absl_throw_delegate
|
||||
mindspore::absl_raw_logging_internal mindspore::absl_int128 mindspore::absl_bad_optional_access)
|
||||
|
||||
# link system openssl
|
||||
find_package(OpenSSL REQUIRED)
|
||||
target_link_libraries(grpc::grpc++ INTERFACE OpenSSL::SSL OpenSSL::Crypto)
|
||||
|
||||
|
||||
function(ms_grpc_generate c_var h_var)
|
||||
if(NOT ARGN)
|
||||
message(SEND_ERROR "Error: ms_grpc_generate() called without any proto files")
|
||||
return()
|
||||
endif()
|
||||
|
||||
set(${c_var})
|
||||
set(${h_var})
|
||||
|
||||
foreach(file ${ARGN})
|
||||
get_filename_component(abs_file ${file} ABSOLUTE)
|
||||
get_filename_component(file_name ${file} NAME_WE)
|
||||
get_filename_component(file_dir ${abs_file} PATH)
|
||||
file(RELATIVE_PATH rel_path ${CMAKE_CURRENT_SOURCE_DIR} ${file_dir})
|
||||
|
||||
list(APPEND ${c_var} "${CMAKE_BINARY_DIR}/proto/${file_name}.pb.cc")
|
||||
list(APPEND ${h_var} "${CMAKE_BINARY_DIR}/proto/${file_name}.pb.h")
|
||||
list(APPEND ${c_var} "${CMAKE_BINARY_DIR}/proto/${file_name}.grpc.pb.cc")
|
||||
list(APPEND ${h_var} "${CMAKE_BINARY_DIR}/proto/${file_name}.grpc.pb.h")
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT "${CMAKE_BINARY_DIR}/proto/${file_name}.pb.cc"
|
||||
"${CMAKE_BINARY_DIR}/proto/${file_name}.pb.h"
|
||||
"${CMAKE_BINARY_DIR}/proto/${file_name}.grpc.pb.cc"
|
||||
"${CMAKE_BINARY_DIR}/proto/${file_name}.grpc.pb.h"
|
||||
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
|
||||
COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_BINARY_DIR}/proto"
|
||||
COMMAND protobuf::protoc --version
|
||||
COMMAND protobuf::protoc -I${file_dir} --cpp_out=${CMAKE_BINARY_DIR}/proto
|
||||
--grpc_out=${CMAKE_BINARY_DIR}/proto --plugin=protoc-gen-grpc=$<TARGET_FILE:grpc::grpc_cpp_plugin> ${abs_file}
|
||||
DEPENDS protobuf::protoc grpc::grpc_cpp_plugin ${abs_file}
|
||||
COMMENT "Running C++ gRPC compiler on ${file}" VERBATIM)
|
||||
endforeach()
|
||||
|
||||
set_source_files_properties(${${c_var}} ${${h_var}} PROPERTIES GENERATED TRUE)
|
||||
set(${c_var} ${${c_var}} PARENT_SCOPE)
|
||||
set(${h_var} ${${h_var}} PARENT_SCOPE)
|
||||
|
||||
endfunction()
|
@ -0,0 +1,9 @@
|
||||
mindspore_add_pkg(zlib
|
||||
VER 1.2.11
|
||||
LIBS z
|
||||
URL https://github.com/madler/zlib/archive/v1.2.11.tar.gz
|
||||
MD5 0095d2d2d1f3442ce1318336637b695f
|
||||
CMAKE_OPTION -DCMAKE_BUILD_TYPE:STRING=Release)
|
||||
|
||||
include_directories(${zlib_INC})
|
||||
add_library(mindspore::z ALIAS zlib::z)
|
@ -0,0 +1,194 @@
|
||||
/**
|
||||
* Copyright 2019-2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "debug/debug_services.h"
|
||||
namespace mindspore {
|
||||
|
||||
DebugServices::DebugServices() {
|
||||
tensor_loader_ = new TensorLoader();
|
||||
uint32_t iter_num = -1;
|
||||
tensor_loader_->set_iter_num(iter_num);
|
||||
}
|
||||
|
||||
DebugServices::DebugServices(const DebugServices &other) {
|
||||
tensor_loader_ = other.tensor_loader_;
|
||||
watchpoint_table = other.watchpoint_table;
|
||||
}
|
||||
|
||||
DebugServices &DebugServices::operator=(const DebugServices &other) {
|
||||
if (this != &other) {
|
||||
tensor_loader_ = other.tensor_loader_;
|
||||
watchpoint_table = other.watchpoint_table;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
DebugServices::~DebugServices() { delete tensor_loader_; }
|
||||
|
||||
void DebugServices::add_watchpoint(unsigned int id, unsigned int watch_condition,
|
||||
const std::vector<std::tuple<std::string, bool>> &check_node_list) {
|
||||
std::lock_guard<std::mutex> lg(lock_);
|
||||
|
||||
watchpoint_t watchpoint_item;
|
||||
|
||||
watchpoint_item.id = id;
|
||||
|
||||
if (watch_condition == 0) {
|
||||
watchpoint_item.conditions.nan.enabled = true;
|
||||
} else if (watch_condition == 1) {
|
||||
watchpoint_item.conditions.inf.enabled = true;
|
||||
watchpoint_item.conditions.neg_inf.enabled = true;
|
||||
}
|
||||
|
||||
watchpoint_item.check_node_list = check_node_list;
|
||||
|
||||
watchpoint_table[id] = watchpoint_item;
|
||||
}
|
||||
|
||||
void DebugServices::remove_watchpoint(unsigned int id) {
|
||||
std::lock_guard<std::mutex> lg(lock_);
|
||||
watchpoint_table.erase(id);
|
||||
}
|
||||
|
||||
void DebugServices::check_watchpoints(std::vector<std::string> *name, std::vector<std::string> *slot,
|
||||
std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size,
|
||||
std::vector<int> *condition, std::vector<unsigned int> *wacthpoint_id) {
|
||||
std::lock_guard<std::mutex> lg(lock_);
|
||||
|
||||
std::vector<std::shared_ptr<TensorData>> tensor_list = tensor_loader_->GetTensor();
|
||||
|
||||
std::string current_tensor_name;
|
||||
std::unordered_map<unsigned int, watchpoint_t> watchpoints_to_check_table;
|
||||
|
||||
for (std::size_t i = 0; i < tensor_list.size(); i++) {
|
||||
current_tensor_name = tensor_list[i]->GetName();
|
||||
mindspore::tensor::TensorPtr tensor_ptr = tensor_list[i]->GetTensor();
|
||||
int tensor_data_type = tensor_ptr->data_type_c();
|
||||
|
||||
// check if we need to analyze this node and for which watchpoints we will check
|
||||
// create a list of watchpoints to check
|
||||
watchpoints_to_check_table.clear();
|
||||
for (auto w_table_item : watchpoint_table) {
|
||||
// if the watchpoint is checking for a nan or inf and the current tensor is not of a float type, then
|
||||
// don't check the watchpoint for this tensor
|
||||
if (std::get<1>(w_table_item).conditions.inf.enabled || std::get<1>(w_table_item).conditions.neg_inf.enabled ||
|
||||
std::get<1>(w_table_item).conditions.nan.enabled) {
|
||||
if (tensor_data_type != kNumberTypeFloat16 && tensor_data_type != kNumberTypeFloat &&
|
||||
tensor_data_type != kNumberTypeFloat32 && tensor_data_type != kNumberTypeFloat64) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
auto check_node_list = std::get<1>(w_table_item).check_node_list;
|
||||
|
||||
for (auto check_node : check_node_list) {
|
||||
std::string w_name = std::get<0>(check_node);
|
||||
bool w_type = std::get<1>(check_node);
|
||||
|
||||
// check if the current node tensor name is included the watchpoint
|
||||
std::string current_node_name = current_tensor_name.substr(0, current_tensor_name.find_first_of(":"));
|
||||
if ((w_type == true && (current_tensor_name.find(w_name) != string::npos || w_name == "*")) ||
|
||||
(w_type == false && current_node_name == w_name)) {
|
||||
watchpoints_to_check_table[w_table_item.second.id] = w_table_item.second;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// check if no watchpoints are valid for the current tensor
|
||||
if (watchpoints_to_check_table.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// need to add support for float16 and float64, and other types when we support conditions beyond inf and nan
|
||||
if (tensor_data_type != kNumberTypeFloat && tensor_data_type != kNumberTypeFloat32) {
|
||||
continue;
|
||||
}
|
||||
|
||||
float *start_addr = reinterpret_cast<float *>(tensor_ptr->data_c(false));
|
||||
unsigned int num_elements = (tensor_ptr->data().nbytes()) / sizeof(float);
|
||||
|
||||
std::unordered_map<unsigned int, watchpoint_t>::iterator it_w_table_check;
|
||||
std::vector<unsigned int> hit_encountered;
|
||||
|
||||
for (unsigned int index = 0; index < num_elements; index++) {
|
||||
float x = start_addr[index];
|
||||
it_w_table_check = watchpoints_to_check_table.begin();
|
||||
|
||||
while (it_w_table_check != watchpoints_to_check_table.end()) {
|
||||
if ((it_w_table_check->second.conditions.inf.enabled || it_w_table_check->second.conditions.neg_inf.enabled) &&
|
||||
isinf(x)) {
|
||||
hit_encountered.push_back(it_w_table_check->second.id);
|
||||
} else if (it_w_table_check->second.conditions.nan.enabled && isnan(x)) {
|
||||
hit_encountered.push_back(it_w_table_check->second.id);
|
||||
}
|
||||
|
||||
++it_w_table_check;
|
||||
}
|
||||
|
||||
if (hit_encountered.size()) {
|
||||
for (auto it_hit_id = hit_encountered.begin(); it_hit_id != hit_encountered.end(); ++it_hit_id) {
|
||||
std::string name_no_slot = current_tensor_name.substr(0, current_tensor_name.find_first_of(":"));
|
||||
name->push_back(name_no_slot);
|
||||
|
||||
slot->push_back(std::to_string(tensor_list[i]->GetSlot()));
|
||||
data_ptr->push_back(reinterpret_cast<char *>(tensor_ptr->data_c(false)));
|
||||
data_size->push_back(tensor_ptr->data().nbytes());
|
||||
|
||||
int condition_item = -1;
|
||||
if (watchpoint_table[*it_hit_id].conditions.nan.enabled) {
|
||||
condition_item = 0;
|
||||
} else if (watchpoint_table[*it_hit_id].conditions.inf.enabled ||
|
||||
watchpoint_table[*it_hit_id].conditions.neg_inf.enabled) {
|
||||
condition_item = 1;
|
||||
}
|
||||
condition->push_back(condition_item);
|
||||
|
||||
wacthpoint_id->push_back(*it_hit_id);
|
||||
|
||||
watchpoints_to_check_table.erase(*it_hit_id);
|
||||
}
|
||||
|
||||
hit_encountered.clear();
|
||||
}
|
||||
|
||||
if (watchpoints_to_check_table.empty()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void DebugServices::read_nodes_tensors(std::vector<std::string> name, std::vector<std::string> *ret_name,
|
||||
std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size,
|
||||
std::vector<TypePtr> *dtype, std::vector<std::vector<int>> *shape) {
|
||||
std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
|
||||
tensor_loader_->SearchTensors(name, &result_list);
|
||||
|
||||
for (auto result : result_list) {
|
||||
if (!std::get<1>(result)) {
|
||||
continue;
|
||||
}
|
||||
ret_name->push_back(std::get<0>(result));
|
||||
data_ptr->push_back(reinterpret_cast<char *>(std::get<1>(result)->GetTensor()->data_c(false)));
|
||||
data_size->push_back(std::get<1>(result)->GetTensor()->data().nbytes());
|
||||
dtype->push_back(std::get<1>(result)->GetTensor()->Dtype());
|
||||
shape->push_back(std::get<1>(result)->GetTensor()->shape());
|
||||
}
|
||||
}
|
||||
|
||||
TensorLoader *DebugServices::get_tensor_loader() const { return tensor_loader_; }
|
||||
|
||||
} // namespace mindspore
|
@ -0,0 +1,95 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_DEBUG_DEBUG_SERVICES_H_
|
||||
#define MINDSPORE_CCSRC_DEBUG_DEBUG_SERVICES_H_
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <tuple>
|
||||
#include <unordered_map>
|
||||
#include <mutex>
|
||||
#include "debug/tensor_load.h"
|
||||
#include "debug/tensor_data.h"
|
||||
#include "ir/dtype.h"
|
||||
|
||||
namespace mindspore {
|
||||
class DebugServices {
|
||||
public:
|
||||
DebugServices();
|
||||
|
||||
DebugServices(const DebugServices &other);
|
||||
|
||||
DebugServices &operator=(const DebugServices &other);
|
||||
|
||||
~DebugServices();
|
||||
|
||||
void add_watchpoint(unsigned int id, unsigned int watch_condition,
|
||||
const std::vector<std::tuple<std::string, bool>> &check_node_list);
|
||||
|
||||
void remove_watchpoint(unsigned int id);
|
||||
|
||||
void check_watchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<char *> *data_ptr,
|
||||
std::vector<unsigned int> *data_size, std::vector<int> *condition,
|
||||
std::vector<unsigned int> *wacthpoint_id);
|
||||
|
||||
void read_nodes_tensors(std::vector<std::string> name, std::vector<std::string> *ret_name,
|
||||
std::vector<char *> *data_ptr, std::vector<unsigned int> *data_size,
|
||||
std::vector<TypePtr> *dtype, std::vector<std::vector<int>> *shape);
|
||||
|
||||
TensorLoader *get_tensor_loader() const;
|
||||
|
||||
private:
|
||||
typedef struct condition_no_param {
|
||||
bool enabled = false;
|
||||
} condition_no_param_t;
|
||||
|
||||
typedef struct condition_with_param {
|
||||
bool enabled = false;
|
||||
float parameter = 0;
|
||||
} condition_with_param_t;
|
||||
|
||||
typedef struct conditions {
|
||||
condition_no_param_t inf;
|
||||
condition_no_param_t neg_inf;
|
||||
condition_no_param_t nan;
|
||||
condition_with_param_t max_below;
|
||||
condition_with_param_t max_above;
|
||||
condition_with_param_t min_below;
|
||||
condition_with_param_t min_above;
|
||||
condition_with_param_t max_minus_min_below;
|
||||
condition_with_param_t max_minus_min_above;
|
||||
condition_with_param_t mean_below;
|
||||
condition_with_param_t mean_above;
|
||||
condition_with_param_t std_dev_below;
|
||||
condition_with_param_t std_dev_above;
|
||||
} conditions_t;
|
||||
|
||||
typedef struct watchpoint {
|
||||
unsigned int id;
|
||||
conditions_t conditions;
|
||||
std::vector<std::tuple<std::string, bool>> check_node_list;
|
||||
} watchpoint_t;
|
||||
|
||||
std::mutex lock_;
|
||||
|
||||
std::unordered_map<unsigned int, watchpoint_t> watchpoint_table;
|
||||
|
||||
TensorLoader *tensor_loader_;
|
||||
};
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_DEBUG_DEBUG_SERVICES_H_
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,81 @@
|
||||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
syntax = "proto3";
|
||||
|
||||
package debugger;
|
||||
|
||||
import "debug_graph.proto";
|
||||
|
||||
service EventListener {
|
||||
rpc WaitCMD (Metadata) returns (EventReply) {};
|
||||
rpc SendMetadata (Metadata) returns (EventReply) {};
|
||||
rpc SendGraph (GraphProto) returns (EventReply) {};
|
||||
rpc SendTensors (stream TensorProto) returns (EventReply) {};
|
||||
rpc SendWatchpointHits (stream WatchpointHit) returns (EventReply) {};
|
||||
}
|
||||
|
||||
message Metadata {
|
||||
string device_name = 1;
|
||||
int32 cur_step = 2;
|
||||
}
|
||||
|
||||
message EventReply {
|
||||
enum Status {
|
||||
OK = 0;
|
||||
FAILED = 1;
|
||||
PENDING = 2;
|
||||
}
|
||||
|
||||
Status status = 1;
|
||||
|
||||
oneof cmd {
|
||||
bool exit = 2;
|
||||
int32 run_cmd = 3;
|
||||
SetCMD set_cmd = 4;
|
||||
ViewCMD view_cmd = 5;
|
||||
}
|
||||
}
|
||||
|
||||
message SetCMD {
|
||||
repeated WatchNode watch_nodes = 1;
|
||||
WatchCondition watch_condition = 2;
|
||||
bool delete = 3;
|
||||
int32 id = 4;
|
||||
}
|
||||
|
||||
message ViewCMD {
|
||||
repeated TensorProto tensors = 1;
|
||||
}
|
||||
|
||||
message WatchCondition {
|
||||
enum Condition {
|
||||
nan = 0;
|
||||
inf = 1;
|
||||
}
|
||||
Condition condition = 1;
|
||||
}
|
||||
|
||||
message WatchNode {
|
||||
string node_name = 1;
|
||||
string node_type = 2;
|
||||
}
|
||||
|
||||
message WatchpointHit {
|
||||
TensorProto tensor = 1;
|
||||
WatchCondition watch_condition = 2;
|
||||
int32 id = 3;
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,159 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_DEBUG_DEBUGGER_DEBUGGER_H_
|
||||
#define MINDSPORE_CCSRC_DEBUG_DEBUGGER_DEBUGGER_H_
|
||||
|
||||
#include <list>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include "session/kernel_graph.h"
|
||||
#include "debug/debugger/grpc_client.h"
|
||||
#include "debug/debug_services.h"
|
||||
|
||||
using debugger::DataType;
|
||||
using debugger::EventReply;
|
||||
using debugger::GraphProto;
|
||||
using debugger::ModelProto;
|
||||
using debugger::TensorProto;
|
||||
using debugger::WatchCondition;
|
||||
using debugger::WatchNode;
|
||||
using debugger::WatchpointHit;
|
||||
|
||||
template <class T>
|
||||
using ProtoVector = google::protobuf::RepeatedPtrField<T>;
|
||||
|
||||
namespace mindspore {
|
||||
// different types of command recieved by debugger
|
||||
// need to keep sync with client-side proto and server-side proto
|
||||
enum class DebuggerCommand { kExitCMD = 2, kRunCMD = 3, kSetCMD = 4, kViewCMD = 5, kUnknownCMD = -1 };
|
||||
|
||||
class Debugger : public std::enable_shared_from_this<Debugger> {
|
||||
public:
|
||||
static std::shared_ptr<Debugger> GetInstance() {
|
||||
std::lock_guard<std::mutex> i_lock(instance_lock_);
|
||||
if (debugger_ == nullptr) {
|
||||
debugger_ = std::shared_ptr<Debugger>(new (std::nothrow) Debugger());
|
||||
}
|
||||
return debugger_;
|
||||
}
|
||||
|
||||
// deconstructor
|
||||
~Debugger() = default;
|
||||
|
||||
// init
|
||||
// only save device_id
|
||||
void Init(const uint32_t device_id);
|
||||
|
||||
// reset debugger
|
||||
void Reset();
|
||||
|
||||
// enable debugger
|
||||
// send graph and wait for command
|
||||
// do nothing if graph is set already
|
||||
void PreExecute(const KernelGraphPtr &graph_ptr);
|
||||
|
||||
// analyze tensors and wait for command
|
||||
// don't need a graph_ptr because it is saved during pre_execute
|
||||
void PostExecute();
|
||||
|
||||
// suspend the execution after a debug_op
|
||||
void PostDebugOp();
|
||||
|
||||
DebugServices *get_debug_services();
|
||||
|
||||
bool debugger_enabled();
|
||||
|
||||
private:
|
||||
// private constructor for singleton
|
||||
Debugger();
|
||||
|
||||
// enable debugger
|
||||
// instantiate class members
|
||||
// read env variable for grpc client
|
||||
void EnableDebugger();
|
||||
|
||||
// check and save graph pointer
|
||||
void CheckGraphPtr(const KernelGraphPtr &graph_ptr);
|
||||
|
||||
// check if the graph is a dataset graph
|
||||
void CheckDatasetGraph();
|
||||
|
||||
// serialize graph and get proto
|
||||
GraphProto GetGraphProto();
|
||||
|
||||
// send graph and enter command wait loop
|
||||
void SendGraphAndSuspend(const GraphProto &graph_proto);
|
||||
|
||||
// wait for command and process command
|
||||
// send command request and process reply in a loop
|
||||
// break if RunCMD
|
||||
void CommandLoop();
|
||||
|
||||
// process reply and command type
|
||||
DebuggerCommand GetCommand(const EventReply &reply);
|
||||
|
||||
// parse other data out of EventReply
|
||||
ProtoVector<WatchNode> GetWatchnodes(const EventReply &reply);
|
||||
WatchCondition GetWatchcondition(const EventReply &reply);
|
||||
int32_t GetWatchpointID(const EventReply &reply);
|
||||
bool GetWatchpointDelete(const EventReply &reply);
|
||||
ProtoVector<TensorProto> GetTensors(const EventReply &reply);
|
||||
|
||||
// set what nodes and conditions to watch
|
||||
void SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCondition &condition, const int32_t id);
|
||||
|
||||
// remove watchpoint with id
|
||||
void RemoveWatchpoint(const int32_t id);
|
||||
|
||||
// load tensor for view command
|
||||
std::list<TensorProto> LoadTensors(const ProtoVector<TensorProto> &tensors);
|
||||
|
||||
// terminate training process
|
||||
void Exit();
|
||||
|
||||
// analyze tensors and check watchpoint conditions
|
||||
// return names of tensors and what condition they hit
|
||||
std::list<WatchpointHit> CheckWatchpoints();
|
||||
|
||||
// send watchpoints that hit and enter command wait loop
|
||||
void SendWatchpointsAndSuspend(const std::list<WatchpointHit> &points);
|
||||
|
||||
// class members
|
||||
std::unique_ptr<GrpcClient> grpc_client_;
|
||||
std::unique_ptr<DebugServices> debug_services_;
|
||||
KernelGraphPtr graph_ptr_;
|
||||
uint32_t device_id_;
|
||||
int32_t num_step_;
|
||||
bool debugger_enabled_;
|
||||
bool is_dataset_graph_;
|
||||
std::mutex access_lock_;
|
||||
|
||||
// singleton
|
||||
static std::mutex instance_lock_;
|
||||
static std::shared_ptr<Debugger> debugger_;
|
||||
};
|
||||
|
||||
using DebuggerPtr = std::shared_ptr<Debugger>;
|
||||
|
||||
// get debugger ModelProto
|
||||
std::string GetDebuggerFuncGraphProtoString(const FuncGraphPtr &func_graph);
|
||||
ModelProto GetDebuggerFuncGraphProto(const FuncGraphPtr &func_graph);
|
||||
|
||||
// for getting proto DataType from Type of Tensor
|
||||
DataType GetDebuggerNumberDataType(const TypePtr &type);
|
||||
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_DEBUG_DEBUGGER_DEBUGGER_H_
|
@ -0,0 +1,124 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <thread>
|
||||
#include "debug/debugger/grpc_client.h"
|
||||
#include "utils/log_adapter.h"
|
||||
|
||||
using debugger::EventListener;
|
||||
using debugger::EventReply;
|
||||
using debugger::EventReply_Status_FAILED;
|
||||
using debugger::GraphProto;
|
||||
using debugger::Metadata;
|
||||
using debugger::TensorProto;
|
||||
using debugger::WatchpointHit;
|
||||
|
||||
namespace mindspore {
|
||||
GrpcClient::GrpcClient(const std::string &host, const std::string &port) : stub_(nullptr) { Init(host, port); }
|
||||
|
||||
void GrpcClient::Init(const std::string &host, const std::string &port) {
|
||||
std::string target_str = host + ":" + port;
|
||||
MS_LOG(INFO) << "GrpcClient connecting to: " << target_str;
|
||||
|
||||
std::shared_ptr<grpc::Channel> channel = grpc::CreateChannel(target_str, grpc::InsecureChannelCredentials());
|
||||
stub_ = EventListener::NewStub(channel);
|
||||
}
|
||||
|
||||
void GrpcClient::Reset() { stub_ = nullptr; }
|
||||
|
||||
EventReply GrpcClient::WaitForCommand(const Metadata &metadata) {
|
||||
EventReply reply;
|
||||
grpc::ClientContext context;
|
||||
grpc::Status status = stub_->WaitCMD(&context, metadata, &reply);
|
||||
|
||||
if (!status.ok()) {
|
||||
MS_LOG(ERROR) << "RPC failed: WaitForCommand";
|
||||
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
|
||||
reply.set_status(EventReply_Status_FAILED);
|
||||
}
|
||||
return reply;
|
||||
}
|
||||
|
||||
EventReply GrpcClient::SendMetadata(const Metadata &metadata) {
|
||||
EventReply reply;
|
||||
grpc::ClientContext context;
|
||||
grpc::Status status = stub_->SendMetadata(&context, metadata, &reply);
|
||||
|
||||
if (!status.ok()) {
|
||||
MS_LOG(ERROR) << "RPC failed: SendMetadata";
|
||||
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
|
||||
reply.set_status(EventReply_Status_FAILED);
|
||||
}
|
||||
return reply;
|
||||
}
|
||||
|
||||
EventReply GrpcClient::SendGraph(const GraphProto &graph) {
|
||||
EventReply reply;
|
||||
grpc::ClientContext context;
|
||||
grpc::Status status = stub_->SendGraph(&context, graph, &reply);
|
||||
|
||||
if (!status.ok()) {
|
||||
MS_LOG(ERROR) << "RPC failed: SendGraph";
|
||||
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
|
||||
reply.set_status(EventReply_Status_FAILED);
|
||||
}
|
||||
return reply;
|
||||
}
|
||||
|
||||
EventReply GrpcClient::SendTensors(const std::list<TensorProto> &tensors) {
|
||||
EventReply reply;
|
||||
grpc::ClientContext context;
|
||||
|
||||
std::unique_ptr<grpc::ClientWriter<TensorProto> > writer(stub_->SendTensors(&context, &reply));
|
||||
for (const auto &tensor : tensors) {
|
||||
if (!writer->Write(tensor)) {
|
||||
break;
|
||||
}
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(1));
|
||||
}
|
||||
writer->WritesDone();
|
||||
grpc::Status status = writer->Finish();
|
||||
|
||||
if (!status.ok()) {
|
||||
MS_LOG(ERROR) << "RPC failed: SendTensors";
|
||||
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
|
||||
reply.set_status(EventReply_Status_FAILED);
|
||||
}
|
||||
return reply;
|
||||
}
|
||||
|
||||
EventReply GrpcClient::SendWatchpointHits(const std::list<WatchpointHit> &watchpoints) {
|
||||
EventReply reply;
|
||||
grpc::ClientContext context;
|
||||
|
||||
std::unique_ptr<grpc::ClientWriter<WatchpointHit> > writer(stub_->SendWatchpointHits(&context, &reply));
|
||||
for (const auto &watchpoint : watchpoints) {
|
||||
if (!writer->Write(watchpoint)) {
|
||||
break;
|
||||
}
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(1));
|
||||
}
|
||||
writer->WritesDone();
|
||||
grpc::Status status = writer->Finish();
|
||||
|
||||
if (!status.ok()) {
|
||||
MS_LOG(ERROR) << "RPC failed: SendWatchpointHits";
|
||||
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
|
||||
reply.set_status(EventReply_Status_FAILED);
|
||||
}
|
||||
return reply;
|
||||
}
|
||||
} // namespace mindspore
|
@ -0,0 +1,61 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_DEBUG_DEBUGGER_GRPC_CLIENT_H_
|
||||
#define MINDSPORE_CCSRC_DEBUG_DEBUGGER_GRPC_CLIENT_H_
|
||||
|
||||
#include <grpcpp/grpcpp.h>
|
||||
#include <string>
|
||||
#include <list>
|
||||
#include <memory>
|
||||
#include "proto/debug_grpc.grpc.pb.h"
|
||||
|
||||
using debugger::EventListener;
|
||||
using debugger::EventReply;
|
||||
using debugger::GraphProto;
|
||||
using debugger::Metadata;
|
||||
using debugger::TensorProto;
|
||||
using debugger::WatchpointHit;
|
||||
|
||||
namespace mindspore {
|
||||
class GrpcClient {
|
||||
public:
|
||||
// constructor
|
||||
GrpcClient(const std::string &host, const std::string &port);
|
||||
|
||||
// deconstructor
|
||||
~GrpcClient() = default;
|
||||
|
||||
// init
|
||||
void Init(const std::string &host, const std::string &port);
|
||||
|
||||
// reset
|
||||
void Reset();
|
||||
|
||||
EventReply WaitForCommand(const Metadata &metadata);
|
||||
|
||||
EventReply SendMetadata(const Metadata &metadata);
|
||||
|
||||
EventReply SendGraph(const GraphProto &graph);
|
||||
|
||||
EventReply SendTensors(const std::list<TensorProto> &tensors);
|
||||
|
||||
EventReply SendWatchpointHits(const std::list<WatchpointHit> &watchpoints);
|
||||
|
||||
private:
|
||||
std::unique_ptr<EventListener::Stub> stub_;
|
||||
};
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_DEBUG_DEBUGGER_GRPC_CLIENT_H_
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,75 @@
|
||||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_DEBUG_TENSOR_DATA_H_
|
||||
#define MINDSPORE_CCSRC_DEBUG_TENSOR_DATA_H_
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include "ir/tensor.h"
|
||||
|
||||
namespace mindspore {
|
||||
class TensorData {
|
||||
private:
|
||||
mindspore::tensor::TensorPtr tensor_ptr;
|
||||
std::string name;
|
||||
size_t slot;
|
||||
int execution_order;
|
||||
|
||||
public:
|
||||
TensorData() : slot(0), execution_order(-1) {}
|
||||
|
||||
TensorData(const TensorData &obj) {
|
||||
std::cout << "Copy Constructor" << std::endl;
|
||||
this->name = obj.name;
|
||||
this->execution_order = obj.execution_order;
|
||||
this->slot = obj.slot;
|
||||
this->tensor_ptr = obj.tensor_ptr;
|
||||
}
|
||||
|
||||
~TensorData() {}
|
||||
|
||||
std::string GetName() { return this->name; }
|
||||
|
||||
mindspore::tensor::TensorPtr GetTensor() { return this->tensor_ptr; }
|
||||
|
||||
size_t GetSlot() { return this->slot; }
|
||||
|
||||
int GetExecutionOrder() { return this->execution_order; }
|
||||
|
||||
int SetExecutionOrder(int execution_order) {
|
||||
this->execution_order = execution_order;
|
||||
return true;
|
||||
}
|
||||
|
||||
int SetName(const std::string &name) {
|
||||
this->name = name;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SetTensor(mindspore::tensor::TensorPtr out_tensor) {
|
||||
this->tensor_ptr = out_tensor;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SetSlot(size_t slot) {
|
||||
this->slot = slot;
|
||||
return true;
|
||||
}
|
||||
};
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_DEBUG_TENSOR_DATA_H_
|
@ -0,0 +1,69 @@
|
||||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_DEBUG_TENSOR_LOAD_H_
|
||||
#define MINDSPORE_CCSRC_DEBUG_TENSOR_LOAD_H_
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <tuple>
|
||||
#include <string>
|
||||
#include "debug/tensor_data.h"
|
||||
namespace mindspore {
|
||||
class TensorLoader {
|
||||
public:
|
||||
TensorLoader() : iter_num(-1) {}
|
||||
|
||||
~TensorLoader() {}
|
||||
|
||||
bool LoadNewTensor(std::shared_ptr<TensorData> tensor) {
|
||||
tensor_list.push_back(tensor);
|
||||
tensor_list_map.insert({tensor->GetName(), tensor});
|
||||
return true;
|
||||
}
|
||||
std::vector<std::shared_ptr<TensorData>> GetTensor() { return tensor_list; }
|
||||
|
||||
uint32_t GetIterNum() { return iter_num; }
|
||||
|
||||
std::map<std::string, std::shared_ptr<TensorData>> GetTensorMap() { return tensor_list_map; }
|
||||
void SearchTensors(const std::vector<std::string> &search_list,
|
||||
std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list) {
|
||||
for (auto i : search_list) {
|
||||
std::map<std::string, std::shared_ptr<TensorData>>::iterator iter;
|
||||
iter = tensor_list_map.find(i);
|
||||
if (iter != tensor_list_map.end()) {
|
||||
result_list->push_back(std::make_tuple(i, iter->second));
|
||||
} else {
|
||||
result_list->push_back(std::make_tuple(i, nullptr));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool EmptyTensor() {
|
||||
tensor_list_map.clear();
|
||||
tensor_list.clear();
|
||||
return true;
|
||||
}
|
||||
|
||||
void set_iter_num(uint32_t iter_num) { this->iter_num = iter_num; }
|
||||
|
||||
private:
|
||||
std::vector<std::shared_ptr<TensorData>> tensor_list;
|
||||
std::map<std::string, std::shared_ptr<TensorData>> tensor_list_map;
|
||||
uint32_t iter_num;
|
||||
};
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_DEBUG_TENSOR_LOAD_H_
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue