GPU debugger - milestone 1 and GPU dump

Additonal Authors: Adel Shafiei, Harshvardhan Gupta
pull/3007/head
John Tzanakakis 5 years ago
parent 875bdc2ebc
commit b3c0eb61d5

@ -279,6 +279,9 @@ checkopts()
done
}
checkopts "$@"
if [[ "X$ENABLE_GPU" = "Xon" ]] && [[ "X$ENABLE_DUMPE2E" = "Xon" ]]; then
ENABLE_DEBUGGER="on"
fi
echo "---------------- MindSpore: build start ----------------"
mkdir -pv "${BUILD_PATH}/package/mindspore/lib"
git submodule update --init graphengine

@ -37,6 +37,7 @@
#include "common/trans.h"
#include "utils/context/ms_context.h"
#include "utils/base_ref_extends.h"
#include "debug/tensor_load.h"
namespace mindspore {
namespace session {
@ -164,7 +165,11 @@ void GPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const {
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
#ifdef ENABLE_DEBUGGER
if (!runtime_instance->Run(kernel_graph.get(), debugger_.get())) {
#else
if (!runtime_instance->Run(kernel_graph.get())) {
#endif
MS_LOG(EXCEPTION) << "GPU execute graph failed!";
}
}
@ -229,6 +234,9 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList
void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
auto &kernel_graph = graphs_[graph_id];
#ifdef ENABLE_DEBUGGER
PreIterationDbg(kernel_graph);
#endif
// Load input data from user input
LoadInputData(kernel_graph, inputs);
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
@ -245,6 +253,9 @@ void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
// Run graph on GPU
Execute(kernel_graph);
}
#ifdef ENABLE_DEBUGGER
PostLoadTensor(kernel_graph);
#endif
// Get result from GPU
UpdateOutputs(kernel_graph, outputs, inputs);
// Summary
@ -253,6 +264,9 @@ void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
if (context_ptr->enable_gpu_summary()) {
Summary(kernel_graph.get());
}
#ifdef ENABLE_DEBUGGER
PostIterationDbg(kernel_graph);
#endif
}
void GPUSession::BuildOp(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
@ -296,6 +310,70 @@ py::tuple GPUSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph
RunOpClearMemory(kernel_graph.get());
return tuple_tensors;
}
#ifdef ENABLE_DEBUGGER
void GPUSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
#ifdef ENABLE_DUMP_E2E
MS_EXCEPTION_IF_NULL(kernel_graph);
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
(void)runtime_instance->DumpData(kernel_graph.get(), debugger_.get());
#endif
}
bool GPUSession::DumpDataEnabledIteration() const {
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
return runtime_instance->DumpDataEnabledIteration();
}
void GPUSession::PreIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const {
if (debugger_) {
debugger_->PreExecute(kernel_graph);
}
PreLoadTensor(kernel_graph);
}
void GPUSession::PostIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const {
bool dump_enabled = DumpDataEnabledIteration();
// debug used for dump
if (debugger_ && dump_enabled) {
Dump(kernel_graph);
}
if (debugger_) {
debugger_->PostExecute();
}
}
void GPUSession::PreLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const {
bool dump_enabled = DumpDataEnabledIteration();
if (!(debugger_ && (debugger_->debugger_enabled() || dump_enabled))) {
return;
}
MS_EXCEPTION_IF_NULL(kernel_graph);
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
DebugServices *debug_services = debugger_->debug_services();
TensorLoader *tensor_loader = debug_services->tensor_loader();
tensor_loader->EmptyTensor();
uint32_t iter_num = tensor_loader->GetIterNum();
tensor_loader->set_iter_num(++iter_num);
}
void GPUSession::PostLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const {
bool dump_enabled = DumpDataEnabledIteration();
if (!(debugger_ && (debugger_->debugger_enabled() || dump_enabled))) {
return;
}
MS_EXCEPTION_IF_NULL(kernel_graph);
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
DebugServices *debug_services = debugger_->debug_services();
TensorLoader *tensor_loader = debug_services->tensor_loader();
tensor_loader->EmptyPrevTensor();
}
#endif
} // namespace gpu
} // namespace session
} // namespace mindspore

@ -67,6 +67,20 @@ class GPUSession : public SessionBasic {
const std::vector<tensor::TensorPtr> &inputs_const) const override;
void Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const;
#ifdef ENABLE_DEBUGGER
void Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const;
bool DumpDataEnabledIteration() const;
void PreIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const;
void PostIterationDbg(const std::shared_ptr<KernelGraph> &kernel_graph) const;
void PreLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const;
void PostLoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const;
#endif
};
using GPUSessionPtr = std::shared_ptr<GPUSession>;
MS_REG_SESSION(kGPUDevice, GPUSession);

@ -24,7 +24,6 @@
#include "backend/kernel_compiler/common_utils.h"
#include "frontend/operator/ops.h"
#include "common/trans.h"
#include "utils/context/ms_context.h"
#include "utils/config_manager.h"
#include "backend/session/anf_runtime_algorithm.h"
#include "backend/kernel_compiler/oplib/oplib.h"

@ -32,6 +32,7 @@
#include "utils/contract.h"
#include "pipeline/pynative/pynative_execute.h"
#include "runtime/device/kernel_info.h"
#include "utils/context/ms_context.h"
#ifdef ENABLE_DEBUGGER
#include "debug/debugger/debugger.h"
#endif
@ -112,7 +113,9 @@ class SessionBasic {
// set debugger
void SetDebugger() {
debugger_ = Debugger::GetInstance();
debugger_->Init(device_id_);
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
debugger_->Init(device_id_, ms_context->device_target());
}
#endif

@ -16,6 +16,7 @@ if (ENABLE_DEBUGGER)
"${CMAKE_CURRENT_SOURCE_DIR}/debugger/grpc_client.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/debugger/proto_exporter.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/debug_services.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/common.cc"
)
endif (ENABLE_DEBUGGER)

@ -21,6 +21,7 @@
#include "debug/debugger/debugger.h"
#include "pipeline/jit/pipeline.h"
#include "backend/session/anf_runtime_algorithm.h"
#include "runtime/device/kernel_runtime_manager.h"
using debugger::EventReply;
using debugger::GraphProto;
@ -41,17 +42,20 @@ Debugger::Debugger()
: grpc_client_(nullptr),
debug_services_(nullptr),
device_id_(0),
device_target_(""),
num_step_(0),
debugger_enabled_(false),
is_dataset_graph_(false),
partial_memory_(false) {}
void Debugger::Init(const uint32_t device_id) {
void Debugger::Init(const uint32_t device_id, const std::string device_target) {
// access lock for public method
std::lock_guard<std::mutex> a_lock(access_lock_);
// save device_id
MS_LOG(INFO) << "Debugger got device_id: " << device_id;
device_id_ = device_id;
MS_LOG(INFO) << "Debugger got device_target: " << device_target;
device_target_ = device_target;
}
void Debugger::EnableDebugger() {
@ -62,6 +66,14 @@ void Debugger::EnableDebugger() {
grpc_client_ = nullptr;
debug_services_ = nullptr;
// see if dump is enabled
bool dump_enabled = false;
if (device_target_ == kGPUDevice) {
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
MS_EXCEPTION_IF_NULL(runtime_instance);
dump_enabled = runtime_instance->DumpDataEnabled();
}
// get env variables to configure debugger
const char *env_enable_str = std::getenv("ENABLE_MS_DEBUGGER");
if (env_enable_str != nullptr) {
@ -70,7 +82,8 @@ void Debugger::EnableDebugger() {
debugger_enabled_ = true;
}
}
if (!debugger_enabled_) {
if (!debugger_enabled_ && !dump_enabled) {
MS_LOG(WARNING) << "Not enabling debugger. Set environment variable ENABLE_MS_DEBUGGER=1 to enable debugger.";
return;
}
@ -118,7 +131,10 @@ void Debugger::EnableDebugger() {
}
// initialize grpc client
grpc_client_ = std::make_unique<GrpcClient>(host, port);
if (debugger_enabled_) {
grpc_client_ = std::make_unique<GrpcClient>(host, port);
}
debug_services_ = std::make_unique<DebugServices>();
}
@ -127,6 +143,7 @@ void Debugger::Reset() {
std::lock_guard<std::mutex> a_lock(access_lock_);
// reset components
device_id_ = 0;
device_target_ = "";
num_step_ = 0;
debugger_enabled_ = false;
is_dataset_graph_ = false;

@ -55,7 +55,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
// init
// only save device_id
void Init(const uint32_t device_id);
void Init(const uint32_t device_id, const std::string device_target);
// reset debugger
void Reset();
@ -128,6 +128,7 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
std::unique_ptr<DebugServices> debug_services_;
KernelGraphPtr graph_ptr_;
uint32_t device_id_;
std::string device_target_;
int32_t num_step_;
bool debugger_enabled_;
bool is_dataset_graph_;

@ -24,6 +24,10 @@
#include <string>
#include <utility>
#include "debug/tensor_data.h"
#include "ir/dtype.h"
#ifdef ENABLE_DUMP_E2E
#include "debug/e2e_dump.h"
#endif
namespace mindspore {
class TensorLoader {
public:
@ -72,8 +76,54 @@ class TensorLoader {
void EmptyPrevTensor() { prev_tensor_list_map.clear(); }
void EmptyCurrentTensor() {
tensor_list_map.clear();
tensor_list.clear();
}
void set_iter_num(uint32_t iter_num) { this->iter_num = iter_num; }
#ifdef ENABLE_DUMP_E2E
bool DumpTensorToFile(std::string tensor_name, bool trans_flag, const std::string &filepath,
const std::string &host_fmt, const std::vector<int> &host_shape, TypeId host_type,
TypeId addr_type_id, std::string addr_format, size_t slot) const {
bool ret = false;
if (filepath.empty()) {
MS_LOG(ERROR) << "Dump file path is null!";
return ret;
}
std::string shape = "shape";
if (host_shape.size()) {
for (auto &value : host_shape) {
shape = shape + '_' + std::to_string(value);
}
} else {
shape = shape + "_0";
}
std::string file_extension = ".bin";
std::string path = "";
if (trans_flag) {
path = filepath + '_' + shape + '_' + TypeIdLabel(host_type) + '_' + host_fmt + file_extension;
} else {
path = filepath + '_' + shape + '_' + TypeIdToType(addr_type_id)->ToString() + '_' + addr_format + file_extension;
}
MS_LOG(INFO) << "Dump path is " << path;
std::string tensor_loader_name = tensor_name + ":" + std::to_string(slot);
auto iter = tensor_list_map.find(tensor_loader_name);
if (iter != tensor_list_map.end()) {
std::shared_ptr<TensorData> node = iter->second;
mindspore::tensor::TensorPtr out_tensor = node->GetTensor();
size_t host_size = out_tensor->data().nbytes();
ret = mindspore::Dump::DumpToFile(path, out_tensor->data_c(), host_size);
}
return ret;
}
#endif
private:
std::vector<std::shared_ptr<TensorData>> tensor_list;
std::map<std::string, std::shared_ptr<TensorData>> tensor_list_map;

@ -275,7 +275,7 @@ void DumpParameters(mindspore::session::KernelGraph *graph, const string &dump_p
} // namespace
#endif
bool AscendKernelRuntime::DumpData(mindspore::session::KernelGraph *graph) {
bool AscendKernelRuntime::DumpData(mindspore::session::KernelGraph *graph, Debugger *debugger) {
MS_EXCEPTION_IF_NULL(graph);
#ifdef ENABLE_DUMP_E2E
MS_LOG(INFO) << "Start dump step";

@ -38,7 +38,7 @@ class AscendKernelRuntime : public KernelRuntime {
AscendKernelRuntime() = default;
~AscendKernelRuntime() override;
bool Init() override;
bool DumpData(session::KernelGraph *graph) override;
bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
bool LoadData(session::KernelGraph *graph, Debugger *debugger) override;
bool GenTask(const session::KernelGraph *graph) override;
bool RunTask(const session::KernelGraph *graph) override;

@ -270,7 +270,7 @@ void CPUKernelRuntime::DecreaseSummaryRefCount(const session::NamedSummaryOutput
resource_manager_.DecreaseSummaryRefCount(summary_outputs);
}
bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph) {
bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, Debugger *debugger) {
MS_EXCEPTION_IF_NULL(kernel_graph);
resource_manager_.IncreaseAddressRefCount(kernel_graph);

@ -36,7 +36,7 @@ class CPUKernelRuntime : public KernelRuntime {
~CPUKernelRuntime() override = default;
bool Init() override { return true; }
bool Run(session::KernelGraph *graph) override;
bool Run(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
void AssignKernelAddress(session::KernelGraph *kernel_graph);
void BindInputOutput(const session::KernelGraph *kernel_graph, const std::vector<tensor::TensorPtr> &inputs,
VectorRef *outputs, std::vector<tensor::TensorPtr> *need_sync_outputs);

@ -16,9 +16,16 @@
#include "runtime/device/gpu/gpu_device_address.h"
#include <vector>
#include <memory>
#include "runtime/device/gpu/gpu_device_manager.h"
#include "utils/log_adapter.h"
#include "runtime/device/gpu/gpu_memory_allocator.h"
#include "ir/tensor.h"
#ifdef ENABLE_DEBUGGER
#include "debug/debug_services.h"
#include "debug/tensor_load.h"
#include "debug/debugger/debugger.h"
#endif
namespace mindspore {
namespace device {
@ -59,6 +66,36 @@ GPUDeviceAddress::~GPUDeviceAddress() {
ptr_ = nullptr;
}
}
#ifdef ENABLE_DEBUGGER
bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
const std::vector<int> &host_shape, TypeId host_type, size_t slot,
Debugger *debugger, bool keep_prev) const {
bool ret = false;
if (size_ == 0) {
return true;
}
DebugServices *debug_services = debugger->debug_services();
TensorLoader *tensor_loader = debug_services->tensor_loader();
mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(type_id_, host_shape);
size_t host_size = out_tensor->data().nbytes();
auto ret_rt_memcpy = SyncDeviceToHost(host_shape, host_size, host_type, out_tensor->data_c());
if (!ret_rt_memcpy) {
MS_LOG(ERROR) << "Copy device mem to host failed";
return ret;
}
auto tensor_data = std::make_shared<mindspore::TensorData>();
tensor_data->SetName(tensor_name);
tensor_data->SetExecutionOrder(execution_order);
tensor_data->SetTensor(out_tensor);
tensor_data->SetSlot(slot);
ret = tensor_loader->LoadNewTensor(tensor_data, keep_prev);
MS_LOG(INFO) << "E2E tensor name is " << tensor_name;
return ret;
}
#endif
} // namespace gpu
} // namespace device
} // namespace mindspore

@ -22,6 +22,9 @@
#include "runtime/device/device_address.h"
namespace mindspore {
#ifdef ENABLE_DEBUGGER
class Debugger;
#endif
namespace device {
namespace gpu {
class GPUDeviceAddress : public DeviceAddress {
@ -37,6 +40,11 @@ class GPUDeviceAddress : public DeviceAddress {
DeviceAddressStatus status() const { return status_; }
DeviceAddressType DeviceType() const override { return DeviceAddressType::kGPU; }
#ifdef ENABLE_DEBUGGER
bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
const std::vector<int> &host_shape, TypeId host_type, size_t slot, Debugger *debugger,
bool keep_prev) const;
#endif
private:
DeviceAddressStatus status_{DeviceAddressStatus::kInDevice};
};

File diff suppressed because it is too large Load Diff

@ -38,7 +38,10 @@ class GPUKernelRuntime : public KernelRuntime {
bool Init() override;
void ReleaseDeviceRes() override;
void AssignMemory(session::KernelGraph *graph) override;
bool Run(session::KernelGraph *graph) override;
bool Run(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
#ifdef ENABLE_DUMP_E2E
bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr) override;
#endif
protected:
DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
@ -61,10 +64,11 @@ class GPUKernelRuntime : public KernelRuntime {
void ClearKernelOutputAddress(const session::KernelGraph *graph);
void ClearKernelWorkspaceAddress(const session::KernelGraph *graph);
void ClearKernelOldOutputAndWorkspace(const session::KernelGraph *graph);
bool RunOneStep(const session::KernelGraph *graph);
bool SearchMemSwapScheme(const session::KernelGraph *graph);
bool RefineMemSwapScheme(const session::KernelGraph *graph);
bool LaunchKernelDynamic(const session::KernelGraph *graph, bool mock = false, bool profiling = false);
bool RunOneStep(const session::KernelGraph *graph, Debugger *debugger = nullptr);
bool SearchMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger = nullptr);
bool RefineMemSwapScheme(const session::KernelGraph *graph, Debugger *debugger = nullptr);
bool LaunchKernelDynamic(const session::KernelGraph *graph, Debugger *debugger = nullptr, bool mock = false,
bool profiling = false);
void LaunchKernelWithTimeProfiling(const AnfNodePtr &kernel, const AddressPtrList &inputs,
const AddressPtrList &workspace, const AddressPtrList &outputs);
bool AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size, bool mock);

@ -41,7 +41,7 @@ KernelRuntime::~KernelRuntime() {
#endif
}
bool KernelRuntime::Run(session::KernelGraph *graph) {
bool KernelRuntime::Run(session::KernelGraph *graph, Debugger *debugger) {
bool ret = false;
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
@ -72,7 +72,7 @@ bool KernelRuntime::Run(session::KernelGraph *graph) {
}
// for D to impl
bool KernelRuntime::DumpData(mindspore::session::KernelGraph *graph) {
bool KernelRuntime::DumpData(mindspore::session::KernelGraph *graph, Debugger *debugger) {
if (graph != nullptr) {
return true;
}
@ -190,6 +190,39 @@ void KernelRuntime::RunOpClearMemory(const session::KernelGraph *graph) {
}
}
bool KernelRuntime::DumpDataEnabled() {
bool ret = false;
#ifdef ENABLE_DUMP_E2E
DumpConfPtr dump_conf = GetDumpConf();
MS_EXCEPTION_IF_NULL(dump_conf);
bool dump_flag = dump_conf->dump_enable();
if (!dump_flag) {
return ret;
}
ret = true;
#endif
return ret;
}
bool KernelRuntime::DumpDataEnabledIteration() {
bool ret = false;
#ifdef ENABLE_DUMP_E2E
if (!DumpDataEnabled()) {
return ret;
}
DumpConfPtr dump_conf = GetDumpConf();
MS_EXCEPTION_IF_NULL(dump_conf);
uint32_t cur_iter = dump_conf->cur_iter() + 1;
if (dump_conf->dump_iter() != 0) {
if (cur_iter != dump_conf->dump_iter()) {
return ret;
}
}
ret = true;
#endif
return ret;
}
void KernelRuntime::AssignStaticMemory(session::KernelGraph *graph) {
AssignStaticMemoryInput(graph);
AssignStaticMemoryValueNode(graph);

@ -55,8 +55,10 @@ class KernelRuntime {
virtual void AssignMemory(session::KernelGraph *graph);
void RunOpAssignMemory(const std::vector<tensor::TensorPtr> &input_tensors, session::KernelGraph *graph);
void RunOpClearMemory(const session::KernelGraph *graph);
virtual bool Run(session::KernelGraph *graph);
virtual bool DumpData(session::KernelGraph *graph);
bool DumpDataEnabled();
bool DumpDataEnabledIteration();
virtual bool Run(session::KernelGraph *graph, Debugger *debugger = nullptr);
virtual bool DumpData(session::KernelGraph *graph, Debugger *debugger = nullptr);
virtual bool LoadData(session::KernelGraph *graph, Debugger *debugger);
virtual bool RunTask(const session::KernelGraph *graph);
virtual bool GenTask(const session::KernelGraph *graph);

Loading…
Cancel
Save