Data Dump Bug Fix

1. Remove old e2e dump json
2. Remove warning log
3. Add device id to e2e dump path
4. Fix bug of dump json parse failed after hccl_init
pull/6198/head
caifubi 4 years ago
parent b9c996484e
commit 1480c93d04

@ -4,7 +4,7 @@
"path": "/test",
"net_name": "ResNet50",
"iteration": 0,
"input_output": 0,
"input_output": 2,
"kernels": ["Default/Conv-op12"],
"support_device": [0,1,2,3,4,5,6,7]
},

@ -1,22 +0,0 @@
{
"DumpSettings": {
"enable": false,
"trans_flag": false,
"path": "/tmp/net/",
"net_name": "ResNet50",
"mode": 0,
"iteration": 0,
"kernels": ["Default/Conv2D-op2", "Default/TensorAdd-op10"]
},
"DumpSettingsSpec": {
"enable": "true: dump enable, false: dump disable",
"trans_flag": "true: trans to host format, false: not trans format",
"path": "the dump file folder",
"net_name": "net name eg:ResNet50",
"mode": "0: dump all kernels, 1: dump kernels in kernels list",
"iteration": "0: all iteration, others: specified iteration ",
"kernels": "op's full scope name which need to be dump"
},
"other": {}
}

@ -1,22 +0,0 @@
{
"DumpSettings": {
"enable": false,
"trans_flag": false,
"path": "/tmp/hccllog/0",
"net_name": "ResNet50",
"mode": 0,
"iteration": 0,
"kernels": ["Default/Conv2D-op2", "Default/TensorAdd-op10"]
},
"DumpSettingsSpec": {
"enable": "true: dump enable, false: dump disable",
"trans_flag": "true: trans to host format, false: not trans format",
"path": "the dump file folder",
"net_name": "net name eg:ResNet50",
"mode": "0: dump all kernels, 1: dump kernels in kernels list",
"iteration": "0: all iteration, others: specified iteration ",
"kernels": "op's full scope name which need to be dump"
},
"other": {}
}

@ -1,22 +0,0 @@
{
"DumpSettings": {
"enable": false,
"trans_flag": false,
"path": "/tmp/hccllog/1",
"net_name": "ResNet50",
"mode": 0,
"iteration": 0,
"kernels": ["Default/Conv2D-op2", "Default/TensorAdd-op10"]
},
"DumpSettingsSpec": {
"enable": "true: dump enable, false: dump disable",
"trans_flag": "true: trans to host format, false: not trans format",
"path": "the dump file folder",
"net_name": "net name eg:ResNet50",
"mode": "0: dump all kernels, 1: dump kernels in kernels list",
"iteration": "0: all iteration, others: specified iteration ",
"kernels": "op's full scope name which need to be dump"
},
"other": {}
}

@ -442,6 +442,7 @@ void AscendSession::InitRuntimeResource() {
if (!runtime_instance->Init()) {
MS_LOG(EXCEPTION) << "Kernel runtime init error.";
}
DumpJsonParser::GetInstance().Parse();
MS_LOG(INFO) << "Finish!";
}

@ -74,6 +74,10 @@ bool DumpJsonParser::IsDumpEnabled() {
void DumpJsonParser::Parse() {
std::lock_guard<std::mutex> guard(lock_);
if (already_parsed_) {
return;
}
already_parsed_ = true;
if (!IsDumpEnabled()) {
return;
}
@ -305,6 +309,8 @@ void DumpJsonParser::JudgeDumpEnabled() {
MS_LOG(WARNING) << "Dump not enabled. device_id:" << device_id << " not support";
}
context->set_param<bool>(MS_CTX_ENABLE_MEM_REUSE, !e2e_dump_enabled_);
MS_LOG(INFO) << "Dump status, e2e_dump_enabled:" << e2e_dump_enabled_
<< " async_dump_enabled:" << async_dump_enabled_;
}
bool DumpJsonParser::NeedDump(const std::string &op_full_name) const {
@ -325,6 +331,9 @@ void DumpJsonParser::MatchKernel(const std::string &kernel_name) {
}
void DumpJsonParser::PrintUnusedKernel() {
if (!e2e_dump_enabled_ && !async_dump_enabled_) {
return;
}
for (const auto &iter : kernels_) {
if (iter.second == 0) {
MS_LOG(WARNING) << "[DataDump] Unused Kernel in json:" << iter.first;
@ -362,16 +371,6 @@ bool DumpJsonParser::OutputNeedDump() const {
return input_output_ == kDumpInputAndOutput || input_output_ == kDumpOutputOnly;
}
bool NeedAsyncDump(const CNodePtr &kernel) {
if (AnfAlgo::GetKernelType(kernel) != TBE_KERNEL && AnfAlgo::GetKernelType(kernel) != AICPU_KERNEL &&
AnfAlgo::GetKernelType(kernel) != AKG_KERNEL) {
return false;
}
MS_EXCEPTION_IF_NULL(kernel);
// dump all kernel if mode is set 0 in data_dump.json
return DumpJsonParser::GetInstance().NeedDump(kernel->fullname_with_scope());
}
void DumpJsonParser::UpdateNeedDumpKernels(NotNull<const session::KernelGraph *> kernel_graph) {
if (e2e_dump_enabled_) {
MS_LOG(INFO) << "E2e dump no need to update dump kernel list";
@ -391,9 +390,6 @@ void DumpJsonParser::UpdateNeedDumpKernels(NotNull<const session::KernelGraph *>
update_kernels.try_emplace(input->fullname_with_scope(), 0);
}
}
} else if (NeedAsyncDump(kernel)) {
MS_LOG(INFO) << "[AsyncDump] Match Node:" << kernel->fullname_with_scope();
update_kernels.try_emplace(kernel->fullname_with_scope(), 0);
}
}
kernels_.insert(update_kernels.begin(), update_kernels.end());

@ -72,6 +72,7 @@ class DumpJsonParser {
uint32_t op_debug_mode_{0};
bool trans_flag_{false};
uint32_t cur_dump_iter_{0};
bool already_parsed_{false};
void ParseCommonDumpSetting(const nlohmann::json &content);
void ParseAsyncDumpSetting(const nlohmann::json &content);

@ -206,14 +206,17 @@ bool E2eDumpUtil::DumpData(const session::KernelGraph *graph, Debugger *debugger
}
}
MS_LOG(INFO) << "Start e2e dump. Current iteration is " << dump_json_parser.cur_dump_iter();
auto context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context);
auto device_id = context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
std::string net_name = dump_json_parser.net_name();
std::string iterator = std::to_string(dump_json_parser.cur_dump_iter());
std::string dump_path = dump_json_parser.path();
if (dump_path.back() == '/') {
dump_path = dump_path + net_name + '/' + iterator;
} else {
dump_path = dump_path + '/' + net_name + '/' + iterator;
if (dump_path.back() != '/') {
dump_path += "/";
}
dump_path += (net_name + "/device_" + std::to_string(device_id) + "/iteration_" + iterator);
DumpInput(graph, dump_path, debugger);
DumpOutput(graph, dump_path, debugger);
DumpParameters(graph, dump_path, debugger);

@ -206,11 +206,8 @@ bool AscendKernelRuntime::Init() {
SetContext();
return true;
}
bool ret = false;
DumpJsonParser::GetInstance().Parse();
// Start up profiling before rtSetDevice
ret = ProfilingManager::GetInstance().StartupProfiling(device_id_);
bool ret = ProfilingManager::GetInstance().StartupProfiling(device_id_);
if (!ret) {
MS_EXCEPTION(DeviceProcessError) << "StartupProfiling failed.";
}

Loading…
Cancel
Save