|
|
@ -309,7 +309,7 @@ bool AscendKernelRuntime::LoadData(mindspore::session::KernelGraph *graph) {
|
|
|
|
MS_EXCEPTION_IF_NULL(graph);
|
|
|
|
MS_EXCEPTION_IF_NULL(graph);
|
|
|
|
#ifdef ENABLE_DEBUGGER
|
|
|
|
#ifdef ENABLE_DEBUGGER
|
|
|
|
MS_LOG(INFO) << "Start load step";
|
|
|
|
MS_LOG(INFO) << "Start load step";
|
|
|
|
for (auto graph_ptr : debugger_->GetGraphPtrList()) {
|
|
|
|
for (const auto &graph_ptr : debugger_->GetGraphPtrList()) {
|
|
|
|
debugger_->SetGraphPtr(graph_ptr);
|
|
|
|
debugger_->SetGraphPtr(graph_ptr);
|
|
|
|
// load output
|
|
|
|
// load output
|
|
|
|
debugger_->LoadGraphOutputs();
|
|
|
|
debugger_->LoadGraphOutputs();
|
|
|
@ -543,17 +543,15 @@ void AscendKernelRuntime::TaskFailCallback(rtTaskFailInfo *task_fail_info) {
|
|
|
|
std::lock_guard<std::mutex> lock(exception_mutex);
|
|
|
|
std::lock_guard<std::mutex> lock(exception_mutex);
|
|
|
|
if (task_fail_info->retcode == ACL_ERROR_RT_AICORE_OVER_FLOW) {
|
|
|
|
if (task_fail_info->retcode == ACL_ERROR_RT_AICORE_OVER_FLOW) {
|
|
|
|
auto key = std::to_string(task_fail_info->streamid) + std::to_string(task_fail_info->taskid);
|
|
|
|
auto key = std::to_string(task_fail_info->streamid) + std::to_string(task_fail_info->taskid);
|
|
|
|
auto find_iter = overflow_tasks_.find(key);
|
|
|
|
if (overflow_tasks_.find(key) == overflow_tasks_.end()) {
|
|
|
|
if (find_iter == overflow_tasks_.end()) {
|
|
|
|
|
|
|
|
overflow_tasks_[key] = 1;
|
|
|
|
overflow_tasks_[key] = 1;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
if (overflow_tasks_[key] == 5) {
|
|
|
|
|
|
|
|
auto node_name = AscendKernelRuntime::GetErrorNodeName(task_fail_info->streamid, task_fail_info->taskid);
|
|
|
|
|
|
|
|
MS_LOG(WARNING) << "Node run task overflow, node name: " << node_name;
|
|
|
|
|
|
|
|
overflow_tasks_[key] = 0;
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
if (overflow_tasks_[key] == 5) {
|
|
|
|
overflow_tasks_[key]++;
|
|
|
|
auto node_name = AscendKernelRuntime::GetErrorNodeName(task_fail_info->streamid, task_fail_info->taskid);
|
|
|
|
|
|
|
|
MS_LOG(WARNING) << "Node run task overflow, node name: " << node_name;
|
|
|
|
|
|
|
|
overflow_tasks_.erase(find_iter);
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
overflow_tasks_[key]++;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
MS_LOG(WARNING) << "Task fail infos task_id: " << task_fail_info->taskid
|
|
|
|
MS_LOG(WARNING) << "Task fail infos task_id: " << task_fail_info->taskid
|
|
|
@ -578,16 +576,20 @@ string AscendKernelRuntime::GetErrorNodeName(uint32_t streamid, uint32_t taskid)
|
|
|
|
|
|
|
|
|
|
|
|
void AscendKernelRuntime::DumpTaskExceptionInfo(const session::KernelGraph *graph) {
|
|
|
|
void AscendKernelRuntime::DumpTaskExceptionInfo(const session::KernelGraph *graph) {
|
|
|
|
MS_EXCEPTION_IF_NULL(graph);
|
|
|
|
MS_EXCEPTION_IF_NULL(graph);
|
|
|
|
auto full_scope_name =
|
|
|
|
|
|
|
|
AscendKernelRuntime::GetErrorNodeName(task_fail_infoes_.at(0).streamid, task_fail_infoes_.at(0).taskid);
|
|
|
|
|
|
|
|
// Dump error data in local path
|
|
|
|
|
|
|
|
const std::string local_path = std::string("./task_error_dump/") + std::to_string(task_fail_infoes_.at(0).deviceid);
|
|
|
|
const std::string local_path = std::string("./task_error_dump/") + std::to_string(task_fail_infoes_.at(0).deviceid);
|
|
|
|
for (const auto &node : graph->execution_order()) {
|
|
|
|
for (const auto &task_fail_info : task_fail_infoes_) {
|
|
|
|
if (node->fullname_with_scope() == full_scope_name) {
|
|
|
|
auto full_scope_name = AscendKernelRuntime::GetErrorNodeName(task_fail_info.streamid, task_fail_info.taskid);
|
|
|
|
MS_LOG(ERROR) << "Begin to dump node (" << full_scope_name << ") task error input/output data in local path."
|
|
|
|
// Dump error data in local path
|
|
|
|
<< " trace: " << trace::DumpSourceLines(node);
|
|
|
|
if (full_scope_name.empty()) {
|
|
|
|
E2eDumpUtil::DumpInputImpl(node, false, local_path, &full_scope_name, nullptr);
|
|
|
|
continue;
|
|
|
|
E2eDumpUtil::DumpOutputImpl(node, false, local_path, &full_scope_name, nullptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (const auto &node : graph->execution_order()) {
|
|
|
|
|
|
|
|
if (node->fullname_with_scope() == full_scope_name) {
|
|
|
|
|
|
|
|
MS_LOG(ERROR) << "Dump node (" << full_scope_name << ") task error input/output data to: " << local_path
|
|
|
|
|
|
|
|
<< " trace: " << trace::DumpSourceLines(node);
|
|
|
|
|
|
|
|
E2eDumpUtil::DumpInputImpl(node, false, local_path, &full_scope_name, nullptr);
|
|
|
|
|
|
|
|
E2eDumpUtil::DumpOutputImpl(node, false, local_path, &full_scope_name, nullptr);
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -686,7 +688,8 @@ bool AscendKernelRuntime::RunTask(const session::KernelGraph *graph) {
|
|
|
|
if (!status) {
|
|
|
|
if (!status) {
|
|
|
|
DumpTaskExceptionInfo(graph);
|
|
|
|
DumpTaskExceptionInfo(graph);
|
|
|
|
std::string file_name = "task_error_debug" + std::to_string(current_graph_id_) + ".ir";
|
|
|
|
std::string file_name = "task_error_debug" + std::to_string(current_graph_id_) + ".ir";
|
|
|
|
DumpIR(file_name, std::shared_ptr<session::KernelGraph>(const_cast<session::KernelGraph *>(graph)));
|
|
|
|
auto graph_tmp = std::make_shared<session::KernelGraph>(*graph);
|
|
|
|
|
|
|
|
DumpIR(file_name, graph_tmp);
|
|
|
|
#ifdef ENABLE_TDTQUE
|
|
|
|
#ifdef ENABLE_TDTQUE
|
|
|
|
// Run task error, we should call TdtHostDestroy to release tdt to avoid DeviceQueueOp hostPush hung
|
|
|
|
// Run task error, we should call TdtHostDestroy to release tdt to avoid DeviceQueueOp hostPush hung
|
|
|
|
// case1: cpu usage 100% cause thread/process exit, but some tdt thread remain in backend
|
|
|
|
// case1: cpu usage 100% cause thread/process exit, but some tdt thread remain in backend
|
|
|
|