pull/1331/head
zhengyuanhua 4 years ago
parent 9394d886f7
commit 6dd23174bd

@ -108,6 +108,7 @@ set(TRAIN_SRC_LIST
"common/helper/model_cache_helper.cc"
"common/profiling/profiling_manager.cc"
"common/dump/dump_manager.cc"
"common/dump/dump_exception.cc"
"common/dump/dump_properties.cc"
"common/dump/opdebug_register.cc"
"common/dump/dump_op.cc"
@ -433,6 +434,7 @@ set(INFER_SRC_LIST
"common/formats/formats.cc"
"common/profiling/profiling_manager.cc"
"common/dump/dump_properties.cc"
"common/dump/dump_exception.cc"
"common/dump/dump_manager.cc"
"common/dump/dump_op.cc"
"common/dump/opdebug_register.cc"

@ -0,0 +1,221 @@
/**
* Copyright 2019-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "common/dump/dump_exception.h"
#include "common/ge/datatype_util.h"
#include "common/debug/memory_dumper.h"
#include "framework/common/debug/log.h"
#include "graph/manager/util/debug.h"
#include "graph/utils/tensor_utils.h"
#include "graph/load/model_manager/model_utils.h"
#include "proto/dump_task.pb.h"
namespace {
static uint64_t GetNowTime() {
uint64_t ret = 0;
mmTimeval tv;
if (mmGetTimeOfDay(&tv, nullptr) == 0) {
ret = tv.tv_sec * 1000000ULL + tv.tv_usec;
}
return ret;
}
static void ReplaceStringElem(std::string &str) {
for_each(str.begin(), str.end(), [](char &ch) {
if ((ch == ' ') || (ch == '.') || (ch == '/') || (ch == '\\')) {
ch = '_';
}
});
}
} // namespace
namespace ge {
DumpException::~DumpException() {}
void DumpException::SaveDumpOpInfo(const OpDescPtr &op, uint32_t task_id, uint32_t stream_id,
vector<void *> &input_addrs, vector<void *> &output_addrs) {
OpDescInfo op_desc_info;
SaveOpDescInfo(op, task_id, stream_id, op_desc_info);
op_desc_info.input_addrs = input_addrs;
op_desc_info.output_addrs = output_addrs;
op_desc_info_.emplace_back(op_desc_info);
}
void DumpException::SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op,
uint32_t task_id, uint32_t stream_id) {
OpDescInfo op_desc_info;
SaveOpDescInfo(op, task_id, stream_id, op_desc_info);
op_desc_info.input_addrs = ModelUtils::GetInputDataAddrs(model_param, op);
op_desc_info.output_addrs = ModelUtils::GetOutputDataAddrs(model_param, op);
op_desc_info_.emplace_back(op_desc_info);
}
void DumpException::SaveOpDescInfo(const OpDescPtr &op, uint32_t task_id, uint32_t stream_id,
OpDescInfo &op_desc_info) {
GELOGD("[Save][OpExceptionInfo] Start to save dump op [%s] info of task_id: %u, stream_id: %u",
op->GetName().c_str(), task_id, stream_id);
op_desc_info.op_name = op->GetName();
op_desc_info.op_type = op->GetType();
op_desc_info.task_id = task_id;
op_desc_info.stream_id = stream_id;
for (size_t i = 0; i < op->GetAllInputsSize(); ++i) {
GeTensorDescPtr input_tensor_desc = op->MutableInputDesc(i);
if (input_tensor_desc == nullptr) {
continue;
}
op_desc_info.input_format.emplace_back(input_tensor_desc->GetFormat());
op_desc_info.input_shape.emplace_back(input_tensor_desc->GetShape().GetDims());
op_desc_info.input_data_type.emplace_back(input_tensor_desc->GetDataType());
int64_t input_size = 0;
if (TensorUtils::GetTensorSizeInBytes(*input_tensor_desc, input_size) != SUCCESS) {
GELOGW("[Save][OpExceptionInfo] Op [%s] get input size failed.", op->GetName().c_str());
return;
}
GELOGD("[Save][OpExceptionInfo] Save dump op info, the input size is %ld", input_size);
op_desc_info.input_size.emplace_back(input_size);
}
for (size_t j = 0; j < op->GetOutputsSize(); ++j) {
GeTensorDescPtr output_tensor_desc = op->MutableOutputDesc(j);
if (output_tensor_desc == nullptr) {
continue;
}
op_desc_info.output_format.emplace_back(output_tensor_desc->GetFormat());
op_desc_info.output_shape.emplace_back(output_tensor_desc->GetShape().GetDims());
op_desc_info.output_data_type.emplace_back(output_tensor_desc->GetDataType());
int64_t output_size = 0;
if (TensorUtils::GetTensorSizeInBytes(*output_tensor_desc, output_size) != SUCCESS) {
GELOGW("[Save][OpExceptionInfo] Op [%s] get output size failed.", op->GetName().c_str());
return;
}
GELOGD("[Save][OpExceptionInfo] Save dump op info, the output size is %ld.", output_size);
op_desc_info.output_size.emplace_back(output_size);
}
}
Status DumpException::DumpExceptionInfo(const std::vector<rtExceptionInfo> &exception_infos) const {
GELOGI("[Dump][Exception] Start to dump exception info.");
for (const rtExceptionInfo &iter : exception_infos) {
OpDescInfo op_desc_info;
if (GetOpDescInfo(iter.streamid, iter.taskid, op_desc_info)) {
toolkit::dumpdata::DumpData dump_data;
dump_data.set_version("2.0");
dump_data.set_dump_time(GetNowTime());
dump_data.set_op_name(op_desc_info.op_name);
for (size_t i = 0; i < op_desc_info.input_format.size(); ++i) {
toolkit::dumpdata::OpInput input;
input.set_data_type(toolkit::dumpdata::OutputDataType(
DataTypeUtil::GetIrDataType(op_desc_info.input_data_type[i])));
input.set_format(toolkit::dumpdata::OutputFormat(op_desc_info.input_format[i]));
for (auto dim : op_desc_info.input_shape[i]) {
input.mutable_shape()->add_dim(dim);
}
input.set_size(op_desc_info.input_size[i]);
GELOGI("[Dump][Exception] The input size int exception is %ld.", op_desc_info.input_size[i]);
dump_data.mutable_input()->Add(std::move(input));
}
for (size_t j = 0; j < op_desc_info.output_format.size(); ++j) {
toolkit::dumpdata::OpOutput output;
output.set_data_type(toolkit::dumpdata::OutputDataType(
DataTypeUtil::GetIrDataType(op_desc_info.output_data_type[j])));
output.set_format(toolkit::dumpdata::OutputFormat(op_desc_info.output_format[j]));
for (auto dim : op_desc_info.output_shape[j]) {
output.mutable_shape()->add_dim(dim);
}
output.set_size(op_desc_info.output_size[j]);
GELOGI("[Dump][Exception] The output size int exception is %ld.", op_desc_info.output_size[j]);
dump_data.mutable_output()->Add(std::move(output));
}
uint64_t now_time = GetNowTime();
std::string op_name = op_desc_info.op_name;
std::string op_type = op_desc_info.op_type;
ReplaceStringElem(op_name);
ReplaceStringElem(op_type);
string dump_file_path =
"./" + op_type + "." + op_name + "." + std::to_string(op_desc_info.task_id) + "." + std::to_string(now_time);
GELOGI("[Dump][Exception] The exception dump file path is %s.", dump_file_path.c_str());
uint64_t proto_size = dump_data.ByteSizeLong();
std::unique_ptr<char[]> proto_msg(new (std::nothrow) char[proto_size]);
bool ret = dump_data.SerializeToArray(proto_msg.get(), proto_size);
if (!ret || proto_size == 0) {
GELOGE(PARAM_INVALID, "[Dump][Exception] Dump data proto serialize failed.");
return PARAM_INVALID;
}
GE_CHK_STATUS_RET(MemoryDumper::DumpToFile(dump_file_path.c_str(), &proto_size, sizeof(uint64_t)),
"Failed to dump proto size");
GE_CHK_STATUS_RET(MemoryDumper::DumpToFile(dump_file_path.c_str(), proto_msg.get(), proto_size),
"Failed to dump proto msg");
if (DumpExceptionInput(op_desc_info, dump_file_path) != SUCCESS) {
GELOGE(PARAM_INVALID, "[Dump][Exception] Dump op [%s] exception input failed.", op_desc_info.op_name.c_str());
return PARAM_INVALID;
}
if (DumpExceptionOutput(op_desc_info, dump_file_path) != SUCCESS) {
GELOGE(PARAM_INVALID, "[Dump][Exception] Dump op [%s] exception output failed.", op_desc_info.op_name.c_str());
return PARAM_INVALID;
}
GELOGI("[Dump][Exception] Dump op [%s] exception info SUCCESS.", op_desc_info.op_name.c_str());
} else {
GELOGI("[Dump][Exception] Can't find op desc info ,task id:%u,stream id:%u", iter.taskid, iter.streamid);
return SUCCESS;
}
}
return SUCCESS;
}
bool DumpException::GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const {
GELOGI("[Get][OpDescInfo] There are %zu op need to dump.", op_desc_info_.size());
for (size_t index = 0; index < op_desc_info_.size(); ++index) {
OpDescInfo dump_op_info = op_desc_info_.at(index);
if (dump_op_info.task_id == task_id && dump_op_info.stream_id == stream_id) {
GELOGI("[Get][OpDescInfo] Find exception op [%s] of task_id: %u, stream_id: %u.",
dump_op_info.op_name.c_str(), task_id, stream_id);
op_desc_info = dump_op_info;
return true;
}
}
return false;
}
Status DumpException::DumpExceptionInput(const OpDescInfo &op_desc_info, const string &dump_file) const {
GELOGI("[Dump][ExceptionInput] Start to dump exception input");
for (size_t i = 0; i < op_desc_info.input_addrs.size(); i++) {
if (Debug::DumpDevMem(dump_file.data(), op_desc_info.input_addrs.at(i), op_desc_info.input_size.at(i)) != SUCCESS) {
GELOGE(PARAM_INVALID, "[Dump][ExceptionInput] Dump the %zu input data of op [%s] failed",
i, op_desc_info.op_name.c_str());
return PARAM_INVALID;
}
}
return SUCCESS;
}
Status DumpException::DumpExceptionOutput(const OpDescInfo &op_desc_info, const string &dump_file) const {
GELOGI("[Dump][ExceptionOutput] Start to dump exception output");
for (size_t i = 0; i < op_desc_info.output_addrs.size(); i++) {
if (Debug::DumpDevMem(dump_file.data(), op_desc_info.output_addrs.at(i), op_desc_info.output_size.at(i)) !=
SUCCESS) {
GELOGE(PARAM_INVALID, "[Dump][ExceptionInput] Dump the %zu input data of op [%s] failed",
i, op_desc_info.op_name.c_str());
return PARAM_INVALID;
}
}
return SUCCESS;
}
} // namespace ge

@ -0,0 +1,47 @@
/**
* Copyright 2019-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef GE_COMMON_DUMP_DUMP_EXCEPTION_H_
#define GE_COMMON_DUMP_DUMP_EXCEPTION_H_
#include <vector>
#include "graph/op_desc.h"
#include "framework/common/ge_types.h"
#include "graph/load/model_manager/task_info/task_info.h"
namespace ge {
class DumpException {
public:
DumpException() = default;
~DumpException();
void SaveDumpOpInfo(const OpDescPtr &op, uint32_t task_id, uint32_t stream_id,
vector<void *> &input_addrs, vector<void *> &output_addrs);
void SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, uint32_t task_id, uint32_t stream_id);
Status DumpExceptionInfo(const std::vector<rtExceptionInfo> &exception_infos) const;
bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const;
private:
void SaveOpDescInfo(const OpDescPtr &op, uint32_t task_id, uint32_t stream_id, OpDescInfo &op_desc_info);
Status DumpExceptionInput(const OpDescInfo &op_desc_info, const string &dump_file) const;
Status DumpExceptionOutput(const OpDescInfo &op_desc_info, const string &dump_file) const;
std::vector<OpDescInfo> op_desc_info_;
};
} // namespace ge
#endif // GE_COMMON_DUMP_DUMP_OP_H_

@ -16,6 +16,7 @@ set(SRC_LIST
"../common/ge/plugin_manager.cc"
"../common/ge/op_tiling_manager.cc"
"../common/dump/dump_properties.cc"
"../common/dump/dump_exception.cc"
"../common/dump/dump_manager.cc"
"../common/dump/dump_op.cc"
"../common/dump/opdebug_register.cc"

@ -72,24 +72,6 @@ static bool ParseNameIndex(const std::string &node_name_index, std::string &node
static bool IsTensorDescWithSkipDumpAddrType(bool has_mem_type_attr, vector<int64_t> v_memory_type, size_t i) {
return has_mem_type_attr && (v_memory_type[i] == RT_MEMORY_L1);
}
static uint64_t GetNowTime() {
uint64_t ret = 0;
mmTimeval tv;
if (mmGetTimeOfDay(&tv, nullptr) == 0) {
ret = tv.tv_sec * 1000000ULL + tv.tv_usec;
}
return ret;
}
static void ReplaceStringElem(std::string &str) {
for_each(str.begin(), str.end(), [](char &ch) {
if ((ch == ' ') || (ch == '.') || (ch == '/') || (ch == '\\')) {
ch = '_';
}
});
}
} // namespace
static int32_t GetIrDataType(ge::DataType data_type) {
@ -194,66 +176,6 @@ void DataDumper::SaveOpDebugId(uint32_t task_id, uint32_t stream_id, void *op_de
is_op_debug_ = is_op_debug;
}
void DataDumper::SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, uint32_t task_id,
uint32_t stream_id) {
GELOGD("Start SaveDumpOpInfo of task_id: %u, stream_id: %u", task_id, stream_id);
OpDescInfo op_desc_info;
op_desc_info.op_name = op->GetName();
op_desc_info.op_type = op->GetType();
op_desc_info.task_id = task_id;
op_desc_info.stream_id = stream_id;
for (size_t i = 0; i < op->GetAllInputsSize(); ++i) {
GeTensorDescPtr input_tensor_desc = op->MutableInputDesc(i);
if (input_tensor_desc == nullptr) {
continue;
}
op_desc_info.input_format.emplace_back(input_tensor_desc->GetFormat());
op_desc_info.input_shape.emplace_back(input_tensor_desc->GetShape().GetDims());
op_desc_info.input_data_type.emplace_back(input_tensor_desc->GetDataType());
int64_t input_size = 0;
if (TensorUtils::GetTensorSizeInBytes(*input_tensor_desc, input_size) != SUCCESS) {
GELOGW("Get input size failed");
return;
}
GELOGD("Save dump op info, the input size is %ld", input_size);
op_desc_info.input_size.emplace_back(input_size);
}
for (size_t j = 0; j < op->GetOutputsSize(); ++j) {
GeTensorDescPtr output_tensor_desc = op->MutableOutputDesc(j);
if (output_tensor_desc == nullptr) {
continue;
}
op_desc_info.output_format.emplace_back(output_tensor_desc->GetFormat());
op_desc_info.output_shape.emplace_back(output_tensor_desc->GetShape().GetDims());
op_desc_info.output_data_type.emplace_back(output_tensor_desc->GetDataType());
int64_t output_size = 0;
if (TensorUtils::GetTensorSizeInBytes(*output_tensor_desc, output_size) != SUCCESS) {
GELOGW("Get input size failed");
return;
}
GELOGD("Save dump op info, the output size is %ld", output_size);
op_desc_info.output_size.emplace_back(output_size);
}
op_desc_info.input_addrs = ModelUtils::GetInputDataAddrs(model_param, op);
op_desc_info.output_addrs = ModelUtils::GetOutputDataAddrs(model_param, op);
op_desc_info_.emplace_back(op_desc_info);
}
bool DataDumper::GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const {
GELOGI("There are %zu op need to dump.", op_desc_info_.size());
for (size_t index = 0; index < op_desc_info_.size(); ++index) {
OpDescInfo dump_op_info = op_desc_info_.at(index);
if (dump_op_info.task_id == task_id && dump_op_info.stream_id == stream_id) {
GELOGI("find exception op of task_id: %u, stream_id: %u.", task_id, stream_id);
op_desc_info = dump_op_info;
return true;
}
}
return false;
}
void DataDumper::SaveDumpTask(uint32_t task_id, uint32_t stream_id, const std::shared_ptr<OpDesc> &op_desc,
uintptr_t args) {
if (op_desc == nullptr) {
@ -873,97 +795,4 @@ void DataDumper::PrintCheckLog(string &dump_list_key) {
}
}
}
Status DataDumper::DumpExceptionInput(const OpDescInfo &op_desc_info, const string &dump_file) {
GELOGI("Start to dump exception input");
for (size_t i = 0; i < op_desc_info.input_addrs.size(); i++) {
if (Debug::DumpDevMem(dump_file.data(), op_desc_info.input_addrs.at(i), op_desc_info.input_size.at(i)) != SUCCESS) {
GELOGE(PARAM_INVALID, "Dump the %zu input data failed", i);
return PARAM_INVALID;
}
}
return SUCCESS;
}
Status DataDumper::DumpExceptionOutput(const OpDescInfo &op_desc_info, const string &dump_file) {
GELOGI("Start to dump exception output");
for (size_t i = 0; i < op_desc_info.output_addrs.size(); i++) {
if (Debug::DumpDevMem(dump_file.data(), op_desc_info.output_addrs.at(i), op_desc_info.output_size.at(i)) !=
SUCCESS) {
GELOGE(PARAM_INVALID, "Dump the %zu input data failed", i);
return PARAM_INVALID;
}
}
return SUCCESS;
}
Status DataDumper::DumpExceptionInfo(const std::vector<rtExceptionInfo> exception_infos) {
GELOGI("Start to dump exception info");
for (const rtExceptionInfo &iter : exception_infos) {
OpDescInfo op_desc_info;
if (GetOpDescInfo(iter.streamid, iter.taskid, op_desc_info)) {
toolkit::dumpdata::DumpData dump_data;
dump_data.set_version("2.0");
dump_data.set_dump_time(GetNowTime());
dump_data.set_op_name(op_desc_info.op_name);
for (size_t i = 0; i < op_desc_info.input_format.size(); ++i) {
toolkit::dumpdata::OpInput input;
input.set_data_type(toolkit::dumpdata::OutputDataType(GetIrDataType(op_desc_info.input_data_type[i])));
input.set_format(toolkit::dumpdata::OutputFormat(op_desc_info.input_format[i]));
for (auto dim : op_desc_info.input_shape[i]) {
input.mutable_shape()->add_dim(dim);
}
input.set_size(op_desc_info.input_size[i]);
GELOGI("The input size int exception is %ld", op_desc_info.input_size[i]);
dump_data.mutable_input()->Add(std::move(input));
}
for (size_t j = 0; j < op_desc_info.output_format.size(); ++j) {
toolkit::dumpdata::OpOutput output;
output.set_data_type(toolkit::dumpdata::OutputDataType(GetIrDataType(op_desc_info.output_data_type[j])));
output.set_format(toolkit::dumpdata::OutputFormat(op_desc_info.output_format[j]));
for (auto dim : op_desc_info.output_shape[j]) {
output.mutable_shape()->add_dim(dim);
}
output.set_size(op_desc_info.output_size[j]);
GELOGI("The output size int exception is %ld", op_desc_info.output_size[j]);
dump_data.mutable_output()->Add(std::move(output));
}
uint64_t now_time = GetNowTime();
std::string op_name = op_desc_info.op_name;
std::string op_type = op_desc_info.op_type;
ReplaceStringElem(op_name);
ReplaceStringElem(op_type);
string dump_file_path =
"./" + op_type + "." + op_name + "." + std::to_string(op_desc_info.task_id) + "." + std::to_string(now_time);
GELOGI("The exception dump file path is %s", dump_file_path.c_str());
uint64_t proto_size = dump_data.ByteSizeLong();
std::unique_ptr<char[]> proto_msg(new (std::nothrow) char[proto_size]);
bool ret = dump_data.SerializeToArray(proto_msg.get(), proto_size);
if (!ret || proto_size == 0) {
GELOGE(PARAM_INVALID, "Dump data proto serialize failed");
return PARAM_INVALID;
}
GE_CHK_STATUS_RET(MemoryDumper::DumpToFile(dump_file_path.c_str(), &proto_size, sizeof(uint64_t)),
"Failed to dump proto size");
GE_CHK_STATUS_RET(MemoryDumper::DumpToFile(dump_file_path.c_str(), proto_msg.get(), proto_size),
"Failed to dump proto msg");
if (DumpExceptionInput(op_desc_info, dump_file_path) != SUCCESS) {
GELOGE(PARAM_INVALID, "Dump exception input failed");
return PARAM_INVALID;
}
if (DumpExceptionOutput(op_desc_info, dump_file_path) != SUCCESS) {
GELOGE(PARAM_INVALID, "Dump exception output failed");
return PARAM_INVALID;
}
GELOGI("Dump exception info SUCCESS");
} else {
GELOGE(PARAM_INVALID, "Get op desc info failed,task id:%u,stream id:%u", iter.taskid, iter.streamid);
return PARAM_INVALID;
}
}
return SUCCESS;
}
} // namespace ge

@ -70,8 +70,6 @@ class DataDumper {
void SaveDumpInput(const std::shared_ptr<Node> &node);
void SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, uint32_t task_id, uint32_t stream_id);
// args is device memory stored first output addr
void SaveDumpTask(uint32_t task_id, uint32_t stream_id, const std::shared_ptr<OpDesc> &op_desc, uintptr_t args);
void SaveEndGraphId(uint32_t task_id, uint32_t stream_id);
@ -87,14 +85,8 @@ class DataDumper {
void SetDumpProperties(const DumpProperties &dump_properties) { dump_properties_ = dump_properties; }
const DumpProperties &GetDumpProperties() const { return dump_properties_; }
bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const;
const std::vector<OpDescInfo> &GetAllOpDescInfo() const { return op_desc_info_; }
// Dump exception info
Status DumpExceptionInput(const OpDescInfo &op_desc_info, const string &dump_file);
Status DumpExceptionOutput(const OpDescInfo &op_desc_info, const string &dump_file);
Status DumpExceptionInfo(const std::vector<rtExceptionInfo> exception_infos);
private:
void ReleaseDevMem(void **ptr) noexcept;

@ -2506,9 +2506,9 @@ Status DavinciModel::ReturnResult(uint32_t data_id, const bool rslt_flg, const b
GE_CHECK_NOTNULL(model_manager);
auto exception_infos = model_manager->GetExceptionInfos();
if (exception_infos.size() > 0) {
GE_CHK_STATUS_RET(data_dumper_.DumpExceptionInfo(exception_infos), "Dump exception info failed");
GE_CHK_STATUS_RET(DumpExceptionInfo(exception_infos), "[Dump][Exception] Dump exception info failed.");
} else {
GELOGI("Exception info is null");
GELOGI("[Dump][Exception] Exception info is null");
}
GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_id, INTERNAL_ERROR, outputs), "OnComputeDone failed.");
return INTERNAL_ERROR;

@ -29,6 +29,7 @@
#include "common/helper/om_file_helper.h"
#include "common/opskernel/ge_task_info.h"
#include "common/properties_manager.h"
#include "common/dump/dump_exception.h"
#include "common/dump/opdebug_register.h"
#include "common/types.h"
#include "framework/common/util.h"
@ -468,13 +469,17 @@ class DavinciModel {
Status ReportProfilingData();
void SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, uint32_t task_id, uint32_t stream_id) {
data_dumper_.SaveDumpOpInfo(model_param, op, task_id, stream_id);
dump_exception_.SaveDumpOpInfo(model_param, op, task_id, stream_id);
}
void SaveDumpTask(uint32_t task_id, uint32_t stream_id, const shared_ptr<OpDesc> &op_desc, uintptr_t args) {
data_dumper_.SaveDumpTask(task_id, stream_id, op_desc, args);
}
Status DumpExceptionInfo(const std::vector<rtExceptionInfo> &exception_infos) const {
return dump_exception_.DumpExceptionInfo(exception_infos);
}
void SetKnownShapeGlobalStep(void *global_step) {
known_shape_global_step_ = global_step;
}
@ -547,7 +552,7 @@ class DavinciModel {
const DumpProperties &GetDumpProperties() const { return data_dumper_.GetDumpProperties(); }
bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const {
return data_dumper_.GetOpDescInfo(stream_id, task_id, op_desc_info);
return dump_exception_.GetOpDescInfo(stream_id, task_id, op_desc_info);
}
private:
@ -990,6 +995,7 @@ class DavinciModel {
int64_t maxDumpOpNum_;
// for data dump
DataDumper data_dumper_;
DumpException dump_exception_;
OpdebugRegister opdebug_register_;
uint64_t iterator_count_;
bool is_l1_fusion_enable_;

@ -1492,9 +1492,21 @@ Status ModelManager::GetOpDescInfo(uint32_t device_id, uint32_t stream_id, uint3
for (const auto &model : model_map_) {
auto davinci_model = model.second;
if (davinci_model->GetDeviceId() == device_id) {
GELOGI("Start to GetOpDescInfo of device_id: %u.", device_id);
GELOGI("[Get][OpDescInfo] Start to GetOpDescInfo of device_id: %u in davinci model.", device_id);
if (davinci_model->GetOpDescInfo(stream_id, task_id, op_desc_info)) {
GELOGI("Find specific node of stream_id: %u, task_id: %u.", stream_id, task_id);
GELOGI("[Get][OpDescInfo] Find specific node of stream_id: %u, task_id: %u in davinci model.",
stream_id, task_id);
return SUCCESS;
}
}
}
for (const auto &model : hybrid_model_map_) {
auto hybrid_model = model.second;
if (hybrid_model->GetDeviceId() == device_id) {
GELOGI("[Get][OpDescInfo] Start to GetOpDescInfo of device_id: %u in hybrid model.", device_id);
if (hybrid_model->GetOpDescInfo(stream_id, task_id, op_desc_info)) {
GELOGI("[Get][OpDescInfo] Find specific node of stream_id: %u, task_id: %u in hybrid model.",
stream_id, task_id);
return SUCCESS;
}
}

@ -17,6 +17,7 @@
#include "hybrid_model_executor.h"
#include "graph/ge_context.h"
#include "graph/runtime_inference_context.h"
#include "graph/load/model_manager/model_manager.h"
#include "common/dump/dump_manager.h"
namespace ge {
@ -80,8 +81,18 @@ Status HybridModelExecutor::ExecuteGraphInternal(SubgraphExecutor &executor,
HYBRID_CHK_STATUS_RET(executor.ExecuteAsync(args.inputs, args.input_desc, args.outputs),
"Failed to execute partitioned call.");
RECORD_MODEL_EXECUTION_EVENT(&context_, "[ExecuteAsync] End");
Status ret = executor.Synchronize();
if (ret != ge::SUCCESS) {
auto model_manager = ModelManager::GetInstance();
GE_CHECK_NOTNULL(model_manager);
auto exception_infos = model_manager->GetExceptionInfos();
if (model_ != nullptr) {
HYBRID_CHK_STATUS_RET(model_->DumpExceptionInfo(exception_infos),
"[Execute][GraphInternal] Dump exception info failed.");
}
GELOGE(ret, "[Execute][GraphInternal] Synchronize failed.");
}
HYBRID_CHK_STATUS_RET(executor.Synchronize(), "Failed to sync root graph.");
RECORD_MODEL_EXECUTION_EVENT(&context_, "[Synchronize] End");
args.outputs.clear();

@ -4,6 +4,7 @@
#include "common/dump/dump_manager.h"
#include "graph/ge_context.h"
#include "graph/runtime_inference_context.h"
#include "graph/load/model_manager/model_manager.h"
namespace ge {
namespace hybrid {
@ -250,6 +251,13 @@ Status HybridModelPipelineExecutor::Execute(HybridModelExecutor::ExecuteArgs &ar
ret = stage_executors_[i]->Synchronize();
if (ret != SUCCESS) {
auto model_manager = ModelManager::GetInstance();
GE_CHECK_NOTNULL(model_manager);
auto exception_infos = model_manager->GetExceptionInfos();
if (model_ != nullptr) {
HYBRID_CHK_STATUS_RET(model_->DumpExceptionInfo(exception_infos),
"[Execute][pipeline] Dump exception info failed.");
}
GELOGE(ret, "[Executor: %zu] Failed to synchronize result.", i);
has_error = true;
continue;

@ -69,6 +69,7 @@ class NodeDoneCallback {
private:
Status PrepareConstInputs(const NodeItem &node_item);
Status DumpDynamicNode();
Status SaveDumpOpInfo();
Status ProfilingReport();
Status GetTaskDescInfo(const NodePtr node, const HybridModel *model,
std::vector<TaskDescInfo> &task_desc_info);
@ -255,6 +256,42 @@ Status NodeDoneCallback::DumpDynamicNode() {
return SUCCESS;
}
Status NodeDoneCallback::SaveDumpOpInfo() {
GE_CHECK_NOTNULL(graph_context_);
GE_CHECK_NOTNULL(graph_context_->model);
auto node = context_->GetNodeItem().node;
if (node == nullptr) {
GELOGE(PARAM_INVALID, "[Save][DumpOpInfo] Get node is nullptr.");
return PARAM_INVALID;
}
auto op_desc = node->GetOpDesc();
GE_CHECK_NOTNULL(op_desc);
vector<void *> input_addrs;
vector<void *> output_addrs;
for (int i = 0; i < context_->NumInputs(); i++) {
auto tensor_value = context_->GetInput(i);
GE_CHK_BOOL_RET_STATUS(tensor_value != nullptr, PARAM_INVALID, "[Save][DumpOpInfo] Tensor value is nullptr.");
void *input_addr = const_cast<void *>(tensor_value->GetData());
input_addrs.emplace_back(input_addr);
}
for (int j = 0; j < context_->NumOutputs(); j++) {
auto tensor_value = context_->GetOutput(j);
GE_CHK_BOOL_RET_STATUS(tensor_value != nullptr, PARAM_INVALID, "[Save][DumpOpInfo] Tensor value is nullptr.");
void *output_addr = const_cast<void *>(tensor_value->GetData());
output_addrs.emplace_back(output_addr);
}
uint32_t stream_id = context_->GetStreamId();
uint32_t task_id = context_->GetTaskId();
auto model = const_cast<HybridModel *>(graph_context_->model);
GE_CHECK_NOTNULL(model);
model->SaveDumpOpInfo(op_desc, task_id, stream_id, input_addrs, output_addrs);
return SUCCESS;
}
Status NodeDoneCallback::OnNodeDone() {
auto &node_item = context_->GetNodeItem();
GELOGI("[%s] Start callback process.", node_item.NodeName().c_str());
@ -267,6 +304,8 @@ Status NodeDoneCallback::OnNodeDone() {
GE_CHK_STATUS_RET(DumpDynamicNode(), "Failed to dump dynamic node");
}
GE_CHK_STATUS_RET(SaveDumpOpInfo(), "Failed to dump op info.");
if (ProfilingManager::Instance().ProfilingModelExecuteOn()) {
GE_CHK_STATUS_RET(ProfilingReport(), "Report node[%s] to profiling failed.",
node_item.NodeName().c_str());

@ -76,6 +76,10 @@ class HybridDavinciModel::Impl {
executor_.SetDeviceId(device_id);
}
uint32_t GetDeviceId() {
return model_.GetDeviceId();
}
void SetModelName(const string &model_name) {
model_.SetModelName(model_name);
executor_.SetModelName(model_name);
@ -108,6 +112,10 @@ class HybridDavinciModel::Impl {
model_.SetModelDescVersion(is_new_model_desc);
}
bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const {
return model_.GetOpDescInfo(stream_id, task_id, op_desc_info);
}
private:
std::shared_ptr<ModelListener> listener_;
HybridModel model_;
@ -181,6 +189,11 @@ void HybridDavinciModel::SetDeviceId(uint32_t device_id) {
}
}
uint32_t HybridDavinciModel::GetDeviceId() const {
GE_CHECK_NOTNULL(impl_);
return impl_->GetDeviceId();
}
void HybridDavinciModel::SetModelName(const string &model_name) {
if (impl_ != nullptr) {
impl_->SetModelName(model_name);
@ -222,5 +235,12 @@ uint64_t HybridDavinciModel::GetSessionId() {
GE_CHECK_NOTNULL(impl_);
return impl_->GetSessionId();
}
bool HybridDavinciModel::GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const {
if (impl_ == nullptr) {
return false;
}
return impl_->GetOpDescInfo(stream_id, task_id, op_desc_info);
}
} // namespace hybrid
} // namespace ge

@ -57,6 +57,8 @@ class HybridDavinciModel {
void SetDeviceId(uint32_t device_id);
uint32_t GetDeviceId() const;
void SetModelName(const string &model_name);
uint64_t GetSessionId();
@ -74,6 +76,8 @@ class HybridDavinciModel {
void SetModelDescVersion(bool is_new_model_desc);
bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const;
private:
HybridDavinciModel() = default;
class Impl;

@ -61,6 +61,10 @@ void HybridDavinciModel::SetModelId(uint32_t model_id) {
void HybridDavinciModel::SetDeviceId(uint32_t device_id) {
}
uint32_t HybridDavinciModel::GetDeviceId() const {
return 0;
}
void HybridDavinciModel::SetModelName(const string &model_name) {
}
@ -87,5 +91,9 @@ Status HybridDavinciModel::GetInputOutputDescInfo(vector<InputOutputDescInfo> &i
void HybridDavinciModel::SetModelDescVersion(bool is_new_model_desc) {
}
bool HybridDavinciModel::GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const {
return true;
}
} // namespace hybrid
} // namespace ge

@ -18,6 +18,7 @@
#include <vector>
#include "graph/debug/ge_attr_define.h"
#include "graph/load/model_manager/model_utils.h"
#include "graph/load/model_manager/davinci_model.h"
#include "graph/utils/graph_utils.h"
#include "graph/utils/node_utils.h"
#include "graph/utils/tensor_utils.h"
@ -214,6 +215,42 @@ void HybridModel::CreateInputDimsInfo(const OpDescPtr &op_desc, InputOutputDescI
}
}
void HybridModel::SaveDavinciModel(const std::shared_ptr<DavinciModel> &davinci_model) {
davinci_model_.emplace_back(davinci_model);
}
Status HybridModel::DumpExceptionInfo(const std::vector<rtExceptionInfo> &exception_infos) const {
if (exception_infos.empty()) {
GELOGI("[Dump][ExceptionInfo] Exception info is null");
return SUCCESS;
}
GELOGI("[Dump][ExceptionInfo] Start to search dynamic op info and to dump.");
if (dump_exception_.DumpExceptionInfo(exception_infos) != SUCCESS) {
GELOGE(FAILED, "[Dump][Exception] Dump dynamic op exception info failed.");
return FAILED;
}
GELOGI("[Dump][ExceptionInfo] Start to search static op info and to dump.");
for (const auto &iter : davinci_model_) {
if (iter != nullptr) {
if (iter->DumpExceptionInfo(exception_infos) != SUCCESS) {
GELOGE(FAILED, "[Dump][Exception] Dump static op exception info failed.");
return FAILED;
}
}
}
return SUCCESS;
}
bool HybridModel::GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const {
bool ret = dump_exception_.GetOpDescInfo(stream_id, task_id, op_desc_info);
for (const auto &iter : davinci_model_) {
ret = ret || iter->GetOpDescInfo(stream_id, task_id, op_desc_info);
}
return ret;
}
Status HybridModel::GetInputDescInfo(vector<InputOutputDescInfo> &input_desc, std::vector<uint32_t> &formats) {
auto node_item_list = root_graph_item_->GetInputNodes();
for (auto &node_item : node_item_list) {

@ -20,6 +20,7 @@
#include <vector>
#include <queue>
#include <memory>
#include "common/dump/dump_exception.h"
#include "framework/common/ge_inner_error_codes.h"
#include "graph/load/model_manager/data_inputer.h"
#include "graph/load/model_manager/task_info/task_info.h"
@ -126,6 +127,17 @@ class HybridModel {
std::vector<std::pair<int64_t, int64_t>> &shape_ranges,
InputOutputDescInfo &input);
void SaveDumpOpInfo(const OpDescPtr &op, uint32_t task_id, uint32_t stream_id,
vector<void *> &input_addrs, vector<void *> &output_addrs) {
dump_exception_.SaveDumpOpInfo(op, task_id, stream_id, input_addrs, output_addrs);
}
void SaveDavinciModel(const std::shared_ptr<DavinciModel> &davinci_model);
Status DumpExceptionInfo(const std::vector<rtExceptionInfo> &exception_infos) const;
bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const;
private:
friend class HybridModelBuilder;
friend class HybridModelAsyncExecutor;
@ -146,6 +158,10 @@ class HybridModel {
std::map<std::string, std::unique_ptr<GraphItem>> subgraph_items_;
std::map<NodePtr, std::unique_ptr<NodeItem>> node_items_;
// dump exception
DumpException dump_exception_;
std::vector<std::shared_ptr<ge::DavinciModel>> davinci_model_;
bool is_new_model_desc_ = false; // support aipp
bool is_single_op_ = false;

@ -199,6 +199,8 @@ Status AiCoreNodeTask::ExecuteAsync(TaskContext &context, std::function<void()>
GELOGE(RT_FAILED, "Get task_id and stream_id failed, ret: 0x%X.", rt_ret);
return RT_ERROR_TO_GE_STATUS(rt_ret);
}
context.SetTaskId(task_id);
context.SetStreamId(stream_id);
GELOGD("Aicore node[%s] task_id: %u, stream_id: %u.", context.GetNodeName(), task_id, stream_id);
(void)context.SaveProfilingTaskDescInfo(task_id, stream_id, kTaskTypeAicore, (*it)->GetBlockDim());
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeLaunchKernel] End");

@ -206,6 +206,8 @@ Status AicpuNodeTaskBase::ExecuteAsync(TaskContext &context, std::function<void(
GELOGE(RT_FAILED, "Get task_id and stream_id failed, ret: 0x%X.", rt_ret);
return RT_ERROR_TO_GE_STATUS(rt_ret);
}
context.SetTaskId(task_id);
context.SetStreamId(stream_id);
GELOGD("Aicpu node[%s] task_id: %u, stream_id: %u.", context.GetNodeName(), task_id, stream_id);
(void)context.SaveProfilingTaskDescInfo(task_id, stream_id, kTaskTypeAicpu, 0);
auto callback = [=, &context]() {

@ -29,7 +29,7 @@ namespace ge {
namespace hybrid {
REGISTER_NODE_EXECUTOR_BUILDER(NodeExecutorManager::ExecutorType::COMPILED_SUBGRAPH, KnownNodeExecutor);
Status KnownNodeTask:: ExecuteAsync(TaskContext &context, std::function<void()> done_callback) {
Status KnownNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) {
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeTaskExecuteAsync] Start");
GELOGD("[%s] KnownNodeTask::ExecuteAsync in.", context.GetNodeName());
if (davinci_model_->GetTaskList().empty()) {
@ -171,6 +171,9 @@ Status KnownNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &node
std::shared_ptr<DavinciModel> davinci_model = MakeShared<DavinciModel>(0, nullptr);
GE_CHECK_NOTNULL(davinci_model);
auto hybrid_model = &const_cast<HybridModel &>(model);
GE_CHECK_NOTNULL(hybrid_model);
hybrid_model->SaveDavinciModel(davinci_model);
// set known node flag as true
davinci_model->SetKnownNode(true);

@ -166,6 +166,7 @@ set(COMMON_SRC_FILES
"${GE_CODE_DIR}/ge/common/dump/dump_properties.cc"
"${GE_CODE_DIR}/ge/common/helper/model_helper.cc"
"${GE_CODE_DIR}/ge/common/dump/dump_manager.cc"
"${GE_CODE_DIR}/ge/common/dump/dump_exception.cc"
"${GE_CODE_DIR}/ge/common/dump/opdebug_register.cc"
"${GE_CODE_DIR}/ge/common/helper/om_file_helper.cc"
"${GE_CODE_DIR}/ge/model/ge_root_model.cc"

Loading…
Cancel
Save