Pre Merge pull request !1331 from zhengyuanhua/master
commit
97be4a7c52
@ -0,0 +1,221 @@
|
||||
/**
|
||||
* Copyright 2019-2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "common/dump/dump_exception.h"
|
||||
|
||||
#include "common/ge/datatype_util.h"
|
||||
#include "common/debug/memory_dumper.h"
|
||||
#include "framework/common/debug/log.h"
|
||||
#include "graph/manager/util/debug.h"
|
||||
#include "graph/utils/tensor_utils.h"
|
||||
#include "graph/load/model_manager/model_utils.h"
|
||||
#include "proto/dump_task.pb.h"
|
||||
|
||||
namespace {
|
||||
static uint64_t GetNowTime() {
|
||||
uint64_t ret = 0;
|
||||
mmTimeval tv;
|
||||
if (mmGetTimeOfDay(&tv, nullptr) == 0) {
|
||||
ret = tv.tv_sec * 1000000ULL + tv.tv_usec;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void ReplaceStringElem(std::string &str) {
|
||||
for_each(str.begin(), str.end(), [](char &ch) {
|
||||
if ((ch == ' ') || (ch == '.') || (ch == '/') || (ch == '\\')) {
|
||||
ch = '_';
|
||||
}
|
||||
});
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace ge {
|
||||
DumpException::~DumpException() {}
|
||||
|
||||
void DumpException::SaveDumpOpInfo(const OpDescPtr &op, uint32_t task_id, uint32_t stream_id,
|
||||
vector<void *> &input_addrs, vector<void *> &output_addrs) {
|
||||
OpDescInfo op_desc_info;
|
||||
SaveOpDescInfo(op, task_id, stream_id, op_desc_info);
|
||||
op_desc_info.input_addrs = input_addrs;
|
||||
op_desc_info.output_addrs = output_addrs;
|
||||
op_desc_info_.emplace_back(op_desc_info);
|
||||
}
|
||||
|
||||
void DumpException::SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op,
|
||||
uint32_t task_id, uint32_t stream_id) {
|
||||
OpDescInfo op_desc_info;
|
||||
SaveOpDescInfo(op, task_id, stream_id, op_desc_info);
|
||||
op_desc_info.input_addrs = ModelUtils::GetInputDataAddrs(model_param, op);
|
||||
op_desc_info.output_addrs = ModelUtils::GetOutputDataAddrs(model_param, op);
|
||||
op_desc_info_.emplace_back(op_desc_info);
|
||||
}
|
||||
|
||||
void DumpException::SaveOpDescInfo(const OpDescPtr &op, uint32_t task_id, uint32_t stream_id,
|
||||
OpDescInfo &op_desc_info) {
|
||||
GELOGD("[Save][OpExceptionInfo] Start to save dump op [%s] info of task_id: %u, stream_id: %u",
|
||||
op->GetName().c_str(), task_id, stream_id);
|
||||
op_desc_info.op_name = op->GetName();
|
||||
op_desc_info.op_type = op->GetType();
|
||||
op_desc_info.task_id = task_id;
|
||||
op_desc_info.stream_id = stream_id;
|
||||
for (size_t i = 0; i < op->GetAllInputsSize(); ++i) {
|
||||
GeTensorDescPtr input_tensor_desc = op->MutableInputDesc(i);
|
||||
if (input_tensor_desc == nullptr) {
|
||||
continue;
|
||||
}
|
||||
op_desc_info.input_format.emplace_back(input_tensor_desc->GetFormat());
|
||||
op_desc_info.input_shape.emplace_back(input_tensor_desc->GetShape().GetDims());
|
||||
op_desc_info.input_data_type.emplace_back(input_tensor_desc->GetDataType());
|
||||
int64_t input_size = 0;
|
||||
|
||||
if (TensorUtils::GetTensorSizeInBytes(*input_tensor_desc, input_size) != SUCCESS) {
|
||||
GELOGW("[Save][OpExceptionInfo] Op [%s] get input size failed.", op->GetName().c_str());
|
||||
return;
|
||||
}
|
||||
GELOGD("[Save][OpExceptionInfo] Save dump op info, the input size is %ld", input_size);
|
||||
op_desc_info.input_size.emplace_back(input_size);
|
||||
}
|
||||
for (size_t j = 0; j < op->GetOutputsSize(); ++j) {
|
||||
GeTensorDescPtr output_tensor_desc = op->MutableOutputDesc(j);
|
||||
if (output_tensor_desc == nullptr) {
|
||||
continue;
|
||||
}
|
||||
op_desc_info.output_format.emplace_back(output_tensor_desc->GetFormat());
|
||||
op_desc_info.output_shape.emplace_back(output_tensor_desc->GetShape().GetDims());
|
||||
op_desc_info.output_data_type.emplace_back(output_tensor_desc->GetDataType());
|
||||
int64_t output_size = 0;
|
||||
if (TensorUtils::GetTensorSizeInBytes(*output_tensor_desc, output_size) != SUCCESS) {
|
||||
GELOGW("[Save][OpExceptionInfo] Op [%s] get output size failed.", op->GetName().c_str());
|
||||
return;
|
||||
}
|
||||
GELOGD("[Save][OpExceptionInfo] Save dump op info, the output size is %ld.", output_size);
|
||||
op_desc_info.output_size.emplace_back(output_size);
|
||||
}
|
||||
}
|
||||
|
||||
Status DumpException::DumpExceptionInfo(const std::vector<rtExceptionInfo> &exception_infos) const {
|
||||
GELOGI("[Dump][Exception] Start to dump exception info.");
|
||||
for (const rtExceptionInfo &iter : exception_infos) {
|
||||
OpDescInfo op_desc_info;
|
||||
if (GetOpDescInfo(iter.streamid, iter.taskid, op_desc_info)) {
|
||||
toolkit::dumpdata::DumpData dump_data;
|
||||
dump_data.set_version("2.0");
|
||||
dump_data.set_dump_time(GetNowTime());
|
||||
dump_data.set_op_name(op_desc_info.op_name);
|
||||
for (size_t i = 0; i < op_desc_info.input_format.size(); ++i) {
|
||||
toolkit::dumpdata::OpInput input;
|
||||
input.set_data_type(toolkit::dumpdata::OutputDataType(
|
||||
DataTypeUtil::GetIrDataType(op_desc_info.input_data_type[i])));
|
||||
input.set_format(toolkit::dumpdata::OutputFormat(op_desc_info.input_format[i]));
|
||||
for (auto dim : op_desc_info.input_shape[i]) {
|
||||
input.mutable_shape()->add_dim(dim);
|
||||
}
|
||||
input.set_size(op_desc_info.input_size[i]);
|
||||
GELOGI("[Dump][Exception] The input size int exception is %ld.", op_desc_info.input_size[i]);
|
||||
dump_data.mutable_input()->Add(std::move(input));
|
||||
}
|
||||
for (size_t j = 0; j < op_desc_info.output_format.size(); ++j) {
|
||||
toolkit::dumpdata::OpOutput output;
|
||||
output.set_data_type(toolkit::dumpdata::OutputDataType(
|
||||
DataTypeUtil::GetIrDataType(op_desc_info.output_data_type[j])));
|
||||
output.set_format(toolkit::dumpdata::OutputFormat(op_desc_info.output_format[j]));
|
||||
for (auto dim : op_desc_info.output_shape[j]) {
|
||||
output.mutable_shape()->add_dim(dim);
|
||||
}
|
||||
output.set_size(op_desc_info.output_size[j]);
|
||||
GELOGI("[Dump][Exception] The output size int exception is %ld.", op_desc_info.output_size[j]);
|
||||
dump_data.mutable_output()->Add(std::move(output));
|
||||
}
|
||||
uint64_t now_time = GetNowTime();
|
||||
std::string op_name = op_desc_info.op_name;
|
||||
std::string op_type = op_desc_info.op_type;
|
||||
ReplaceStringElem(op_name);
|
||||
ReplaceStringElem(op_type);
|
||||
string dump_file_path =
|
||||
"./" + op_type + "." + op_name + "." + std::to_string(op_desc_info.task_id) + "." + std::to_string(now_time);
|
||||
GELOGI("[Dump][Exception] The exception dump file path is %s.", dump_file_path.c_str());
|
||||
|
||||
uint64_t proto_size = dump_data.ByteSizeLong();
|
||||
std::unique_ptr<char[]> proto_msg(new (std::nothrow) char[proto_size]);
|
||||
bool ret = dump_data.SerializeToArray(proto_msg.get(), proto_size);
|
||||
if (!ret || proto_size == 0) {
|
||||
GELOGE(PARAM_INVALID, "[Dump][Exception] Dump data proto serialize failed.");
|
||||
return PARAM_INVALID;
|
||||
}
|
||||
|
||||
GE_CHK_STATUS_RET(MemoryDumper::DumpToFile(dump_file_path.c_str(), &proto_size, sizeof(uint64_t)),
|
||||
"Failed to dump proto size");
|
||||
GE_CHK_STATUS_RET(MemoryDumper::DumpToFile(dump_file_path.c_str(), proto_msg.get(), proto_size),
|
||||
"Failed to dump proto msg");
|
||||
if (DumpExceptionInput(op_desc_info, dump_file_path) != SUCCESS) {
|
||||
GELOGE(PARAM_INVALID, "[Dump][Exception] Dump op [%s] exception input failed.", op_desc_info.op_name.c_str());
|
||||
return PARAM_INVALID;
|
||||
}
|
||||
|
||||
if (DumpExceptionOutput(op_desc_info, dump_file_path) != SUCCESS) {
|
||||
GELOGE(PARAM_INVALID, "[Dump][Exception] Dump op [%s] exception output failed.", op_desc_info.op_name.c_str());
|
||||
return PARAM_INVALID;
|
||||
}
|
||||
GELOGI("[Dump][Exception] Dump op [%s] exception info SUCCESS.", op_desc_info.op_name.c_str());
|
||||
} else {
|
||||
GELOGI("[Dump][Exception] Can't find op desc info ,task id:%u,stream id:%u", iter.taskid, iter.streamid);
|
||||
return SUCCESS;
|
||||
}
|
||||
}
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
bool DumpException::GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const {
|
||||
GELOGI("[Get][OpDescInfo] There are %zu op need to dump.", op_desc_info_.size());
|
||||
for (size_t index = 0; index < op_desc_info_.size(); ++index) {
|
||||
OpDescInfo dump_op_info = op_desc_info_.at(index);
|
||||
if (dump_op_info.task_id == task_id && dump_op_info.stream_id == stream_id) {
|
||||
GELOGI("[Get][OpDescInfo] Find exception op [%s] of task_id: %u, stream_id: %u.",
|
||||
dump_op_info.op_name.c_str(), task_id, stream_id);
|
||||
op_desc_info = dump_op_info;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
Status DumpException::DumpExceptionInput(const OpDescInfo &op_desc_info, const string &dump_file) const {
|
||||
GELOGI("[Dump][ExceptionInput] Start to dump exception input");
|
||||
for (size_t i = 0; i < op_desc_info.input_addrs.size(); i++) {
|
||||
if (Debug::DumpDevMem(dump_file.data(), op_desc_info.input_addrs.at(i), op_desc_info.input_size.at(i)) != SUCCESS) {
|
||||
GELOGE(PARAM_INVALID, "[Dump][ExceptionInput] Dump the %zu input data of op [%s] failed",
|
||||
i, op_desc_info.op_name.c_str());
|
||||
return PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
Status DumpException::DumpExceptionOutput(const OpDescInfo &op_desc_info, const string &dump_file) const {
|
||||
GELOGI("[Dump][ExceptionOutput] Start to dump exception output");
|
||||
for (size_t i = 0; i < op_desc_info.output_addrs.size(); i++) {
|
||||
if (Debug::DumpDevMem(dump_file.data(), op_desc_info.output_addrs.at(i), op_desc_info.output_size.at(i)) !=
|
||||
SUCCESS) {
|
||||
GELOGE(PARAM_INVALID, "[Dump][ExceptionInput] Dump the %zu input data of op [%s] failed",
|
||||
i, op_desc_info.op_name.c_str());
|
||||
return PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
return SUCCESS;
|
||||
}
|
||||
} // namespace ge
|
@ -0,0 +1,47 @@
|
||||
/**
|
||||
* Copyright 2019-2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef GE_COMMON_DUMP_DUMP_EXCEPTION_H_
|
||||
#define GE_COMMON_DUMP_DUMP_EXCEPTION_H_
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "graph/op_desc.h"
|
||||
#include "framework/common/ge_types.h"
|
||||
#include "graph/load/model_manager/task_info/task_info.h"
|
||||
|
||||
namespace ge {
|
||||
class DumpException {
|
||||
public:
|
||||
DumpException() = default;
|
||||
~DumpException();
|
||||
|
||||
void SaveDumpOpInfo(const OpDescPtr &op, uint32_t task_id, uint32_t stream_id,
|
||||
vector<void *> &input_addrs, vector<void *> &output_addrs);
|
||||
void SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, uint32_t task_id, uint32_t stream_id);
|
||||
Status DumpExceptionInfo(const std::vector<rtExceptionInfo> &exception_infos) const;
|
||||
bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const;
|
||||
|
||||
private:
|
||||
void SaveOpDescInfo(const OpDescPtr &op, uint32_t task_id, uint32_t stream_id, OpDescInfo &op_desc_info);
|
||||
Status DumpExceptionInput(const OpDescInfo &op_desc_info, const string &dump_file) const;
|
||||
Status DumpExceptionOutput(const OpDescInfo &op_desc_info, const string &dump_file) const;
|
||||
|
||||
std::vector<OpDescInfo> op_desc_info_;
|
||||
};
|
||||
} // namespace ge
|
||||
|
||||
#endif // GE_COMMON_DUMP_DUMP_OP_H_
|
Loading…
Reference in new issue