From 99b60f4f927887f59895ddefbf48addf8cd5e12d Mon Sep 17 00:00:00 2001 From: liudingyan Date: Mon, 29 Mar 2021 14:58:38 +0800 Subject: [PATCH 1/5] add errlog and modify geloge --1 --- ge/hybrid/common/npu_memory_allocator.cc | 16 +++- ge/hybrid/common/tensor_value.cc | 6 +- .../executor/hybrid_execution_context.cc | 5 +- .../executor/hybrid_model_async_executor.cc | 92 ++++++++++++------- ge/hybrid/executor/worker/execution_engine.cc | 62 +++++++------ .../executor/worker/shape_inference_engine.cc | 47 +++++----- .../executor/worker/task_compile_engine.cc | 3 +- 7 files changed, 140 insertions(+), 91 deletions(-) diff --git a/ge/hybrid/common/npu_memory_allocator.cc b/ge/hybrid/common/npu_memory_allocator.cc index ccd6a624..7561d2e1 100644 --- a/ge/hybrid/common/npu_memory_allocator.cc +++ b/ge/hybrid/common/npu_memory_allocator.cc @@ -39,7 +39,8 @@ AllocationAttr::AllocationAttr(void *try_reuse_addr) : AllocationAttr(0, try_reu NpuMemoryAllocator *NpuMemoryAllocator::GetAllocator() { int32_t device_id = 0; if (rtGetDevice(&device_id) != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "Failed to get device id"); + GELOGE(RT_FAILED, "[Get][Device] Failed when %s.", __FUNCTION__); + REPORT_INNER_ERROR("E19999", "rtGetDevice failed when %s.", __FUNCTION__); return nullptr; } @@ -57,7 +58,10 @@ void *NpuMemoryAllocator::Allocate(std::size_t size, AllocationAttr *attr) { } if (allocate_size == 0) { - GELOGE(MEMALLOC_FAILED, "Memory size is 0, device_id = %u, size = %zu", device_id_, allocate_size); + GELOGE(MEMALLOC_FAILED, "[Check][Param:size_t]Memory size is 0, device_id = %u, size = %zu when %s.", + device_id_, allocate_size, __FUNCTION__); + REPORT_INNER_ERROR("E19999", "Memory size is 0, device_id = %u, size = %zu when %s.", + device_id_, allocate_size, __FUNCTION__); return nullptr; } @@ -68,7 +72,8 @@ void *NpuMemoryAllocator::Allocate(std::size_t size, AllocationAttr *attr) { buffer = MemManager::Instance().HostMemInstance(RT_MEMORY_HBM).Malloc(allocate_size); } else { if (allocate_size > kMaxHbmMemorySize) { - GELOGE(PARAM_INVALID, "Invalid HBM memory size: %zu", allocate_size); + GELOGE(PARAM_INVALID, "[Check][Param:size_t]Invalid HBM memory size: %zu when %s.", allocate_size, __FUNCTION__); + REPORT_CALL_ERROR("E19999", "Invalid HBM memory size: %zu when %s.", allocate_size, __FUNCTION__); return nullptr; } void *try_reuse_addr = nullptr; @@ -87,7 +92,10 @@ void *NpuMemoryAllocator::Allocate(std::size_t size, AllocationAttr *attr) { .Malloc(allocate_size, reinterpret_cast(try_reuse_addr), device_id_); } if (buffer == nullptr) { - GELOGE(MEMALLOC_FAILED, "Failed to malloc memory, device_id = %u, size = %zu", device_id_, allocate_size); + GELOGE(MEMALLOC_FAILED, "[Malloc][Memory] Failed, device_id = %u, size = %zu when %s.", + device_id_, allocate_size, __FUNCTION__); + REPORT_CALL_ERROR("E19999", "malloc memory failed, device_id = %u, size = %zu when %s.", + device_id_, allocate_size, __FUNCTION__); return nullptr; } diff --git a/ge/hybrid/common/tensor_value.cc b/ge/hybrid/common/tensor_value.cc index c691c6f3..81670c57 100644 --- a/ge/hybrid/common/tensor_value.cc +++ b/ge/hybrid/common/tensor_value.cc @@ -32,7 +32,8 @@ std::unique_ptr TensorBuffer::Create(NpuMemoryAllocator *allocator } if (allocator == nullptr) { - GELOGE(INTERNAL_ERROR, "allocator is NULL"); + GELOGE(INTERNAL_ERROR, "[Check][Param:NpuMemoryAllocator] allocator is NULL, when %s.", __FUNCTION__); + REPORT_INNER_ERROR("E19999", "input allocator is NULL, when %s.", __FUNCTION__); return nullptr; } @@ -42,7 +43,8 @@ std::unique_ptr TensorBuffer::Create(NpuMemoryAllocator *allocator } buffer = allocator->Allocate(size, attr); if (buffer == nullptr) { - GELOGE(MEMALLOC_FAILED, "Failed to allocate memory. size = %zu", size); + GELOGE(MEMALLOC_FAILED, "[Allocate][Memory] Failed. size = %zu, when %s.", size, __FUNCTION__); + REPORT_CALL_ERROR("E19999", "allocate failed, size = %zu, when %s.", size, __FUNCTION__); return nullptr; } diff --git a/ge/hybrid/executor/hybrid_execution_context.cc b/ge/hybrid/executor/hybrid_execution_context.cc index 50f6287c..00a203d7 100644 --- a/ge/hybrid/executor/hybrid_execution_context.cc +++ b/ge/hybrid/executor/hybrid_execution_context.cc @@ -59,7 +59,10 @@ Status GraphExecutionContext::Synchronize(rtStream_t rt_stream) { return SUCCESS; } - GELOGE(RT_FAILED, "Failed to invoke rtStreamSynchronize, ret = %d", rt_ret); + GELOGE(RT_FAILED, + "[Invoke][rtStreamSynchronize] failed when GraphExecutionContext %s, ret = %d", __FUNCTION__, rt_ret); + REPORT_CALL_ERROR("E19999", + "invoke rtStreamSynchronize failed when GraphExecutionContext %s, ret = %d", __FUNCTION__, rt_ret); return RT_FAILED; } } // namespace hybrid diff --git a/ge/hybrid/executor/hybrid_model_async_executor.cc b/ge/hybrid/executor/hybrid_model_async_executor.cc index db183f52..ec1080db 100644 --- a/ge/hybrid/executor/hybrid_model_async_executor.cc +++ b/ge/hybrid/executor/hybrid_model_async_executor.cc @@ -52,7 +52,7 @@ void HybridModelAsyncExecutor::SetModelName(const string &model_name) { Status HybridModelAsyncExecutor::EnqueueData(const shared_ptr &data) { GE_CHK_STATUS_EXEC(data_inputer_->Push(data), return domi::DATA_QUEUE_ISFULL, - "Data queue is full, please call again later, model_id %u ", model_id_); + "[Push][Data] Data queue is full, please call again later, model_id %u ", model_id_); GELOGD("EnqueueData successfully. model_id = %u, data_index = %u", data->GetInput().model_id, data->GetInput().index); return SUCCESS; } @@ -60,7 +60,8 @@ Status HybridModelAsyncExecutor::EnqueueData(const shared_ptr Status HybridModelAsyncExecutor::Start(const std::shared_ptr &listener) { GELOGD("HybridModelExecutor::Start IN, has listener = %d", listener != nullptr); std::lock_guard lk(mu_); - GE_CHK_BOOL_RET_STATUS(!run_flag_, INTERNAL_ERROR, "Model already started."); + GE_CHK_BOOL_RET_STATUS(!run_flag_, INTERNAL_ERROR, + "[Check][RunState] Model already started when HybridModelAsyncExecutor %s.", __FUNCTION__); run_flag_ = true; listener_ = listener; @@ -71,7 +72,8 @@ Status HybridModelAsyncExecutor::Start(const std::shared_ptr &lis return RunInternal(); }); - GE_CHK_BOOL_RET_STATUS(future_.valid(), INTERNAL_ERROR, "Failed to start."); + GE_CHK_BOOL_RET_STATUS(future_.valid(), INTERNAL_ERROR, + "[Check][RunState] Failed to start when HybridModelAsyncExecutor %s.", __FUNCTION__); GELOGD("HybridModelExecutor::Start successfully"); return SUCCESS; } @@ -105,26 +107,29 @@ Status HybridModelAsyncExecutor::Init() { executor_ = std::unique_ptr(new(std::nothrow) HybridModelExecutor(model_, device_id_, stream_)); GE_CHECK_NOTNULL(executor_); - GE_CHK_STATUS_RET(executor_->Init(), "Failed to init hybrid engine"); - GE_CHK_STATUS_RET(DumpOpDebug(), "Dump op debug failed in hybrid engine"); + GE_CHK_STATUS_RET(executor_->Init(), + "[Init][HybridModelExecutor] failed when HybridModelAsyncExecutor %s.", __FUNCTION__); + GE_CHK_STATUS_RET(DumpOpDebug(), "[Dump][OpDebug] failed when HybridModelAsyncExecutor %s.", __FUNCTION__); GELOGI("HybridModel stage nums:%zu", model_->GetRootGraphItem()->NumGroups()); if (model_->GetRootGraphItem()->NumGroups() >= kMinimumPiplineStages) { pipe_executor_ = std::unique_ptr(new(std::nothrow) HybridModelPipelineExecutor(model_, device_id_)); GE_CHECK_NOTNULL(pipe_executor_); - GE_CHK_STATUS_RET(pipe_executor_->Init(), "Failed to init hybrid engine"); + GE_CHK_STATUS_RET(pipe_executor_->Init(), + "[Init][HybridModelPipelineExecutor] failed when HybridModelAsyncExecutor %s.", __FUNCTION__); } - GE_CHK_STATUS_RET(InitInputDesc(), "Failed to init input tensors"); + GE_CHK_STATUS_RET(InitInputDesc(), "[Init][InputDesc] failed when HybridModelAsyncExecutor %s.", __FUNCTION__); return SUCCESS; } Status HybridModelAsyncExecutor::PreRun(InputData ¤t_data, HybridModelExecutor::ExecuteArgs &args) { - GE_CHK_STATUS_RET(SyncVarData(), "Failed to sync var data"); + GE_CHK_STATUS_RET(SyncVarData(), "[Invoke][SyncVarData] failed when HybridModelAsyncExecutor %s.", __FUNCTION__); RECORD_MODEL_EXECUTION_EVENT(executor_->GetContext(), "[SyncVarData] End"); - GE_CHK_STATUS_RET(PrepareInputs(current_data, args), "Failed to copy input data to model"); + GE_CHK_STATUS_RET(PrepareInputs(current_data, args), + "[Invoke][PrepareInputs] failed to copy input data to model when HybridModelAsyncExecutor %s.", __FUNCTION__); RECORD_MODEL_EXECUTION_EVENT(executor_->GetContext(), "[CopyInputData] End"); return SUCCESS; } @@ -155,7 +160,7 @@ Status HybridModelAsyncExecutor::RunInternal() { GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( ret != SUCCESS, (void) HandleResult(ret, current_data.index, args, data_wrapper->GetOutput()); CsaInteract::GetInstance().StoreInternalErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); - continue, "PreRun failed."); // [No need to check value] + continue, "[Invoke][PreRun] failed when HybridModelAsyncExecutor %s.", __FUNCTION__); // [No need to check value] if (pipe_executor_ != nullptr) { GELOGI("HybridModel will execute in pipeline mode"); @@ -199,7 +204,11 @@ Status HybridModelAsyncExecutor::HandleResult(Status exec_ret, } if (exec_ret != SUCCESS) { - GELOGE(exec_ret, "Failed to execute graph. model_id = %u", model_id_); + GELOGE(exec_ret, + "[Check][Param:Status] failed to execute graph when HybridModelAsyncExecutor %s. model_id = %u", + __FUNCTION__, model_id_); + REPORT_INNER_ERROR("E19999", + "failed to execute graph when HybridModelAsyncExecutor %s. model_id = %u", __FUNCTION__, model_id_); return OnComputeDone(data_id, INTERNAL_ERROR, output_tensor_info_list); } @@ -235,8 +244,12 @@ Status HybridModelAsyncExecutor::SyncVarData() { Status HybridModelAsyncExecutor::PrepareInputs(const InputData ¤t_data, HybridModelExecutor::ExecuteArgs &args) { if (current_data.blobs.size() < input_tensor_desc_.size()) { - GELOGE(PARAM_INVALID, "Blob size mismatches, expect at least %zu, but got %zu", - input_tensor_desc_.size(), current_data.blobs.size()); + GELOGE(PARAM_INVALID, + "[Check][Size]Blob size mismatches, expect at least %zu, but got %zu when HybridModelAsyncExecutor %s.", + input_tensor_desc_.size(), current_data.blobs.size(), __FUNCTION__); + REPORT_INNER_ERROR("E19999", + "Blob size mismatches, expect at least %zu, but got %zu when HybridModelAsyncExecutor %s.", + input_tensor_desc_.size(), current_data.blobs.size(), __FUNCTION__); return PARAM_INVALID; } @@ -248,8 +261,12 @@ Status HybridModelAsyncExecutor::PrepareInputs(const InputData ¤t_data, Hy auto tensor_size = input_sizes_[input_index]; if (is_input_dynamic_[input_index]) { if (input_index >= current_data.shapes.size()) { - GELOGE(PARAM_INVALID, "Shape index out of range, index = %zu, shape size = %zu", - input_index, current_data.shapes.size()); + GELOGE(PARAM_INVALID, + "[Check][Range]Shape index out of range, index = %zu, shape size = %zu when HybridModelAsyncExecutor %s.", + input_index, current_data.shapes.size(), __FUNCTION__); + REPORT_INNER_ERROR("E19999", + "Shape index out of range, index = %zu, shape size = %zu when HybridModelAsyncExecutor %s.", + input_index, current_data.shapes.size(), __FUNCTION__); return PARAM_INVALID; } auto &tensor_desc = input_tensor_desc_[input_index]; @@ -257,15 +274,19 @@ Status HybridModelAsyncExecutor::PrepareInputs(const InputData ¤t_data, Hy std::vector> range; auto range_ret = tensor_desc->GetShapeRange(range); GE_CHK_BOOL_RET_STATUS(range_ret == GRAPH_SUCCESS, INTERNAL_ERROR, - "Get shape range failed, ret=%u.", range_ret); + "[Invoke][GetShapeRange] failed, ret=%u.", range_ret); for (size_t k = 0; k < range.size(); ++k) { if (k >= shape.GetDimNum()) { break; } // range[k].second can be -1 if (shape.GetDim(k) < range[k].first || (range[k].second >= 0 && shape.GetDim(k) > range[k].second)) { - GELOGE(PARAM_INVALID, "Dim out of range, shape idx = %zu, dim idx = %zu, dim = %ld, range = [%ld, %ld]", - input_index, k, shape.GetDim(k), range[k].first, range[k].second); + GELOGE(PARAM_INVALID, + "[Check][Range]Dim out of range, shape idx = %zu, dim idx = %zu, dim = %ld, range = [%ld, %ld]", + input_index, k, shape.GetDim(k), range[k].first, range[k].second); + REPORT_INNER_ERROR("E19999", + "Dim out of range, shape idx = %zu, dim idx = %zu, dim = %ld, range = [%ld, %ld]", + input_index, k, shape.GetDim(k), range[k].first, range[k].second); return PARAM_INVALID; } } @@ -273,9 +294,8 @@ Status HybridModelAsyncExecutor::PrepareInputs(const InputData ¤t_data, Hy args.input_desc[input_index] = tensor_desc; GELOGD("Update shape of input[%zu] to [%s]", input_index, tensor_desc->MutableShape().ToString().c_str()); GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetTensorMemorySizeInBytes(*tensor_desc, tensor_size), - "Failed to calc tensor size, index = %zu, shape = [%s]", - input_index, - tensor_desc->GetShape().ToString().c_str()); + "[Invoke][GetTensorMemorySizeInBytes]Failed to calc tensor size, index = %zu, shape = [%s]", + input_index, tensor_desc->GetShape().ToString().c_str()); GELOGD("Input tensor[%zu] size = %zu", input_index, tensor_size); } @@ -293,7 +313,7 @@ Status HybridModelAsyncExecutor::PrepareInputs(const InputData ¤t_data, Hy auto mem_size = static_cast(tensor_size); GE_CHK_BOOL_RET_STATUS(mem_size >= data_buf.length, PARAM_INVALID, - "input data size(%lu) does not match model required size(%lu), ret failed.", + "[Check][Size]input data size(%lu) does not match model required size(%lu), ret failed.", data_buf.length, mem_size); @@ -351,7 +371,7 @@ Status HybridModelAsyncExecutor::OnComputeDone(uint32_t data_index, uint32_t res GELOGD("OnComputeDone. model id = %u, data index = %u, execution ret = %u", model_id_, data_index, result_code); if (listener_ != nullptr) { GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_index, result_code, outputs), - "OnComputeDone failed"); + "[Invoke][OnComputeDone] failed."); } return result_code; @@ -365,9 +385,12 @@ Status HybridModelAsyncExecutor::CopyOutputs(HybridModelExecutor::ExecuteArgs &a std::vector &output_tensors = args.outputs; if (output_tensor_desc_list.size() != output_tensors.size()) { GELOGE(INTERNAL_ERROR, - "Output sizes mismatch. From op_desc = %zu, and from output tensors = %zu", - output_tensor_desc_list.size(), - output_tensors.size()); + "[Check][Size]Output sizes mismatch. From op_desc = %zu, and from output tensors = %zu " + "when HybridModelAsyncExecutor %s.", + output_tensor_desc_list.size(), output_tensors.size(), __FUNCTION__); + REPORT_INNER_ERROR("E19999", "Output sizes mismatch. From op_desc = %zu, and from output tensors = %zu " + "when HybridModelAsyncExecutor %s.", + output_tensor_desc_list.size(), output_tensors.size(), __FUNCTION__); return INTERNAL_ERROR; } @@ -399,8 +422,10 @@ Status HybridModelAsyncExecutor::CopyOutputs(HybridModelExecutor::ExecuteArgs &a GE_CHECK_LE(output_size, UINT32_MAX); if (output_tensor.GetSize() < static_cast(output_size)) { GELOGE(INTERNAL_ERROR, - "output[%zu] tensor size(%zu) is not enough for output shape [%s]", - i, output_tensor.GetSize(), tensor_desc->GetShape().ToString().c_str()); + "[Check][Size]output[%zu] tensor size(%zu) is not enough for output shape [%s]", + i, output_tensor.GetSize(), tensor_desc->GetShape().ToString().c_str()); + REPORT_INNER_ERROR("E19999", "output[%zu] tensor size(%zu) is not enough for output shape [%s]", + i, output_tensor.GetSize(), tensor_desc->GetShape().ToString().c_str()); return INTERNAL_ERROR; } @@ -456,7 +481,7 @@ Status HybridModelAsyncExecutor::Execute(const std::vector &inputs, args.input_desc.emplace_back(tensor_desc_ptr); } - GE_CHK_STATUS_RET(executor_->Execute(args), "Failed to execute model."); + GE_CHK_STATUS_RET(executor_->Execute(args), "[Invoke][Execute] Failed when HybridModelAsyncExecutor %s.", __FUNCTION__); for (const auto &output_tensor_desc : args.output_desc) { output_desc.emplace_back(*output_tensor_desc); } @@ -477,13 +502,14 @@ Status HybridModelAsyncExecutor::Execute(const vector &inputs, vector< } HybridModelExecutor::ExecuteArgs args; - GE_CHK_STATUS_RET(PrepareInputs(input_data, args), "Failed to copy input data to model"); + GE_CHK_STATUS_RET(PrepareInputs(input_data, args), "[Invoke][PrepareInputs]Failed to copy input data to model"); GELOGD("Done copying input data successfully."); - GE_CHK_STATUS_RET(executor_->Execute(args), "Failed to execute model."); + GE_CHK_STATUS_RET(executor_->Execute(args), "[Invoke][Execute] Failed."); std::vector output_tensor_info_list; OutputData output_data; - GE_CHK_STATUS_RET(CopyOutputs(args, &output_data, output_tensor_info_list), "Failed to copy outputs."); + GE_CHK_STATUS_RET(CopyOutputs(args, &output_data, output_tensor_info_list), + "[Invoke][CopyOutputs]Failed to copy outputs."); GELOGD("Done copying output data successfully. output count = %zu", output_tensor_info_list.size()); int out_index = 0; @@ -534,7 +560,7 @@ Status HybridModelAsyncExecutor::DumpOpDebug() { loop_cond = const_cast(varible_loop_cond->GetData()); } data_dumper_.SetLoopAddr(global_step, loop_per_iter, loop_cond); - GE_CHK_STATUS_RET(data_dumper_.LoadDumpInfo(), "LoadDumpInfo failed in hybrid engine"); + GE_CHK_STATUS_RET(data_dumper_.LoadDumpInfo(), "[Invoke][LoadDumpInfo] failed in hybrid engine"); GELOGD("Dump op debug SUCCESS in hybrid engine"); } return SUCCESS; diff --git a/ge/hybrid/executor/worker/execution_engine.cc b/ge/hybrid/executor/worker/execution_engine.cc index de3bdc37..3d21fac3 100755 --- a/ge/hybrid/executor/worker/execution_engine.cc +++ b/ge/hybrid/executor/worker/execution_engine.cc @@ -102,11 +102,13 @@ Status NodeDoneCallback::PrepareConstInputs(const NodeItem &node_item) { if (output_tensor->GetSize() < static_cast(tensor_size)) { GELOGE(INTERNAL_ERROR, - "[%s] Tensor size is not enough. output index = %d, required size = %ld, tensor = %s", - node_item.NodeName().c_str(), - output_idx, - tensor_size, - output_tensor->DebugString().c_str()); + "[Check][Size][%s] Tensor size is not enough. output index = %d, required size = %ld, tensor = %s when %s.", + node_item.NodeName().c_str(), output_idx, tensor_size, + output_tensor->DebugString().c_str(), __FUNCTION__); + REPORT_INNER_ERROR("E19999", + "[%s] Tensor size is not enough. output index = %d, required size = %ld, tensor = %s when %s.", + node_item.NodeName().c_str(), output_idx, tensor_size, + output_tensor->DebugString().c_str(), __FUNCTION__); return INTERNAL_ERROR; } @@ -128,7 +130,7 @@ Status NodeDoneCallback::PrepareConstInputs(const NodeItem &node_item) { GE_CHK_GRAPH_STATUS_RET(RuntimeInferenceContext::GetContext(context_id, &runtime_infer_ctx), "Failed to get RuntimeInferenceContext, context_id = %s", context_id.c_str()); GE_CHK_STATUS_RET(runtime_infer_ctx->SetTensor(node_item.node_id, output_idx, std::move(tensor)), - "Failed to SetTensor, node = %s, output_index = %d", node_item.NodeName().c_str(), output_idx); + "[Set][Tensor] Failed, node = %s, output_index = %d", node_item.NodeName().c_str(), output_idx); GELOGD("[%s] Output[%d] cached successfully in context: %s. node_id = %d, shape = [%s]", node_item.NodeName().c_str(), output_idx, @@ -173,7 +175,8 @@ Status NodeDoneCallback::GetTaskDescInfo(const NodePtr node, const HybridModel * Status NodeDoneCallback::ProfilingReport() { auto node = context_->GetNodeItem().node; if (node == nullptr) { - GELOGE(PARAM_INVALID, "Get node is nullptr"); + GELOGE(PARAM_INVALID, "[Get][Node] value is nullptr when %s.", __FUNCTION__); + REPORT_INNER_ERROR("E19999", "Get node failed, when %s.", __FUNCTION__); return PARAM_INVALID; } @@ -190,7 +193,8 @@ Status NodeDoneCallback::ProfilingReport() { std::vector task_desc_info; auto profiling_ret = GetTaskDescInfo(node, model, task_desc_info); if (profiling_ret != RT_ERROR_NONE) { - GELOGE(profiling_ret, "Get task info of node[%s] failed.", node->GetName().c_str()); + GELOGE(profiling_ret, "[Get][TaskDescInfo] of node:%s failed, when %s.", node->GetName().c_str(), __FUNCTION__); + REPORT_CALL_ERROR("E19999", "GetTaskDescInfo of node:%s failed, when %s.", node->GetName().c_str(), __FUNCTION__); return profiling_ret; } @@ -202,7 +206,8 @@ Status NodeDoneCallback::ProfilingReport() { Status NodeDoneCallback::DumpDynamicNode() { auto node = context_->GetNodeItem().node; if (node == nullptr) { - GELOGE(PARAM_INVALID, "Get node is nullptr"); + GELOGE(PARAM_INVALID, "[Get][Node] value is nullptr when %s.", __FUNCTION__); + REPORT_INNER_ERROR("E19999", "get node is nullptr when %s.", __FUNCTION__); return PARAM_INVALID; } auto op_desc = node->GetOpDesc(); @@ -211,13 +216,13 @@ Status NodeDoneCallback::DumpDynamicNode() { vector output_addrs; for (int i = 0; i < context_->NumInputs(); i++) { auto tensor_value = context_->GetInput(i); - GE_CHK_BOOL_RET_STATUS(tensor_value != nullptr, PARAM_INVALID, "Tensor value is nullptr"); + GE_CHK_BOOL_RET_STATUS(tensor_value != nullptr, PARAM_INVALID, "[Get][Tensor] value is nullptr."); uint64_t input_addr = reinterpret_cast(tensor_value->GetData()); input_addrs.emplace_back(input_addr); } for (int j = 0; j < context_->NumOutputs(); j++) { auto tensor_value = context_->GetOutput(j); - GE_CHK_BOOL_RET_STATUS(tensor_value != nullptr, PARAM_INVALID, "Tensor value is nullptr"); + GE_CHK_BOOL_RET_STATUS(tensor_value != nullptr, PARAM_INVALID, "[Get][Tensor] value is nullptr."); uint64_t output_addr = reinterpret_cast(tensor_value->GetData()); output_addrs.emplace_back(output_addr); } @@ -245,11 +250,12 @@ Status NodeDoneCallback::DumpDynamicNode() { void *global_step = context_->GetExecutionContext()->global_step; dump_op_.SetLoopAddr(global_step, loop_per_iter, loop_cond); - GE_CHK_STATUS_RET(dump_op_.LaunchDumpOp(), "Failed to launch dump op in hybird model"); + GE_CHK_STATUS_RET(dump_op_.LaunchDumpOp(), "[Launch][DumpOp] failed in hybird model when %s.", __FUNCTION__); auto rt_ret = rtStreamSynchronize(stream); if (rt_ret != RT_ERROR_NONE) { - GELOGE(rt_ret, "rtStreamSynchronize failed"); + GELOGE(rt_ret, "[Call][rtStreamSynchronize] failed when %s.", __FUNCTION__); + REPORT_CALL_ERROR("E19999", "call rtStreamSynchronize failed when %s.", __FUNCTION__); return rt_ret; } return SUCCESS; @@ -264,12 +270,12 @@ Status NodeDoneCallback::OnNodeDone() { const DumpProperties &dump_properties = context_->GetDumpProperties(); if (dump_properties.IsDumpOpen() || context_->IsOverFlow()) { GELOGI("Start to dump dynamic shape op"); - GE_CHK_STATUS_RET(DumpDynamicNode(), "Failed to dump dynamic node"); + GE_CHK_STATUS_RET(DumpDynamicNode(), "[Call][DumpDynamicNode] Failed when %s.", __FUNCTION__); } if (ProfilingManager::Instance().ProfilingModelExecuteOn()) { - GE_CHK_STATUS_RET(ProfilingReport(), "Report node[%s] to profiling failed.", - node_item.NodeName().c_str()); + GE_CHK_STATUS_RET(ProfilingReport(), "[Report][Profiling] of node[%s] failed when %s.", + node_item.NodeName().c_str(), __FUNCTION__); } // release workspace @@ -292,8 +298,7 @@ Status NodeDoneCallback::OnNodeDone() { } GE_CHK_STATUS_RET(context_->PropagateOutputs(), - "[%s] Failed to propagate outputs failed", - node_item.NodeName().c_str()); + "[Propagate][Outputs] of [%s] failed when %s.", node_item.NodeName().c_str(), __FUNCTION__); RECORD_CALLBACK_EVENT(graph_context_, context_->GetNodeName(), "[PropagateOutputs] End"); } @@ -333,7 +338,8 @@ Status ExecutionEngine::DoExecuteAsync(NodeState &node_state, const std::function &callback) { const auto &task = node_state.GetKernelTask(); if (task == nullptr) { - GELOGE(INTERNAL_ERROR, "[%s] NodeTask is null.", node_state.GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Get][KernelTask] of [%s] is null when %s.", node_state.GetName().c_str(), __FUNCTION__); + REPORT_CALL_ERROR("E19999", "GetKernelTask of %s is null when %s.", node_state.GetName().c_str(), __FUNCTION__); return INTERNAL_ERROR; } @@ -348,8 +354,7 @@ Status ExecutionEngine::DoExecuteAsync(NodeState &node_state, GE_CHECK_NOTNULL(executor); RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[PrepareTask] Start"); GE_CHK_STATUS_RET(executor->PrepareTask(*task, task_context), - "[%s] Failed to prepare task", - node_state.GetName().c_str()); + "[Prepare][Task] for [%s] failed when %s", node_state.GetName().c_str(), __FUNCTION__); RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[PrepareTask] End"); GELOGD("[%s] Done task preparation successfully.", node_state.GetName().c_str()); @@ -360,7 +365,8 @@ Status ExecutionEngine::DoExecuteAsync(NodeState &node_state, } } - GE_CHK_STATUS_RET(ValidateInputTensors(node_state, task_context), "Failed to validate input tensors."); + GE_CHK_STATUS_RET(ValidateInputTensors(node_state, task_context), "[Validate][InputTensors] for %s failed when %s.", + node_state.GetName().c_str(), __FUNCTION__); RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[ValidateInputTensors] End"); if (context.profiling_level > 0) { @@ -414,11 +420,10 @@ Status ExecutionEngine::ValidateInputTensors(const NodeState &node_state, const input_tensor->GetSize()); } else { GELOGE(INTERNAL_ERROR, - "[%s] Input[%d]: tensor size mismatches. expected: %ld, but given %zu", - task_context.GetNodeName(), - i, - expected_size, - input_tensor->GetSize()); + "[Check][Size] for [%s] Input[%d]: tensor size mismatches. expected: %ld, but given %zu when %s.", + task_context.GetNodeName(), i, expected_size, input_tensor->GetSize(), __FUNCTION__); + REPORT_INNER_ERROR("E19999", "[%s] Input[%d]: tensor size mismatches. expected: %ld, but given %zu when %s.", + task_context.GetNodeName(), i, expected_size, input_tensor->GetSize(), __FUNCTION__); return INTERNAL_ERROR; } } @@ -432,8 +437,7 @@ Status ExecutionEngine::PropagateOutputs(const NodeItem &node_item, GraphExecutionContext &context) { if (node_item.shape_inference_type != DEPEND_COMPUTE) { GE_CHK_STATUS_RET(task_context.PropagateOutputs(), - "[%s] Failed to propagate outputs.", - node_item.NodeName().c_str()); + "[Propagate][Outputs] for [%s] failed when ExecutionEngine %s.", node_item.NodeName().c_str(), __FUNCTION__); RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[PropagateOutputs] End"); GELOGD("[%s] Done propagating outputs successfully.", node_item.NodeName().c_str()); } diff --git a/ge/hybrid/executor/worker/shape_inference_engine.cc b/ge/hybrid/executor/worker/shape_inference_engine.cc index 27919589..97ea77a1 100755 --- a/ge/hybrid/executor/worker/shape_inference_engine.cc +++ b/ge/hybrid/executor/worker/shape_inference_engine.cc @@ -70,7 +70,7 @@ Status ShapeInferenceEngine::InferShape(NodeState &node_state) { { RECORD_SHAPE_INFERENCE_EVENT(execution_context_, node_item.NodeName().c_str(), "[InferShapeAndType] Start"); GE_CHK_STATUS_RET(ShapeRefiner::InferShapeAndTypeForRunning(node_item.node, true), - "Invoke InferShapeAndType failed."); + "[Invoke][InferShapeAndType] for %s failed when %s.", node_item.NodeName().c_str(), __FUNCTION__); RECORD_SHAPE_INFERENCE_EVENT(execution_context_, node_item.NodeName().c_str(), "[InferShapeAndType] End"); } @@ -172,8 +172,7 @@ Status ShapeInferenceEngine::InferShapeForSubgraph(const NodeItem &node_item, co GE_CHK_STATUS_RET(ShapeRefiner::InferShapeAndType(node)); GELOGD("[%s] Done invoking InferShapeAndType", node->GetName().c_str()); GE_CHK_STATUS_RET(UpdatePeerNodeShape(*node), - "[%s] Failed to update shapes of peer node.", - node->GetName().c_str()); + "[Update][PeerNodeShape] failed for [%s] when %s.", node->GetName().c_str(), __FUNCTION__); } for (auto &it : fused_subgraph.output_mapping) { @@ -205,7 +204,10 @@ Status ShapeInferenceEngine::UpdatePeerNodeShape(const Node &node) { GE_CHECK_NOTNULL(peer_op_desc); auto peer_input_desc = peer_op_desc->MutableInputDesc(peer_anchor->GetIdx()); if (peer_input_desc == nullptr) { - GELOGE(GRAPH_FAILED, "peer_input_desc is nullptr"); + GELOGE(GRAPH_FAILED, "[Call][MutableInputDesc] for %s return nullptr when ShapeInferenceEngine %s.", + peer_op_desc->GetName().c_str(), __FUNCTION__); + REPORT_CALL_ERROR("E19999", "%s call MutableInputDesc return nullptr when ShapeInferenceEngine %s.", + peer_op_desc->GetName().c_str(), __FUNCTION__); continue; } @@ -230,8 +232,11 @@ Status ShapeInferenceEngine::CanonicalizeShape(GeTensorDesc &tensor_desc, const auto &tensor_shape = tensor_desc.MutableShape(); if (tensor_shape.IsUnknownShape()) { if (!fallback_with_range) { - GELOGE(INTERNAL_ERROR, "Output shape is still unknown after shape inference. shape = [%s]", - tensor_shape.ToString().c_str()); + GELOGE(INTERNAL_ERROR, + "[Is][UnknownShape] Output shape is still unknown after shape inference. " + "shape = [%s] when ShapeInferenceEngine %s.", tensor_shape.ToString().c_str(), __FUNCTION__); + REPORT_INNER_ERROR("E19999", "Output shape is still unknown after shape inference. " + "shape = [%s] when ShapeInferenceEngine %s.", tensor_shape.ToString().c_str(), __FUNCTION__); return INTERNAL_ERROR; } @@ -239,9 +244,10 @@ Status ShapeInferenceEngine::CanonicalizeShape(GeTensorDesc &tensor_desc, std::vector> shape_range; GE_CHK_GRAPH_STATUS_RET(tensor_desc.GetShapeRange(shape_range), "Failed to get shape range"); if (shape_range.size() != shape.size()) { - GELOGE(INTERNAL_ERROR, "Number of shape ranges (%zu) mismatches that of dims (%zu)", - shape_range.size(), - shape.size()); + GELOGE(INTERNAL_ERROR, "[Check][Size] Number of shape ranges (%zu) mismatches that of dims (%zu)" + " when ShapeInferenceEngine %s.", shape_range.size(), shape.size(), __FUNCTION__); + REPORT_INNER_ERROR("E19999", "Number of shape ranges (%zu) mismatches that of dims (%zu)" + " when ShapeInferenceEngine %s.", shape_range.size(), shape.size(), __FUNCTION__); return INTERNAL_ERROR; } @@ -265,7 +271,10 @@ Status ShapeInferenceEngine::CalcTensorSize(DataType data_type, GELOGD("To calc tensor size by shape = [%s]", GeShape(shape).ToString().c_str()); uint32_t type_size; if (!TypeUtils::GetDataTypeLength(data_type, type_size)) { - GELOGE(INTERNAL_ERROR, "Failed to get data type size"); + GELOGE(INTERNAL_ERROR, "[Get][DataTypeLength] failed for type:%s when ShapeInferenceEngine %s.", + TypeUtils::DataTypeToSerialString(data_type).c_str(), __FUNCTION__); + REPORT_CALL_ERROR("E19999", "GetDataTypeLength failed for type:%s when ShapeInferenceEngine %s.", + TypeUtils::DataTypeToSerialString(data_type).c_str(), __FUNCTION__); return INTERNAL_ERROR; } @@ -273,15 +282,13 @@ Status ShapeInferenceEngine::CalcTensorSize(DataType data_type, for (const auto &dim : shape) { GE_CHECK_GE(dim, 0); GE_CHK_STATUS_RET(Int64MulCheckOverflow(tensor_size, dim), - "Shape size overflow, shape = [%s]", - GeShape(shape).ToString().c_str()); + "[Check][Overflow] Shape size overflow, shape = [%s]", GeShape(shape).ToString().c_str()); tensor_size *= dim; } GE_CHK_STATUS_RET(CheckInt64AddOverflow(tensor_size, kAlignment - 1), - "Tensor size is too large: %ld, shape = [%s]", - tensor_size, - GeShape(shape).ToString().c_str()); + "[Check][Overflow]Tensor size is too large: %ld, shape = [%s]", + tensor_size, GeShape(shape).ToString().c_str()); tensor_size = (tensor_size + kAlignment - 1) / kAlignment * kAlignment; return SUCCESS; } @@ -295,15 +302,13 @@ Status ShapeInferenceEngine::CalcOutputTensorSizes(const NodeItem &node_item, bo // modify on copy auto dims = shape.GetDims(); GE_CHK_STATUS_RET(CanonicalizeShape(*tensor_desc, dims, fallback_with_range), - "[%s] Failed to canonicalize shape for output %zu", - node_item.NodeName().c_str(), - output_index); + "[Canonicalize][Shape] failed for [%s], output %zu, when ShapeInferenceEngine %s.", + node_item.NodeName().c_str(), output_index, __FUNCTION__); int64_t tensor_size; GE_CHK_STATUS_RET(CalcTensorSize(tensor_desc->GetDataType(), dims, tensor_size), - "[%s] Failed to calc tensor size for output %zu", - node_item.NodeName().c_str(), - output_index); + "[Calc][TensorSize] failed for [%s], output %zu when ShapeInferenceEngine %s.", + node_item.NodeName().c_str(), output_index, __FUNCTION__); GELOGD("[%s] Tensor size of output %zu = %ld", node_item.NodeName().c_str(), output_index, tensor_size); (void) TensorUtils::SetSize(*tensor_desc, tensor_size); } diff --git a/ge/hybrid/executor/worker/task_compile_engine.cc b/ge/hybrid/executor/worker/task_compile_engine.cc index f80374c6..b4353400 100755 --- a/ge/hybrid/executor/worker/task_compile_engine.cc +++ b/ge/hybrid/executor/worker/task_compile_engine.cc @@ -32,7 +32,8 @@ Status TaskCompileEngine::Compile(NodeState &node_state, GraphExecutionContext * shared_ptr kernel_task; auto ret = node_item.node_executor->CompileTask(*context->model, node_item.node, kernel_task); RECORD_COMPILE_EVENT(context, node_state.GetName().c_str(), "[Compile] End"); - GE_CHK_STATUS_RET(ret, "Failed to create task for node: %s", node_item.NodeName().c_str()); + GE_CHK_STATUS_RET(ret, "[Compile][Task] failed for node: %s, when TaskCompileEngine %s.", + node_item.NodeName().c_str(), __FUNCTION__); node_state.SetKernelTask(kernel_task); GELOGI("Compiling node %s successfully", node_state.GetName().c_str()); return SUCCESS; From 1bcc0df932cc32f39a74154c54b1946e60e9f80c Mon Sep 17 00:00:00 2001 From: liudingyan Date: Mon, 29 Mar 2021 17:26:40 +0800 Subject: [PATCH 2/5] add errlog and modify geloge --2 --- ge/hybrid/executor/hybrid_model_executor.cc | 2 +- .../hybrid_model_pipeline_executor.cc | 30 ++++++++++++++----- ge/hybrid/executor/hybrid_profiler.cc | 8 +++-- ge/hybrid/executor/node_done_manager.cc | 3 +- ge/hybrid/executor/node_state.cc | 17 ++++++++--- metadef | 2 +- parser | 2 +- 7 files changed, 46 insertions(+), 18 deletions(-) diff --git a/ge/hybrid/executor/hybrid_model_executor.cc b/ge/hybrid/executor/hybrid_model_executor.cc index cd130e95..85abb7bd 100755 --- a/ge/hybrid/executor/hybrid_model_executor.cc +++ b/ge/hybrid/executor/hybrid_model_executor.cc @@ -72,7 +72,7 @@ Status HybridModelExecutor::Execute(HybridModelExecutor::ExecuteArgs &args) { if (ret == END_OF_SEQUENCE) { args.is_eos = true; } else { - GE_CHK_STATUS_RET(ret, "Failed to execute model"); + GE_CHK_STATUS_RET(ret, "[Invoke][ExecuteGraphInternal]Failed when HybridModelExecutor %s.", __FUNCTION__); } return SUCCESS; } diff --git a/ge/hybrid/executor/hybrid_model_pipeline_executor.cc b/ge/hybrid/executor/hybrid_model_pipeline_executor.cc index 4706fa97..46650c02 100644 --- a/ge/hybrid/executor/hybrid_model_pipeline_executor.cc +++ b/ge/hybrid/executor/hybrid_model_pipeline_executor.cc @@ -59,7 +59,10 @@ Status StageExecutor::Start(const std::vector &inputs, const std::v task_queue_.Pop(task_info); GELOGD("[Executor: %d] Got task, stage = %d, iteration = %ld", id_, task_info.stage, task_info.iteration); if (task_info.iteration >= pipe_config_->iteration_end) { - GELOGE(INTERNAL_ERROR, "[Executor: %d] Unexpected iteration: %d", id_, task_info.iteration); + GELOGE(INTERNAL_ERROR, "[Check][Range][Executor: %d] Unexpected iteration: %d when StageExecutor %s.", + id_, task_info.iteration, __FUNCTION__); + REPORT_INNER_ERROR("E19999", "[Executor: %d] Unexpected iteration: %d when StageExecutor %s.", + id_, task_info.iteration, __FUNCTION__); return INTERNAL_ERROR; } @@ -75,7 +78,8 @@ Status StageExecutor::Start(const std::vector &inputs, const std::v if (task_info.stage == 0) { GELOGD("[Executor: %d] To ResetExecutionContext", id_); - GE_CHK_STATUS_RET(ResetExecutionContext(context_), "[Executor: %d] Failed to reset context", id_); + GE_CHK_STATUS_RET(ResetExecutionContext(context_), + "[Invoke][ResetExecutionContext][Executor: %d] Failed to reset context", id_); context_.iteration = task_info.iteration; GE_CHK_STATUS_RET_NOLOG(SetInputs(inputs, input_desc)); } @@ -92,8 +96,11 @@ Status StageExecutor::Start(const std::vector &inputs, const std::v auto sync_result = Synchronize(); if (sync_result != SUCCESS) { - GELOGE(sync_result, "[Executor: %d] Failed to sync result. iteration = %d", id_, task_info.iteration); - + GELOGE(sync_result, + "[Invoke][Synchronize][Executor: %d] Failed to sync result when StageExecutor %s. iteration = %d", + id_, __FUNCTION__, task_info.iteration); + REPORT_CALL_ERROR("E19999", "[Executor: %d] Failed to sync result when StageExecutor %s. iteration = %d", + id_, __FUNCTION__, task_info.iteration); context_.profiler->Dump(std::cout); context_.callback_manager->Destroy(); RuntimeInferenceContext::DestroyContext(std::to_string(context_.context_id)); @@ -242,7 +249,10 @@ Status HybridModelPipelineExecutor::Execute(HybridModelExecutor::ExecuteArgs &ar GELOGD("Start to sync result of executor[%zu]", i); auto ret = futures[i].get(); if (ret != SUCCESS) { - GELOGE(ret, "[Executor: %zu] Failed to schedule tasks.", i); + GELOGE(ret, "[Check][Result][Executor: %zu] Failed to schedule tasks when HybridModelPipelineExecutor %s.", + i, __FUNCTION__); + REPORT_INNER_ERROR("E19999", "[Executor: %zu] Failed to schedule tasks when HybridModelPipelineExecutor %s.", + i, __FUNCTION__); has_error = true; continue; } @@ -250,7 +260,10 @@ Status HybridModelPipelineExecutor::Execute(HybridModelExecutor::ExecuteArgs &ar ret = stage_executors_[i]->Synchronize(); if (ret != SUCCESS) { - GELOGE(ret, "[Executor: %zu] Failed to synchronize result.", i); + GELOGE(ret, "[Invoke][Synchronize] failed for [Executor: %zu] when HybridModelPipelineExecutor %s.", + i, __FUNCTION__); + REPORT_CALL_ERROR("E19999", "[Executor: %zu] failed to Synchronize result when HybridModelPipelineExecutor %s.", + i, __FUNCTION__); has_error = true; continue; } @@ -266,13 +279,14 @@ Status HybridModelPipelineExecutor::Execute(HybridModelExecutor::ExecuteArgs &ar iteration_ = config_.iteration_end; if (has_error) { - GELOGE(FAILED, "Error occurred while execution"); + GELOGE(FAILED, "[Check][Error]Error occurred while execution when HybridModelPipelineExecutor %s.", __FUNCTION__); + REPORT_INNER_ERROR("E19999", "Error occurred while execution when HybridModelPipelineExecutor %s.", __FUNCTION__); return FAILED; } auto last_iter_executor_idx = loop_count % stage_executors_.size(); GE_CHK_STATUS_RET(stage_executors_[last_iter_executor_idx]->GetOutputs(args.outputs, args.output_desc), - "Failed to get output from executor[%zu]", last_iter_executor_idx); + "[Get][Outputs]Failed from executor[%zu]", last_iter_executor_idx); return SUCCESS; } diff --git a/ge/hybrid/executor/hybrid_profiler.cc b/ge/hybrid/executor/hybrid_profiler.cc index 3b6865bb..0f074b29 100644 --- a/ge/hybrid/executor/hybrid_profiler.cc +++ b/ge/hybrid/executor/hybrid_profiler.cc @@ -40,7 +40,8 @@ void HybridProfiler::RecordEvent(EventType event_type, const char *fmt, ...) { char buf[kEventDescMax]; if (vsnprintf_s(buf, kEventDescMax, kEventDescMax - 1, fmt, args) == -1) { - GELOGE(FAILED, "Format %s failed.", fmt); + GELOGE(FAILED, "[Parse][Param:fmt]Format %s failed when HybridProfiler %s.", fmt, __FUNCTION__); + REPORT_INNER_ERROR("E19999", "Parse Format %s failed when HybridProfiler %s.", fmt, __FUNCTION__); va_end(args); return; } @@ -48,7 +49,10 @@ void HybridProfiler::RecordEvent(EventType event_type, const char *fmt, ...) { va_end(args); auto index = counter_++; if (index >= static_cast(events_.size())) { - GELOGE(INTERNAL_ERROR, "index out of range. index = %d, max event size = %zu", index, events_.size()); + GELOGE(INTERNAL_ERROR, + "[Check][Range]index out of range. index = %d, max event size = %zu", index, events_.size()); + REPORT_INNER_ERROR("E19999", "index out of range when HybridProfiler %s. index = %d, max event size = %zu", + __FUNCTION__, index, events_.size()); return; } auto &evt = events_[index]; diff --git a/ge/hybrid/executor/node_done_manager.cc b/ge/hybrid/executor/node_done_manager.cc index f0d4324a..95dfb67f 100644 --- a/ge/hybrid/executor/node_done_manager.cc +++ b/ge/hybrid/executor/node_done_manager.cc @@ -28,7 +28,8 @@ bool NodeDoneManager::Cond::Await() { if (!cv_.wait_for(lk, std::chrono::seconds(kDefaultWaitTimeoutInSec), [&]() { return is_released_ || is_cancelled_; })) { - GELOGE(INTERNAL_ERROR, "Wait timed out."); + GELOGE(INTERNAL_ERROR, "[Invoke][wait_for]Wait timed out when %s.", __FUNCTION__); + REPORT_INNER_ERROR("E19999", "wait timed out when %s.", __FUNCTION__); return false; } diff --git a/ge/hybrid/executor/node_state.cc b/ge/hybrid/executor/node_state.cc index 13727250..5eb8ee54 100644 --- a/ge/hybrid/executor/node_state.cc +++ b/ge/hybrid/executor/node_state.cc @@ -67,7 +67,10 @@ Status ShapeInferenceState::UpdateInputShape(int idx, const GeTensorDesc &target Format format = input_desc.GetFormat(); DataType data_type = input_desc.GetDataType(); if (TensorUtils::CalcTensorMemSize(shape, format, data_type, tensor_size) != GRAPH_SUCCESS) { - GELOGE(FAILED, "[%s] Calculate tensor memory size failed.", node_item.NodeName().c_str()); + GELOGE(FAILED, "[Invoke][CalcTensorMemSize] failed for [%s] when ShapeInferenceState %s.", + node_item.NodeName().c_str(), __FUNCTION__); + REPORT_CALL_ERROR("E19999", "CalcTensorMemSize failed for [%s] when ShapeInferenceState %s.", + node_item.NodeName().c_str(), __FUNCTION__); return FAILED; } } @@ -121,13 +124,19 @@ Status ShapeInferenceState::AwaitShapesReady(const GraphExecutionContext &contex } if (context.GetStatus() != SUCCESS) { - GELOGE(FAILED, "[%s] Await pending shape cancelled", node_item.NodeName().c_str()); + GELOGE(FAILED, "[Check][Status][%s] Await pending shape cancelled when %s.", + node_item.NodeName().c_str(), __FUNCTION__); + REPORT_CALL_ERROR("E19999", "[%s] Await pending shape cancelled when %s.", + node_item.NodeName().c_str(), __FUNCTION__); break; } } if (!wait_success) { - GELOGE(FAILED, "[%s] Wait for shape timeout.", node_item.NodeName().c_str()); + GELOGE(FAILED, "[Check][Status][%s] Wait for shape timeout when %s.", + node_item.NodeName().c_str(), __FUNCTION__); + REPORT_CALL_ERROR("E19999", "[%s] Wait for shape timeout when %s.", + node_item.NodeName().c_str(), __FUNCTION__); return FAILED; } } @@ -232,7 +241,7 @@ Status NodeState::WaitForPrepareDone() { if (prepare_future_.valid()) { GELOGD("[%s] Start to wait for prepare future.", GetName().c_str()); GE_CHK_STATUS_RET(prepare_future_.get(), - "[%s] PreRun failed.", GetName().c_str()); + "[Check][Status][%s] PreRun failed when NodeState %s.", GetName().c_str(), __FUNCTION__); } return SUCCESS; diff --git a/metadef b/metadef index 620e9b9a..4ff5e398 160000 --- a/metadef +++ b/metadef @@ -1 +1 @@ -Subproject commit 620e9b9ac3210db3e4cf47babfb23d248bb9f17e +Subproject commit 4ff5e3987f2e5d2980019defacaf0891861c84fc diff --git a/parser b/parser index d744541c..51fb6c48 160000 --- a/parser +++ b/parser @@ -1 +1 @@ -Subproject commit d744541c6ca7f6966c1befacc9f83f53b0829e0a +Subproject commit 51fb6c4850906e8342598d47eccfca0b87ffea59 From bab28210623884cd306e0230018ae39b68923067 Mon Sep 17 00:00:00 2001 From: liudingyan Date: Tue, 30 Mar 2021 10:17:01 +0800 Subject: [PATCH 3/5] modify code to add report errmsg --- ge/hybrid/common/npu_memory_allocator.cc | 16 +-- ge/hybrid/common/tensor_value.cc | 4 +- .../executor/hybrid_execution_context.cc | 2 +- .../executor/hybrid_model_async_executor.cc | 117 ++++++++++-------- ge/hybrid/executor/hybrid_model_executor.cc | 2 +- .../hybrid_model_pipeline_executor.cc | 20 ++- ge/hybrid/executor/hybrid_profiler.cc | 4 +- ge/hybrid/executor/node_done_manager.cc | 4 +- ge/hybrid/executor/node_state.cc | 18 +-- ge/hybrid/executor/worker/execution_engine.cc | 40 +++--- .../executor/worker/shape_inference_engine.cc | 44 ++++--- .../executor/worker/task_compile_engine.cc | 3 +- 12 files changed, 147 insertions(+), 127 deletions(-) diff --git a/ge/hybrid/common/npu_memory_allocator.cc b/ge/hybrid/common/npu_memory_allocator.cc index 7561d2e1..08f91f02 100644 --- a/ge/hybrid/common/npu_memory_allocator.cc +++ b/ge/hybrid/common/npu_memory_allocator.cc @@ -39,7 +39,7 @@ AllocationAttr::AllocationAttr(void *try_reuse_addr) : AllocationAttr(0, try_reu NpuMemoryAllocator *NpuMemoryAllocator::GetAllocator() { int32_t device_id = 0; if (rtGetDevice(&device_id) != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "[Get][Device] Failed when %s.", __FUNCTION__); + GELOGE(RT_FAILED, "[Get][Device] Failed."); REPORT_INNER_ERROR("E19999", "rtGetDevice failed when %s.", __FUNCTION__); return nullptr; } @@ -58,8 +58,8 @@ void *NpuMemoryAllocator::Allocate(std::size_t size, AllocationAttr *attr) { } if (allocate_size == 0) { - GELOGE(MEMALLOC_FAILED, "[Check][Param:size_t]Memory size is 0, device_id = %u, size = %zu when %s.", - device_id_, allocate_size, __FUNCTION__); + GELOGE(MEMALLOC_FAILED, "[Check][Param:size_t]Memory size is 0, device_id = %u, size = %zu.", + device_id_, allocate_size); REPORT_INNER_ERROR("E19999", "Memory size is 0, device_id = %u, size = %zu when %s.", device_id_, allocate_size, __FUNCTION__); return nullptr; @@ -72,8 +72,10 @@ void *NpuMemoryAllocator::Allocate(std::size_t size, AllocationAttr *attr) { buffer = MemManager::Instance().HostMemInstance(RT_MEMORY_HBM).Malloc(allocate_size); } else { if (allocate_size > kMaxHbmMemorySize) { - GELOGE(PARAM_INVALID, "[Check][Param:size_t]Invalid HBM memory size: %zu when %s.", allocate_size, __FUNCTION__); - REPORT_CALL_ERROR("E19999", "Invalid HBM memory size: %zu when %s.", allocate_size, __FUNCTION__); + GELOGE(PARAM_INVALID, "[Check][Param:size_t]Invalid HBM memory size: %zu bigger than limit:%lu, check invalid.", + allocate_size, kMaxHbmMemorySize); + REPORT_CALL_ERROR("E19999", "Invalid HBM memory size: %zu bigger than limit:%lu, check invalid when %s.", + allocate_size, kMaxHbmMemorySize, __FUNCTION__); return nullptr; } void *try_reuse_addr = nullptr; @@ -92,8 +94,8 @@ void *NpuMemoryAllocator::Allocate(std::size_t size, AllocationAttr *attr) { .Malloc(allocate_size, reinterpret_cast(try_reuse_addr), device_id_); } if (buffer == nullptr) { - GELOGE(MEMALLOC_FAILED, "[Malloc][Memory] Failed, device_id = %u, size = %zu when %s.", - device_id_, allocate_size, __FUNCTION__); + GELOGE(MEMALLOC_FAILED, "[Malloc][Memory] Failed, device_id = %u, size = %zu.", + device_id_, allocate_size); REPORT_CALL_ERROR("E19999", "malloc memory failed, device_id = %u, size = %zu when %s.", device_id_, allocate_size, __FUNCTION__); return nullptr; diff --git a/ge/hybrid/common/tensor_value.cc b/ge/hybrid/common/tensor_value.cc index 81670c57..48f9cfc4 100644 --- a/ge/hybrid/common/tensor_value.cc +++ b/ge/hybrid/common/tensor_value.cc @@ -32,7 +32,7 @@ std::unique_ptr TensorBuffer::Create(NpuMemoryAllocator *allocator } if (allocator == nullptr) { - GELOGE(INTERNAL_ERROR, "[Check][Param:NpuMemoryAllocator] allocator is NULL, when %s.", __FUNCTION__); + GELOGE(INTERNAL_ERROR, "[Check][Param:NpuMemoryAllocator] allocator is NULL."); REPORT_INNER_ERROR("E19999", "input allocator is NULL, when %s.", __FUNCTION__); return nullptr; } @@ -43,7 +43,7 @@ std::unique_ptr TensorBuffer::Create(NpuMemoryAllocator *allocator } buffer = allocator->Allocate(size, attr); if (buffer == nullptr) { - GELOGE(MEMALLOC_FAILED, "[Allocate][Memory] Failed. size = %zu, when %s.", size, __FUNCTION__); + GELOGE(MEMALLOC_FAILED, "[Allocate][Memory] Failed. size = %zu.", size); REPORT_CALL_ERROR("E19999", "allocate failed, size = %zu, when %s.", size, __FUNCTION__); return nullptr; } diff --git a/ge/hybrid/executor/hybrid_execution_context.cc b/ge/hybrid/executor/hybrid_execution_context.cc index 00a203d7..dd964448 100644 --- a/ge/hybrid/executor/hybrid_execution_context.cc +++ b/ge/hybrid/executor/hybrid_execution_context.cc @@ -60,7 +60,7 @@ Status GraphExecutionContext::Synchronize(rtStream_t rt_stream) { } GELOGE(RT_FAILED, - "[Invoke][rtStreamSynchronize] failed when GraphExecutionContext %s, ret = %d", __FUNCTION__, rt_ret); + "[Invoke][rtStreamSynchronize] failed, ret = %d", rt_ret); REPORT_CALL_ERROR("E19999", "invoke rtStreamSynchronize failed when GraphExecutionContext %s, ret = %d", __FUNCTION__, rt_ret); return RT_FAILED; diff --git a/ge/hybrid/executor/hybrid_model_async_executor.cc b/ge/hybrid/executor/hybrid_model_async_executor.cc index ec1080db..2720d72d 100644 --- a/ge/hybrid/executor/hybrid_model_async_executor.cc +++ b/ge/hybrid/executor/hybrid_model_async_executor.cc @@ -51,8 +51,13 @@ void HybridModelAsyncExecutor::SetModelName(const string &model_name) { } Status HybridModelAsyncExecutor::EnqueueData(const shared_ptr &data) { - GE_CHK_STATUS_EXEC(data_inputer_->Push(data), return domi::DATA_QUEUE_ISFULL, - "[Push][Data] Data queue is full, please call again later, model_id %u ", model_id_); + if(data_inputer_->Push(data) != SUCCESS){ + REPORT_CALL_ERROR("E19999", "Data queue is full, please call again later when %s, model_id %u.", + __FUNCTION__, model_id_); + GELOGE(domi::DATA_QUEUE_ISFULL, + "[Push][Data] Data queue is full, please call again later, model_id %u ", model_id_); + return domi::DATA_QUEUE_ISFULL; + } GELOGD("EnqueueData successfully. model_id = %u, data_index = %u", data->GetInput().model_id, data->GetInput().index); return SUCCESS; } @@ -60,9 +65,12 @@ Status HybridModelAsyncExecutor::EnqueueData(const shared_ptr Status HybridModelAsyncExecutor::Start(const std::shared_ptr &listener) { GELOGD("HybridModelExecutor::Start IN, has listener = %d", listener != nullptr); std::lock_guard lk(mu_); - GE_CHK_BOOL_RET_STATUS(!run_flag_, INTERNAL_ERROR, - "[Check][RunState] Model already started when HybridModelAsyncExecutor %s.", __FUNCTION__); - + if(run_flag_){ + REPORT_INNER_ERROR("E19999", "Model already started when HybridModelAsyncExecutor %s, model_id:%u.", + __FUNCTION__, model_id_); + GELOGE(INTERNAL_ERROR, "[Check][RunState] Model already started, model_id:%u.", model_id_); + return INTERNAL_ERROR; + } run_flag_ = true; listener_ = listener; future_ = std::async(std::launch::async, [&]() -> Status { @@ -73,7 +81,7 @@ Status HybridModelAsyncExecutor::Start(const std::shared_ptr &lis }); GE_CHK_BOOL_RET_STATUS(future_.valid(), INTERNAL_ERROR, - "[Check][RunState] Failed to start when HybridModelAsyncExecutor %s.", __FUNCTION__); + "[Check][RunState] Failed to start, model_id:%u.", model_id_); GELOGD("HybridModelExecutor::Start successfully"); return SUCCESS; } @@ -108,8 +116,8 @@ Status HybridModelAsyncExecutor::Init() { executor_ = std::unique_ptr(new(std::nothrow) HybridModelExecutor(model_, device_id_, stream_)); GE_CHECK_NOTNULL(executor_); GE_CHK_STATUS_RET(executor_->Init(), - "[Init][HybridModelExecutor] failed when HybridModelAsyncExecutor %s.", __FUNCTION__); - GE_CHK_STATUS_RET(DumpOpDebug(), "[Dump][OpDebug] failed when HybridModelAsyncExecutor %s.", __FUNCTION__); + "[Init][HybridModelExecutor] failed, model_id:%u.", model_id_); + GE_CHK_STATUS_RET(DumpOpDebug(), "[Dump][OpDebug] failed, model_id:%u.", model_id_); GELOGI("HybridModel stage nums:%zu", model_->GetRootGraphItem()->NumGroups()); if (model_->GetRootGraphItem()->NumGroups() >= kMinimumPiplineStages) { @@ -117,19 +125,19 @@ Status HybridModelAsyncExecutor::Init() { std::unique_ptr(new(std::nothrow) HybridModelPipelineExecutor(model_, device_id_)); GE_CHECK_NOTNULL(pipe_executor_); GE_CHK_STATUS_RET(pipe_executor_->Init(), - "[Init][HybridModelPipelineExecutor] failed when HybridModelAsyncExecutor %s.", __FUNCTION__); + "[Init][HybridModelPipelineExecutor] failed, model_id:%u.", model_id_); } - GE_CHK_STATUS_RET(InitInputDesc(), "[Init][InputDesc] failed when HybridModelAsyncExecutor %s.", __FUNCTION__); + GE_CHK_STATUS_RET(InitInputDesc(), "[Init][InputDesc] failed, model_id:%u.", model_id_); return SUCCESS; } Status HybridModelAsyncExecutor::PreRun(InputData ¤t_data, HybridModelExecutor::ExecuteArgs &args) { - GE_CHK_STATUS_RET(SyncVarData(), "[Invoke][SyncVarData] failed when HybridModelAsyncExecutor %s.", __FUNCTION__); + GE_CHK_STATUS_RET(SyncVarData(), "[Invoke][SyncVarData] failed, model_id:%u.", model_id_); RECORD_MODEL_EXECUTION_EVENT(executor_->GetContext(), "[SyncVarData] End"); GE_CHK_STATUS_RET(PrepareInputs(current_data, args), - "[Invoke][PrepareInputs] failed to copy input data to model when HybridModelAsyncExecutor %s.", __FUNCTION__); + "[Invoke][PrepareInputs] failed to copy input data to model, model_id:%u.", model_id_); RECORD_MODEL_EXECUTION_EVENT(executor_->GetContext(), "[CopyInputData] End"); return SUCCESS; } @@ -160,7 +168,7 @@ Status HybridModelAsyncExecutor::RunInternal() { GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( ret != SUCCESS, (void) HandleResult(ret, current_data.index, args, data_wrapper->GetOutput()); CsaInteract::GetInstance().StoreInternalErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); - continue, "[Invoke][PreRun] failed when HybridModelAsyncExecutor %s.", __FUNCTION__); // [No need to check value] + continue, "[Invoke][PreRun] failed, model_id:%u.", model_id_); // [No need to check value] if (pipe_executor_ != nullptr) { GELOGI("HybridModel will execute in pipeline mode"); @@ -204,9 +212,7 @@ Status HybridModelAsyncExecutor::HandleResult(Status exec_ret, } if (exec_ret != SUCCESS) { - GELOGE(exec_ret, - "[Check][Param:Status] failed to execute graph when HybridModelAsyncExecutor %s. model_id = %u", - __FUNCTION__, model_id_); + GELOGE(exec_ret, "[Check][Param:Status] failed to execute graph. model_id = %u", model_id_); REPORT_INNER_ERROR("E19999", "failed to execute graph when HybridModelAsyncExecutor %s. model_id = %u", __FUNCTION__, model_id_); return OnComputeDone(data_id, INTERNAL_ERROR, output_tensor_info_list); @@ -245,11 +251,11 @@ Status HybridModelAsyncExecutor::SyncVarData() { Status HybridModelAsyncExecutor::PrepareInputs(const InputData ¤t_data, HybridModelExecutor::ExecuteArgs &args) { if (current_data.blobs.size() < input_tensor_desc_.size()) { GELOGE(PARAM_INVALID, - "[Check][Size]Blob size mismatches, expect at least %zu, but got %zu when HybridModelAsyncExecutor %s.", - input_tensor_desc_.size(), current_data.blobs.size(), __FUNCTION__); + "[Check][Size]Blob size mismatches, expect at least %zu, but got %zu, model_id = %u", + input_tensor_desc_.size(), current_data.blobs.size(), model_id_); REPORT_INNER_ERROR("E19999", - "Blob size mismatches, expect at least %zu, but got %zu when HybridModelAsyncExecutor %s.", - input_tensor_desc_.size(), current_data.blobs.size(), __FUNCTION__); + "Blob size mismatches, expect at least %zu, but got %zu when HybridModelAsyncExecutor %s, model_id = %u.", + input_tensor_desc_.size(), current_data.blobs.size(), __FUNCTION__, model_id_); return PARAM_INVALID; } @@ -262,11 +268,11 @@ Status HybridModelAsyncExecutor::PrepareInputs(const InputData ¤t_data, Hy if (is_input_dynamic_[input_index]) { if (input_index >= current_data.shapes.size()) { GELOGE(PARAM_INVALID, - "[Check][Range]Shape index out of range, index = %zu, shape size = %zu when HybridModelAsyncExecutor %s.", - input_index, current_data.shapes.size(), __FUNCTION__); + "[Check][Range]Shape index out of range, index = %zu, shape size = %zu model_id = %u.", + input_index, current_data.shapes.size(), model_id_); REPORT_INNER_ERROR("E19999", - "Shape index out of range, index = %zu, shape size = %zu when HybridModelAsyncExecutor %s.", - input_index, current_data.shapes.size(), __FUNCTION__); + "Shape index out of range, index = %zu, shape size = %zu when HybridModelAsyncExecutor %s, model_id = %u.", + input_index, current_data.shapes.size(), __FUNCTION__, model_id_); return PARAM_INVALID; } auto &tensor_desc = input_tensor_desc_[input_index]; @@ -274,7 +280,7 @@ Status HybridModelAsyncExecutor::PrepareInputs(const InputData ¤t_data, Hy std::vector> range; auto range_ret = tensor_desc->GetShapeRange(range); GE_CHK_BOOL_RET_STATUS(range_ret == GRAPH_SUCCESS, INTERNAL_ERROR, - "[Invoke][GetShapeRange] failed, ret=%u.", range_ret); + "[Invoke][GetShapeRange] failed, ret=%u, model_id = %u.", range_ret, model_id_); for (size_t k = 0; k < range.size(); ++k) { if (k >= shape.GetDimNum()) { break; @@ -282,11 +288,11 @@ Status HybridModelAsyncExecutor::PrepareInputs(const InputData ¤t_data, Hy // range[k].second can be -1 if (shape.GetDim(k) < range[k].first || (range[k].second >= 0 && shape.GetDim(k) > range[k].second)) { GELOGE(PARAM_INVALID, - "[Check][Range]Dim out of range, shape idx = %zu, dim idx = %zu, dim = %ld, range = [%ld, %ld]", - input_index, k, shape.GetDim(k), range[k].first, range[k].second); + "[Check][Range]Dim out of range, shape idx = %zu, dim idx = %zu, dim = %ld, range = [%ld, %ld], model_id = %u.", + input_index, k, shape.GetDim(k), range[k].first, range[k].second, model_id_); REPORT_INNER_ERROR("E19999", - "Dim out of range, shape idx = %zu, dim idx = %zu, dim = %ld, range = [%ld, %ld]", - input_index, k, shape.GetDim(k), range[k].first, range[k].second); + "Dim out of range, shape idx = %zu, dim idx = %zu, dim = %ld, range = [%ld, %ld], model_id = %u.", + input_index, k, shape.GetDim(k), range[k].first, range[k].second, model_id_); return PARAM_INVALID; } } @@ -294,8 +300,8 @@ Status HybridModelAsyncExecutor::PrepareInputs(const InputData ¤t_data, Hy args.input_desc[input_index] = tensor_desc; GELOGD("Update shape of input[%zu] to [%s]", input_index, tensor_desc->MutableShape().ToString().c_str()); GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetTensorMemorySizeInBytes(*tensor_desc, tensor_size), - "[Invoke][GetTensorMemorySizeInBytes]Failed to calc tensor size, index = %zu, shape = [%s]", - input_index, tensor_desc->GetShape().ToString().c_str()); + "[Invoke][GetTensorMemorySizeInBytes]Failed to calc tensor size, index = %zu, shape = [%s], model_id = %u.", + input_index, tensor_desc->GetShape().ToString().c_str(), model_id_); GELOGD("Input tensor[%zu] size = %zu", input_index, tensor_size); } @@ -311,12 +317,16 @@ Status HybridModelAsyncExecutor::PrepareInputs(const InputData ¤t_data, Hy GELOGD("To copy input data for input[%zu]", input_index); const DataBuffer &data_buf = blobs[input_index]; auto mem_size = static_cast(tensor_size); - GE_CHK_BOOL_RET_STATUS(mem_size >= data_buf.length, - PARAM_INVALID, - "[Check][Size]input data size(%lu) does not match model required size(%lu), ret failed.", - data_buf.length, - mem_size); + if(mem_size < data_buf.length){ + REPORT_INNER_ERROR("E19999", + "input data size(%lu) does not match model required size(%lu) when %s, ret failed, model_id = %u.", + data_buf.length, mem_size, __FUNCTION__, model_id_); + GELOGE(PARAM_INVALID, + "[Check][Size]input data size(%lu) does not match model required size(%lu), ret failed, model_id = %u.", + data_buf.length, mem_size, model_id_); + return PARAM_INVALID; + } if (data_buf.length > 0) { GELOGI("[IMAS]CopyPlainData memcpy graph_%u type[F] output[%zu] memaddr[%p] mem_size[%zu] datasize[%lu]", model_->root_runtime_param_.graph_id, @@ -371,7 +381,7 @@ Status HybridModelAsyncExecutor::OnComputeDone(uint32_t data_index, uint32_t res GELOGD("OnComputeDone. model id = %u, data index = %u, execution ret = %u", model_id_, data_index, result_code); if (listener_ != nullptr) { GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_index, result_code, outputs), - "[Invoke][OnComputeDone] failed."); + "[Invoke][OnComputeDone] failed, model_id = %u.", model_id_); } return result_code; @@ -385,12 +395,11 @@ Status HybridModelAsyncExecutor::CopyOutputs(HybridModelExecutor::ExecuteArgs &a std::vector &output_tensors = args.outputs; if (output_tensor_desc_list.size() != output_tensors.size()) { GELOGE(INTERNAL_ERROR, - "[Check][Size]Output sizes mismatch. From op_desc = %zu, and from output tensors = %zu " - "when HybridModelAsyncExecutor %s.", - output_tensor_desc_list.size(), output_tensors.size(), __FUNCTION__); - REPORT_INNER_ERROR("E19999", "Output sizes mismatch. From op_desc = %zu, and from output tensors = %zu " - "when HybridModelAsyncExecutor %s.", - output_tensor_desc_list.size(), output_tensors.size(), __FUNCTION__); + "[Check][Size]Output sizes mismatch. From op_desc = %zu, and from output tensors = %zu, model_id = %u.", + output_tensor_desc_list.size(), output_tensors.size(), model_id_); + REPORT_INNER_ERROR("E19999", "Output sizes mismatch. From op_desc = %zu, and from output tensors = %zu, " + "when HybridModelAsyncExecutor %s, model_id = %u.", + output_tensor_desc_list.size(), output_tensors.size(), __FUNCTION__, model_id_); return INTERNAL_ERROR; } @@ -422,10 +431,12 @@ Status HybridModelAsyncExecutor::CopyOutputs(HybridModelExecutor::ExecuteArgs &a GE_CHECK_LE(output_size, UINT32_MAX); if (output_tensor.GetSize() < static_cast(output_size)) { GELOGE(INTERNAL_ERROR, - "[Check][Size]output[%zu] tensor size(%zu) is not enough for output shape [%s]", - i, output_tensor.GetSize(), tensor_desc->GetShape().ToString().c_str()); - REPORT_INNER_ERROR("E19999", "output[%zu] tensor size(%zu) is not enough for output shape [%s]", - i, output_tensor.GetSize(), tensor_desc->GetShape().ToString().c_str()); + "[Check][Size]output[%zu] tensor size(%zu) is not enough for output shape [%s], model_id = %u.", + i, output_tensor.GetSize(), tensor_desc->GetShape().ToString().c_str(), model_id_); + REPORT_INNER_ERROR("E19999", + "output[%zu] tensor size(%zu) is not enough for output shape [%s] model_id = %u," + " when HybridModelAsyncExecutor %s.", + i, output_tensor.GetSize(), tensor_desc->GetShape().ToString().c_str(), model_id_, __FUNCTION__); return INTERNAL_ERROR; } @@ -481,7 +492,7 @@ Status HybridModelAsyncExecutor::Execute(const std::vector &inputs, args.input_desc.emplace_back(tensor_desc_ptr); } - GE_CHK_STATUS_RET(executor_->Execute(args), "[Invoke][Execute] Failed when HybridModelAsyncExecutor %s.", __FUNCTION__); + GE_CHK_STATUS_RET(executor_->Execute(args), "[Invoke][Execute] Failed, model_id = %u.", model_id_); for (const auto &output_tensor_desc : args.output_desc) { output_desc.emplace_back(*output_tensor_desc); } @@ -502,14 +513,15 @@ Status HybridModelAsyncExecutor::Execute(const vector &inputs, vector< } HybridModelExecutor::ExecuteArgs args; - GE_CHK_STATUS_RET(PrepareInputs(input_data, args), "[Invoke][PrepareInputs]Failed to copy input data to model"); + GE_CHK_STATUS_RET(PrepareInputs(input_data, args), + "[Invoke][PrepareInputs]Failed to copy input data to model, model_id = %u", model_id_); GELOGD("Done copying input data successfully."); - GE_CHK_STATUS_RET(executor_->Execute(args), "[Invoke][Execute] Failed."); + GE_CHK_STATUS_RET(executor_->Execute(args), "[Invoke][Execute] Failed, model_id = %u.", model_id_); std::vector output_tensor_info_list; OutputData output_data; GE_CHK_STATUS_RET(CopyOutputs(args, &output_data, output_tensor_info_list), - "[Invoke][CopyOutputs]Failed to copy outputs."); + "[Invoke][CopyOutputs]Failed to copy outputs, model_id = %u.", model_id_); GELOGD("Done copying output data successfully. output count = %zu", output_tensor_info_list.size()); int out_index = 0; @@ -560,7 +572,8 @@ Status HybridModelAsyncExecutor::DumpOpDebug() { loop_cond = const_cast(varible_loop_cond->GetData()); } data_dumper_.SetLoopAddr(global_step, loop_per_iter, loop_cond); - GE_CHK_STATUS_RET(data_dumper_.LoadDumpInfo(), "[Invoke][LoadDumpInfo] failed in hybrid engine"); + GE_CHK_STATUS_RET(data_dumper_.LoadDumpInfo(), + "[Invoke][LoadDumpInfo] failed in hybrid engine, model_id = %u.", model_id_); GELOGD("Dump op debug SUCCESS in hybrid engine"); } return SUCCESS; diff --git a/ge/hybrid/executor/hybrid_model_executor.cc b/ge/hybrid/executor/hybrid_model_executor.cc index 85abb7bd..51233636 100755 --- a/ge/hybrid/executor/hybrid_model_executor.cc +++ b/ge/hybrid/executor/hybrid_model_executor.cc @@ -72,7 +72,7 @@ Status HybridModelExecutor::Execute(HybridModelExecutor::ExecuteArgs &args) { if (ret == END_OF_SEQUENCE) { args.is_eos = true; } else { - GE_CHK_STATUS_RET(ret, "[Invoke][ExecuteGraphInternal]Failed when HybridModelExecutor %s.", __FUNCTION__); + GE_CHK_STATUS_RET(ret, "[Invoke][ExecuteGraphInternal] Failed, ret:%d.", ret); } return SUCCESS; } diff --git a/ge/hybrid/executor/hybrid_model_pipeline_executor.cc b/ge/hybrid/executor/hybrid_model_pipeline_executor.cc index 46650c02..098c194c 100644 --- a/ge/hybrid/executor/hybrid_model_pipeline_executor.cc +++ b/ge/hybrid/executor/hybrid_model_pipeline_executor.cc @@ -59,8 +59,8 @@ Status StageExecutor::Start(const std::vector &inputs, const std::v task_queue_.Pop(task_info); GELOGD("[Executor: %d] Got task, stage = %d, iteration = %ld", id_, task_info.stage, task_info.iteration); if (task_info.iteration >= pipe_config_->iteration_end) { - GELOGE(INTERNAL_ERROR, "[Check][Range][Executor: %d] Unexpected iteration: %d when StageExecutor %s.", - id_, task_info.iteration, __FUNCTION__); + GELOGE(INTERNAL_ERROR, "[Check][Range][Executor: %d] Unexpected iteration: %d.", + id_, task_info.iteration); REPORT_INNER_ERROR("E19999", "[Executor: %d] Unexpected iteration: %d when StageExecutor %s.", id_, task_info.iteration, __FUNCTION__); return INTERNAL_ERROR; @@ -97,10 +97,10 @@ Status StageExecutor::Start(const std::vector &inputs, const std::v auto sync_result = Synchronize(); if (sync_result != SUCCESS) { GELOGE(sync_result, - "[Invoke][Synchronize][Executor: %d] Failed to sync result when StageExecutor %s. iteration = %d", - id_, __FUNCTION__, task_info.iteration); - REPORT_CALL_ERROR("E19999", "[Executor: %d] Failed to sync result when StageExecutor %s. iteration = %d", - id_, __FUNCTION__, task_info.iteration); + "[Invoke][Synchronize][Executor: %d] Failed to sync result:%d. iteration = %d", + id_, sync_result, task_info.iteration); + REPORT_CALL_ERROR("E19999", "[Executor: %d] Failed to sync result:%d when StageExecutor %s. iteration = %d", + id_, sync_result, __FUNCTION__, task_info.iteration); context_.profiler->Dump(std::cout); context_.callback_manager->Destroy(); RuntimeInferenceContext::DestroyContext(std::to_string(context_.context_id)); @@ -249,8 +249,7 @@ Status HybridModelPipelineExecutor::Execute(HybridModelExecutor::ExecuteArgs &ar GELOGD("Start to sync result of executor[%zu]", i); auto ret = futures[i].get(); if (ret != SUCCESS) { - GELOGE(ret, "[Check][Result][Executor: %zu] Failed to schedule tasks when HybridModelPipelineExecutor %s.", - i, __FUNCTION__); + GELOGE(ret, "[Check][Result][Executor: %zu] Failed to schedule tasks.", i); REPORT_INNER_ERROR("E19999", "[Executor: %zu] Failed to schedule tasks when HybridModelPipelineExecutor %s.", i, __FUNCTION__); has_error = true; @@ -260,8 +259,7 @@ Status HybridModelPipelineExecutor::Execute(HybridModelExecutor::ExecuteArgs &ar ret = stage_executors_[i]->Synchronize(); if (ret != SUCCESS) { - GELOGE(ret, "[Invoke][Synchronize] failed for [Executor: %zu] when HybridModelPipelineExecutor %s.", - i, __FUNCTION__); + GELOGE(ret, "[Invoke][Synchronize] failed for [Executor: %zu].", i); REPORT_CALL_ERROR("E19999", "[Executor: %zu] failed to Synchronize result when HybridModelPipelineExecutor %s.", i, __FUNCTION__); has_error = true; @@ -279,7 +277,7 @@ Status HybridModelPipelineExecutor::Execute(HybridModelExecutor::ExecuteArgs &ar iteration_ = config_.iteration_end; if (has_error) { - GELOGE(FAILED, "[Check][Error]Error occurred while execution when HybridModelPipelineExecutor %s.", __FUNCTION__); + GELOGE(FAILED, "[Check][Error]Error occurred while execution."); REPORT_INNER_ERROR("E19999", "Error occurred while execution when HybridModelPipelineExecutor %s.", __FUNCTION__); return FAILED; } diff --git a/ge/hybrid/executor/hybrid_profiler.cc b/ge/hybrid/executor/hybrid_profiler.cc index 0f074b29..bbe82dad 100644 --- a/ge/hybrid/executor/hybrid_profiler.cc +++ b/ge/hybrid/executor/hybrid_profiler.cc @@ -40,8 +40,8 @@ void HybridProfiler::RecordEvent(EventType event_type, const char *fmt, ...) { char buf[kEventDescMax]; if (vsnprintf_s(buf, kEventDescMax, kEventDescMax - 1, fmt, args) == -1) { - GELOGE(FAILED, "[Parse][Param:fmt]Format %s failed when HybridProfiler %s.", fmt, __FUNCTION__); - REPORT_INNER_ERROR("E19999", "Parse Format %s failed when HybridProfiler %s.", fmt, __FUNCTION__); + GELOGE(FAILED, "[Parse][Param:fmt]Format %s failed.", fmt); + REPORT_CALL_ERROR("E19999", "Parse Format %s failed when HybridProfiler %s.", fmt, __FUNCTION__); va_end(args); return; } diff --git a/ge/hybrid/executor/node_done_manager.cc b/ge/hybrid/executor/node_done_manager.cc index 95dfb67f..921ea1e2 100644 --- a/ge/hybrid/executor/node_done_manager.cc +++ b/ge/hybrid/executor/node_done_manager.cc @@ -28,8 +28,8 @@ bool NodeDoneManager::Cond::Await() { if (!cv_.wait_for(lk, std::chrono::seconds(kDefaultWaitTimeoutInSec), [&]() { return is_released_ || is_cancelled_; })) { - GELOGE(INTERNAL_ERROR, "[Invoke][wait_for]Wait timed out when %s.", __FUNCTION__); - REPORT_INNER_ERROR("E19999", "wait timed out when %s.", __FUNCTION__); + GELOGE(INTERNAL_ERROR, "[Invoke][wait_for]Wait timed out."); + REPORT_INNER_ERROR("E19999", "wait timed out[%d] when %s.", kDefaultWaitTimeoutInSec, __FUNCTION__); return false; } diff --git a/ge/hybrid/executor/node_state.cc b/ge/hybrid/executor/node_state.cc index 5eb8ee54..f30901c9 100644 --- a/ge/hybrid/executor/node_state.cc +++ b/ge/hybrid/executor/node_state.cc @@ -67,8 +67,8 @@ Status ShapeInferenceState::UpdateInputShape(int idx, const GeTensorDesc &target Format format = input_desc.GetFormat(); DataType data_type = input_desc.GetDataType(); if (TensorUtils::CalcTensorMemSize(shape, format, data_type, tensor_size) != GRAPH_SUCCESS) { - GELOGE(FAILED, "[Invoke][CalcTensorMemSize] failed for [%s] when ShapeInferenceState %s.", - node_item.NodeName().c_str(), __FUNCTION__); + GELOGE(FAILED, "[Invoke][CalcTensorMemSize] failed for [%s].", + node_item.NodeName().c_str()); REPORT_CALL_ERROR("E19999", "CalcTensorMemSize failed for [%s] when ShapeInferenceState %s.", node_item.NodeName().c_str(), __FUNCTION__); return FAILED; @@ -124,8 +124,8 @@ Status ShapeInferenceState::AwaitShapesReady(const GraphExecutionContext &contex } if (context.GetStatus() != SUCCESS) { - GELOGE(FAILED, "[Check][Status][%s] Await pending shape cancelled when %s.", - node_item.NodeName().c_str(), __FUNCTION__); + GELOGE(FAILED, "[Check][Status][%s] Await pending shape cancelled.", + node_item.NodeName().c_str()); REPORT_CALL_ERROR("E19999", "[%s] Await pending shape cancelled when %s.", node_item.NodeName().c_str(), __FUNCTION__); break; @@ -133,10 +133,10 @@ Status ShapeInferenceState::AwaitShapesReady(const GraphExecutionContext &contex } if (!wait_success) { - GELOGE(FAILED, "[Check][Status][%s] Wait for shape timeout when %s.", - node_item.NodeName().c_str(), __FUNCTION__); - REPORT_CALL_ERROR("E19999", "[%s] Wait for shape timeout when %s.", - node_item.NodeName().c_str(), __FUNCTION__); + GELOGE(FAILED, "[Check][Status][%s] Wait for shape timeout:%d.", + node_item.NodeName().c_str(), kWaitInternal); + REPORT_CALL_ERROR("E19999", "[%s] Wait for shape timeout:%d when %s.", + node_item.NodeName().c_str(), kWaitInternal, __FUNCTION__); return FAILED; } } @@ -241,7 +241,7 @@ Status NodeState::WaitForPrepareDone() { if (prepare_future_.valid()) { GELOGD("[%s] Start to wait for prepare future.", GetName().c_str()); GE_CHK_STATUS_RET(prepare_future_.get(), - "[Check][Status][%s] PreRun failed when NodeState %s.", GetName().c_str(), __FUNCTION__); + "[Check][Status][%s] PreRun failed.", GetName().c_str()); } return SUCCESS; diff --git a/ge/hybrid/executor/worker/execution_engine.cc b/ge/hybrid/executor/worker/execution_engine.cc index 3d21fac3..797c0092 100755 --- a/ge/hybrid/executor/worker/execution_engine.cc +++ b/ge/hybrid/executor/worker/execution_engine.cc @@ -102,9 +102,9 @@ Status NodeDoneCallback::PrepareConstInputs(const NodeItem &node_item) { if (output_tensor->GetSize() < static_cast(tensor_size)) { GELOGE(INTERNAL_ERROR, - "[Check][Size][%s] Tensor size is not enough. output index = %d, required size = %ld, tensor = %s when %s.", + "[Check][Size][%s] Tensor size is not enough. output index = %d, required size = %ld, tensor = %s.", node_item.NodeName().c_str(), output_idx, tensor_size, - output_tensor->DebugString().c_str(), __FUNCTION__); + output_tensor->DebugString().c_str()); REPORT_INNER_ERROR("E19999", "[%s] Tensor size is not enough. output index = %d, required size = %ld, tensor = %s when %s.", node_item.NodeName().c_str(), output_idx, tensor_size, @@ -175,7 +175,7 @@ Status NodeDoneCallback::GetTaskDescInfo(const NodePtr node, const HybridModel * Status NodeDoneCallback::ProfilingReport() { auto node = context_->GetNodeItem().node; if (node == nullptr) { - GELOGE(PARAM_INVALID, "[Get][Node] value is nullptr when %s.", __FUNCTION__); + GELOGE(PARAM_INVALID, "[Get][Node] value is nullptr."); REPORT_INNER_ERROR("E19999", "Get node failed, when %s.", __FUNCTION__); return PARAM_INVALID; } @@ -193,7 +193,7 @@ Status NodeDoneCallback::ProfilingReport() { std::vector task_desc_info; auto profiling_ret = GetTaskDescInfo(node, model, task_desc_info); if (profiling_ret != RT_ERROR_NONE) { - GELOGE(profiling_ret, "[Get][TaskDescInfo] of node:%s failed, when %s.", node->GetName().c_str(), __FUNCTION__); + GELOGE(profiling_ret, "[Get][TaskDescInfo] of node:%s failed.", node->GetName().c_str()); REPORT_CALL_ERROR("E19999", "GetTaskDescInfo of node:%s failed, when %s.", node->GetName().c_str(), __FUNCTION__); return profiling_ret; } @@ -206,7 +206,7 @@ Status NodeDoneCallback::ProfilingReport() { Status NodeDoneCallback::DumpDynamicNode() { auto node = context_->GetNodeItem().node; if (node == nullptr) { - GELOGE(PARAM_INVALID, "[Get][Node] value is nullptr when %s.", __FUNCTION__); + GELOGE(PARAM_INVALID, "[Get][Node] value is nullptr."); REPORT_INNER_ERROR("E19999", "get node is nullptr when %s.", __FUNCTION__); return PARAM_INVALID; } @@ -250,12 +250,12 @@ Status NodeDoneCallback::DumpDynamicNode() { void *global_step = context_->GetExecutionContext()->global_step; dump_op_.SetLoopAddr(global_step, loop_per_iter, loop_cond); - GE_CHK_STATUS_RET(dump_op_.LaunchDumpOp(), "[Launch][DumpOp] failed in hybird model when %s.", __FUNCTION__); + GE_CHK_STATUS_RET(dump_op_.LaunchDumpOp(), "[Launch][DumpOp] failed in hybird model."); auto rt_ret = rtStreamSynchronize(stream); if (rt_ret != RT_ERROR_NONE) { - GELOGE(rt_ret, "[Call][rtStreamSynchronize] failed when %s.", __FUNCTION__); - REPORT_CALL_ERROR("E19999", "call rtStreamSynchronize failed when %s.", __FUNCTION__); + GELOGE(rt_ret, "[Call][rtStreamSynchronize] failed, ret = %d.", rt_ret); + REPORT_CALL_ERROR("E19999", "call rtStreamSynchronize failed when %s, ret = %d.", __FUNCTION__, rt_ret); return rt_ret; } return SUCCESS; @@ -270,12 +270,12 @@ Status NodeDoneCallback::OnNodeDone() { const DumpProperties &dump_properties = context_->GetDumpProperties(); if (dump_properties.IsDumpOpen() || context_->IsOverFlow()) { GELOGI("Start to dump dynamic shape op"); - GE_CHK_STATUS_RET(DumpDynamicNode(), "[Call][DumpDynamicNode] Failed when %s.", __FUNCTION__); + GE_CHK_STATUS_RET(DumpDynamicNode(), "[Call][DumpDynamicNode] Failed."); } if (ProfilingManager::Instance().ProfilingModelExecuteOn()) { - GE_CHK_STATUS_RET(ProfilingReport(), "[Report][Profiling] of node[%s] failed when %s.", - node_item.NodeName().c_str(), __FUNCTION__); + GE_CHK_STATUS_RET(ProfilingReport(), "[Report][Profiling] of node[%s] failed.", + node_item.NodeName().c_str()); } // release workspace @@ -298,7 +298,7 @@ Status NodeDoneCallback::OnNodeDone() { } GE_CHK_STATUS_RET(context_->PropagateOutputs(), - "[Propagate][Outputs] of [%s] failed when %s.", node_item.NodeName().c_str(), __FUNCTION__); + "[Propagate][Outputs] of [%s] failed.", node_item.NodeName().c_str()); RECORD_CALLBACK_EVENT(graph_context_, context_->GetNodeName(), "[PropagateOutputs] End"); } @@ -338,8 +338,8 @@ Status ExecutionEngine::DoExecuteAsync(NodeState &node_state, const std::function &callback) { const auto &task = node_state.GetKernelTask(); if (task == nullptr) { - GELOGE(INTERNAL_ERROR, "[Get][KernelTask] of [%s] is null when %s.", node_state.GetName().c_str(), __FUNCTION__); - REPORT_CALL_ERROR("E19999", "GetKernelTask of %s is null when %s.", node_state.GetName().c_str(), __FUNCTION__); + GELOGE(INTERNAL_ERROR, "[Get][KernelTask] of [%s] is null.", node_state.GetName().c_str()); + REPORT_INNER_ERROR("E19999", "GetKernelTask of %s is null when %s.", node_state.GetName().c_str(), __FUNCTION__); return INTERNAL_ERROR; } @@ -354,7 +354,7 @@ Status ExecutionEngine::DoExecuteAsync(NodeState &node_state, GE_CHECK_NOTNULL(executor); RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[PrepareTask] Start"); GE_CHK_STATUS_RET(executor->PrepareTask(*task, task_context), - "[Prepare][Task] for [%s] failed when %s", node_state.GetName().c_str(), __FUNCTION__); + "[Prepare][Task] for [%s] failed.", node_state.GetName().c_str()); RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[PrepareTask] End"); GELOGD("[%s] Done task preparation successfully.", node_state.GetName().c_str()); @@ -365,8 +365,8 @@ Status ExecutionEngine::DoExecuteAsync(NodeState &node_state, } } - GE_CHK_STATUS_RET(ValidateInputTensors(node_state, task_context), "[Validate][InputTensors] for %s failed when %s.", - node_state.GetName().c_str(), __FUNCTION__); + GE_CHK_STATUS_RET(ValidateInputTensors(node_state, task_context), "[Validate][InputTensors] for %s failed.", + node_state.GetName().c_str()); RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[ValidateInputTensors] End"); if (context.profiling_level > 0) { @@ -420,8 +420,8 @@ Status ExecutionEngine::ValidateInputTensors(const NodeState &node_state, const input_tensor->GetSize()); } else { GELOGE(INTERNAL_ERROR, - "[Check][Size] for [%s] Input[%d]: tensor size mismatches. expected: %ld, but given %zu when %s.", - task_context.GetNodeName(), i, expected_size, input_tensor->GetSize(), __FUNCTION__); + "[Check][Size] for [%s] Input[%d]: tensor size mismatches. expected: %ld, but given %zu.", + task_context.GetNodeName(), i, expected_size, input_tensor->GetSize()); REPORT_INNER_ERROR("E19999", "[%s] Input[%d]: tensor size mismatches. expected: %ld, but given %zu when %s.", task_context.GetNodeName(), i, expected_size, input_tensor->GetSize(), __FUNCTION__); return INTERNAL_ERROR; @@ -437,7 +437,7 @@ Status ExecutionEngine::PropagateOutputs(const NodeItem &node_item, GraphExecutionContext &context) { if (node_item.shape_inference_type != DEPEND_COMPUTE) { GE_CHK_STATUS_RET(task_context.PropagateOutputs(), - "[Propagate][Outputs] for [%s] failed when ExecutionEngine %s.", node_item.NodeName().c_str(), __FUNCTION__); + "[Propagate][Outputs] for [%s] failed.", node_item.NodeName().c_str()); RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[PropagateOutputs] End"); GELOGD("[%s] Done propagating outputs successfully.", node_item.NodeName().c_str()); } diff --git a/ge/hybrid/executor/worker/shape_inference_engine.cc b/ge/hybrid/executor/worker/shape_inference_engine.cc index 97ea77a1..f8fbbb9c 100755 --- a/ge/hybrid/executor/worker/shape_inference_engine.cc +++ b/ge/hybrid/executor/worker/shape_inference_engine.cc @@ -70,7 +70,7 @@ Status ShapeInferenceEngine::InferShape(NodeState &node_state) { { RECORD_SHAPE_INFERENCE_EVENT(execution_context_, node_item.NodeName().c_str(), "[InferShapeAndType] Start"); GE_CHK_STATUS_RET(ShapeRefiner::InferShapeAndTypeForRunning(node_item.node, true), - "[Invoke][InferShapeAndType] for %s failed when %s.", node_item.NodeName().c_str(), __FUNCTION__); + "[Invoke][InferShapeAndType] for %s failed.", node_item.NodeName().c_str()); RECORD_SHAPE_INFERENCE_EVENT(execution_context_, node_item.NodeName().c_str(), "[InferShapeAndType] End"); } @@ -172,7 +172,7 @@ Status ShapeInferenceEngine::InferShapeForSubgraph(const NodeItem &node_item, co GE_CHK_STATUS_RET(ShapeRefiner::InferShapeAndType(node)); GELOGD("[%s] Done invoking InferShapeAndType", node->GetName().c_str()); GE_CHK_STATUS_RET(UpdatePeerNodeShape(*node), - "[Update][PeerNodeShape] failed for [%s] when %s.", node->GetName().c_str(), __FUNCTION__); + "[Update][PeerNodeShape] failed for [%s].", node->GetName().c_str()); } for (auto &it : fused_subgraph.output_mapping) { @@ -204,8 +204,7 @@ Status ShapeInferenceEngine::UpdatePeerNodeShape(const Node &node) { GE_CHECK_NOTNULL(peer_op_desc); auto peer_input_desc = peer_op_desc->MutableInputDesc(peer_anchor->GetIdx()); if (peer_input_desc == nullptr) { - GELOGE(GRAPH_FAILED, "[Call][MutableInputDesc] for %s return nullptr when ShapeInferenceEngine %s.", - peer_op_desc->GetName().c_str(), __FUNCTION__); + GELOGE(GRAPH_FAILED, "[Call][MutableInputDesc] for %s return nullptr.", peer_op_desc->GetName().c_str()); REPORT_CALL_ERROR("E19999", "%s call MutableInputDesc return nullptr when ShapeInferenceEngine %s.", peer_op_desc->GetName().c_str(), __FUNCTION__); continue; @@ -233,8 +232,8 @@ Status ShapeInferenceEngine::CanonicalizeShape(GeTensorDesc &tensor_desc, if (tensor_shape.IsUnknownShape()) { if (!fallback_with_range) { GELOGE(INTERNAL_ERROR, - "[Is][UnknownShape] Output shape is still unknown after shape inference. " - "shape = [%s] when ShapeInferenceEngine %s.", tensor_shape.ToString().c_str(), __FUNCTION__); + "[Is][UnknownShape] Output shape is still unknown after shape inference. shape = [%s].", + tensor_shape.ToString().c_str()); REPORT_INNER_ERROR("E19999", "Output shape is still unknown after shape inference. " "shape = [%s] when ShapeInferenceEngine %s.", tensor_shape.ToString().c_str(), __FUNCTION__); return INTERNAL_ERROR; @@ -244,8 +243,8 @@ Status ShapeInferenceEngine::CanonicalizeShape(GeTensorDesc &tensor_desc, std::vector> shape_range; GE_CHK_GRAPH_STATUS_RET(tensor_desc.GetShapeRange(shape_range), "Failed to get shape range"); if (shape_range.size() != shape.size()) { - GELOGE(INTERNAL_ERROR, "[Check][Size] Number of shape ranges (%zu) mismatches that of dims (%zu)" - " when ShapeInferenceEngine %s.", shape_range.size(), shape.size(), __FUNCTION__); + GELOGE(INTERNAL_ERROR, "[Check][Size] Number of shape ranges (%zu) mismatches that of dims (%zu).", + shape_range.size(), shape.size()); REPORT_INNER_ERROR("E19999", "Number of shape ranges (%zu) mismatches that of dims (%zu)" " when ShapeInferenceEngine %s.", shape_range.size(), shape.size(), __FUNCTION__); return INTERNAL_ERROR; @@ -271,8 +270,8 @@ Status ShapeInferenceEngine::CalcTensorSize(DataType data_type, GELOGD("To calc tensor size by shape = [%s]", GeShape(shape).ToString().c_str()); uint32_t type_size; if (!TypeUtils::GetDataTypeLength(data_type, type_size)) { - GELOGE(INTERNAL_ERROR, "[Get][DataTypeLength] failed for type:%s when ShapeInferenceEngine %s.", - TypeUtils::DataTypeToSerialString(data_type).c_str(), __FUNCTION__); + GELOGE(INTERNAL_ERROR, "[Get][DataTypeLength] failed for type:%s.", + TypeUtils::DataTypeToSerialString(data_type).c_str()); REPORT_CALL_ERROR("E19999", "GetDataTypeLength failed for type:%s when ShapeInferenceEngine %s.", TypeUtils::DataTypeToSerialString(data_type).c_str(), __FUNCTION__); return INTERNAL_ERROR; @@ -287,7 +286,7 @@ Status ShapeInferenceEngine::CalcTensorSize(DataType data_type, } GE_CHK_STATUS_RET(CheckInt64AddOverflow(tensor_size, kAlignment - 1), - "[Check][Overflow]Tensor size is too large: %ld, shape = [%s]", + "[Check][Overflow]Tensor size is too large: %ld, shape = [%s] Shape size will overflow when add align.", tensor_size, GeShape(shape).ToString().c_str()); tensor_size = (tensor_size + kAlignment - 1) / kAlignment * kAlignment; return SUCCESS; @@ -301,14 +300,23 @@ Status ShapeInferenceEngine::CalcOutputTensorSizes(const NodeItem &node_item, bo const auto &shape = tensor_desc->MutableShape(); // modify on copy auto dims = shape.GetDims(); - GE_CHK_STATUS_RET(CanonicalizeShape(*tensor_desc, dims, fallback_with_range), - "[Canonicalize][Shape] failed for [%s], output %zu, when ShapeInferenceEngine %s.", - node_item.NodeName().c_str(), output_index, __FUNCTION__); - + auto _status = CanonicalizeShape(*tensor_desc, dims, fallback_with_range); + if(_status != SUCCESS){ + REPORT_CALL_ERROR("E19999", "Invoke CanonicalizeShape failed when ShapeInferenceEngine %s, node:%s, output:%zu.", + node_item.NodeName().c_str(), __FUNCTION__, output_index); + GELOGE(ge::FAILED, "[Canonicalize][Shape] failed for [%s], output %zu.", + node_item.NodeName().c_str(), output_index); + return _status; + } int64_t tensor_size; - GE_CHK_STATUS_RET(CalcTensorSize(tensor_desc->GetDataType(), dims, tensor_size), - "[Calc][TensorSize] failed for [%s], output %zu when ShapeInferenceEngine %s.", - node_item.NodeName().c_str(), output_index, __FUNCTION__); + _status = CalcTensorSize(tensor_desc->GetDataType(), dims, tensor_size); + if(_status != SUCCESS){ + REPORT_CALL_ERROR("E19999", "Invoke CalcTensorSize failed when ShapeInferenceEngine %s, node:%s, output:%zu.", + node_item.NodeName().c_str(), __FUNCTION__, output_index); + GELOGE(ge::FAILED, "[Calc][TensorSize] failed for [%s], output %zu.", + node_item.NodeName().c_str(), output_index); + return _status; + } GELOGD("[%s] Tensor size of output %zu = %ld", node_item.NodeName().c_str(), output_index, tensor_size); (void) TensorUtils::SetSize(*tensor_desc, tensor_size); } diff --git a/ge/hybrid/executor/worker/task_compile_engine.cc b/ge/hybrid/executor/worker/task_compile_engine.cc index b4353400..f7da9acd 100755 --- a/ge/hybrid/executor/worker/task_compile_engine.cc +++ b/ge/hybrid/executor/worker/task_compile_engine.cc @@ -32,8 +32,7 @@ Status TaskCompileEngine::Compile(NodeState &node_state, GraphExecutionContext * shared_ptr kernel_task; auto ret = node_item.node_executor->CompileTask(*context->model, node_item.node, kernel_task); RECORD_COMPILE_EVENT(context, node_state.GetName().c_str(), "[Compile] End"); - GE_CHK_STATUS_RET(ret, "[Compile][Task] failed for node: %s, when TaskCompileEngine %s.", - node_item.NodeName().c_str(), __FUNCTION__); + GE_CHK_STATUS_RET(ret, "[Compile][Task] failed for node: %s.", node_item.NodeName().c_str()); node_state.SetKernelTask(kernel_task); GELOGI("Compiling node %s successfully", node_state.GetName().c_str()); return SUCCESS; From d85228d6fb34efa59e5d23a5c4dc940347a529af Mon Sep 17 00:00:00 2001 From: liudingyan Date: Tue, 30 Mar 2021 19:14:32 +0800 Subject: [PATCH 4/5] modify geloge and add err report --- ge/hybrid/executor/rt_callback_manager.cc | 10 +- ge/hybrid/executor/subgraph_context.cc | 21 +- ge/hybrid/executor/subgraph_executor.cc | 108 +++++----- ge/hybrid/model/graph_item.cc | 3 +- ge/hybrid/model/hybrid_model_builder.cc | 232 ++++++++++++---------- 5 files changed, 206 insertions(+), 168 deletions(-) diff --git a/ge/hybrid/executor/rt_callback_manager.cc b/ge/hybrid/executor/rt_callback_manager.cc index d3989f31..9ceba0a1 100644 --- a/ge/hybrid/executor/rt_callback_manager.cc +++ b/ge/hybrid/executor/rt_callback_manager.cc @@ -27,7 +27,8 @@ Status CallbackManager::RegisterCallback(rtStream_t stream, rtCallback_t callbac GE_CHK_RT_RET(rtEventCreate(&event)); auto rt_ret = rtEventRecord(event, stream); if (rt_ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "Failed to invoke rtEventRecord, error code = %d", rt_ret); + GELOGE(RT_FAILED, "[Invoke][rtEventRecord] failed, error code = %d", rt_ret); + REPORT_CALL_ERROR("E19999", "Invoke rtEventRecord failed when %s, error code = %d", __FUNCTION__, rt_ret); (void) rtEventDestroy(event); return RT_FAILED; } @@ -50,7 +51,8 @@ Status CallbackManager::Init() { return CallbackProcess(context); }, ctx); if (!ret_future_.valid()) { - GELOGE(INTERNAL_ERROR, "Failed to init callback manager."); + GELOGE(INTERNAL_ERROR, "[Check][ShareState]Failed to init callback manager."); + REPORT_INNER_ERROR("E19999", "Failed to init callback manager."); return INTERNAL_ERROR; } @@ -73,7 +75,9 @@ Status CallbackManager::CallbackProcess(rtContext_t context) { auto rt_err = rtEventSynchronize(event); if (rt_err != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "rtEventSynchronize failed. ret = %d", rt_err); + GELOGE(RT_FAILED, "[Invoke][rtEventSynchronize] failed. ret = %d", rt_err); + REPORT_CALL_ERROR("E19999", + "Invoke rtEventSynchronize failed when CallbackManager %s, ret = %d.", __FUNCTION__, rt_err); GE_CHK_RT(rtEventDestroy(event)); return RT_FAILED; } diff --git a/ge/hybrid/executor/subgraph_context.cc b/ge/hybrid/executor/subgraph_context.cc index 0fa112a4..34cc2bbf 100644 --- a/ge/hybrid/executor/subgraph_context.cc +++ b/ge/hybrid/executor/subgraph_context.cc @@ -50,9 +50,11 @@ NodeStatePtr SubgraphContext::GetOrCreateNodeState(const NodeItem *node_item) { Status SubgraphContext::SetInput(int index, const TensorValue &tensor) { if (static_cast(index) >= all_inputs_.size()) { GELOGE(INTERNAL_ERROR, - "output index output range. all input num = %zu, input index = %d", - all_inputs_.size(), - index); + "[Check][Param:index]output index output range. all input num = %zu, input index = %d", + all_inputs_.size(), index); + REPORT_INNER_ERROR("E19999", + "input param index out range when SubgraphContext %s, all input num = %zu, input index = %d.", + __FUNCTION__, all_inputs_.size(), index); return INTERNAL_ERROR; } all_inputs_[index] = tensor; @@ -68,10 +70,11 @@ Status SubgraphContext::SetOutput(const NodeItem &node_item, int output_index, c auto index = node_item.output_start + output_index; if ((output_index >= node_item.num_outputs) || (static_cast(index) >= all_outputs_.size())) { GELOGE(INTERNAL_ERROR, - "output index output range. all output num = %zu, node_item = %s, output index = %d", - all_outputs_.size(), - node_item.DebugString().c_str(), - output_index); + "[Check][Param:output_index]output index output range. all output num = %zu, node_item = %s," + "output index = %d.", all_outputs_.size(), node_item.DebugString().c_str(), output_index); + REPORT_INNER_ERROR("E19999", "output index output range when SubgraphContext %s. " + "all output num = %zu, node_item = %s, output index = %d.", + __FUNCTION__, all_outputs_.size(), node_item.DebugString().c_str(), output_index); return INTERNAL_ERROR; } @@ -126,7 +129,9 @@ Status SubgraphContext::Await(const NodePtr &node) { void SubgraphContext::OnError(Status error) { if (error != END_OF_SEQUENCE) { - GELOGE(error, "[%s] Error occurred while executing graph.", graph_item_->GetName().c_str()); + GELOGE(error, "[Check][Param:error][%s] Error occurred while executing graph.", graph_item_->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "[%s] Error occurred while executing graph when SubgraphContext %s.", + graph_item_->GetName().c_str(), __FUNCTION__); } node_done_manager_.Destroy(); } diff --git a/ge/hybrid/executor/subgraph_executor.cc b/ge/hybrid/executor/subgraph_executor.cc index 57e4052d..3bd78579 100644 --- a/ge/hybrid/executor/subgraph_executor.cc +++ b/ge/hybrid/executor/subgraph_executor.cc @@ -44,7 +44,8 @@ Status SubgraphExecutor::Init(const std::vector &inputs, const std::vector &input_desc) { subgraph_context_.reset(new(std::nothrow)SubgraphContext(graph_item_, context_)); GE_CHECK_NOTNULL(subgraph_context_); - GE_CHK_STATUS_RET(subgraph_context_->Init(), "[%s] Failed to init subgraph context.", graph_item_->GetName().c_str()); + GE_CHK_STATUS_RET(subgraph_context_->Init(), + "[Init][SubgraphContext][%s] Failed to init subgraph context.", graph_item_->GetName().c_str()); shape_inference_engine_.reset(new(std::nothrow) ShapeInferenceEngine(context_, subgraph_context_.get())); GE_CHECK_NOTNULL(shape_inference_engine_); @@ -55,8 +56,8 @@ Status SubgraphExecutor::Init(const std::vector &inputs, graph_item_->GetName().c_str()); } else { GE_CHK_STATUS_RET(InitInputsForKnownShape(inputs), - "[%s] Failed to init subgraph executor for known shape subgraph.", - graph_item_->GetName().c_str()); + "[Invoke][InitInputsForKnownShape][%s] Failed to init subgraph executor for known shape subgraph.", + graph_item_->GetName().c_str()); } return SUCCESS; @@ -67,8 +68,12 @@ Status SubgraphExecutor::InitInputsForUnknownShape(const std::vectorGetInputNodes(); if (inputs.size() < input_nodes.size()) { - GELOGE(INTERNAL_ERROR, "[%s] Number of inputs [%zu] is not sufficient for subgraph which needs [%zu] inputs.", - graph_item_->GetName().c_str(), inputs.size(), input_nodes.size()); + GELOGE(INTERNAL_ERROR, + "[Check][Size][%s] Number of inputs [%zu] is not sufficient for subgraph which needs [%zu] inputs.", + graph_item_->GetName().c_str(), inputs.size(), input_nodes.size()); + REPORT_INNER_ERROR("E19999", + "Number of inputs [%zu] is not sufficient for subgraph which needs [%zu] inputs when SubgraphExecutor %s.", + graph_item_->GetName().c_str(), inputs.size(), input_nodes.size(), __FUNCTION__); return INTERNAL_ERROR; } @@ -87,9 +92,7 @@ Status SubgraphExecutor::InitInputsForUnknownShape(const std::vectorSetInput(*input_node, kDataInputIndex, input_tensor), - "[%s] Failed to set input tensor[%zu]", - graph_item_->GetName().c_str(), - i); + "[Invoke][SetInput] failed for grap_item[%s] input tensor[%zu]", graph_item_->GetName().c_str(), i); if (force_infer_shape_ || input_node->is_dynamic) { GELOGD("[%s] Start to update input[%zu] for subgraph data node.", graph_item_->GetName().c_str(), i); @@ -112,11 +115,11 @@ Status SubgraphExecutor::InitInputsForKnownShape(const std::vector auto &parent_input_index = input_index_mapping[i]; if (static_cast(parent_input_index) >= inputs.size()) { GELOGE(INTERNAL_ERROR, - "[%s] Number of inputs [%zu] is not sufficient for subgraph which needs at lease [%d] inputs", - graph_item_->GetName().c_str(), - inputs.size(), - parent_input_index + 1); - + "[Check][Size][%s] Number of inputs [%zu] is not sufficient for subgraph which needs at lease [%d] inputs", + graph_item_->GetName().c_str(), inputs.size(), parent_input_index + 1); + REPORT_INNER_ERROR("E19999", + "[%s] Number of inputs [%zu] is not sufficient for subgraph which needs at lease [%d] inputs when %s.", + graph_item_->GetName().c_str(), inputs.size(), parent_input_index + 1, __FUNCTION__); return INTERNAL_ERROR; } @@ -136,10 +139,10 @@ Status SubgraphExecutor::ExecuteAsync(const std::vector &inputs, const std::vector &input_desc, const std::vector &outputs) { GELOGD("[%s] is dynamic = %s", graph_item_->GetName().c_str(), graph_item_->IsDynamic() ? "true" : "false"); - GE_CHK_STATUS_RET(Init(inputs, input_desc), "[%s] Failed to init executor.", graph_item_->GetName().c_str()); + GE_CHK_STATUS_RET(Init(inputs, input_desc), "[Invoke][Init]failed for [%s].", graph_item_->GetName().c_str()); if (!outputs.empty()) { GE_CHK_STATUS_RET(EnableOutputZeroCopy(outputs), - "Failed to enable output zero copy by user provided outputs."); + "[Invoke][EnableOutputZeroCopy] Failed by user provided outputs."); } if (!graph_item_->IsDynamic()) { return ExecuteAsyncForKnownShape(inputs); @@ -194,12 +197,11 @@ Status SubgraphExecutor::ExecuteAsync(TaskContext &task_context) { } GE_CHK_STATUS_RET(ExecuteAsync(inputs, input_desc), - "[%s] Failed to execute subgraph.", - graph_item_->GetName().c_str()); + "[Invoke][ExecuteAsync] failed for [%s].", graph_item_->GetName().c_str()); GE_CHK_STATUS_RET(SetOutputsToParentNode(task_context), - "[%s] Failed to set output shapes to parent node.", - graph_item_->GetName().c_str()); + "[Invoke][SetOutputsToParentNode][%s] Failed to set output shapes to parent node.", + graph_item_->GetName().c_str()); return SUCCESS; } @@ -239,7 +241,7 @@ Status SubgraphExecutor::PrepareNodes(int group) { if (node_item.kernel_task == nullptr) { GELOGW("[%s] Node of static shape got no task.", node_item.NodeName().c_str()); GE_CHK_STATUS_RET(TaskCompileEngine::Compile(*p_node_state, context_), - "[%s] Failed to create task.", p_node_state->GetName().c_str()); + "[Invoke][Compile] failed for [%s].", p_node_state->GetName().c_str()); } else { node_state->SetKernelTask(node_item.kernel_task); } @@ -248,7 +250,9 @@ Status SubgraphExecutor::PrepareNodes(int group) { GE_CHECK_NOTNULL(unique_task_context); const auto &task = node_state->GetKernelTask(); if (task == nullptr) { - GELOGE(INTERNAL_ERROR, "[%s] NodeTask is null.", node_state->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Get][KernelTask] failed for[%s], NodeTask is null.", node_state->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "invoke GetKernelTask failed for %s when %s, nodetask is null.", + node_state->GetName().c_str(), __FUNCTION__); return INTERNAL_ERROR; } auto shared_task_context = std::shared_ptr(unique_task_context.release()); @@ -261,8 +265,10 @@ Status SubgraphExecutor::PrepareNodes(int group) { GELOGD("Got end of sequence"); return SUCCESS; } - GELOGE(INTERNAL_ERROR, "[%s] Error occurs while launching tasks. quit from preparing nodes.", - graph_item_->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Check][State][%s] Error occurs while launching tasks. quit from preparing nodes.", + graph_item_->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "[%s] Error occurs while launching tasks. quit from preparing nodes when %s.", + graph_item_->GetName().c_str(), __FUNCTION__); return INTERNAL_ERROR; } @@ -275,9 +281,9 @@ Status SubgraphExecutor::PrepareNodes(int group) { Status SubgraphExecutor::InferShape(ShapeInferenceEngine *shape_inference_engine, NodeState &node_state) const { HYBRID_CHK_STATUS_RET(shape_inference_engine->InferShape(node_state), - "[%s] Failed to InferShape.", node_state.GetName().c_str()); + "[Invoke][InferShape] failed for [%s].", node_state.GetName().c_str()); HYBRID_CHK_STATUS_RET(shape_inference_engine->PropagateOutputShapes(node_state), - "[%s] Failed to PropagateOutputShapes.", node_state.GetName().c_str()); + "[Invoke][PropagateOutputShapes] failed for [%s].", node_state.GetName().c_str()); return SUCCESS; } @@ -285,7 +291,7 @@ Status SubgraphExecutor::PrepareForExecution(GraphExecutionContext *ctx, NodeSta auto &node_item = *node_state.GetNodeItem(); if (node_item.kernel_task == nullptr) { GE_CHK_STATUS_RET(TaskCompileEngine::Compile(node_state, ctx), - "Failed to create task for node[%s]", node_state.GetName().c_str()); + "[Invoke][Compile] Failed for node[%s]", node_state.GetName().c_str()); } else { node_state.SetKernelTask(node_item.kernel_task); } @@ -293,7 +299,9 @@ Status SubgraphExecutor::PrepareForExecution(GraphExecutionContext *ctx, NodeSta GE_CHECK_NOTNULL(unique_task_context); const auto &task = node_state.GetKernelTask(); if (task == nullptr) { - GELOGE(INTERNAL_ERROR, "[%s] NodeTask is null.", node_state.GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Invoke][GetKernelTask] failed for[%s], NodeTask is null.", node_state.GetName().c_str()); + REPORT_CALL_ERROR("E19999", "invoke GetKernelTask failed for %s, NodeTask is null when %s.", + node_state.GetName().c_str(), __FUNCTION__); return INTERNAL_ERROR; } auto shared_task_context = std::shared_ptr(unique_task_context.release()); @@ -309,7 +317,8 @@ Status SubgraphExecutor::LaunchTasks() { while (true) { NodeState *node_state = nullptr; if (!ready_queue_.Pop(node_state)) { - GELOGE(INTERNAL_ERROR, "[%s] Failed to pop node.", graph_item_->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Invoke][Pop] failed for [%s].", graph_item_->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "invoke pop failed for %s when %s", graph_item_->GetName().c_str(), __FUNCTION__); return INTERNAL_ERROR; } @@ -334,8 +343,7 @@ Status SubgraphExecutor::LaunchTasks() { GE_CHECK_NOTNULL(shared_task_context); shared_task_context->SetForceInferShape(force_infer_shape_); HYBRID_CHK_STATUS_RET(ExecutionEngine::ExecuteAsync(*node_state, shared_task_context, *context_), - "[%s] Execute node failed.", - node_state->GetName().c_str()); + "[Invoke][ExecuteAsync] failed for [%s].", node_state->GetName().c_str()); GELOGD("[%s] Done executing node successfully.", node_state->GetName().c_str()); } } @@ -361,8 +369,7 @@ Status SubgraphExecutor::ScheduleTasks(int group) { } GE_CHK_STATUS_RET(prepare_future.get(), - "[%s] Error occurred in task preparation.", - graph_item_->GetName().c_str()); + "[Invoke][get] [%s] Error occurred in task preparation.", graph_item_->GetName().c_str()); GELOGD("[%s] Done launching all tasks successfully.", graph_item_->GetName().c_str()); return SUCCESS; @@ -373,17 +380,17 @@ Status SubgraphExecutor::GetOutputs(vector &outputs) { } Status SubgraphExecutor::GetOutputs(vector &outputs, std::vector &output_desc) { - GE_CHK_STATUS_RET(GetOutputs(outputs), "[%s] Failed to get output tensors.", graph_item_->GetName().c_str()); + GE_CHK_STATUS_RET(GetOutputs(outputs), "[Invoke][GetOutputs] failed for [%s].", graph_item_->GetName().c_str()); // copy output data from op to designated position GE_CHK_STATUS_RET(graph_item_->GetOutputDescList(output_desc), - "[%s] Failed to get output tensor desc.", - graph_item_->GetName().c_str()); + "[Invoke][GetOutputDescList][%s] Failed to get output tensor desc.", graph_item_->GetName().c_str()); if (outputs.size() != output_desc.size()) { GELOGE(INTERNAL_ERROR, - "Number of output tensors(%zu) mismatch number of output tensor desc(%zu).", - outputs.size(), - output_desc.size()); + "[Check][Size]Number of output tensors(%zu) mismatch number of output tensor desc(%zu).", + outputs.size(), output_desc.size()); + REPORT_INNER_ERROR("E19999", "Number of output tensors(%zu) mismatch number of output tensor desc(%zu) when %s.", + outputs.size(), output_desc.size(), __FUNCTION__); return INTERNAL_ERROR; } return SUCCESS; @@ -401,17 +408,15 @@ Status SubgraphExecutor::SetOutputsToParentNode(TaskContext &task_context) { std::vector outputs; std::vector output_desc_list; GE_CHK_STATUS_RET(subgraph_context_->GetOutputs(outputs), - "[%s] Failed to get output tensors.", - graph_item_->GetName().c_str()); + "[Invoke][GetOutputs][%s] Failed to get output tensors.", graph_item_->GetName().c_str()); GE_CHK_STATUS_RET(graph_item_->GetOutputDescList(output_desc_list), - "[%s] Failed to get output tensor desc.", - graph_item_->GetName().c_str()); + "[Invoke][GetOutputDescList][%s] Failed to get output tensor desc.", graph_item_->GetName().c_str()); if (outputs.size() != output_desc_list.size()) { - GELOGE(INTERNAL_ERROR, "[%s] num output tensors = %zu, num output tensor desc = %zu", - graph_item_->GetName().c_str(), - outputs.size(), - output_desc_list.size()); + GELOGE(INTERNAL_ERROR, "[Check][Size][%s] num output tensors = %zu, num output tensor desc = %zu", + graph_item_->GetName().c_str(), outputs.size(), output_desc_list.size()); + REPORT_INNER_ERROR("E19999", "[%s] num output tensors = %zu, num output tensor desc = %zu when %s", + graph_item_->GetName().c_str(), outputs.size(), output_desc_list.size(), __FUNCTION__); return INTERNAL_ERROR; } @@ -460,9 +465,10 @@ Status SubgraphExecutor::EnableOutputZeroCopy(const vector &outputs const auto &output_edges = graph_item_->GetOutputEdges(); // Op -> MetOutput, set the output tensor of Op that output to the NetOutput node if (outputs.size() != output_edges.size()) { - GELOGE(PARAM_INVALID, "Output number mismatches, expect = %zu, but given = %zu", - output_edges.size(), - outputs.size()); + GELOGE(PARAM_INVALID, "[Check][Size]Output number mismatches, expect = %zu, but given = %zu", + output_edges.size(), outputs.size()); + REPORT_INNER_ERROR("E19999", "Output number mismatches, expect = %zu, but given = %zu when %s", + output_edges.size(), outputs.size(), __FUNCTION__); return PARAM_INVALID; } @@ -478,9 +484,7 @@ Status SubgraphExecutor::EnableOutputZeroCopy(const vector &outputs output_tensor.DebugString().c_str()); GE_CHK_STATUS_RET(subgraph_context_->SetOutput(*output_node, output_idx, output_tensor), - "[%s] Failed to set input tensor[%zu]", - graph_item_->GetName().c_str(), - i); + "[Invoke][SetOutput][%s] Failed to set input tensor[%zu]", graph_item_->GetName().c_str(), i); } GELOGD("Done enabling zero copy for outputs successfully."); diff --git a/ge/hybrid/model/graph_item.cc b/ge/hybrid/model/graph_item.cc index 91f675a4..8204b1c8 100644 --- a/ge/hybrid/model/graph_item.cc +++ b/ge/hybrid/model/graph_item.cc @@ -95,7 +95,8 @@ Status GraphItem::GroupNodes() { int group = node->group; if (group != last_group) { if (seen_groups.find(group) != seen_groups.end()) { - GELOGE(INTERNAL_ERROR, "Unordered node group found. node = %s, group = %d", node->NodeName().c_str(), group); + GELOGE(INTERNAL_ERROR, + "[Order][NodeGroup]Unordered node group found. node = %s, group = %d", node->NodeName().c_str(), group); return INTERNAL_ERROR; } else { last_group = group; diff --git a/ge/hybrid/model/hybrid_model_builder.cc b/ge/hybrid/model/hybrid_model_builder.cc index 6e43007f..e18ba2ca 100755 --- a/ge/hybrid/model/hybrid_model_builder.cc +++ b/ge/hybrid/model/hybrid_model_builder.cc @@ -71,7 +71,7 @@ Status SetOutputNameAttr(ComputeGraph &graph) { } } GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListStr(&graph, ATTR_MODEL_OUT_NODES_NAME, output_names), - GELOGE(FAILED, "SetListStr of ATTR_MODEL_OUT_NODES_NAME failed."); + GELOGE(FAILED, "[Invoke][SetListStr] of ATTR_MODEL_OUT_NODES_NAME failed."); return FAILED); return SUCCESS; } @@ -109,10 +109,11 @@ Status CollectDependenciesForFusedGraph(NodeItem &node_item, std::set GE_CHECK_NOTNULL(src_op_desc); if (src_node->GetType() != DATA_TYPE) { GELOGE(UNSUPPORTED, - "[%s::%s] Node in fused subgraph can only depend on Data nodes, but depend on %s", - node_item.NodeName().c_str(), - node->GetName().c_str(), - src_node->GetType().c_str()); + "[Check][NodeType][%s::%s] Node in fused subgraph can only depend on Data nodes, but depend on %s", + node_item.NodeName().c_str(), node->GetName().c_str(), src_node->GetType().c_str()); + REPORT_INNER_ERROR("E19999", + "[%s::%s] Node in fused subgraph can only depend on Data nodes, but depend on %s when %s.", + node_item.NodeName().c_str(), node->GetName().c_str(), src_node->GetType().c_str(), __FUNCTION__); return UNSUPPORTED; } @@ -129,37 +130,38 @@ HybridModelBuilder::HybridModelBuilder(HybridModel &hybrid_model) } Status HybridModelBuilder::Build() { - GE_CHK_STATUS_RET(ValidateParams(), "Failed to validate GeRootModel"); + GE_CHK_STATUS_RET(ValidateParams(), "[Invoke][ValidateParams] failed, model_name_:[%s]", GetGraphName()); hybrid_model_.model_name_ = ge_root_model_->GetRootGraph()->GetName(); GELOGI("[%s] Start to build hybrid model.", GetGraphName()); - GE_CHK_STATUS_RET(InitRuntimeParams(), "[%s] Failed to InitRuntimeParams", GetGraphName()); - GE_CHK_STATUS_RET(RecoverGraphUnknownFlag(), "[%s] Failed to RecoverGraphUnknownFlag", GetGraphName()); - GE_CHK_STATUS_RET(IndexSpecialNodes(), "[%s] Failed to index nodes", GetGraphName()); - GE_CHK_STATUS_RET(IndexTaskDefs(), "[%s] Failed to index task defs", GetGraphName()); - GE_CHK_STATUS_RET(InitWeights(), "[%s] Failed to init weights", GetGraphName()); - GE_CHK_STATUS_RET(LoadGraph(), "[%s] Failed to load graph", GetGraphName()); - GE_CHK_STATUS_RET(AssignUninitializedConstantOps(), "[%s] Failed to assign uninitialized constants", GetGraphName()); - GE_CHK_STATUS_RET(TransAllVarData(), "[%s] Failed to trans all var data", GetGraphName()); - GE_CHK_STATUS_RET(CopyVarData(), "[%s] Failed to copy var data", GetGraphName()); - GE_CHK_STATUS_RET(InitModelMem(), "[%s] Failed to init memory", GetGraphName()); - GE_CHK_STATUS_RET(InitConstantOps(), "[%s] Failed to init constant op", GetGraphName()); - GE_CHK_STATUS_RET(InitVariableTensors(), "[%s] Failed to init variables", GetGraphName()); - GE_CHK_STATUS_RET(LoadTasks(), "[%s] Failed to load tasks", GetGraphName()); + GE_CHK_STATUS_RET(InitRuntimeParams(), "[Invoke][InitRuntimeParams] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(RecoverGraphUnknownFlag(), "[Invoke][RecoverGraphUnknownFlag] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(IndexSpecialNodes(), "[Invoke][IndexSpecialNodes] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(IndexTaskDefs(), "[Invoke][IndexTaskDefs] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(InitWeights(), "[Invoke][InitWeights] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(LoadGraph(), "[Invoke][LoadGraph] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(AssignUninitializedConstantOps(), + "[Invoke][AssignUninitializedConstantOps] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(TransAllVarData(), "[Invoke][TransAllVarData] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(CopyVarData(), "[Invoke][CopyVarData] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(InitModelMem(), "[Invoke][InitModelMem] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(InitConstantOps(), "[Invoke][InitConstantOps] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(InitVariableTensors(), "[Invoke][InitVariableTensors], model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(LoadTasks(), "[Invoke][LoadTasks] failed, model_name_:[%s]", GetGraphName()); GELOGI("[%s] Done building hybrid model successfully.", GetGraphName()); return SUCCESS; } Status HybridModelBuilder::BuildForSingleOp() { - GE_CHK_STATUS_RET(ValidateParams(), "Failed to validate GeRootModel"); + GE_CHK_STATUS_RET(ValidateParams(), "[Invoke][ValidateParams] failed, model_name_:[%s]", GetGraphName()); hybrid_model_.model_name_ = ge_root_model_->GetRootGraph()->GetName(); GELOGI("[%s] Start to build hybrid model.", GetGraphName()); auto ret = ge_root_model_->GetSubgraphInstanceNameToModel(); const GeModelPtr ge_model = ret[ge_root_model_->GetRootGraph()->GetName()]; GE_CHK_STATUS_RET(IndexTaskDefs(ge_root_model_->GetRootGraph(), ge_model), - "[%s] Failed to index task defs", GetGraphName()); - GE_CHK_STATUS_RET(LoadGraph(), "[%s] Failed to load graph", GetGraphName()); - GE_CHK_STATUS_RET(InitWeights(), "[%s] Failed to init weights", GetGraphName()); - GE_CHK_STATUS_RET(LoadTasks(), "[%s] Failed to load tasks", GetGraphName()); + "[Invoke][IndexTaskDefs] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(LoadGraph(), "[Invoke][LoadGraph] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(InitWeights(), "[Invoke][InitWeights] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(LoadTasks(), "[Invoke][LoadTasks] failed, model_name_:[%s]", GetGraphName()); GELOGI("[%s] Done building hybrid model for single op successfully.", GetGraphName()); return SUCCESS; } @@ -173,18 +175,20 @@ Status HybridModelBuilder::ValidateParams() { Status HybridModelBuilder::BuildNodeItem(const NodePtr &node, NodeItem &node_item) { auto op_desc = node->GetOpDesc(); GE_CHK_STATUS_RET(ParseForceInfershapeNodes(node, node_item), - "[%s] Failed to parse force_infershape node.", + "[Invoke][ParseForceInfershapeNodes]failed, node:[%s].", node_item.NodeName().c_str()); vector dependencies = node->GetOpDesc()->GetOpInferDepends(); GE_CHK_STATUS_RET(ParseDependentInputNodes(node_item, dependencies), - "[%s] Failed to parse node dependencies.", + "[Invoke][ParseDependentInputNodes]failed, node:[%s].", node_item.NodeName().c_str()); node_item.outputs.resize(node_item.num_outputs); for (int i = 0; i < node_item.num_outputs; ++i) { auto out_data_anchor = node->GetOutDataAnchor(i); if (out_data_anchor == nullptr) { - GELOGE(INTERNAL_ERROR, "out anchor[%d] of node %s is nullptr", i, node->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Get][OutDataAnchor]out anchor[%d] of node %s is nullptr", i, node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "out anchor[%d] of node %s is nullptr when %s", + i, node->GetName().c_str(), __FUNCTION__); return INTERNAL_ERROR; } @@ -197,12 +201,11 @@ Status HybridModelBuilder::BuildNodeItem(const NodePtr &node, NodeItem &node_ite NodeItem *dst_node_item = nullptr; GE_CHK_STATUS_RET(GetOrCreateNodeItem(dst_node, &dst_node_item), - "[%s] Failed to get or create node item.", + "[GetOrCreate][NodeItem] failed, dst_node:[%s].", dst_node->GetName().c_str()); int canonical_index; GE_CHK_STATUS_RET(dst_node_item->GetCanonicalInputIndex(dst_in_anchor->GetIdx(), canonical_index), - "[%s] Failed to canonical input index", - dst_node->GetName().c_str()); + "[Invoke][GetCanonicalInputIndex] failed, dst_node:[%s].", dst_node->GetName().c_str()); node_item.outputs[i].emplace_back(canonical_index, dst_node_item); } @@ -246,7 +249,7 @@ Status HybridModelBuilder::GetOrCreateNodeItem(const NodePtr &node, NodeItem **n } std::unique_ptr new_node; - GE_CHK_STATUS_RET(NodeItem::Create(node, new_node), "Failed to create node item"); + GE_CHK_STATUS_RET(NodeItem::Create(node, new_node), "[Invoke][Create] failed, model_name_:[%s]", GetGraphName()); GE_CHK_STATUS_RET_NOLOG(NodeExecutorManager::GetInstance().GetExecutor(*node, &new_node->node_executor)); // we do not need L2 Buffer @@ -330,10 +333,8 @@ Status HybridModelBuilder::ParseDependentInputNodes(NodeItem &node_item, const s for (const auto &input_name : dependencies) { int input_index = node_item.op_desc->GetInputIndexByName(input_name); if (input_index < 0) { - GELOGE(INTERNAL_ERROR, - "[%s] Failed to get input index by name: %s", - node_item.NodeName().c_str(), - input_name.c_str()); + GELOGE(INTERNAL_ERROR, "[Get][InputIndex]failed, node:[%s] inputname: %s.", + node_item.NodeName().c_str(), input_name.c_str()); return INTERNAL_ERROR; } @@ -380,10 +381,10 @@ Status HybridModelBuilder::ParseDependentForFusedSubgraph(NodeItem &node_item, s for (auto &op_desc : data_ops) { uint32_t parent_index = 0; if (!AttrUtils::GetInt(*op_desc, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { - GELOGE(INTERNAL_ERROR, - "[%s] Failed to get attr [%s]", - op_desc->GetName().c_str(), - ATTR_NAME_PARENT_NODE_INDEX.c_str()); + GELOGE(INTERNAL_ERROR, "[Invoke][GetInt] failed, node:[%s] attr:[%s]", + op_desc->GetName().c_str(), ATTR_NAME_PARENT_NODE_INDEX.c_str()); + REPORT_CALL_ERROR("E19999", "invoke GetInt failed when %s, node:[%s] attr:[%s]", + __FUNCTION__, op_desc->GetName().c_str(), ATTR_NAME_PARENT_NODE_INDEX.c_str()); return INTERNAL_ERROR; } @@ -413,24 +414,29 @@ Status HybridModelBuilder::ParseDependentForFusedSubgraph(NodeItem &node_item, s Status HybridModelBuilder::UpdateAnchorStatus(const NodePtr &node) { if (NodeUtils::SetAllAnchorStatus(node) != GRAPH_SUCCESS) { - GELOGE(INTERNAL_ERROR, "[%s] NodeUtils::SetAllAnchorStatus failed.", node->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Invoke][SetAllAnchorStatus] failed, node:[%s].", node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "[%s] NodeUtils::SetAllAnchorStatus failed when %s.", + node->GetName().c_str(), __FUNCTION__); return INTERNAL_ERROR; } for (auto &anchor : node->GetAllInDataAnchors()) { auto peer_anchor = anchor->GetPeerOutAnchor(); if (peer_anchor == nullptr) { if (AnchorUtils::SetStatus(anchor, ANCHOR_SUSPEND) != GRAPH_SUCCESS) { - GELOGE(INTERNAL_ERROR, "[%s] AnchorUtils::SetStatus failed.", node->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Invoke][SetStatus] failed, node:[%s].", node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "SetStatus failed, node:[%s] when %s.", node->GetName().c_str(), __FUNCTION__); return INTERNAL_ERROR; } } else if (peer_anchor->GetOwnerNode()->GetType() == CONSTANT) { if (AnchorUtils::SetStatus(anchor, ANCHOR_CONST) != GRAPH_SUCCESS) { - GELOGE(INTERNAL_ERROR, "[%s] AnchorUtils::SetStatus failed.", node->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Invoke][SetStatus] failed, node:[%s].", node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "SetStatus failed, node:[%s] when %s.", node->GetName().c_str(), __FUNCTION__); return INTERNAL_ERROR; } } else { if (AnchorUtils::SetStatus(anchor, ANCHOR_DATA) != GRAPH_SUCCESS) { - GELOGE(INTERNAL_ERROR, "[%s] AnchorUtils::SetStatus failed.", node->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Invoke][SetStatus] failed, node:[%s].", node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "SetStatus failed, node:[%s] when %s.", node->GetName().c_str(), __FUNCTION__); return INTERNAL_ERROR; } } @@ -441,11 +447,9 @@ Status HybridModelBuilder::UpdateAnchorStatus(const NodePtr &node) { Status HybridModelBuilder::DoUnlinkDataAnchors(const OutDataAnchorPtr &out_data_anchor, const InDataAnchorPtr &in_data_anchor) { - GE_CHK_GRAPH_STATUS_RET(out_data_anchor->Unlink(in_data_anchor), "Failed to unlink %s:%d from %s:%d", - out_data_anchor->GetOwnerNode()->GetName().c_str(), - out_data_anchor->GetIdx(), - in_data_anchor->GetOwnerNode()->GetName().c_str(), - in_data_anchor->GetIdx()); + GE_CHK_GRAPH_STATUS_RET(out_data_anchor->Unlink(in_data_anchor), + "[Invoke][Unlink] failed to unlink %s:%d from %s:%d", out_data_anchor->GetOwnerNode()->GetName().c_str(), + out_data_anchor->GetIdx(), in_data_anchor->GetOwnerNode()->GetName().c_str(), in_data_anchor->GetIdx()); GELOGD("Succeeded in unlinking %s:%d from %s:%d", out_data_anchor->GetOwnerNode()->GetName().c_str(), @@ -456,7 +460,7 @@ Status HybridModelBuilder::DoUnlinkDataAnchors(const OutDataAnchorPtr &out_data_ } Status HybridModelBuilder::DoLinkDataAnchors(OutDataAnchorPtr &out_data_anchor, InDataAnchorPtr &in_data_anchor) { - GE_CHK_GRAPH_STATUS_RET(out_data_anchor->LinkTo(in_data_anchor), "Failed to link %s:%d to %s:%d", + GE_CHK_GRAPH_STATUS_RET(out_data_anchor->LinkTo(in_data_anchor), "[Invoke][LinkTo]Failed to link %s:%d to %s:%d", out_data_anchor->GetOwnerNode()->GetName().c_str(), out_data_anchor->GetIdx(), in_data_anchor->GetOwnerNode()->GetName().c_str(), @@ -488,10 +492,10 @@ Status HybridModelBuilder::MergeInputNodes(ComputeGraph &graph) { uint32_t parent_index = 0; if (!AttrUtils::GetInt(data_op_desc, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { - GELOGE(FAILED, - "[%s] Failed to get attr [%s]", - data_op_desc->GetName().c_str(), - ATTR_NAME_PARENT_NODE_INDEX.c_str()); + GELOGE(FAILED, "[Invoke][GetInt] failed, node:[%s] attr:[%s]", + data_op_desc->GetName().c_str(), ATTR_NAME_PARENT_NODE_INDEX.c_str()); + REPORT_CALL_ERROR("E19999", "GetInt failed when %s, node:[%s] attr:[%s]", + __FUNCTION__, data_op_desc->GetName().c_str(), ATTR_NAME_PARENT_NODE_INDEX.c_str()); return FAILED; } @@ -557,7 +561,8 @@ Status HybridModelBuilder::MergeNetOutputNode(ComputeGraph &graph) { auto index = in_data_anchor->GetIdx(); auto input_desc = net_output_desc->MutableInputDesc(index); if (input_desc == nullptr) { - GELOGE(INTERNAL_ERROR, "[%s] Failed to get input desc[%d]", net_output_desc->GetName().c_str(), index); + GELOGE(INTERNAL_ERROR, "[Invoke][MutableInputDesc][%s] Failed to get input desc[%d]", + net_output_desc->GetName().c_str(), index); return INTERNAL_ERROR; } @@ -633,12 +638,13 @@ Status HybridModelBuilder::UnfoldSubgraphs(ComputeGraphPtr &root_graph, ComputeG } } GE_CHK_GRAPH_STATUS_RET(UnfoldSubgraph(root_graph, merged_graph, *subgraph), - "[%s] Failed to merge subgraph.", + "[Invoke][UnfoldSubgraph][%s] Failed to merge subgraph.", subgraph->GetName().c_str()); } // invoke before adding subgraphs. in case modify node id in known-shaped subgraphs. - GE_CHK_GRAPH_STATUS_RET(merged_graph->TopologicalSorting(), "Failed to invoke TopologicalSorting on merged graph."); + GE_CHK_GRAPH_STATUS_RET(merged_graph->TopologicalSorting(), + "[Invoke][TopologicalSorting]Failed to invoke TopologicalSorting on merged graph."); GE_DUMP(merged_graph, "hybrid_merged_graph_BeforeStageSort"); merged_graph->TopologicalSorting([](const NodePtr &a, const NodePtr &b) -> bool { uint32_t a_level = UINT32_MAX; @@ -651,7 +657,7 @@ Status HybridModelBuilder::UnfoldSubgraphs(ComputeGraphPtr &root_graph, ComputeG for (auto &remained_subgraph : root_graph->GetAllSubgraphs()) { GELOGD("Adding subgraph [%s] to merged-graph.", remained_subgraph->GetName().c_str()); GE_CHK_GRAPH_STATUS_RET(merged_graph->AddSubgraph(remained_subgraph), - "Failed to add subgraph [%s]", + "[Invoke][AddSubgraph]Failed to add subgraph [%s]", remained_subgraph->GetName().c_str()); remained_subgraph->SetParentGraph(merged_graph); } @@ -666,10 +672,10 @@ Status HybridModelBuilder::UnfoldSubgraph(ComputeGraphPtr &root_graph, GE_CHECK_NOTNULL(parent_node); GE_CHK_STATUS_RET(MergeInputNodes(sub_graph), - "[%s] Failed to merge data nodes for subgraph", + "[Invoke][MergeInputNodes][%s] Failed to merge data nodes for subgraph", sub_graph.GetName().c_str()); GE_CHK_STATUS_RET(MergeNetOutputNode(sub_graph), - "[%s] Failed to merge net output nodes for subgraph", + "[Invoke][MergeNetOutputNode][%s] Failed to merge net output nodes for subgraph", sub_graph.GetName().c_str()); GELOGD("[%s] Done merging subgraph inputs and outputs successfully", sub_graph.GetName().c_str()); @@ -683,7 +689,7 @@ Status HybridModelBuilder::UnfoldSubgraph(ComputeGraphPtr &root_graph, GE_CHECK_NOTNULL(sub_sub_graph); if (sub_sub_graph->GetGraphUnknownFlag()) { GE_CHK_STATUS_RET(UnfoldSubgraph(root_graph, parent_graph, *sub_sub_graph), - "[%s] Failed to merge subgraph", + "[Invoke][UnfoldSubgraph][%s] Failed to merge subgraph", sub_sub_graph->GetName().c_str()); continue; } @@ -757,7 +763,8 @@ Status HybridModelBuilder::LoadGraph() { GELOGI("Before merging subgraphs DirectNodesSize = %zu, GetAllNodesSize = %zu", root_graph->GetDirectNodesSize(), root_graph->GetAllNodesSize()); - GE_CHK_GRAPH_STATUS_RET(UnfoldSubgraphs(root_graph, merged_graph), "Failed to unfold subgraphs."); + GE_CHK_GRAPH_STATUS_RET(UnfoldSubgraphs(root_graph, merged_graph), + "[Invoke][UnfoldSubgraphs]Failed to unfold subgraphs, model_name_:%s.", GetGraphName()); root_graph = std::move(merged_graph); GELOGI("After merging subgraphs DirectNodesSize = %zu, GetAllNodesSize = %zu", root_graph->GetDirectNodesSize(), @@ -779,9 +786,11 @@ Status HybridModelBuilder::LoadGraph() { op_desc->SetId(index++); } GE_DUMP(root_graph, "hybrid_merged_graph"); - GE_CHK_STATUS_RET(LoadDynamicSubgraph(*root_graph, true), "Failed to load root graph."); + GE_CHK_STATUS_RET(LoadDynamicSubgraph(*root_graph, true), + "[Invoke][LoadDynamicSubgraph]Failed to load root graph, model_name_:%s.", GetGraphName()); GELOGD("Done loading root graph successfully."); - GE_CHK_STATUS_RET(hybrid_model_.root_graph_item_->GroupNodes(), "Failed to group nodes for root graph"); + GE_CHK_STATUS_RET(hybrid_model_.root_graph_item_->GroupNodes(), + "[Invoke][GroupNodes]Failed to group nodes for root graph, model_name_:%s.", GetGraphName()); for (auto &sub_graph : root_graph->GetAllSubgraphs()) { GE_CHECK_NOTNULL(sub_graph); @@ -797,26 +806,28 @@ Status HybridModelBuilder::LoadGraph() { if (sub_graph->GetGraphUnknownFlag()) { GE_CHK_STATUS_RET(LoadDynamicSubgraph(*sub_graph, false), - "Failed to load subgraph: [%s]", + "[Invoke][LoadDynamicSubgraph]Failed to load subgraph: [%s]", sub_graph->GetName().c_str()); } else { GE_CHK_STATUS_RET(IdentifyVariableOutputs(*parent_node_item), - "[%s] Failed to identify ref outputs.", + "[Invoke][IdentifyVariableOutputs][%s] Failed to identify ref outputs.", parent_node_item->NodeName().c_str()); GE_CHK_STATUS_RET(IdentifySameInputs(*parent_node_item), - "[%s] Failed to identify same outputs.", + "[Invoke][IdentifySameInputs][%s] Failed to identify same outputs.", parent_node_item->NodeName().c_str()); // if parent is function control op. need add a virtual partitioned call if (parent_node_item->IsControlOp()) { GE_CHK_STATUS_RET(LoadKnownShapedSubgraph(*sub_graph, parent_node_item), - "Failed to load function control op subgraph [%s]", + "[Invoke][LoadKnownShapedSubgraph]Failed to load function control op subgraph [%s]", sub_graph->GetName().c_str()); } } } - GE_CHK_STATUS_RET(ParseDependentByParallelGroup(), "Failed to establish dependencies for hccl ops"); + GE_CHK_STATUS_RET(ParseDependentByParallelGroup(), + "[Invoke][ParseDependentByParallelGroup]Failed to establish dependencies for hccl ops, model_name_:%s.", + GetGraphName()); GELOGI("Done loading all subgraphs successfully."); return SUCCESS; } @@ -834,7 +845,7 @@ Status HybridModelBuilder::VarNodeToTensor(const NodePtr &var_node, std::unique_ auto tensor_desc = var_node->GetOpDesc()->MutableOutputDesc(0); uint8_t *var_logic = nullptr; GE_CHK_STATUS_RET(var_manager_->GetVarAddr(var_name, *tensor_desc, &var_logic), - "Failed to get var addr. var_name = %s, session_id = %ld", + "[Invoke][GetVarAddr]Failed to get var addr. var_name = %s, session_id = %ld", var_name.c_str(), hybrid_model_.GetSessionId()); @@ -846,9 +857,11 @@ Status HybridModelBuilder::VarNodeToTensor(const NodePtr &var_node, std::unique_ uint8_t *dev_mem = var_manager_->GetVarMemoryAddr(var_logic, memory_type); if (dev_mem == nullptr) { GELOGE(INTERNAL_ERROR, - "Failed to copy var %s from device, cant not get " - "var addr from logic addr %p", - var_node->GetName().c_str(), var_logic); + "[Invoke][GetVarMemoryAddr]Failed to copy var %s from device, cant not get var addr from logic addr %p", + var_node->GetName().c_str(), var_logic); + REPORT_CALL_ERROR("E19999", + "GetVarMemoryAddr failed when %s, Failed to copy var %s from device, cant not get var addr from logic addr %p", + __FUNCTION__, var_node->GetName().c_str(), var_logic); return INTERNAL_ERROR; } @@ -876,7 +889,7 @@ Status HybridModelBuilder::HandleDtString(const GeTensor &tensor, void *var_addr auto &mutable_tensor = const_cast(tensor); uint64_t *buff = reinterpret_cast(mutable_tensor.MutableData().data()); GE_CHK_BOOL_RET_STATUS(ge::CheckInt64Uint32MulOverflow(elem_num, kBytes * kStringHeadElems) == SUCCESS, FAILED, - "Shape size is invalid"); + "[Invoke][CheckInt64Uint32MulOverflow] failed because Shape size is invalid."); auto offset = static_cast(elem_num * kBytes * kStringHeadElems); auto hbm_raw_data_base_addr = static_cast(reinterpret_cast(var_addr) + offset); @@ -928,7 +941,7 @@ Status HybridModelBuilder::InitConstantOps() { auto op_desc = var_node->GetOpDesc(); auto v_weights = ModelUtils::GetWeights(op_desc); if (v_weights.empty()) { - GELOGE(INTERNAL_ERROR, "[%s] Constant no not have value", var_node->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Check][Size][%s] Constant no not have value", var_node->GetName().c_str()); return INTERNAL_ERROR; } auto *ge_tensor = const_cast(v_weights[0].get()); @@ -942,7 +955,7 @@ Status HybridModelBuilder::InitConstantOps() { GELOGD("Init tensor with host constant %s size = %zu", var_name.c_str(), aligned_tensor.MutableData().GetSize()); if (MemManager::Instance().HostMemInstance(RT_MEMORY_HBM).Malloc(aligned_tensor.GetAlignedPtr(), aligned_tensor.GetData().size()) == nullptr) { - GELOGE(MEMALLOC_FAILED, "Malloc host memory for an existed GeTensor failed."); + GELOGE(MEMALLOC_FAILED, "Malloc host memory for an existed GeTensor failed, model_name_:%s.", GetGraphName()); return MEMALLOC_FAILED; } var_tensor.reset(new(std::nothrow)TensorValue(aligned_tensor.MutableData().data(), @@ -993,17 +1006,18 @@ Status HybridModelBuilder::InitVariableTensors() { int64_t tensor_size = 0; if (TensorUtils::CalcTensorMemSize(output_tensor.GetShape(), output_tensor.GetFormat(), output_tensor.GetDataType(), tensor_size) != SUCCESS) { - GELOGE(INTERNAL_ERROR, "Calculate variable size failed, node name:%s", it.first.c_str()); + GELOGE(INTERNAL_ERROR, "[Calculate][TensorMemSize] failed, node name:%s", it.first.c_str()); return INTERNAL_ERROR; } SharedMemInfo mem_info(it.first, tensor_size); if (HostMemManager::Instance().MallocSharedMemory(mem_info) != SUCCESS) { - GELOGE(GE_GRAPH_MALLOC_FAILED, "Host variable [%s] malloc failed.", it.first.c_str()); + GELOGE(GE_GRAPH_MALLOC_FAILED, "[Malloc][SharedMemory] failed, Host variable [%s].", it.first.c_str()); return GE_GRAPH_MALLOC_FAILED; } if (MemManager::Instance().HostMemInstance(RT_MEMORY_HBM).Malloc(mem_info.host_aligned_ptr, tensor_size) == nullptr) { - GELOGE(MEMALLOC_FAILED, "Malloc host memory for an existed GeTensor failed."); + GELOGE(MEMALLOC_FAILED, + "[Malloc][HostMem] for an existed GeTensor failed, Host variable [%s].", it.first.c_str()); return MEMALLOC_FAILED; } GELOGD("Host variable [%s] malloc success, size=%ld.", it.first.c_str(), tensor_size); @@ -1054,7 +1068,9 @@ Status HybridModelBuilder::InitWeights() { auto op_desc = node->GetOpDesc(); auto v_weights = ModelUtils::GetWeights(op_desc); if (v_weights.empty()) { - GELOGE(INTERNAL_ERROR, "[%s] Constant has no value", node->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Invoke][GetWeights][%s] Constant has no value", node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "[%s] Constant has no value when %s.", + node->GetName().c_str(), __FUNCTION__); return INTERNAL_ERROR; } auto *ge_tensor = const_cast(v_weights[0].get()); @@ -1062,11 +1078,11 @@ Status HybridModelBuilder::InitWeights() { const GeTensorDesc &tensor_desc = ge_tensor->GetTensorDesc(); int64_t tensor_size = 0; GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetSize(*op_desc->MutableOutputDesc(0), tensor_size), - "[%s] Failed to get tensor size", + "[Invoke][GetSize][%s] Failed to get tensor size", node->GetName().c_str()); int64_t data_offset = 0; GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetDataOffset(tensor_desc, data_offset), - "[%s] Failed to get data offset", + "[Invoke][GetDataOffset][%s] Failed to get data offset", node->GetName().c_str()); GELOGD("[%s] Start to init Constant node [%s], size = %ld, offset = %ld", GetGraphName(), @@ -1093,7 +1109,8 @@ Status HybridModelBuilder::LoadTask(NodeItem &node_item) { node_ptr, node_item.kernel_task); if (load_ret != UNSUPPORTED && load_ret != SUCCESS) { - GELOGE(load_ret, "[%s] Failed to load task", node_ptr->GetName().c_str()); + GELOGE(load_ret, "[Invoke][LoadTask][%s] Failed to load task", node_ptr->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "[%s] Failed to load task when %s", node_ptr->GetName().c_str(), __FUNCTION__); return load_ret; } @@ -1102,7 +1119,7 @@ Status HybridModelBuilder::LoadTask(NodeItem &node_item) { } Status HybridModelBuilder::LoadTasks() { - GE_CHK_STATUS_RET(CheckAicpuOpList(), "Check Aicpu op failed."); + GE_CHK_STATUS_RET(CheckAicpuOpList(), "[Check][AicpuOpList] failed."); std::map> ordered_partitioned_calls; for (auto &it : hybrid_model_.node_items_) { auto &node_item = it.second; @@ -1179,7 +1196,8 @@ Status HybridModelBuilder::IndexTaskDefs(const ComputeGraphPtr &sub_graph, const auto iter = node_map.find(op_index); if (iter == node_map.end()) { - GELOGE(INTERNAL_ERROR, "Failed to get node by op_index = %u", op_index); + GELOGE(INTERNAL_ERROR, "[Find][Node]Failed to get node by op_index = %u", op_index); + REPORT_INNER_ERROR("E19999", "Failed to get node by op_index = %u when %s.", op_index, __FUNCTION__); return INTERNAL_ERROR; } @@ -1249,7 +1267,8 @@ Status HybridModelBuilder::IndexTaskDefs() { auto iter = node_map.find(op_index); if (iter == node_map.end()) { - GELOGE(INTERNAL_ERROR, "Failed to get node by index = %u", op_index); + GELOGE(INTERNAL_ERROR, "[Find][Node]Failed to get node by index = %u.", op_index); + REPORT_INNER_ERROR("E19999", "Failed to get node by index = %u when %s.", op_index, __FUNCTION__); return INTERNAL_ERROR; } @@ -1314,14 +1333,14 @@ Status HybridModelBuilder::GetPeerNodeAcrossSubGraphs(const NodePtr &data_node, GELOGD("To get peer node of %s::%s", sub_graph->GetName().c_str(), data_node->GetName().c_str()); auto wrapped_node = data_node->GetOwnerComputeGraph()->GetParentNode(); if (wrapped_node == nullptr) { - GELOGE(INTERNAL_ERROR, "[%s] Node is in root graph.", data_node->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Invoke][GetParentNode][%s] Node is in root graph.", data_node->GetName().c_str()); return INTERNAL_ERROR; } auto data_op_desc = data_node->GetOpDesc(); uint32_t parent_index = 0; if (!AttrUtils::GetInt(data_op_desc, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { GELOGE(INTERNAL_ERROR, - "[%s] Failed to get attr [%s]", + "[Invoke][GetInt][%s] Failed to get attr [%s]", data_op_desc->GetName().c_str(), ATTR_NAME_PARENT_NODE_INDEX.c_str()); return INTERNAL_ERROR; @@ -1331,7 +1350,8 @@ Status HybridModelBuilder::GetPeerNodeAcrossSubGraphs(const NodePtr &data_node, GE_CHECK_NOTNULL(wrapped_node_in_anchor); auto src_out_anchor = wrapped_node_in_anchor->GetPeerOutAnchor(); if (src_out_anchor == nullptr || src_out_anchor->GetOwnerNode() == nullptr) { - GELOGE(INTERNAL_ERROR, "[%s] Parent node do not have peer anchor.", data_node->GetName().c_str()); + GELOGE(INTERNAL_ERROR, + "[Check][ParentNode][%s] Parent node do not have peer anchor.", data_node->GetName().c_str()); return INTERNAL_ERROR; } @@ -1356,7 +1376,7 @@ Status HybridModelBuilder::GetPeerNodeAcrossSubGraphs(const NodePtr &data_node, auto src_net_output_node = src_graph->FindFirstNodeMatchType(NETOUTPUT); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(src_net_output_node == nullptr, return INTERNAL_ERROR, - "Failed to find NetOutput in subgraph: %s", + "[Invoke][FindFirstNodeMatchType]Failed to find NetOutput in subgraph: %s", src_graph->GetName().c_str()); auto net_output_desc = src_net_output_node->GetOpDesc(); GE_CHECK_NOTNULL(net_output_desc); @@ -1393,17 +1413,18 @@ Status HybridModelBuilder::GetPeerNodeAcrossSubGraphs(const NodePtr &data_node, } } - GELOGE(FAILED, - "Failed to find peer node for %s::%s", - sub_graph->GetName().c_str(), - data_node->GetName().c_str()); + GELOGE(FAILED, "[Get][PeerNode]Failed to find peer node for %s::%s", + sub_graph->GetName().c_str(), data_node->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Failed to find peer node for %s::%s when %s.", + sub_graph->GetName().c_str(), data_node->GetName().c_str(), __FUNCTION__); return FAILED; } Status HybridModelBuilder::InitRuntimeParams() { int64_t value = 0; bool ret = false; if (ge_root_model_->GetSubgraphInstanceNameToModel().empty()) { - GELOGE(INTERNAL_ERROR, "Root model has no sub model"); + GELOGE(INTERNAL_ERROR, "[Get][SubModel]Root model has no sub model, model:%s.", GetGraphName()); + REPORT_INNER_ERROR("E19999", "Root model has no sub model when %s, model:%s.", __FUNCTION__, GetGraphName()); return INTERNAL_ERROR; } @@ -1546,8 +1567,10 @@ Status HybridModelBuilder::GetParentNodeOutputIndex(const OpDesc &op_desc, int i auto input_desc = op_desc.MutableInputDesc(index); GE_CHECK_NOTNULL(input_desc); if (!AttrUtils::GetInt(input_desc, ATTR_NAME_PARENT_NODE_INDEX, out_index)) { - GELOGE(INTERNAL_ERROR, "NetOutput input tensor %d, attr %s not found.", - index, ATTR_NAME_PARENT_NODE_INDEX.c_str()); + GELOGE(INTERNAL_ERROR, "[Invoke][GetInt]NetOutput input tensor %d, attr %s not found.", + index, ATTR_NAME_PARENT_NODE_INDEX.c_str()); + REPORT_CALL_ERROR("E19999", "NetOutput input tensor %d, attr %s not found when %s.", + index, ATTR_NAME_PARENT_NODE_INDEX.c_str(), __FUNCTION__); return INTERNAL_ERROR; } return SUCCESS; @@ -1563,7 +1586,7 @@ Status HybridModelBuilder::InitModelMem() { if (total_var_size > 0 && hybrid_model_.var_mem_base_ == nullptr) { GE_CHK_STATUS_RET(var_manager_->MallocVarMemory(total_var_size), - "Malloc Var Memory Fail."); + "[Malloc][VarMemory] failed, size:%zu.", total_var_size); hybrid_model_.var_mem_base_ = var_manager_->GetVarMemoryBase(RT_MEMORY_HBM); } @@ -1580,7 +1603,8 @@ Status HybridModelBuilder::TransAllVarData() { rtContext_t ctx = nullptr; rtError_t rt_ret = rtCtxGetCurrent(&ctx); if (rt_ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "Failed to get current context, error_code is: 0x%X.", rt_ret); + GELOGE(RT_FAILED, "[Invoke][rtCtxGetCurrent]Failed to get current context, error_code is: 0x%X.", rt_ret); + REPORT_CALL_ERROR("E19999", "rtCtxGetCurrent failed when %s, error_code: 0x%X.", __FUNCTION__, rt_ret); return RT_FAILED; } @@ -1594,7 +1618,7 @@ Status HybridModelBuilder::TransAllVarData() { runtime_param_.session_id, ctx, runtime_param_.graph_id), - "TransAllVarData failed."); + "[Invoke][TransAllVarData] failed."); GELOGI("TransAllVarData success."); return SUCCESS; @@ -1604,7 +1628,7 @@ Status HybridModelBuilder::CopyVarData() { GE_CHK_STATUS_RET(TransVarDataUtils::CopyVarData(ge_root_model_->GetRootGraph(), runtime_param_.session_id, hybrid_model_.device_id_), - "CopyVarData failed."); + "[Invoke][CopyVarData] failed."); GELOGI("CopyVarData success."); return SUCCESS; } @@ -1628,7 +1652,7 @@ Status HybridModelBuilder::LoadKnownShapedSubgraph(ComputeGraph &graph, NodeItem int32_t data_index = 0; if (!AttrUtils::GetInt(node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, data_index)) { GELOGE(FAILED, - "[%s] Failed to get attr [%s]", + "[Invoke][GetInt][%s] Failed to get attr [%s]", node->GetName().c_str(), ATTR_NAME_PARENT_NODE_INDEX.c_str()); return FAILED; @@ -1645,7 +1669,7 @@ Status HybridModelBuilder::LoadKnownShapedSubgraph(ComputeGraph &graph, NodeItem } GE_CHK_GRAPH_STATUS_RET(wrapper_op_desc->AddOutputDesc(*output_desc), - "[%s] Failed to add output desc. output index = %d", + "[Invoke][AddOutputDesc][%s] Failed to add output desc. output index = %d", graph.GetName().c_str(), output_index); @@ -2003,7 +2027,7 @@ Status HybridModelBuilder::BuildInputMapping(GraphItem &graph_item, } else { if (!AttrUtils::GetInt(node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, data_index)) { GELOGE(FAILED, - "[%s] Failed to get attr [%s]", + "[Invoke][GetInt][%s] Failed to get attr [%s]", node->GetName().c_str(), ATTR_NAME_PARENT_NODE_INDEX.c_str()); return FAILED; @@ -2040,7 +2064,7 @@ Status HybridModelBuilder::CheckAicpuOpList() { aicpu_optype_list.assign(aicpu_optype_set.begin(), aicpu_optype_set.end()); aicpu_tf_optype_list.assign(aicpu_tf_optype_set.begin(), aicpu_tf_optype_set.end()); GE_CHK_STATUS_RET(ModelManager::GetInstance()->LaunchKernelCheckAicpuOp(aicpu_optype_list, aicpu_tf_optype_list), - "Launch check aicpu op type failed."); + "[Launch][KernelCheckAicpuOp] failed."); return SUCCESS; } From 9a7cb76cb96b6c3eb2d3a4632039c3ee87dc86b0 Mon Sep 17 00:00:00 2001 From: liudingyan Date: Wed, 31 Mar 2021 10:43:49 +0800 Subject: [PATCH 5/5] modify geloge and add err report --- ge/hybrid/common/npu_memory_allocator.cc | 20 ++-- .../executor/hybrid_execution_context.cc | 5 +- .../executor/hybrid_model_async_executor.cc | 58 +++++------ .../hybrid_model_pipeline_executor.cc | 22 ++--- ge/hybrid/executor/hybrid_profiler.cc | 4 +- ge/hybrid/executor/node_state.cc | 14 +-- ge/hybrid/executor/rt_callback_manager.cc | 2 +- ge/hybrid/executor/subgraph_context.cc | 17 ++-- ge/hybrid/executor/subgraph_executor.cc | 25 +++-- ge/hybrid/executor/worker/execution_engine.cc | 4 +- .../executor/worker/shape_inference_engine.cc | 45 ++++----- ge/hybrid/model/graph_item.cc | 4 +- ge/hybrid/model/hybrid_model_builder.cc | 95 +++++++++++-------- 13 files changed, 172 insertions(+), 143 deletions(-) diff --git a/ge/hybrid/common/npu_memory_allocator.cc b/ge/hybrid/common/npu_memory_allocator.cc index 08f91f02..728a289b 100644 --- a/ge/hybrid/common/npu_memory_allocator.cc +++ b/ge/hybrid/common/npu_memory_allocator.cc @@ -38,9 +38,11 @@ AllocationAttr::AllocationAttr(void *try_reuse_addr) : AllocationAttr(0, try_reu NpuMemoryAllocator *NpuMemoryAllocator::GetAllocator() { int32_t device_id = 0; - if (rtGetDevice(&device_id) != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "[Get][Device] Failed."); - REPORT_INNER_ERROR("E19999", "rtGetDevice failed when %s.", __FUNCTION__); + auto rt_result = rtGetDevice(&device_id); + if (rt_result != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "[Get][Device] Failed, result:%d.", rt_result); + REPORT_INNER_ERROR("E19999", "rtGetDevice failed when NpuMemoryAllocator %s, result:%d.", + __FUNCTION__, rt_result); return nullptr; } @@ -58,9 +60,9 @@ void *NpuMemoryAllocator::Allocate(std::size_t size, AllocationAttr *attr) { } if (allocate_size == 0) { - GELOGE(MEMALLOC_FAILED, "[Check][Param:size_t]Memory size is 0, device_id = %u, size = %zu.", + GELOGE(MEMALLOC_FAILED, "[Check][Param:size_t]Memory size is 0, device_id = %u, size = %zu.", device_id_, allocate_size); - REPORT_INNER_ERROR("E19999", "Memory size is 0, device_id = %u, size = %zu when %s.", + REPORT_INNER_ERROR("E19999", "Memory size is 0, device_id = %u, size = %zu when %s.", device_id_, allocate_size, __FUNCTION__); return nullptr; } @@ -72,9 +74,9 @@ void *NpuMemoryAllocator::Allocate(std::size_t size, AllocationAttr *attr) { buffer = MemManager::Instance().HostMemInstance(RT_MEMORY_HBM).Malloc(allocate_size); } else { if (allocate_size > kMaxHbmMemorySize) { - GELOGE(PARAM_INVALID, "[Check][Param:size_t]Invalid HBM memory size: %zu bigger than limit:%lu, check invalid.", + GELOGE(PARAM_INVALID, "[Check][Param:size_t]Invalid HBM memory size: %zu bigger than limit:%lu, check invalid.", allocate_size, kMaxHbmMemorySize); - REPORT_CALL_ERROR("E19999", "Invalid HBM memory size: %zu bigger than limit:%lu, check invalid when %s.", + REPORT_CALL_ERROR("E19999", "Invalid HBM memory size: %zu bigger than limit:%lu, check invalid when %s.", allocate_size, kMaxHbmMemorySize, __FUNCTION__); return nullptr; } @@ -94,9 +96,9 @@ void *NpuMemoryAllocator::Allocate(std::size_t size, AllocationAttr *attr) { .Malloc(allocate_size, reinterpret_cast(try_reuse_addr), device_id_); } if (buffer == nullptr) { - GELOGE(MEMALLOC_FAILED, "[Malloc][Memory] Failed, device_id = %u, size = %zu.", + GELOGE(MEMALLOC_FAILED, "[Malloc][Memory] Failed, device_id = %u, size = %zu.", device_id_, allocate_size); - REPORT_CALL_ERROR("E19999", "malloc memory failed, device_id = %u, size = %zu when %s.", + REPORT_CALL_ERROR("E19999", "malloc memory failed, device_id = %u, size = %zu when %s.", device_id_, allocate_size, __FUNCTION__); return nullptr; } diff --git a/ge/hybrid/executor/hybrid_execution_context.cc b/ge/hybrid/executor/hybrid_execution_context.cc index dd964448..7cceadf3 100644 --- a/ge/hybrid/executor/hybrid_execution_context.cc +++ b/ge/hybrid/executor/hybrid_execution_context.cc @@ -59,9 +59,8 @@ Status GraphExecutionContext::Synchronize(rtStream_t rt_stream) { return SUCCESS; } - GELOGE(RT_FAILED, - "[Invoke][rtStreamSynchronize] failed, ret = %d", rt_ret); - REPORT_CALL_ERROR("E19999", + GELOGE(RT_FAILED, "[Invoke][rtStreamSynchronize] failed, ret = %d", rt_ret); + REPORT_CALL_ERROR("E19999", "invoke rtStreamSynchronize failed when GraphExecutionContext %s, ret = %d", __FUNCTION__, rt_ret); return RT_FAILED; } diff --git a/ge/hybrid/executor/hybrid_model_async_executor.cc b/ge/hybrid/executor/hybrid_model_async_executor.cc index 2720d72d..a97336d9 100644 --- a/ge/hybrid/executor/hybrid_model_async_executor.cc +++ b/ge/hybrid/executor/hybrid_model_async_executor.cc @@ -51,10 +51,10 @@ void HybridModelAsyncExecutor::SetModelName(const string &model_name) { } Status HybridModelAsyncExecutor::EnqueueData(const shared_ptr &data) { - if(data_inputer_->Push(data) != SUCCESS){ + if (data_inputer_->Push(data) != SUCCESS) { REPORT_CALL_ERROR("E19999", "Data queue is full, please call again later when %s, model_id %u.", __FUNCTION__, model_id_); - GELOGE(domi::DATA_QUEUE_ISFULL, + GELOGE(domi::DATA_QUEUE_ISFULL, "[Push][Data] Data queue is full, please call again later, model_id %u ", model_id_); return domi::DATA_QUEUE_ISFULL; } @@ -65,9 +65,9 @@ Status HybridModelAsyncExecutor::EnqueueData(const shared_ptr Status HybridModelAsyncExecutor::Start(const std::shared_ptr &listener) { GELOGD("HybridModelExecutor::Start IN, has listener = %d", listener != nullptr); std::lock_guard lk(mu_); - if(run_flag_){ - REPORT_INNER_ERROR("E19999", "Model already started when HybridModelAsyncExecutor %s, model_id:%u.", - __FUNCTION__, model_id_); + if (run_flag_) { + REPORT_INNER_ERROR("E19999", + "Model already started when HybridModelAsyncExecutor %s, model_id:%u.", __FUNCTION__, model_id_); GELOGE(INTERNAL_ERROR, "[Check][RunState] Model already started, model_id:%u.", model_id_); return INTERNAL_ERROR; } @@ -80,8 +80,8 @@ Status HybridModelAsyncExecutor::Start(const std::shared_ptr &lis return RunInternal(); }); - GE_CHK_BOOL_RET_STATUS(future_.valid(), INTERNAL_ERROR, - "[Check][RunState] Failed to start, model_id:%u.", model_id_); + GE_CHK_BOOL_RET_STATUS(future_.valid(), INTERNAL_ERROR, + "[Check][RunState] Failed to start, model_id:%u.", model_id_); GELOGD("HybridModelExecutor::Start successfully"); return SUCCESS; } @@ -115,8 +115,8 @@ Status HybridModelAsyncExecutor::Init() { executor_ = std::unique_ptr(new(std::nothrow) HybridModelExecutor(model_, device_id_, stream_)); GE_CHECK_NOTNULL(executor_); - GE_CHK_STATUS_RET(executor_->Init(), - "[Init][HybridModelExecutor] failed, model_id:%u.", model_id_); + GE_CHK_STATUS_RET(executor_->Init(), + "[Init][HybridModelExecutor] failed, model_id:%u.", model_id_); GE_CHK_STATUS_RET(DumpOpDebug(), "[Dump][OpDebug] failed, model_id:%u.", model_id_); GELOGI("HybridModel stage nums:%zu", model_->GetRootGraphItem()->NumGroups()); @@ -124,8 +124,8 @@ Status HybridModelAsyncExecutor::Init() { pipe_executor_ = std::unique_ptr(new(std::nothrow) HybridModelPipelineExecutor(model_, device_id_)); GE_CHECK_NOTNULL(pipe_executor_); - GE_CHK_STATUS_RET(pipe_executor_->Init(), - "[Init][HybridModelPipelineExecutor] failed, model_id:%u.", model_id_); + GE_CHK_STATUS_RET(pipe_executor_->Init(), + "[Init][HybridModelPipelineExecutor] failed, model_id:%u.", model_id_); } GE_CHK_STATUS_RET(InitInputDesc(), "[Init][InputDesc] failed, model_id:%u.", model_id_); @@ -136,8 +136,8 @@ Status HybridModelAsyncExecutor::Init() { Status HybridModelAsyncExecutor::PreRun(InputData ¤t_data, HybridModelExecutor::ExecuteArgs &args) { GE_CHK_STATUS_RET(SyncVarData(), "[Invoke][SyncVarData] failed, model_id:%u.", model_id_); RECORD_MODEL_EXECUTION_EVENT(executor_->GetContext(), "[SyncVarData] End"); - GE_CHK_STATUS_RET(PrepareInputs(current_data, args), - "[Invoke][PrepareInputs] failed to copy input data to model, model_id:%u.", model_id_); + GE_CHK_STATUS_RET(PrepareInputs(current_data, args), + "[Invoke][PrepareInputs] failed to copy input data to model, model_id:%u.", model_id_); RECORD_MODEL_EXECUTION_EVENT(executor_->GetContext(), "[CopyInputData] End"); return SUCCESS; } @@ -213,7 +213,7 @@ Status HybridModelAsyncExecutor::HandleResult(Status exec_ret, if (exec_ret != SUCCESS) { GELOGE(exec_ret, "[Check][Param:Status] failed to execute graph. model_id = %u", model_id_); - REPORT_INNER_ERROR("E19999", + REPORT_INNER_ERROR("E19999", "failed to execute graph when HybridModelAsyncExecutor %s. model_id = %u", __FUNCTION__, model_id_); return OnComputeDone(data_id, INTERNAL_ERROR, output_tensor_info_list); } @@ -250,10 +250,10 @@ Status HybridModelAsyncExecutor::SyncVarData() { Status HybridModelAsyncExecutor::PrepareInputs(const InputData ¤t_data, HybridModelExecutor::ExecuteArgs &args) { if (current_data.blobs.size() < input_tensor_desc_.size()) { - GELOGE(PARAM_INVALID, + GELOGE(PARAM_INVALID, "[Check][Size]Blob size mismatches, expect at least %zu, but got %zu, model_id = %u", input_tensor_desc_.size(), current_data.blobs.size(), model_id_); - REPORT_INNER_ERROR("E19999", + REPORT_INNER_ERROR("E19999", "Blob size mismatches, expect at least %zu, but got %zu when HybridModelAsyncExecutor %s, model_id = %u.", input_tensor_desc_.size(), current_data.blobs.size(), __FUNCTION__, model_id_); return PARAM_INVALID; @@ -267,12 +267,12 @@ Status HybridModelAsyncExecutor::PrepareInputs(const InputData ¤t_data, Hy auto tensor_size = input_sizes_[input_index]; if (is_input_dynamic_[input_index]) { if (input_index >= current_data.shapes.size()) { - GELOGE(PARAM_INVALID, + GELOGE(PARAM_INVALID, "[Check][Range]Shape index out of range, index = %zu, shape size = %zu model_id = %u.", input_index, current_data.shapes.size(), model_id_); - REPORT_INNER_ERROR("E19999", + REPORT_INNER_ERROR("E19999", "Shape index out of range, index = %zu, shape size = %zu when HybridModelAsyncExecutor %s, model_id = %u.", - input_index, current_data.shapes.size(), __FUNCTION__, model_id_); + input_index, current_data.shapes.size(), __FUNCTION__, model_id_); return PARAM_INVALID; } auto &tensor_desc = input_tensor_desc_[input_index]; @@ -287,12 +287,12 @@ Status HybridModelAsyncExecutor::PrepareInputs(const InputData ¤t_data, Hy } // range[k].second can be -1 if (shape.GetDim(k) < range[k].first || (range[k].second >= 0 && shape.GetDim(k) > range[k].second)) { - GELOGE(PARAM_INVALID, + GELOGE(PARAM_INVALID, "[Check][Range]Dim out of range, shape idx = %zu, dim idx = %zu, dim = %ld, range = [%ld, %ld], model_id = %u.", input_index, k, shape.GetDim(k), range[k].first, range[k].second, model_id_); - REPORT_INNER_ERROR("E19999", + REPORT_INNER_ERROR("E19999", "Dim out of range, shape idx = %zu, dim idx = %zu, dim = %ld, range = [%ld, %ld], model_id = %u.", - input_index, k, shape.GetDim(k), range[k].first, range[k].second, model_id_); + input_index, k, shape.GetDim(k), range[k].first, range[k].second, model_id_); return PARAM_INVALID; } } @@ -318,14 +318,14 @@ Status HybridModelAsyncExecutor::PrepareInputs(const InputData ¤t_data, Hy const DataBuffer &data_buf = blobs[input_index]; auto mem_size = static_cast(tensor_size); - if(mem_size < data_buf.length){ - REPORT_INNER_ERROR("E19999", + if (mem_size < data_buf.length) { + REPORT_INNER_ERROR("E19999", "input data size(%lu) does not match model required size(%lu) when %s, ret failed, model_id = %u.", data_buf.length, mem_size, __FUNCTION__, model_id_); GELOGE(PARAM_INVALID, "[Check][Size]input data size(%lu) does not match model required size(%lu), ret failed, model_id = %u.", data_buf.length, mem_size, model_id_); - return PARAM_INVALID; + return PARAM_INVALID; } if (data_buf.length > 0) { GELOGI("[IMAS]CopyPlainData memcpy graph_%u type[F] output[%zu] memaddr[%p] mem_size[%zu] datasize[%lu]", @@ -433,7 +433,7 @@ Status HybridModelAsyncExecutor::CopyOutputs(HybridModelExecutor::ExecuteArgs &a GELOGE(INTERNAL_ERROR, "[Check][Size]output[%zu] tensor size(%zu) is not enough for output shape [%s], model_id = %u.", i, output_tensor.GetSize(), tensor_desc->GetShape().ToString().c_str(), model_id_); - REPORT_INNER_ERROR("E19999", + REPORT_INNER_ERROR("E19999", "output[%zu] tensor size(%zu) is not enough for output shape [%s] model_id = %u," " when HybridModelAsyncExecutor %s.", i, output_tensor.GetSize(), tensor_desc->GetShape().ToString().c_str(), model_id_, __FUNCTION__); @@ -513,14 +513,14 @@ Status HybridModelAsyncExecutor::Execute(const vector &inputs, vector< } HybridModelExecutor::ExecuteArgs args; - GE_CHK_STATUS_RET(PrepareInputs(input_data, args), + GE_CHK_STATUS_RET(PrepareInputs(input_data, args), "[Invoke][PrepareInputs]Failed to copy input data to model, model_id = %u", model_id_); GELOGD("Done copying input data successfully."); GE_CHK_STATUS_RET(executor_->Execute(args), "[Invoke][Execute] Failed, model_id = %u.", model_id_); std::vector output_tensor_info_list; OutputData output_data; - GE_CHK_STATUS_RET(CopyOutputs(args, &output_data, output_tensor_info_list), + GE_CHK_STATUS_RET(CopyOutputs(args, &output_data, output_tensor_info_list), "[Invoke][CopyOutputs]Failed to copy outputs, model_id = %u.", model_id_); GELOGD("Done copying output data successfully. output count = %zu", output_tensor_info_list.size()); @@ -572,7 +572,7 @@ Status HybridModelAsyncExecutor::DumpOpDebug() { loop_cond = const_cast(varible_loop_cond->GetData()); } data_dumper_.SetLoopAddr(global_step, loop_per_iter, loop_cond); - GE_CHK_STATUS_RET(data_dumper_.LoadDumpInfo(), + GE_CHK_STATUS_RET(data_dumper_.LoadDumpInfo(), "[Invoke][LoadDumpInfo] failed in hybrid engine, model_id = %u.", model_id_); GELOGD("Dump op debug SUCCESS in hybrid engine"); } diff --git a/ge/hybrid/executor/hybrid_model_pipeline_executor.cc b/ge/hybrid/executor/hybrid_model_pipeline_executor.cc index 098c194c..0633eeb5 100644 --- a/ge/hybrid/executor/hybrid_model_pipeline_executor.cc +++ b/ge/hybrid/executor/hybrid_model_pipeline_executor.cc @@ -59,9 +59,9 @@ Status StageExecutor::Start(const std::vector &inputs, const std::v task_queue_.Pop(task_info); GELOGD("[Executor: %d] Got task, stage = %d, iteration = %ld", id_, task_info.stage, task_info.iteration); if (task_info.iteration >= pipe_config_->iteration_end) { - GELOGE(INTERNAL_ERROR, "[Check][Range][Executor: %d] Unexpected iteration: %d.", + GELOGE(INTERNAL_ERROR, "[Check][Range][Executor: %d] Unexpected iteration: %ld.", id_, task_info.iteration); - REPORT_INNER_ERROR("E19999", "[Executor: %d] Unexpected iteration: %d when StageExecutor %s.", + REPORT_INNER_ERROR("E19999", "[Executor: %d] Unexpected iteration: %ld when StageExecutor %s.", id_, task_info.iteration, __FUNCTION__); return INTERNAL_ERROR; } @@ -69,16 +69,16 @@ Status StageExecutor::Start(const std::vector &inputs, const std::v if (task_info.event != nullptr) { GELOGD("[%d] Add StreamWaitEvent", id_); GE_CHK_RT_RET(rtStreamWaitEvent(stream_, task_info.event)); - RECORD_MODEL_EXECUTION_EVENT(&context_, "[iteration = %d] [Stage = %d] End", task_info.iteration - 1, + RECORD_MODEL_EXECUTION_EVENT(&context_, "[iteration = %ld] [Stage = %d] End", task_info.iteration - 1, task_info.stage); } - RECORD_MODEL_EXECUTION_EVENT(&context_, "[iteration = %d] [Stage = %d] Start", task_info.iteration, + RECORD_MODEL_EXECUTION_EVENT(&context_, "[iteration = %lld] [Stage = %d] Start", task_info.iteration, task_info.stage); if (task_info.stage == 0) { GELOGD("[Executor: %d] To ResetExecutionContext", id_); - GE_CHK_STATUS_RET(ResetExecutionContext(context_), + GE_CHK_STATUS_RET(ResetExecutionContext(context_), "[Invoke][ResetExecutionContext][Executor: %d] Failed to reset context", id_); context_.iteration = task_info.iteration; GE_CHK_STATUS_RET_NOLOG(SetInputs(inputs, input_desc)); @@ -96,10 +96,10 @@ Status StageExecutor::Start(const std::vector &inputs, const std::v auto sync_result = Synchronize(); if (sync_result != SUCCESS) { - GELOGE(sync_result, - "[Invoke][Synchronize][Executor: %d] Failed to sync result:%d. iteration = %d", + GELOGE(sync_result, + "[Invoke][Synchronize][Executor: %d] Failed to sync result:%d. iteration = %ld", id_, sync_result, task_info.iteration); - REPORT_CALL_ERROR("E19999", "[Executor: %d] Failed to sync result:%d when StageExecutor %s. iteration = %d", + REPORT_CALL_ERROR("E19999", "[Executor: %d] Failed to sync result:%d when StageExecutor %s. iteration = %ld", id_, sync_result, __FUNCTION__, task_info.iteration); context_.profiler->Dump(std::cout); context_.callback_manager->Destroy(); @@ -107,11 +107,11 @@ Status StageExecutor::Start(const std::vector &inputs, const std::v return sync_result; } - RECORD_MODEL_EXECUTION_EVENT(&context_, "[iteration = %d] [Stage = %d] End", task_info.iteration, task_info.stage); + RECORD_MODEL_EXECUTION_EVENT(&context_, "[iteration = %ld] [Stage = %d] End", task_info.iteration, task_info.stage); // if not end stage if (task_info.stage >= pipe_config_->num_stages - 1) { - RECORD_MODEL_EXECUTION_EVENT(&context_, "[iteration = %d] Schedule End", task_info.iteration); + RECORD_MODEL_EXECUTION_EVENT(&context_, "[iteration = %ld] Schedule End", task_info.iteration); GELOGD("[Executor: %d] End of iteration [%ld]", id_, task_info.iteration); context_.callback_manager->Destroy(); RuntimeInferenceContext::DestroyContext(std::to_string(context_.context_id)); @@ -261,7 +261,7 @@ Status HybridModelPipelineExecutor::Execute(HybridModelExecutor::ExecuteArgs &ar if (ret != SUCCESS) { GELOGE(ret, "[Invoke][Synchronize] failed for [Executor: %zu].", i); REPORT_CALL_ERROR("E19999", "[Executor: %zu] failed to Synchronize result when HybridModelPipelineExecutor %s.", - i, __FUNCTION__); + i, __FUNCTION__); has_error = true; continue; } diff --git a/ge/hybrid/executor/hybrid_profiler.cc b/ge/hybrid/executor/hybrid_profiler.cc index bbe82dad..de2322b9 100644 --- a/ge/hybrid/executor/hybrid_profiler.cc +++ b/ge/hybrid/executor/hybrid_profiler.cc @@ -49,9 +49,9 @@ void HybridProfiler::RecordEvent(EventType event_type, const char *fmt, ...) { va_end(args); auto index = counter_++; if (index >= static_cast(events_.size())) { - GELOGE(INTERNAL_ERROR, + GELOGE(INTERNAL_ERROR, "[Check][Range]index out of range. index = %d, max event size = %zu", index, events_.size()); - REPORT_INNER_ERROR("E19999", "index out of range when HybridProfiler %s. index = %d, max event size = %zu", + REPORT_INNER_ERROR("E19999", "index out of range when HybridProfiler %s. index = %d, max event size = %zu", __FUNCTION__, index, events_.size()); return; } diff --git a/ge/hybrid/executor/node_state.cc b/ge/hybrid/executor/node_state.cc index f30901c9..ddeeaae7 100644 --- a/ge/hybrid/executor/node_state.cc +++ b/ge/hybrid/executor/node_state.cc @@ -67,10 +67,10 @@ Status ShapeInferenceState::UpdateInputShape(int idx, const GeTensorDesc &target Format format = input_desc.GetFormat(); DataType data_type = input_desc.GetDataType(); if (TensorUtils::CalcTensorMemSize(shape, format, data_type, tensor_size) != GRAPH_SUCCESS) { - GELOGE(FAILED, "[Invoke][CalcTensorMemSize] failed for [%s].", + GELOGE(FAILED, "[Invoke][CalcTensorMemSize] failed for [%s].", node_item.NodeName().c_str()); - REPORT_CALL_ERROR("E19999", "CalcTensorMemSize failed for [%s] when ShapeInferenceState %s.", - node_item.NodeName().c_str(), __FUNCTION__); + REPORT_CALL_ERROR("E19999", "CalcTensorMemSize failed for [%s] when ShapeInferenceState %s.", + node_item.NodeName().c_str(), __FUNCTION__); return FAILED; } } @@ -124,18 +124,18 @@ Status ShapeInferenceState::AwaitShapesReady(const GraphExecutionContext &contex } if (context.GetStatus() != SUCCESS) { - GELOGE(FAILED, "[Check][Status][%s] Await pending shape cancelled.", + GELOGE(FAILED, "[Check][Status][%s] Await pending shape cancelled.", node_item.NodeName().c_str()); - REPORT_CALL_ERROR("E19999", "[%s] Await pending shape cancelled when %s.", + REPORT_CALL_ERROR("E19999", "[%s] Await pending shape cancelled when %s.", node_item.NodeName().c_str(), __FUNCTION__); break; } } if (!wait_success) { - GELOGE(FAILED, "[Check][Status][%s] Wait for shape timeout:%d.", + GELOGE(FAILED, "[Check][Status][%s] Wait for shape timeout:%d.", node_item.NodeName().c_str(), kWaitInternal); - REPORT_CALL_ERROR("E19999", "[%s] Wait for shape timeout:%d when %s.", + REPORT_CALL_ERROR("E19999", "[%s] Wait for shape timeout:%d when %s.", node_item.NodeName().c_str(), kWaitInternal, __FUNCTION__); return FAILED; } diff --git a/ge/hybrid/executor/rt_callback_manager.cc b/ge/hybrid/executor/rt_callback_manager.cc index 9ceba0a1..51a93938 100644 --- a/ge/hybrid/executor/rt_callback_manager.cc +++ b/ge/hybrid/executor/rt_callback_manager.cc @@ -76,7 +76,7 @@ Status CallbackManager::CallbackProcess(rtContext_t context) { auto rt_err = rtEventSynchronize(event); if (rt_err != RT_ERROR_NONE) { GELOGE(RT_FAILED, "[Invoke][rtEventSynchronize] failed. ret = %d", rt_err); - REPORT_CALL_ERROR("E19999", + REPORT_CALL_ERROR("E19999", "Invoke rtEventSynchronize failed when CallbackManager %s, ret = %d.", __FUNCTION__, rt_err); GE_CHK_RT(rtEventDestroy(event)); return RT_FAILED; diff --git a/ge/hybrid/executor/subgraph_context.cc b/ge/hybrid/executor/subgraph_context.cc index 34cc2bbf..f198de8b 100644 --- a/ge/hybrid/executor/subgraph_context.cc +++ b/ge/hybrid/executor/subgraph_context.cc @@ -50,10 +50,10 @@ NodeStatePtr SubgraphContext::GetOrCreateNodeState(const NodeItem *node_item) { Status SubgraphContext::SetInput(int index, const TensorValue &tensor) { if (static_cast(index) >= all_inputs_.size()) { GELOGE(INTERNAL_ERROR, - "[Check][Param:index]output index output range. all input num = %zu, input index = %d", + "[Check][Param:index]input index out of range. all input num = %zu, input index = %d", all_inputs_.size(), index); - REPORT_INNER_ERROR("E19999", - "input param index out range when SubgraphContext %s, all input num = %zu, input index = %d.", + REPORT_INNER_ERROR("E19999", + "input param index out of range when SubgraphContext %s, all input num = %zu, input index = %d.", __FUNCTION__, all_inputs_.size(), index); return INTERNAL_ERROR; } @@ -70,9 +70,9 @@ Status SubgraphContext::SetOutput(const NodeItem &node_item, int output_index, c auto index = node_item.output_start + output_index; if ((output_index >= node_item.num_outputs) || (static_cast(index) >= all_outputs_.size())) { GELOGE(INTERNAL_ERROR, - "[Check][Param:output_index]output index output range. all output num = %zu, node_item = %s," + "[Check][Param:output_index]output index out of range. all output num = %zu, node_item = %s," "output index = %d.", all_outputs_.size(), node_item.DebugString().c_str(), output_index); - REPORT_INNER_ERROR("E19999", "output index output range when SubgraphContext %s. " + REPORT_INNER_ERROR("E19999", "output index out of range when SubgraphContext %s. " "all output num = %zu, node_item = %s, output index = %d.", __FUNCTION__, all_outputs_.size(), node_item.DebugString().c_str(), output_index); return INTERNAL_ERROR; @@ -129,9 +129,10 @@ Status SubgraphContext::Await(const NodePtr &node) { void SubgraphContext::OnError(Status error) { if (error != END_OF_SEQUENCE) { - GELOGE(error, "[Check][Param:error][%s] Error occurred while executing graph.", graph_item_->GetName().c_str()); - REPORT_INNER_ERROR("E19999", "[%s] Error occurred while executing graph when SubgraphContext %s.", - graph_item_->GetName().c_str(), __FUNCTION__); + GELOGE(error, "[Check][Param:error][%s] Error:%d occurred while executing graph.", + graph_item_->GetName().c_str(), error); + REPORT_INNER_ERROR("E19999", "[%s] Error:%d occurred while executing graph when SubgraphContext %s.", + graph_item_->GetName().c_str(), error, __FUNCTION__); } node_done_manager_.Destroy(); } diff --git a/ge/hybrid/executor/subgraph_executor.cc b/ge/hybrid/executor/subgraph_executor.cc index 3bd78579..4c8b1bc1 100644 --- a/ge/hybrid/executor/subgraph_executor.cc +++ b/ge/hybrid/executor/subgraph_executor.cc @@ -44,7 +44,7 @@ Status SubgraphExecutor::Init(const std::vector &inputs, const std::vector &input_desc) { subgraph_context_.reset(new(std::nothrow)SubgraphContext(graph_item_, context_)); GE_CHECK_NOTNULL(subgraph_context_); - GE_CHK_STATUS_RET(subgraph_context_->Init(), + GE_CHK_STATUS_RET(subgraph_context_->Init(), "[Init][SubgraphContext][%s] Failed to init subgraph context.", graph_item_->GetName().c_str()); shape_inference_engine_.reset(new(std::nothrow) ShapeInferenceEngine(context_, subgraph_context_.get())); @@ -68,11 +68,12 @@ Status SubgraphExecutor::InitInputsForUnknownShape(const std::vectorGetInputNodes(); if (inputs.size() < input_nodes.size()) { - GELOGE(INTERNAL_ERROR, + GELOGE(INTERNAL_ERROR, "[Check][Size][%s] Number of inputs [%zu] is not sufficient for subgraph which needs [%zu] inputs.", graph_item_->GetName().c_str(), inputs.size(), input_nodes.size()); - REPORT_INNER_ERROR("E19999", - "Number of inputs [%zu] is not sufficient for subgraph which needs [%zu] inputs when SubgraphExecutor %s.", + REPORT_INNER_ERROR("E19999", + "[%s] Number of inputs [%zu] is not sufficient for subgraph which needs [%zu] inputs," + "check invalid when SubgraphExecutor %s.", graph_item_->GetName().c_str(), inputs.size(), input_nodes.size(), __FUNCTION__); return INTERNAL_ERROR; } @@ -117,8 +118,9 @@ Status SubgraphExecutor::InitInputsForKnownShape(const std::vector GELOGE(INTERNAL_ERROR, "[Check][Size][%s] Number of inputs [%zu] is not sufficient for subgraph which needs at lease [%d] inputs", graph_item_->GetName().c_str(), inputs.size(), parent_input_index + 1); - REPORT_INNER_ERROR("E19999", - "[%s] Number of inputs [%zu] is not sufficient for subgraph which needs at lease [%d] inputs when %s.", + REPORT_INNER_ERROR("E19999", + "[%s] Number of inputs [%zu] is not sufficient for subgraph which needs at lease [%d] inputs," + "check invalid when %s.", graph_item_->GetName().c_str(), inputs.size(), parent_input_index + 1, __FUNCTION__); return INTERNAL_ERROR; } @@ -387,9 +389,10 @@ Status SubgraphExecutor::GetOutputs(vector &outputs, std::vectorGetName().c_str()); if (outputs.size() != output_desc.size()) { GELOGE(INTERNAL_ERROR, - "[Check][Size]Number of output tensors(%zu) mismatch number of output tensor desc(%zu).", + "[Check][Size]Number of outputs(%zu) mismatch number of output_desc(%zu).", outputs.size(), output_desc.size()); - REPORT_INNER_ERROR("E19999", "Number of output tensors(%zu) mismatch number of output tensor desc(%zu) when %s.", + REPORT_INNER_ERROR("E19999", "Number of outputs(%zu) mismatch number of output_desc(%zu)," + "check invlid when SubgraphExecutor %s.", outputs.size(), output_desc.size(), __FUNCTION__); return INTERNAL_ERROR; } @@ -413,9 +416,11 @@ Status SubgraphExecutor::SetOutputsToParentNode(TaskContext &task_context) { "[Invoke][GetOutputDescList][%s] Failed to get output tensor desc.", graph_item_->GetName().c_str()); if (outputs.size() != output_desc_list.size()) { - GELOGE(INTERNAL_ERROR, "[Check][Size][%s] num output tensors = %zu, num output tensor desc = %zu", + GELOGE(INTERNAL_ERROR, "[Check][Size][%s] num of output tensors = %zu, num of output tensor desc = %zu not equal", graph_item_->GetName().c_str(), outputs.size(), output_desc_list.size()); - REPORT_INNER_ERROR("E19999", "[%s] num output tensors = %zu, num output tensor desc = %zu when %s", + REPORT_INNER_ERROR("E19999", + "%s num of output tensors = %zu, num of output tensor desc = %zu not equal," + "check invalid when SubgraphExecutor %s", graph_item_->GetName().c_str(), outputs.size(), output_desc_list.size(), __FUNCTION__); return INTERNAL_ERROR; } diff --git a/ge/hybrid/executor/worker/execution_engine.cc b/ge/hybrid/executor/worker/execution_engine.cc index 797c0092..8dfdb476 100755 --- a/ge/hybrid/executor/worker/execution_engine.cc +++ b/ge/hybrid/executor/worker/execution_engine.cc @@ -105,7 +105,7 @@ Status NodeDoneCallback::PrepareConstInputs(const NodeItem &node_item) { "[Check][Size][%s] Tensor size is not enough. output index = %d, required size = %ld, tensor = %s.", node_item.NodeName().c_str(), output_idx, tensor_size, output_tensor->DebugString().c_str()); - REPORT_INNER_ERROR("E19999", + REPORT_INNER_ERROR("E19999", "[%s] Tensor size is not enough. output index = %d, required size = %ld, tensor = %s when %s.", node_item.NodeName().c_str(), output_idx, tensor_size, output_tensor->DebugString().c_str(), __FUNCTION__); @@ -423,7 +423,7 @@ Status ExecutionEngine::ValidateInputTensors(const NodeState &node_state, const "[Check][Size] for [%s] Input[%d]: tensor size mismatches. expected: %ld, but given %zu.", task_context.GetNodeName(), i, expected_size, input_tensor->GetSize()); REPORT_INNER_ERROR("E19999", "[%s] Input[%d]: tensor size mismatches. expected: %ld, but given %zu when %s.", - task_context.GetNodeName(), i, expected_size, input_tensor->GetSize(), __FUNCTION__); + task_context.GetNodeName(), i, expected_size, input_tensor->GetSize(), __FUNCTION__); return INTERNAL_ERROR; } } diff --git a/ge/hybrid/executor/worker/shape_inference_engine.cc b/ge/hybrid/executor/worker/shape_inference_engine.cc index f8fbbb9c..38c0fdec 100755 --- a/ge/hybrid/executor/worker/shape_inference_engine.cc +++ b/ge/hybrid/executor/worker/shape_inference_engine.cc @@ -205,8 +205,8 @@ Status ShapeInferenceEngine::UpdatePeerNodeShape(const Node &node) { auto peer_input_desc = peer_op_desc->MutableInputDesc(peer_anchor->GetIdx()); if (peer_input_desc == nullptr) { GELOGE(GRAPH_FAILED, "[Call][MutableInputDesc] for %s return nullptr.", peer_op_desc->GetName().c_str()); - REPORT_CALL_ERROR("E19999", "%s call MutableInputDesc return nullptr when ShapeInferenceEngine %s.", - peer_op_desc->GetName().c_str(), __FUNCTION__); + REPORT_CALL_ERROR("E19999", "%s call MutableInputDesc return nullptr when ShapeInferenceEngine %s.", + peer_op_desc->GetName().c_str(), __FUNCTION__); continue; } @@ -231,11 +231,11 @@ Status ShapeInferenceEngine::CanonicalizeShape(GeTensorDesc &tensor_desc, const auto &tensor_shape = tensor_desc.MutableShape(); if (tensor_shape.IsUnknownShape()) { if (!fallback_with_range) { - GELOGE(INTERNAL_ERROR, - "[Is][UnknownShape] Output shape is still unknown after shape inference. shape = [%s].", + GELOGE(INTERNAL_ERROR, + "[Is][UnknownShape] Output shape is still unknown after shape inference. shape = [%s].", tensor_shape.ToString().c_str()); REPORT_INNER_ERROR("E19999", "Output shape is still unknown after shape inference. " - "shape = [%s] when ShapeInferenceEngine %s.", tensor_shape.ToString().c_str(), __FUNCTION__); + "shape = [%s] when ShapeInferenceEngine %s.", tensor_shape.ToString().c_str(), __FUNCTION__); return INTERNAL_ERROR; } @@ -243,10 +243,10 @@ Status ShapeInferenceEngine::CanonicalizeShape(GeTensorDesc &tensor_desc, std::vector> shape_range; GE_CHK_GRAPH_STATUS_RET(tensor_desc.GetShapeRange(shape_range), "Failed to get shape range"); if (shape_range.size() != shape.size()) { - GELOGE(INTERNAL_ERROR, "[Check][Size] Number of shape ranges (%zu) mismatches that of dims (%zu).", + GELOGE(INTERNAL_ERROR, "[Check][Size] Number of shape ranges (%zu) mismatches that of dims (%zu).", shape_range.size(), shape.size()); - REPORT_INNER_ERROR("E19999", "Number of shape ranges (%zu) mismatches that of dims (%zu)" - " when ShapeInferenceEngine %s.", shape_range.size(), shape.size(), __FUNCTION__); + REPORT_INNER_ERROR("E19999", "Number of shape ranges (%zu) mismatches that of dims (%zu)" + " when ShapeInferenceEngine %s.", shape_range.size(), shape.size(), __FUNCTION__); return INTERNAL_ERROR; } @@ -270,10 +270,10 @@ Status ShapeInferenceEngine::CalcTensorSize(DataType data_type, GELOGD("To calc tensor size by shape = [%s]", GeShape(shape).ToString().c_str()); uint32_t type_size; if (!TypeUtils::GetDataTypeLength(data_type, type_size)) { - GELOGE(INTERNAL_ERROR, "[Get][DataTypeLength] failed for type:%s.", + GELOGE(INTERNAL_ERROR, "[Get][DataTypeLength] failed for type:%s.", TypeUtils::DataTypeToSerialString(data_type).c_str()); - REPORT_CALL_ERROR("E19999", "GetDataTypeLength failed for type:%s when ShapeInferenceEngine %s.", - TypeUtils::DataTypeToSerialString(data_type).c_str(), __FUNCTION__); + REPORT_CALL_ERROR("E19999", "GetDataTypeLength failed for type:%s when ShapeInferenceEngine %s.", + TypeUtils::DataTypeToSerialString(data_type).c_str(), __FUNCTION__); return INTERNAL_ERROR; } @@ -300,23 +300,24 @@ Status ShapeInferenceEngine::CalcOutputTensorSizes(const NodeItem &node_item, bo const auto &shape = tensor_desc->MutableShape(); // modify on copy auto dims = shape.GetDims(); - auto _status = CanonicalizeShape(*tensor_desc, dims, fallback_with_range); - if(_status != SUCCESS){ - REPORT_CALL_ERROR("E19999", "Invoke CanonicalizeShape failed when ShapeInferenceEngine %s, node:%s, output:%zu.", - node_item.NodeName().c_str(), __FUNCTION__, output_index); + auto status_result = CanonicalizeShape(*tensor_desc, dims, fallback_with_range); + if (status_result != SUCCESS) { + REPORT_CALL_ERROR("E19999", + "Invoke CanonicalizeShape failed when ShapeInferenceEngine %s, node:%s, output:%zu.", + node_item.NodeName().c_str(), __FUNCTION__, output_index); GELOGE(ge::FAILED, "[Canonicalize][Shape] failed for [%s], output %zu.", - node_item.NodeName().c_str(), output_index); - return _status; + node_item.NodeName().c_str(), output_index); + return status_result; } int64_t tensor_size; - _status = CalcTensorSize(tensor_desc->GetDataType(), dims, tensor_size); - if(_status != SUCCESS){ + status_result = CalcTensorSize(tensor_desc->GetDataType(), dims, tensor_size); + if (status_result != SUCCESS) { REPORT_CALL_ERROR("E19999", "Invoke CalcTensorSize failed when ShapeInferenceEngine %s, node:%s, output:%zu.", node_item.NodeName().c_str(), __FUNCTION__, output_index); GELOGE(ge::FAILED, "[Calc][TensorSize] failed for [%s], output %zu.", - node_item.NodeName().c_str(), output_index); - return _status; - } + node_item.NodeName().c_str(), output_index); + return status_result; + } GELOGD("[%s] Tensor size of output %zu = %ld", node_item.NodeName().c_str(), output_index, tensor_size); (void) TensorUtils::SetSize(*tensor_desc, tensor_size); } diff --git a/ge/hybrid/model/graph_item.cc b/ge/hybrid/model/graph_item.cc index 8204b1c8..09e0a117 100644 --- a/ge/hybrid/model/graph_item.cc +++ b/ge/hybrid/model/graph_item.cc @@ -95,8 +95,8 @@ Status GraphItem::GroupNodes() { int group = node->group; if (group != last_group) { if (seen_groups.find(group) != seen_groups.end()) { - GELOGE(INTERNAL_ERROR, - "[Order][NodeGroup]Unordered node group found. node = %s, group = %d", node->NodeName().c_str(), group); + GELOGE(INTERNAL_ERROR, + "[Find][Group]Unordered node group found. node = %s, group = %d", node->NodeName().c_str(), group); return INTERNAL_ERROR; } else { last_group = group; diff --git a/ge/hybrid/model/hybrid_model_builder.cc b/ge/hybrid/model/hybrid_model_builder.cc index e18ba2ca..ff107fe6 100755 --- a/ge/hybrid/model/hybrid_model_builder.cc +++ b/ge/hybrid/model/hybrid_model_builder.cc @@ -71,8 +71,10 @@ Status SetOutputNameAttr(ComputeGraph &graph) { } } GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListStr(&graph, ATTR_MODEL_OUT_NODES_NAME, output_names), - GELOGE(FAILED, "[Invoke][SetListStr] of ATTR_MODEL_OUT_NODES_NAME failed."); - return FAILED); + GELOGE(FAILED, "[Invoke][SetListStr] failed, name:%s.", ATTR_MODEL_OUT_NODES_NAME.c_str()); + REPORT_CALL_ERROR("E19999", "SetListStr failed when %s, name:%s.", + __FUNCTION__, ATTR_MODEL_OUT_NODES_NAME.c_str()); + return FAILED); return SUCCESS; } @@ -109,10 +111,11 @@ Status CollectDependenciesForFusedGraph(NodeItem &node_item, std::set GE_CHECK_NOTNULL(src_op_desc); if (src_node->GetType() != DATA_TYPE) { GELOGE(UNSUPPORTED, - "[Check][NodeType][%s::%s] Node in fused subgraph can only depend on Data nodes, but depend on %s", + "[Check][NodeType][%s::%s] Node in fused subgraph can only depend on Data nodes," + "but depend on %s actually", node_item.NodeName().c_str(), node->GetName().c_str(), src_node->GetType().c_str()); - REPORT_INNER_ERROR("E19999", - "[%s::%s] Node in fused subgraph can only depend on Data nodes, but depend on %s when %s.", + REPORT_INNER_ERROR("E19999", "[%s::%s] Node in fused subgraph can only depend on Data nodes," + " but depend on %s actually, check invalid when %s.", node_item.NodeName().c_str(), node->GetName().c_str(), src_node->GetType().c_str(), __FUNCTION__); return UNSUPPORTED; } @@ -134,12 +137,13 @@ Status HybridModelBuilder::Build() { hybrid_model_.model_name_ = ge_root_model_->GetRootGraph()->GetName(); GELOGI("[%s] Start to build hybrid model.", GetGraphName()); GE_CHK_STATUS_RET(InitRuntimeParams(), "[Invoke][InitRuntimeParams] failed, model_name_:[%s]", GetGraphName()); - GE_CHK_STATUS_RET(RecoverGraphUnknownFlag(), "[Invoke][RecoverGraphUnknownFlag] failed, model_name_:[%s]", GetGraphName()); + GE_CHK_STATUS_RET(RecoverGraphUnknownFlag(), + "[Invoke][RecoverGraphUnknownFlag] failed, model_name_:[%s]", GetGraphName()); GE_CHK_STATUS_RET(IndexSpecialNodes(), "[Invoke][IndexSpecialNodes] failed, model_name_:[%s]", GetGraphName()); GE_CHK_STATUS_RET(IndexTaskDefs(), "[Invoke][IndexTaskDefs] failed, model_name_:[%s]", GetGraphName()); GE_CHK_STATUS_RET(InitWeights(), "[Invoke][InitWeights] failed, model_name_:[%s]", GetGraphName()); GE_CHK_STATUS_RET(LoadGraph(), "[Invoke][LoadGraph] failed, model_name_:[%s]", GetGraphName()); - GE_CHK_STATUS_RET(AssignUninitializedConstantOps(), + GE_CHK_STATUS_RET(AssignUninitializedConstantOps(), "[Invoke][AssignUninitializedConstantOps] failed, model_name_:[%s]", GetGraphName()); GE_CHK_STATUS_RET(TransAllVarData(), "[Invoke][TransAllVarData] failed, model_name_:[%s]", GetGraphName()); GE_CHK_STATUS_RET(CopyVarData(), "[Invoke][CopyVarData] failed, model_name_:[%s]", GetGraphName()); @@ -335,6 +339,8 @@ Status HybridModelBuilder::ParseDependentInputNodes(NodeItem &node_item, const s if (input_index < 0) { GELOGE(INTERNAL_ERROR, "[Get][InputIndex]failed, node:[%s] inputname: %s.", node_item.NodeName().c_str(), input_name.c_str()); + REPORT_CALL_ERROR("E19999", "GetInputIndexByName failed when HybridModelBuilder %s, node:[%s] inputname: %s.", + __FUNCTION__, node_item.NodeName().c_str(), input_name.c_str()); return INTERNAL_ERROR; } @@ -423,20 +429,24 @@ Status HybridModelBuilder::UpdateAnchorStatus(const NodePtr &node) { auto peer_anchor = anchor->GetPeerOutAnchor(); if (peer_anchor == nullptr) { if (AnchorUtils::SetStatus(anchor, ANCHOR_SUSPEND) != GRAPH_SUCCESS) { - GELOGE(INTERNAL_ERROR, "[Invoke][SetStatus] failed, node:[%s].", node->GetName().c_str()); - REPORT_CALL_ERROR("E19999", "SetStatus failed, node:[%s] when %s.", node->GetName().c_str(), __FUNCTION__); + GELOGE(INTERNAL_ERROR, "[Invoke][SetStatus] failed to set ANCHOR_SUSPEND, node:[%s].", + node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "SetStatus failed to set ANCHOR_SUSPEND, node:[%s] when HybridModelBuilder %s.", + node->GetName().c_str(), __FUNCTION__); return INTERNAL_ERROR; } } else if (peer_anchor->GetOwnerNode()->GetType() == CONSTANT) { if (AnchorUtils::SetStatus(anchor, ANCHOR_CONST) != GRAPH_SUCCESS) { - GELOGE(INTERNAL_ERROR, "[Invoke][SetStatus] failed, node:[%s].", node->GetName().c_str()); - REPORT_CALL_ERROR("E19999", "SetStatus failed, node:[%s] when %s.", node->GetName().c_str(), __FUNCTION__); + GELOGE(INTERNAL_ERROR, "[Invoke][SetStatus] failed to set ANCHOR_CONST, node:[%s].", node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "SetStatus failed to set ANCHOR_CONST, node:[%s] when HybridModelBuilder %s.", + node->GetName().c_str(), __FUNCTION__); return INTERNAL_ERROR; } } else { if (AnchorUtils::SetStatus(anchor, ANCHOR_DATA) != GRAPH_SUCCESS) { - GELOGE(INTERNAL_ERROR, "[Invoke][SetStatus] failed, node:[%s].", node->GetName().c_str()); - REPORT_CALL_ERROR("E19999", "SetStatus failed, node:[%s] when %s.", node->GetName().c_str(), __FUNCTION__); + GELOGE(INTERNAL_ERROR, "[Invoke][SetStatus] failed to set ANCHOR_DATA, node:[%s].", node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "SetStatus failed to set ANCHOR_DATA, node:[%s] when HybridModelBuilder %s.", + node->GetName().c_str(), __FUNCTION__); return INTERNAL_ERROR; } } @@ -563,6 +573,8 @@ Status HybridModelBuilder::MergeNetOutputNode(ComputeGraph &graph) { if (input_desc == nullptr) { GELOGE(INTERNAL_ERROR, "[Invoke][MutableInputDesc][%s] Failed to get input desc[%d]", net_output_desc->GetName().c_str(), index); + REPORT_CALL_ERROR("E19999", "[%s] Failed to get input desc[%d] when HybridModelBuilder %s.", + net_output_desc->GetName().c_str(), index, __FUNCTION__); return INTERNAL_ERROR; } @@ -763,7 +775,7 @@ Status HybridModelBuilder::LoadGraph() { GELOGI("Before merging subgraphs DirectNodesSize = %zu, GetAllNodesSize = %zu", root_graph->GetDirectNodesSize(), root_graph->GetAllNodesSize()); - GE_CHK_GRAPH_STATUS_RET(UnfoldSubgraphs(root_graph, merged_graph), + GE_CHK_GRAPH_STATUS_RET(UnfoldSubgraphs(root_graph, merged_graph), "[Invoke][UnfoldSubgraphs]Failed to unfold subgraphs, model_name_:%s.", GetGraphName()); root_graph = std::move(merged_graph); GELOGI("After merging subgraphs DirectNodesSize = %zu, GetAllNodesSize = %zu", @@ -786,7 +798,7 @@ Status HybridModelBuilder::LoadGraph() { op_desc->SetId(index++); } GE_DUMP(root_graph, "hybrid_merged_graph"); - GE_CHK_STATUS_RET(LoadDynamicSubgraph(*root_graph, true), + GE_CHK_STATUS_RET(LoadDynamicSubgraph(*root_graph, true), "[Invoke][LoadDynamicSubgraph]Failed to load root graph, model_name_:%s.", GetGraphName()); GELOGD("Done loading root graph successfully."); GE_CHK_STATUS_RET(hybrid_model_.root_graph_item_->GroupNodes(), @@ -825,7 +837,7 @@ Status HybridModelBuilder::LoadGraph() { } } - GE_CHK_STATUS_RET(ParseDependentByParallelGroup(), + GE_CHK_STATUS_RET(ParseDependentByParallelGroup(), "[Invoke][ParseDependentByParallelGroup]Failed to establish dependencies for hccl ops, model_name_:%s.", GetGraphName()); GELOGI("Done loading all subgraphs successfully."); @@ -941,7 +953,7 @@ Status HybridModelBuilder::InitConstantOps() { auto op_desc = var_node->GetOpDesc(); auto v_weights = ModelUtils::GetWeights(op_desc); if (v_weights.empty()) { - GELOGE(INTERNAL_ERROR, "[Check][Size][%s] Constant no not have value", var_node->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Check][Size][%s] Constant op has no weight", var_node->GetName().c_str()); return INTERNAL_ERROR; } auto *ge_tensor = const_cast(v_weights[0].get()); @@ -955,7 +967,7 @@ Status HybridModelBuilder::InitConstantOps() { GELOGD("Init tensor with host constant %s size = %zu", var_name.c_str(), aligned_tensor.MutableData().GetSize()); if (MemManager::Instance().HostMemInstance(RT_MEMORY_HBM).Malloc(aligned_tensor.GetAlignedPtr(), aligned_tensor.GetData().size()) == nullptr) { - GELOGE(MEMALLOC_FAILED, "Malloc host memory for an existed GeTensor failed, model_name_:%s.", GetGraphName()); + GELOGE(MEMALLOC_FAILED, "[Malloc][HostMemory] for an existed GeTensor failed, model_name_:%s.", GetGraphName()); return MEMALLOC_FAILED; } var_tensor.reset(new(std::nothrow)TensorValue(aligned_tensor.MutableData().data(), @@ -1006,6 +1018,8 @@ Status HybridModelBuilder::InitVariableTensors() { int64_t tensor_size = 0; if (TensorUtils::CalcTensorMemSize(output_tensor.GetShape(), output_tensor.GetFormat(), output_tensor.GetDataType(), tensor_size) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "CalcTensorMemSize failed when HybridModelBuilder %s, node name:%s", + __FUNCTION__, it.first.c_str()); GELOGE(INTERNAL_ERROR, "[Calculate][TensorMemSize] failed, node name:%s", it.first.c_str()); return INTERNAL_ERROR; } @@ -1016,7 +1030,7 @@ Status HybridModelBuilder::InitVariableTensors() { } if (MemManager::Instance().HostMemInstance(RT_MEMORY_HBM).Malloc(mem_info.host_aligned_ptr, tensor_size) == nullptr) { - GELOGE(MEMALLOC_FAILED, + GELOGE(MEMALLOC_FAILED, "[Malloc][HostMem] for an existed GeTensor failed, Host variable [%s].", it.first.c_str()); return MEMALLOC_FAILED; } @@ -1078,7 +1092,7 @@ Status HybridModelBuilder::InitWeights() { const GeTensorDesc &tensor_desc = ge_tensor->GetTensorDesc(); int64_t tensor_size = 0; GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetSize(*op_desc->MutableOutputDesc(0), tensor_size), - "[Invoke][GetSize][%s] Failed to get tensor size", + "[Invoke][GetSize][%s] Failed to get output tensor size", node->GetName().c_str()); int64_t data_offset = 0; GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetDataOffset(tensor_desc, data_offset), @@ -1333,16 +1347,18 @@ Status HybridModelBuilder::GetPeerNodeAcrossSubGraphs(const NodePtr &data_node, GELOGD("To get peer node of %s::%s", sub_graph->GetName().c_str(), data_node->GetName().c_str()); auto wrapped_node = data_node->GetOwnerComputeGraph()->GetParentNode(); if (wrapped_node == nullptr) { + REPORT_INNER_ERROR("E19999", "[%s] Node is in root graph when HybridModelBuilder %s.", + data_node->GetName().c_str(), __FUNCTION__); GELOGE(INTERNAL_ERROR, "[Invoke][GetParentNode][%s] Node is in root graph.", data_node->GetName().c_str()); return INTERNAL_ERROR; } auto data_op_desc = data_node->GetOpDesc(); uint32_t parent_index = 0; if (!AttrUtils::GetInt(data_op_desc, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { - GELOGE(INTERNAL_ERROR, - "[Invoke][GetInt][%s] Failed to get attr [%s]", - data_op_desc->GetName().c_str(), - ATTR_NAME_PARENT_NODE_INDEX.c_str()); + REPORT_CALL_ERROR("E19999", "[%s] Failed to get attr [%s] when HybridModelBuilder %s.", + data_op_desc->GetName().c_str(), ATTR_NAME_PARENT_NODE_INDEX.c_str(), __FUNCTION__); + GELOGE(INTERNAL_ERROR, "[Invoke][GetInt][%s] Failed to get attr [%s]", + data_op_desc->GetName().c_str(), ATTR_NAME_PARENT_NODE_INDEX.c_str()); return INTERNAL_ERROR; } @@ -1350,7 +1366,9 @@ Status HybridModelBuilder::GetPeerNodeAcrossSubGraphs(const NodePtr &data_node, GE_CHECK_NOTNULL(wrapped_node_in_anchor); auto src_out_anchor = wrapped_node_in_anchor->GetPeerOutAnchor(); if (src_out_anchor == nullptr || src_out_anchor->GetOwnerNode() == nullptr) { - GELOGE(INTERNAL_ERROR, + REPORT_INNER_ERROR("E19999", "[%s] Parent node do not have peer anchor when HybridModelBuilder %s.", + data_node->GetName().c_str(), __FUNCTION__); + GELOGE(INTERNAL_ERROR, "[Check][ParentNode][%s] Parent node do not have peer anchor.", data_node->GetName().c_str()); return INTERNAL_ERROR; } @@ -1374,10 +1392,13 @@ Status HybridModelBuilder::GetPeerNodeAcrossSubGraphs(const NodePtr &data_node, auto src_graph = NodeUtils::GetSubgraph(*src_wrapped_node, kSubgraphIndex); GE_CHECK_NOTNULL(src_graph); auto src_net_output_node = src_graph->FindFirstNodeMatchType(NETOUTPUT); - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(src_net_output_node == nullptr, - return INTERNAL_ERROR, - "[Invoke][FindFirstNodeMatchType]Failed to find NetOutput in subgraph: %s", - src_graph->GetName().c_str()); + if (src_net_output_node == nullptr) { + REPORT_INNER_ERROR("E19999", "Failed to find NetOutput in subgraph: %s when HybridModelBuilder %s", + src_graph->GetName().c_str(), __FUNCTION__); + GELOGE(INTERNAL_ERROR, "[Invoke][FindFirstNodeMatchType]Failed to find NetOutput in subgraph: %s", + src_graph->GetName().c_str()); + return INTERNAL_ERROR; + } auto net_output_desc = src_net_output_node->GetOpDesc(); GE_CHECK_NOTNULL(net_output_desc); @@ -1567,10 +1588,10 @@ Status HybridModelBuilder::GetParentNodeOutputIndex(const OpDesc &op_desc, int i auto input_desc = op_desc.MutableInputDesc(index); GE_CHECK_NOTNULL(input_desc); if (!AttrUtils::GetInt(input_desc, ATTR_NAME_PARENT_NODE_INDEX, out_index)) { - GELOGE(INTERNAL_ERROR, "[Invoke][GetInt]NetOutput input tensor %d, attr %s not found.", - index, ATTR_NAME_PARENT_NODE_INDEX.c_str()); - REPORT_CALL_ERROR("E19999", "NetOutput input tensor %d, attr %s not found when %s.", - index, ATTR_NAME_PARENT_NODE_INDEX.c_str(), __FUNCTION__); + GELOGE(INTERNAL_ERROR, "[Invoke][GetInt]NetOutput %s input tensor %d, attr %s not found.", + op_desc.GetName().c_str(), index, ATTR_NAME_PARENT_NODE_INDEX.c_str()); + REPORT_CALL_ERROR("E19999", "NetOutput %s input tensor %d, attr %s not found when %s.", + op_desc.GetName().c_str(), index, ATTR_NAME_PARENT_NODE_INDEX.c_str(), __FUNCTION__); return INTERNAL_ERROR; } return SUCCESS; @@ -2026,10 +2047,10 @@ Status HybridModelBuilder::BuildInputMapping(GraphItem &graph_item, data_op_index++; } else { if (!AttrUtils::GetInt(node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, data_index)) { - GELOGE(FAILED, - "[Invoke][GetInt][%s] Failed to get attr [%s]", - node->GetName().c_str(), - ATTR_NAME_PARENT_NODE_INDEX.c_str()); + GELOGE(FAILED, "[Invoke][GetInt][%s] Failed to get attr [%s]", + node->GetName().c_str(), ATTR_NAME_PARENT_NODE_INDEX.c_str()); + REPORT_CALL_ERROR("E19999", "call GetInt failed when HybridModelBuilder %s, [%s] Failed to get attr [%s]", + __FUNCTION__, node->GetName().c_str(), ATTR_NAME_PARENT_NODE_INDEX.c_str()); return FAILED; } }