!1389 add report errmsg and modify geloge

Merge pull request !1389 from ldy2021/master
pull/1389/MERGE
计晨 4 years ago committed by Gitee
commit 49b2c8bdba

@ -38,8 +38,11 @@ AllocationAttr::AllocationAttr(void *try_reuse_addr) : AllocationAttr(0, try_reu
NpuMemoryAllocator *NpuMemoryAllocator::GetAllocator() { NpuMemoryAllocator *NpuMemoryAllocator::GetAllocator() {
int32_t device_id = 0; int32_t device_id = 0;
if (rtGetDevice(&device_id) != RT_ERROR_NONE) { auto rt_result = rtGetDevice(&device_id);
GELOGE(RT_FAILED, "Failed to get device id"); if (rt_result != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "[Get][Device] Failed, result:%d.", rt_result);
REPORT_INNER_ERROR("E19999", "rtGetDevice failed when NpuMemoryAllocator %s, result:%d.",
__FUNCTION__, rt_result);
return nullptr; return nullptr;
} }
@ -57,7 +60,10 @@ void *NpuMemoryAllocator::Allocate(std::size_t size, AllocationAttr *attr) {
} }
if (allocate_size == 0) { if (allocate_size == 0) {
GELOGE(MEMALLOC_FAILED, "Memory size is 0, device_id = %u, size = %zu", device_id_, allocate_size); GELOGE(MEMALLOC_FAILED, "[Check][Param:size_t]Memory size is 0, device_id = %u, size = %zu.",
device_id_, allocate_size);
REPORT_INNER_ERROR("E19999", "Memory size is 0, device_id = %u, size = %zu when %s.",
device_id_, allocate_size, __FUNCTION__);
return nullptr; return nullptr;
} }
@ -68,7 +74,10 @@ void *NpuMemoryAllocator::Allocate(std::size_t size, AllocationAttr *attr) {
buffer = MemManager::Instance().HostMemInstance(RT_MEMORY_HBM).Malloc(allocate_size); buffer = MemManager::Instance().HostMemInstance(RT_MEMORY_HBM).Malloc(allocate_size);
} else { } else {
if (allocate_size > kMaxHbmMemorySize) { if (allocate_size > kMaxHbmMemorySize) {
GELOGE(PARAM_INVALID, "Invalid HBM memory size: %zu", allocate_size); GELOGE(PARAM_INVALID, "[Check][Param:size_t]Invalid HBM memory size: %zu bigger than limit:%lu, check invalid.",
allocate_size, kMaxHbmMemorySize);
REPORT_CALL_ERROR("E19999", "Invalid HBM memory size: %zu bigger than limit:%lu, check invalid when %s.",
allocate_size, kMaxHbmMemorySize, __FUNCTION__);
return nullptr; return nullptr;
} }
void *try_reuse_addr = nullptr; void *try_reuse_addr = nullptr;
@ -87,7 +96,10 @@ void *NpuMemoryAllocator::Allocate(std::size_t size, AllocationAttr *attr) {
.Malloc(allocate_size, reinterpret_cast<uint8_t *>(try_reuse_addr), device_id_); .Malloc(allocate_size, reinterpret_cast<uint8_t *>(try_reuse_addr), device_id_);
} }
if (buffer == nullptr) { if (buffer == nullptr) {
GELOGE(MEMALLOC_FAILED, "Failed to malloc memory, device_id = %u, size = %zu", device_id_, allocate_size); GELOGE(MEMALLOC_FAILED, "[Malloc][Memory] Failed, device_id = %u, size = %zu.",
device_id_, allocate_size);
REPORT_CALL_ERROR("E19999", "malloc memory failed, device_id = %u, size = %zu when %s.",
device_id_, allocate_size, __FUNCTION__);
return nullptr; return nullptr;
} }

@ -32,7 +32,8 @@ std::unique_ptr<TensorBuffer> TensorBuffer::Create(NpuMemoryAllocator *allocator
} }
if (allocator == nullptr) { if (allocator == nullptr) {
GELOGE(INTERNAL_ERROR, "allocator is NULL"); GELOGE(INTERNAL_ERROR, "[Check][Param:NpuMemoryAllocator] allocator is NULL.");
REPORT_INNER_ERROR("E19999", "input allocator is NULL, when %s.", __FUNCTION__);
return nullptr; return nullptr;
} }
@ -42,7 +43,8 @@ std::unique_ptr<TensorBuffer> TensorBuffer::Create(NpuMemoryAllocator *allocator
} }
buffer = allocator->Allocate(size, attr); buffer = allocator->Allocate(size, attr);
if (buffer == nullptr) { if (buffer == nullptr) {
GELOGE(MEMALLOC_FAILED, "Failed to allocate memory. size = %zu", size); GELOGE(MEMALLOC_FAILED, "[Allocate][Memory] Failed. size = %zu.", size);
REPORT_CALL_ERROR("E19999", "allocate failed, size = %zu, when %s.", size, __FUNCTION__);
return nullptr; return nullptr;
} }

@ -59,7 +59,9 @@ Status GraphExecutionContext::Synchronize(rtStream_t rt_stream) {
return SUCCESS; return SUCCESS;
} }
GELOGE(RT_FAILED, "Failed to invoke rtStreamSynchronize, ret = %d", rt_ret); GELOGE(RT_FAILED, "[Invoke][rtStreamSynchronize] failed, ret = %d", rt_ret);
REPORT_CALL_ERROR("E19999",
"invoke rtStreamSynchronize failed when GraphExecutionContext %s, ret = %d", __FUNCTION__, rt_ret);
return RT_FAILED; return RT_FAILED;
} }
} // namespace hybrid } // namespace hybrid

@ -51,8 +51,13 @@ void HybridModelAsyncExecutor::SetModelName(const string &model_name) {
} }
Status HybridModelAsyncExecutor::EnqueueData(const shared_ptr<InputDataWrapper> &data) { Status HybridModelAsyncExecutor::EnqueueData(const shared_ptr<InputDataWrapper> &data) {
GE_CHK_STATUS_EXEC(data_inputer_->Push(data), return domi::DATA_QUEUE_ISFULL, if (data_inputer_->Push(data) != SUCCESS) {
"Data queue is full, please call again later, model_id %u ", model_id_); REPORT_CALL_ERROR("E19999", "Data queue is full, please call again later when %s, model_id %u.",
__FUNCTION__, model_id_);
GELOGE(domi::DATA_QUEUE_ISFULL,
"[Push][Data] Data queue is full, please call again later, model_id %u ", model_id_);
return domi::DATA_QUEUE_ISFULL;
}
GELOGD("EnqueueData successfully. model_id = %u, data_index = %u", data->GetInput().model_id, data->GetInput().index); GELOGD("EnqueueData successfully. model_id = %u, data_index = %u", data->GetInput().model_id, data->GetInput().index);
return SUCCESS; return SUCCESS;
} }
@ -60,8 +65,12 @@ Status HybridModelAsyncExecutor::EnqueueData(const shared_ptr<InputDataWrapper>
Status HybridModelAsyncExecutor::Start(const std::shared_ptr<ModelListener> &listener) { Status HybridModelAsyncExecutor::Start(const std::shared_ptr<ModelListener> &listener) {
GELOGD("HybridModelExecutor::Start IN, has listener = %d", listener != nullptr); GELOGD("HybridModelExecutor::Start IN, has listener = %d", listener != nullptr);
std::lock_guard<std::mutex> lk(mu_); std::lock_guard<std::mutex> lk(mu_);
GE_CHK_BOOL_RET_STATUS(!run_flag_, INTERNAL_ERROR, "Model already started."); if (run_flag_) {
REPORT_INNER_ERROR("E19999",
"Model already started when HybridModelAsyncExecutor %s, model_id:%u.", __FUNCTION__, model_id_);
GELOGE(INTERNAL_ERROR, "[Check][RunState] Model already started, model_id:%u.", model_id_);
return INTERNAL_ERROR;
}
run_flag_ = true; run_flag_ = true;
listener_ = listener; listener_ = listener;
future_ = std::async(std::launch::async, [&]() -> Status { future_ = std::async(std::launch::async, [&]() -> Status {
@ -71,7 +80,8 @@ Status HybridModelAsyncExecutor::Start(const std::shared_ptr<ModelListener> &lis
return RunInternal(); return RunInternal();
}); });
GE_CHK_BOOL_RET_STATUS(future_.valid(), INTERNAL_ERROR, "Failed to start."); GE_CHK_BOOL_RET_STATUS(future_.valid(), INTERNAL_ERROR,
"[Check][RunState] Failed to start, model_id:%u.", model_id_);
GELOGD("HybridModelExecutor::Start successfully"); GELOGD("HybridModelExecutor::Start successfully");
return SUCCESS; return SUCCESS;
} }
@ -105,26 +115,29 @@ Status HybridModelAsyncExecutor::Init() {
executor_ = std::unique_ptr<HybridModelExecutor>(new(std::nothrow) HybridModelExecutor(model_, device_id_, stream_)); executor_ = std::unique_ptr<HybridModelExecutor>(new(std::nothrow) HybridModelExecutor(model_, device_id_, stream_));
GE_CHECK_NOTNULL(executor_); GE_CHECK_NOTNULL(executor_);
GE_CHK_STATUS_RET(executor_->Init(), "Failed to init hybrid engine"); GE_CHK_STATUS_RET(executor_->Init(),
GE_CHK_STATUS_RET(DumpOpDebug(), "Dump op debug failed in hybrid engine"); "[Init][HybridModelExecutor] failed, model_id:%u.", model_id_);
GE_CHK_STATUS_RET(DumpOpDebug(), "[Dump][OpDebug] failed, model_id:%u.", model_id_);
GELOGI("HybridModel stage nums:%zu", model_->GetRootGraphItem()->NumGroups()); GELOGI("HybridModel stage nums:%zu", model_->GetRootGraphItem()->NumGroups());
if (model_->GetRootGraphItem()->NumGroups() >= kMinimumPiplineStages) { if (model_->GetRootGraphItem()->NumGroups() >= kMinimumPiplineStages) {
pipe_executor_ = pipe_executor_ =
std::unique_ptr<HybridModelPipelineExecutor>(new(std::nothrow) HybridModelPipelineExecutor(model_, device_id_)); std::unique_ptr<HybridModelPipelineExecutor>(new(std::nothrow) HybridModelPipelineExecutor(model_, device_id_));
GE_CHECK_NOTNULL(pipe_executor_); GE_CHECK_NOTNULL(pipe_executor_);
GE_CHK_STATUS_RET(pipe_executor_->Init(), "Failed to init hybrid engine"); GE_CHK_STATUS_RET(pipe_executor_->Init(),
"[Init][HybridModelPipelineExecutor] failed, model_id:%u.", model_id_);
} }
GE_CHK_STATUS_RET(InitInputDesc(), "Failed to init input tensors"); GE_CHK_STATUS_RET(InitInputDesc(), "[Init][InputDesc] failed, model_id:%u.", model_id_);
return SUCCESS; return SUCCESS;
} }
Status HybridModelAsyncExecutor::PreRun(InputData &current_data, HybridModelExecutor::ExecuteArgs &args) { Status HybridModelAsyncExecutor::PreRun(InputData &current_data, HybridModelExecutor::ExecuteArgs &args) {
GE_CHK_STATUS_RET(SyncVarData(), "Failed to sync var data"); GE_CHK_STATUS_RET(SyncVarData(), "[Invoke][SyncVarData] failed, model_id:%u.", model_id_);
RECORD_MODEL_EXECUTION_EVENT(executor_->GetContext(), "[SyncVarData] End"); RECORD_MODEL_EXECUTION_EVENT(executor_->GetContext(), "[SyncVarData] End");
GE_CHK_STATUS_RET(PrepareInputs(current_data, args), "Failed to copy input data to model"); GE_CHK_STATUS_RET(PrepareInputs(current_data, args),
"[Invoke][PrepareInputs] failed to copy input data to model, model_id:%u.", model_id_);
RECORD_MODEL_EXECUTION_EVENT(executor_->GetContext(), "[CopyInputData] End"); RECORD_MODEL_EXECUTION_EVENT(executor_->GetContext(), "[CopyInputData] End");
return SUCCESS; return SUCCESS;
} }
@ -155,7 +168,7 @@ Status HybridModelAsyncExecutor::RunInternal() {
GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
ret != SUCCESS, (void) HandleResult(ret, current_data.index, args, data_wrapper->GetOutput()); ret != SUCCESS, (void) HandleResult(ret, current_data.index, args, data_wrapper->GetOutput());
CsaInteract::GetInstance().StoreInternalErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); CsaInteract::GetInstance().StoreInternalErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC);
continue, "PreRun failed."); // [No need to check value] continue, "[Invoke][PreRun] failed, model_id:%u.", model_id_); // [No need to check value]
if (pipe_executor_ != nullptr) { if (pipe_executor_ != nullptr) {
GELOGI("HybridModel will execute in pipeline mode"); GELOGI("HybridModel will execute in pipeline mode");
@ -199,7 +212,9 @@ Status HybridModelAsyncExecutor::HandleResult(Status exec_ret,
} }
if (exec_ret != SUCCESS) { if (exec_ret != SUCCESS) {
GELOGE(exec_ret, "Failed to execute graph. model_id = %u", model_id_); GELOGE(exec_ret, "[Check][Param:Status] failed to execute graph. model_id = %u", model_id_);
REPORT_INNER_ERROR("E19999",
"failed to execute graph when HybridModelAsyncExecutor %s. model_id = %u", __FUNCTION__, model_id_);
return OnComputeDone(data_id, INTERNAL_ERROR, output_tensor_info_list); return OnComputeDone(data_id, INTERNAL_ERROR, output_tensor_info_list);
} }
@ -235,8 +250,12 @@ Status HybridModelAsyncExecutor::SyncVarData() {
Status HybridModelAsyncExecutor::PrepareInputs(const InputData &current_data, HybridModelExecutor::ExecuteArgs &args) { Status HybridModelAsyncExecutor::PrepareInputs(const InputData &current_data, HybridModelExecutor::ExecuteArgs &args) {
if (current_data.blobs.size() < input_tensor_desc_.size()) { if (current_data.blobs.size() < input_tensor_desc_.size()) {
GELOGE(PARAM_INVALID, "Blob size mismatches, expect at least %zu, but got %zu", GELOGE(PARAM_INVALID,
input_tensor_desc_.size(), current_data.blobs.size()); "[Check][Size]Blob size mismatches, expect at least %zu, but got %zu, model_id = %u",
input_tensor_desc_.size(), current_data.blobs.size(), model_id_);
REPORT_INNER_ERROR("E19999",
"Blob size mismatches, expect at least %zu, but got %zu when HybridModelAsyncExecutor %s, model_id = %u.",
input_tensor_desc_.size(), current_data.blobs.size(), __FUNCTION__, model_id_);
return PARAM_INVALID; return PARAM_INVALID;
} }
@ -248,8 +267,12 @@ Status HybridModelAsyncExecutor::PrepareInputs(const InputData &current_data, Hy
auto tensor_size = input_sizes_[input_index]; auto tensor_size = input_sizes_[input_index];
if (is_input_dynamic_[input_index]) { if (is_input_dynamic_[input_index]) {
if (input_index >= current_data.shapes.size()) { if (input_index >= current_data.shapes.size()) {
GELOGE(PARAM_INVALID, "Shape index out of range, index = %zu, shape size = %zu", GELOGE(PARAM_INVALID,
input_index, current_data.shapes.size()); "[Check][Range]Shape index out of range, index = %zu, shape size = %zu model_id = %u.",
input_index, current_data.shapes.size(), model_id_);
REPORT_INNER_ERROR("E19999",
"Shape index out of range, index = %zu, shape size = %zu when HybridModelAsyncExecutor %s, model_id = %u.",
input_index, current_data.shapes.size(), __FUNCTION__, model_id_);
return PARAM_INVALID; return PARAM_INVALID;
} }
auto &tensor_desc = input_tensor_desc_[input_index]; auto &tensor_desc = input_tensor_desc_[input_index];
@ -257,15 +280,19 @@ Status HybridModelAsyncExecutor::PrepareInputs(const InputData &current_data, Hy
std::vector<std::pair<int64_t, int64_t>> range; std::vector<std::pair<int64_t, int64_t>> range;
auto range_ret = tensor_desc->GetShapeRange(range); auto range_ret = tensor_desc->GetShapeRange(range);
GE_CHK_BOOL_RET_STATUS(range_ret == GRAPH_SUCCESS, INTERNAL_ERROR, GE_CHK_BOOL_RET_STATUS(range_ret == GRAPH_SUCCESS, INTERNAL_ERROR,
"Get shape range failed, ret=%u.", range_ret); "[Invoke][GetShapeRange] failed, ret=%u, model_id = %u.", range_ret, model_id_);
for (size_t k = 0; k < range.size(); ++k) { for (size_t k = 0; k < range.size(); ++k) {
if (k >= shape.GetDimNum()) { if (k >= shape.GetDimNum()) {
break; break;
} }
// range[k].second can be -1 // range[k].second can be -1
if (shape.GetDim(k) < range[k].first || (range[k].second >= 0 && shape.GetDim(k) > range[k].second)) { if (shape.GetDim(k) < range[k].first || (range[k].second >= 0 && shape.GetDim(k) > range[k].second)) {
GELOGE(PARAM_INVALID, "Dim out of range, shape idx = %zu, dim idx = %zu, dim = %ld, range = [%ld, %ld]", GELOGE(PARAM_INVALID,
input_index, k, shape.GetDim(k), range[k].first, range[k].second); "[Check][Range]Dim out of range, shape idx = %zu, dim idx = %zu, dim = %ld, range = [%ld, %ld], model_id = %u.",
input_index, k, shape.GetDim(k), range[k].first, range[k].second, model_id_);
REPORT_INNER_ERROR("E19999",
"Dim out of range, shape idx = %zu, dim idx = %zu, dim = %ld, range = [%ld, %ld], model_id = %u.",
input_index, k, shape.GetDim(k), range[k].first, range[k].second, model_id_);
return PARAM_INVALID; return PARAM_INVALID;
} }
} }
@ -273,9 +300,8 @@ Status HybridModelAsyncExecutor::PrepareInputs(const InputData &current_data, Hy
args.input_desc[input_index] = tensor_desc; args.input_desc[input_index] = tensor_desc;
GELOGD("Update shape of input[%zu] to [%s]", input_index, tensor_desc->MutableShape().ToString().c_str()); GELOGD("Update shape of input[%zu] to [%s]", input_index, tensor_desc->MutableShape().ToString().c_str());
GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetTensorMemorySizeInBytes(*tensor_desc, tensor_size), GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetTensorMemorySizeInBytes(*tensor_desc, tensor_size),
"Failed to calc tensor size, index = %zu, shape = [%s]", "[Invoke][GetTensorMemorySizeInBytes]Failed to calc tensor size, index = %zu, shape = [%s], model_id = %u.",
input_index, input_index, tensor_desc->GetShape().ToString().c_str(), model_id_);
tensor_desc->GetShape().ToString().c_str());
GELOGD("Input tensor[%zu] size = %zu", input_index, tensor_size); GELOGD("Input tensor[%zu] size = %zu", input_index, tensor_size);
} }
@ -291,12 +317,16 @@ Status HybridModelAsyncExecutor::PrepareInputs(const InputData &current_data, Hy
GELOGD("To copy input data for input[%zu]", input_index); GELOGD("To copy input data for input[%zu]", input_index);
const DataBuffer &data_buf = blobs[input_index]; const DataBuffer &data_buf = blobs[input_index];
auto mem_size = static_cast<uint64_t>(tensor_size); auto mem_size = static_cast<uint64_t>(tensor_size);
GE_CHK_BOOL_RET_STATUS(mem_size >= data_buf.length,
PARAM_INVALID,
"input data size(%lu) does not match model required size(%lu), ret failed.",
data_buf.length,
mem_size);
if (mem_size < data_buf.length) {
REPORT_INNER_ERROR("E19999",
"input data size(%lu) does not match model required size(%lu) when %s, ret failed, model_id = %u.",
data_buf.length, mem_size, __FUNCTION__, model_id_);
GELOGE(PARAM_INVALID,
"[Check][Size]input data size(%lu) does not match model required size(%lu), ret failed, model_id = %u.",
data_buf.length, mem_size, model_id_);
return PARAM_INVALID;
}
if (data_buf.length > 0) { if (data_buf.length > 0) {
GELOGI("[IMAS]CopyPlainData memcpy graph_%u type[F] output[%zu] memaddr[%p] mem_size[%zu] datasize[%lu]", GELOGI("[IMAS]CopyPlainData memcpy graph_%u type[F] output[%zu] memaddr[%p] mem_size[%zu] datasize[%lu]",
model_->root_runtime_param_.graph_id, model_->root_runtime_param_.graph_id,
@ -351,7 +381,7 @@ Status HybridModelAsyncExecutor::OnComputeDone(uint32_t data_index, uint32_t res
GELOGD("OnComputeDone. model id = %u, data index = %u, execution ret = %u", model_id_, data_index, result_code); GELOGD("OnComputeDone. model id = %u, data index = %u, execution ret = %u", model_id_, data_index, result_code);
if (listener_ != nullptr) { if (listener_ != nullptr) {
GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_index, result_code, outputs), GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_index, result_code, outputs),
"OnComputeDone failed"); "[Invoke][OnComputeDone] failed, model_id = %u.", model_id_);
} }
return result_code; return result_code;
@ -365,9 +395,11 @@ Status HybridModelAsyncExecutor::CopyOutputs(HybridModelExecutor::ExecuteArgs &a
std::vector<TensorValue> &output_tensors = args.outputs; std::vector<TensorValue> &output_tensors = args.outputs;
if (output_tensor_desc_list.size() != output_tensors.size()) { if (output_tensor_desc_list.size() != output_tensors.size()) {
GELOGE(INTERNAL_ERROR, GELOGE(INTERNAL_ERROR,
"Output sizes mismatch. From op_desc = %zu, and from output tensors = %zu", "[Check][Size]Output sizes mismatch. From op_desc = %zu, and from output tensors = %zu, model_id = %u.",
output_tensor_desc_list.size(), output_tensor_desc_list.size(), output_tensors.size(), model_id_);
output_tensors.size()); REPORT_INNER_ERROR("E19999", "Output sizes mismatch. From op_desc = %zu, and from output tensors = %zu, "
"when HybridModelAsyncExecutor %s, model_id = %u.",
output_tensor_desc_list.size(), output_tensors.size(), __FUNCTION__, model_id_);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
@ -399,8 +431,12 @@ Status HybridModelAsyncExecutor::CopyOutputs(HybridModelExecutor::ExecuteArgs &a
GE_CHECK_LE(output_size, UINT32_MAX); GE_CHECK_LE(output_size, UINT32_MAX);
if (output_tensor.GetSize() < static_cast<size_t>(output_size)) { if (output_tensor.GetSize() < static_cast<size_t>(output_size)) {
GELOGE(INTERNAL_ERROR, GELOGE(INTERNAL_ERROR,
"output[%zu] tensor size(%zu) is not enough for output shape [%s]", "[Check][Size]output[%zu] tensor size(%zu) is not enough for output shape [%s], model_id = %u.",
i, output_tensor.GetSize(), tensor_desc->GetShape().ToString().c_str()); i, output_tensor.GetSize(), tensor_desc->GetShape().ToString().c_str(), model_id_);
REPORT_INNER_ERROR("E19999",
"output[%zu] tensor size(%zu) is not enough for output shape [%s] model_id = %u,"
" when HybridModelAsyncExecutor %s.",
i, output_tensor.GetSize(), tensor_desc->GetShape().ToString().c_str(), model_id_, __FUNCTION__);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
@ -456,7 +492,7 @@ Status HybridModelAsyncExecutor::Execute(const std::vector<DataBuffer> &inputs,
args.input_desc.emplace_back(tensor_desc_ptr); args.input_desc.emplace_back(tensor_desc_ptr);
} }
GE_CHK_STATUS_RET(executor_->Execute(args), "Failed to execute model."); GE_CHK_STATUS_RET(executor_->Execute(args), "[Invoke][Execute] Failed, model_id = %u.", model_id_);
for (const auto &output_tensor_desc : args.output_desc) { for (const auto &output_tensor_desc : args.output_desc) {
output_desc.emplace_back(*output_tensor_desc); output_desc.emplace_back(*output_tensor_desc);
} }
@ -477,13 +513,15 @@ Status HybridModelAsyncExecutor::Execute(const vector<GeTensor> &inputs, vector<
} }
HybridModelExecutor::ExecuteArgs args; HybridModelExecutor::ExecuteArgs args;
GE_CHK_STATUS_RET(PrepareInputs(input_data, args), "Failed to copy input data to model"); GE_CHK_STATUS_RET(PrepareInputs(input_data, args),
"[Invoke][PrepareInputs]Failed to copy input data to model, model_id = %u", model_id_);
GELOGD("Done copying input data successfully."); GELOGD("Done copying input data successfully.");
GE_CHK_STATUS_RET(executor_->Execute(args), "Failed to execute model."); GE_CHK_STATUS_RET(executor_->Execute(args), "[Invoke][Execute] Failed, model_id = %u.", model_id_);
std::vector<ge::OutputTensorInfo> output_tensor_info_list; std::vector<ge::OutputTensorInfo> output_tensor_info_list;
OutputData output_data; OutputData output_data;
GE_CHK_STATUS_RET(CopyOutputs(args, &output_data, output_tensor_info_list), "Failed to copy outputs."); GE_CHK_STATUS_RET(CopyOutputs(args, &output_data, output_tensor_info_list),
"[Invoke][CopyOutputs]Failed to copy outputs, model_id = %u.", model_id_);
GELOGD("Done copying output data successfully. output count = %zu", output_tensor_info_list.size()); GELOGD("Done copying output data successfully. output count = %zu", output_tensor_info_list.size());
int out_index = 0; int out_index = 0;
@ -534,7 +572,8 @@ Status HybridModelAsyncExecutor::DumpOpDebug() {
loop_cond = const_cast<void *>(varible_loop_cond->GetData()); loop_cond = const_cast<void *>(varible_loop_cond->GetData());
} }
data_dumper_.SetLoopAddr(global_step, loop_per_iter, loop_cond); data_dumper_.SetLoopAddr(global_step, loop_per_iter, loop_cond);
GE_CHK_STATUS_RET(data_dumper_.LoadDumpInfo(), "LoadDumpInfo failed in hybrid engine"); GE_CHK_STATUS_RET(data_dumper_.LoadDumpInfo(),
"[Invoke][LoadDumpInfo] failed in hybrid engine, model_id = %u.", model_id_);
GELOGD("Dump op debug SUCCESS in hybrid engine"); GELOGD("Dump op debug SUCCESS in hybrid engine");
} }
return SUCCESS; return SUCCESS;

@ -72,7 +72,7 @@ Status HybridModelExecutor::Execute(HybridModelExecutor::ExecuteArgs &args) {
if (ret == END_OF_SEQUENCE) { if (ret == END_OF_SEQUENCE) {
args.is_eos = true; args.is_eos = true;
} else { } else {
GE_CHK_STATUS_RET(ret, "Failed to execute model"); GE_CHK_STATUS_RET(ret, "[Invoke][ExecuteGraphInternal] Failed, ret:%d.", ret);
} }
return SUCCESS; return SUCCESS;
} }

@ -59,23 +59,27 @@ Status StageExecutor::Start(const std::vector<TensorValue> &inputs, const std::v
task_queue_.Pop(task_info); task_queue_.Pop(task_info);
GELOGD("[Executor: %d] Got task, stage = %d, iteration = %ld", id_, task_info.stage, task_info.iteration); GELOGD("[Executor: %d] Got task, stage = %d, iteration = %ld", id_, task_info.stage, task_info.iteration);
if (task_info.iteration >= pipe_config_->iteration_end) { if (task_info.iteration >= pipe_config_->iteration_end) {
GELOGE(INTERNAL_ERROR, "[Executor: %d] Unexpected iteration: %d", id_, task_info.iteration); GELOGE(INTERNAL_ERROR, "[Check][Range][Executor: %d] Unexpected iteration: %ld.",
id_, task_info.iteration);
REPORT_INNER_ERROR("E19999", "[Executor: %d] Unexpected iteration: %ld when StageExecutor %s.",
id_, task_info.iteration, __FUNCTION__);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
if (task_info.event != nullptr) { if (task_info.event != nullptr) {
GELOGD("[%d] Add StreamWaitEvent", id_); GELOGD("[%d] Add StreamWaitEvent", id_);
GE_CHK_RT_RET(rtStreamWaitEvent(stream_, task_info.event)); GE_CHK_RT_RET(rtStreamWaitEvent(stream_, task_info.event));
RECORD_MODEL_EXECUTION_EVENT(&context_, "[iteration = %d] [Stage = %d] End", task_info.iteration - 1, RECORD_MODEL_EXECUTION_EVENT(&context_, "[iteration = %ld] [Stage = %d] End", task_info.iteration - 1,
task_info.stage); task_info.stage);
} }
RECORD_MODEL_EXECUTION_EVENT(&context_, "[iteration = %d] [Stage = %d] Start", task_info.iteration, RECORD_MODEL_EXECUTION_EVENT(&context_, "[iteration = %lld] [Stage = %d] Start", task_info.iteration,
task_info.stage); task_info.stage);
if (task_info.stage == 0) { if (task_info.stage == 0) {
GELOGD("[Executor: %d] To ResetExecutionContext", id_); GELOGD("[Executor: %d] To ResetExecutionContext", id_);
GE_CHK_STATUS_RET(ResetExecutionContext(context_), "[Executor: %d] Failed to reset context", id_); GE_CHK_STATUS_RET(ResetExecutionContext(context_),
"[Invoke][ResetExecutionContext][Executor: %d] Failed to reset context", id_);
context_.iteration = task_info.iteration; context_.iteration = task_info.iteration;
GE_CHK_STATUS_RET_NOLOG(SetInputs(inputs, input_desc)); GE_CHK_STATUS_RET_NOLOG(SetInputs(inputs, input_desc));
} }
@ -92,19 +96,22 @@ Status StageExecutor::Start(const std::vector<TensorValue> &inputs, const std::v
auto sync_result = Synchronize(); auto sync_result = Synchronize();
if (sync_result != SUCCESS) { if (sync_result != SUCCESS) {
GELOGE(sync_result, "[Executor: %d] Failed to sync result. iteration = %d", id_, task_info.iteration); GELOGE(sync_result,
"[Invoke][Synchronize][Executor: %d] Failed to sync result:%d. iteration = %ld",
id_, sync_result, task_info.iteration);
REPORT_CALL_ERROR("E19999", "[Executor: %d] Failed to sync result:%d when StageExecutor %s. iteration = %ld",
id_, sync_result, __FUNCTION__, task_info.iteration);
context_.profiler->Dump(std::cout); context_.profiler->Dump(std::cout);
context_.callback_manager->Destroy(); context_.callback_manager->Destroy();
RuntimeInferenceContext::DestroyContext(std::to_string(context_.context_id)); RuntimeInferenceContext::DestroyContext(std::to_string(context_.context_id));
return sync_result; return sync_result;
} }
RECORD_MODEL_EXECUTION_EVENT(&context_, "[iteration = %d] [Stage = %d] End", task_info.iteration, task_info.stage); RECORD_MODEL_EXECUTION_EVENT(&context_, "[iteration = %ld] [Stage = %d] End", task_info.iteration, task_info.stage);
// if not end stage // if not end stage
if (task_info.stage >= pipe_config_->num_stages - 1) { if (task_info.stage >= pipe_config_->num_stages - 1) {
RECORD_MODEL_EXECUTION_EVENT(&context_, "[iteration = %d] Schedule End", task_info.iteration); RECORD_MODEL_EXECUTION_EVENT(&context_, "[iteration = %ld] Schedule End", task_info.iteration);
GELOGD("[Executor: %d] End of iteration [%ld]", id_, task_info.iteration); GELOGD("[Executor: %d] End of iteration [%ld]", id_, task_info.iteration);
context_.callback_manager->Destroy(); context_.callback_manager->Destroy();
RuntimeInferenceContext::DestroyContext(std::to_string(context_.context_id)); RuntimeInferenceContext::DestroyContext(std::to_string(context_.context_id));
@ -242,7 +249,9 @@ Status HybridModelPipelineExecutor::Execute(HybridModelExecutor::ExecuteArgs &ar
GELOGD("Start to sync result of executor[%zu]", i); GELOGD("Start to sync result of executor[%zu]", i);
auto ret = futures[i].get(); auto ret = futures[i].get();
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(ret, "[Executor: %zu] Failed to schedule tasks.", i); GELOGE(ret, "[Check][Result][Executor: %zu] Failed to schedule tasks.", i);
REPORT_INNER_ERROR("E19999", "[Executor: %zu] Failed to schedule tasks when HybridModelPipelineExecutor %s.",
i, __FUNCTION__);
has_error = true; has_error = true;
continue; continue;
} }
@ -250,7 +259,9 @@ Status HybridModelPipelineExecutor::Execute(HybridModelExecutor::ExecuteArgs &ar
ret = stage_executors_[i]->Synchronize(); ret = stage_executors_[i]->Synchronize();
if (ret != SUCCESS) { if (ret != SUCCESS) {
GELOGE(ret, "[Executor: %zu] Failed to synchronize result.", i); GELOGE(ret, "[Invoke][Synchronize] failed for [Executor: %zu].", i);
REPORT_CALL_ERROR("E19999", "[Executor: %zu] failed to Synchronize result when HybridModelPipelineExecutor %s.",
i, __FUNCTION__);
has_error = true; has_error = true;
continue; continue;
} }
@ -266,13 +277,14 @@ Status HybridModelPipelineExecutor::Execute(HybridModelExecutor::ExecuteArgs &ar
iteration_ = config_.iteration_end; iteration_ = config_.iteration_end;
if (has_error) { if (has_error) {
GELOGE(FAILED, "Error occurred while execution"); GELOGE(FAILED, "[Check][Error]Error occurred while execution.");
REPORT_INNER_ERROR("E19999", "Error occurred while execution when HybridModelPipelineExecutor %s.", __FUNCTION__);
return FAILED; return FAILED;
} }
auto last_iter_executor_idx = loop_count % stage_executors_.size(); auto last_iter_executor_idx = loop_count % stage_executors_.size();
GE_CHK_STATUS_RET(stage_executors_[last_iter_executor_idx]->GetOutputs(args.outputs, args.output_desc), GE_CHK_STATUS_RET(stage_executors_[last_iter_executor_idx]->GetOutputs(args.outputs, args.output_desc),
"Failed to get output from executor[%zu]", last_iter_executor_idx); "[Get][Outputs]Failed from executor[%zu]", last_iter_executor_idx);
return SUCCESS; return SUCCESS;
} }

@ -40,7 +40,8 @@ void HybridProfiler::RecordEvent(EventType event_type, const char *fmt, ...) {
char buf[kEventDescMax]; char buf[kEventDescMax];
if (vsnprintf_s(buf, kEventDescMax, kEventDescMax - 1, fmt, args) == -1) { if (vsnprintf_s(buf, kEventDescMax, kEventDescMax - 1, fmt, args) == -1) {
GELOGE(FAILED, "Format %s failed.", fmt); GELOGE(FAILED, "[Parse][Param:fmt]Format %s failed.", fmt);
REPORT_CALL_ERROR("E19999", "Parse Format %s failed when HybridProfiler %s.", fmt, __FUNCTION__);
va_end(args); va_end(args);
return; return;
} }
@ -48,7 +49,10 @@ void HybridProfiler::RecordEvent(EventType event_type, const char *fmt, ...) {
va_end(args); va_end(args);
auto index = counter_++; auto index = counter_++;
if (index >= static_cast<int>(events_.size())) { if (index >= static_cast<int>(events_.size())) {
GELOGE(INTERNAL_ERROR, "index out of range. index = %d, max event size = %zu", index, events_.size()); GELOGE(INTERNAL_ERROR,
"[Check][Range]index out of range. index = %d, max event size = %zu", index, events_.size());
REPORT_INNER_ERROR("E19999", "index out of range when HybridProfiler %s. index = %d, max event size = %zu",
__FUNCTION__, index, events_.size());
return; return;
} }
auto &evt = events_[index]; auto &evt = events_[index];

@ -28,7 +28,8 @@ bool NodeDoneManager::Cond::Await() {
if (!cv_.wait_for(lk, if (!cv_.wait_for(lk,
std::chrono::seconds(kDefaultWaitTimeoutInSec), std::chrono::seconds(kDefaultWaitTimeoutInSec),
[&]() { return is_released_ || is_cancelled_; })) { [&]() { return is_released_ || is_cancelled_; })) {
GELOGE(INTERNAL_ERROR, "Wait timed out."); GELOGE(INTERNAL_ERROR, "[Invoke][wait_for]Wait timed out.");
REPORT_INNER_ERROR("E19999", "wait timed out[%d] when %s.", kDefaultWaitTimeoutInSec, __FUNCTION__);
return false; return false;
} }

@ -67,7 +67,10 @@ Status ShapeInferenceState::UpdateInputShape(int idx, const GeTensorDesc &target
Format format = input_desc.GetFormat(); Format format = input_desc.GetFormat();
DataType data_type = input_desc.GetDataType(); DataType data_type = input_desc.GetDataType();
if (TensorUtils::CalcTensorMemSize(shape, format, data_type, tensor_size) != GRAPH_SUCCESS) { if (TensorUtils::CalcTensorMemSize(shape, format, data_type, tensor_size) != GRAPH_SUCCESS) {
GELOGE(FAILED, "[%s] Calculate tensor memory size failed.", node_item.NodeName().c_str()); GELOGE(FAILED, "[Invoke][CalcTensorMemSize] failed for [%s].",
node_item.NodeName().c_str());
REPORT_CALL_ERROR("E19999", "CalcTensorMemSize failed for [%s] when ShapeInferenceState %s.",
node_item.NodeName().c_str(), __FUNCTION__);
return FAILED; return FAILED;
} }
} }
@ -121,13 +124,19 @@ Status ShapeInferenceState::AwaitShapesReady(const GraphExecutionContext &contex
} }
if (context.GetStatus() != SUCCESS) { if (context.GetStatus() != SUCCESS) {
GELOGE(FAILED, "[%s] Await pending shape cancelled", node_item.NodeName().c_str()); GELOGE(FAILED, "[Check][Status][%s] Await pending shape cancelled.",
node_item.NodeName().c_str());
REPORT_CALL_ERROR("E19999", "[%s] Await pending shape cancelled when %s.",
node_item.NodeName().c_str(), __FUNCTION__);
break; break;
} }
} }
if (!wait_success) { if (!wait_success) {
GELOGE(FAILED, "[%s] Wait for shape timeout.", node_item.NodeName().c_str()); GELOGE(FAILED, "[Check][Status][%s] Wait for shape timeout:%d.",
node_item.NodeName().c_str(), kWaitInternal);
REPORT_CALL_ERROR("E19999", "[%s] Wait for shape timeout:%d when %s.",
node_item.NodeName().c_str(), kWaitInternal, __FUNCTION__);
return FAILED; return FAILED;
} }
} }
@ -232,7 +241,7 @@ Status NodeState::WaitForPrepareDone() {
if (prepare_future_.valid()) { if (prepare_future_.valid()) {
GELOGD("[%s] Start to wait for prepare future.", GetName().c_str()); GELOGD("[%s] Start to wait for prepare future.", GetName().c_str());
GE_CHK_STATUS_RET(prepare_future_.get(), GE_CHK_STATUS_RET(prepare_future_.get(),
"[%s] PreRun failed.", GetName().c_str()); "[Check][Status][%s] PreRun failed.", GetName().c_str());
} }
return SUCCESS; return SUCCESS;

@ -27,7 +27,8 @@ Status CallbackManager::RegisterCallback(rtStream_t stream, rtCallback_t callbac
GE_CHK_RT_RET(rtEventCreate(&event)); GE_CHK_RT_RET(rtEventCreate(&event));
auto rt_ret = rtEventRecord(event, stream); auto rt_ret = rtEventRecord(event, stream);
if (rt_ret != RT_ERROR_NONE) { if (rt_ret != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "Failed to invoke rtEventRecord, error code = %d", rt_ret); GELOGE(RT_FAILED, "[Invoke][rtEventRecord] failed, error code = %d", rt_ret);
REPORT_CALL_ERROR("E19999", "Invoke rtEventRecord failed when %s, error code = %d", __FUNCTION__, rt_ret);
(void) rtEventDestroy(event); (void) rtEventDestroy(event);
return RT_FAILED; return RT_FAILED;
} }
@ -50,7 +51,8 @@ Status CallbackManager::Init() {
return CallbackProcess(context); return CallbackProcess(context);
}, ctx); }, ctx);
if (!ret_future_.valid()) { if (!ret_future_.valid()) {
GELOGE(INTERNAL_ERROR, "Failed to init callback manager."); GELOGE(INTERNAL_ERROR, "[Check][ShareState]Failed to init callback manager.");
REPORT_INNER_ERROR("E19999", "Failed to init callback manager.");
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
@ -73,7 +75,9 @@ Status CallbackManager::CallbackProcess(rtContext_t context) {
auto rt_err = rtEventSynchronize(event); auto rt_err = rtEventSynchronize(event);
if (rt_err != RT_ERROR_NONE) { if (rt_err != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "rtEventSynchronize failed. ret = %d", rt_err); GELOGE(RT_FAILED, "[Invoke][rtEventSynchronize] failed. ret = %d", rt_err);
REPORT_CALL_ERROR("E19999",
"Invoke rtEventSynchronize failed when CallbackManager %s, ret = %d.", __FUNCTION__, rt_err);
GE_CHK_RT(rtEventDestroy(event)); GE_CHK_RT(rtEventDestroy(event));
return RT_FAILED; return RT_FAILED;
} }

@ -50,9 +50,11 @@ NodeStatePtr SubgraphContext::GetOrCreateNodeState(const NodeItem *node_item) {
Status SubgraphContext::SetInput(int index, const TensorValue &tensor) { Status SubgraphContext::SetInput(int index, const TensorValue &tensor) {
if (static_cast<size_t>(index) >= all_inputs_.size()) { if (static_cast<size_t>(index) >= all_inputs_.size()) {
GELOGE(INTERNAL_ERROR, GELOGE(INTERNAL_ERROR,
"output index output range. all input num = %zu, input index = %d", "[Check][Param:index]input index out of range. all input num = %zu, input index = %d",
all_inputs_.size(), all_inputs_.size(), index);
index); REPORT_INNER_ERROR("E19999",
"input param index out of range when SubgraphContext %s, all input num = %zu, input index = %d.",
__FUNCTION__, all_inputs_.size(), index);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
all_inputs_[index] = tensor; all_inputs_[index] = tensor;
@ -68,10 +70,11 @@ Status SubgraphContext::SetOutput(const NodeItem &node_item, int output_index, c
auto index = node_item.output_start + output_index; auto index = node_item.output_start + output_index;
if ((output_index >= node_item.num_outputs) || (static_cast<size_t>(index) >= all_outputs_.size())) { if ((output_index >= node_item.num_outputs) || (static_cast<size_t>(index) >= all_outputs_.size())) {
GELOGE(INTERNAL_ERROR, GELOGE(INTERNAL_ERROR,
"output index output range. all output num = %zu, node_item = %s, output index = %d", "[Check][Param:output_index]output index out of range. all output num = %zu, node_item = %s,"
all_outputs_.size(), "output index = %d.", all_outputs_.size(), node_item.DebugString().c_str(), output_index);
node_item.DebugString().c_str(), REPORT_INNER_ERROR("E19999", "output index out of range when SubgraphContext %s. "
output_index); "all output num = %zu, node_item = %s, output index = %d.",
__FUNCTION__, all_outputs_.size(), node_item.DebugString().c_str(), output_index);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
@ -126,7 +129,10 @@ Status SubgraphContext::Await(const NodePtr &node) {
void SubgraphContext::OnError(Status error) { void SubgraphContext::OnError(Status error) {
if (error != END_OF_SEQUENCE) { if (error != END_OF_SEQUENCE) {
GELOGE(error, "[%s] Error occurred while executing graph.", graph_item_->GetName().c_str()); GELOGE(error, "[Check][Param:error][%s] Error:%d occurred while executing graph.",
graph_item_->GetName().c_str(), error);
REPORT_INNER_ERROR("E19999", "[%s] Error:%d occurred while executing graph when SubgraphContext %s.",
graph_item_->GetName().c_str(), error, __FUNCTION__);
} }
node_done_manager_.Destroy(); node_done_manager_.Destroy();
} }

@ -44,7 +44,8 @@ Status SubgraphExecutor::Init(const std::vector<TensorValue> &inputs,
const std::vector<ConstGeTensorDescPtr> &input_desc) { const std::vector<ConstGeTensorDescPtr> &input_desc) {
subgraph_context_.reset(new(std::nothrow)SubgraphContext(graph_item_, context_)); subgraph_context_.reset(new(std::nothrow)SubgraphContext(graph_item_, context_));
GE_CHECK_NOTNULL(subgraph_context_); GE_CHECK_NOTNULL(subgraph_context_);
GE_CHK_STATUS_RET(subgraph_context_->Init(), "[%s] Failed to init subgraph context.", graph_item_->GetName().c_str()); GE_CHK_STATUS_RET(subgraph_context_->Init(),
"[Init][SubgraphContext][%s] Failed to init subgraph context.", graph_item_->GetName().c_str());
shape_inference_engine_.reset(new(std::nothrow) ShapeInferenceEngine(context_, subgraph_context_.get())); shape_inference_engine_.reset(new(std::nothrow) ShapeInferenceEngine(context_, subgraph_context_.get()));
GE_CHECK_NOTNULL(shape_inference_engine_); GE_CHECK_NOTNULL(shape_inference_engine_);
@ -55,8 +56,8 @@ Status SubgraphExecutor::Init(const std::vector<TensorValue> &inputs,
graph_item_->GetName().c_str()); graph_item_->GetName().c_str());
} else { } else {
GE_CHK_STATUS_RET(InitInputsForKnownShape(inputs), GE_CHK_STATUS_RET(InitInputsForKnownShape(inputs),
"[%s] Failed to init subgraph executor for known shape subgraph.", "[Invoke][InitInputsForKnownShape][%s] Failed to init subgraph executor for known shape subgraph.",
graph_item_->GetName().c_str()); graph_item_->GetName().c_str());
} }
return SUCCESS; return SUCCESS;
@ -67,8 +68,13 @@ Status SubgraphExecutor::InitInputsForUnknownShape(const std::vector<TensorValue
// Number of inputs of parent node should be greater or equal than that of subgraph // Number of inputs of parent node should be greater or equal than that of subgraph
auto input_nodes = graph_item_->GetInputNodes(); auto input_nodes = graph_item_->GetInputNodes();
if (inputs.size() < input_nodes.size()) { if (inputs.size() < input_nodes.size()) {
GELOGE(INTERNAL_ERROR, "[%s] Number of inputs [%zu] is not sufficient for subgraph which needs [%zu] inputs.", GELOGE(INTERNAL_ERROR,
graph_item_->GetName().c_str(), inputs.size(), input_nodes.size()); "[Check][Size][%s] Number of inputs [%zu] is not sufficient for subgraph which needs [%zu] inputs.",
graph_item_->GetName().c_str(), inputs.size(), input_nodes.size());
REPORT_INNER_ERROR("E19999",
"[%s] Number of inputs [%zu] is not sufficient for subgraph which needs [%zu] inputs,"
"check invalid when SubgraphExecutor %s.",
graph_item_->GetName().c_str(), inputs.size(), input_nodes.size(), __FUNCTION__);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
@ -87,9 +93,7 @@ Status SubgraphExecutor::InitInputsForUnknownShape(const std::vector<TensorValue
input_tensor.DebugString().c_str()); input_tensor.DebugString().c_str());
GE_CHK_STATUS_RET(subgraph_context_->SetInput(*input_node, kDataInputIndex, input_tensor), GE_CHK_STATUS_RET(subgraph_context_->SetInput(*input_node, kDataInputIndex, input_tensor),
"[%s] Failed to set input tensor[%zu]", "[Invoke][SetInput] failed for grap_item[%s] input tensor[%zu]", graph_item_->GetName().c_str(), i);
graph_item_->GetName().c_str(),
i);
if (force_infer_shape_ || input_node->is_dynamic) { if (force_infer_shape_ || input_node->is_dynamic) {
GELOGD("[%s] Start to update input[%zu] for subgraph data node.", graph_item_->GetName().c_str(), i); GELOGD("[%s] Start to update input[%zu] for subgraph data node.", graph_item_->GetName().c_str(), i);
@ -112,11 +116,12 @@ Status SubgraphExecutor::InitInputsForKnownShape(const std::vector<TensorValue>
auto &parent_input_index = input_index_mapping[i]; auto &parent_input_index = input_index_mapping[i];
if (static_cast<size_t>(parent_input_index) >= inputs.size()) { if (static_cast<size_t>(parent_input_index) >= inputs.size()) {
GELOGE(INTERNAL_ERROR, GELOGE(INTERNAL_ERROR,
"[%s] Number of inputs [%zu] is not sufficient for subgraph which needs at lease [%d] inputs", "[Check][Size][%s] Number of inputs [%zu] is not sufficient for subgraph which needs at lease [%d] inputs",
graph_item_->GetName().c_str(), graph_item_->GetName().c_str(), inputs.size(), parent_input_index + 1);
inputs.size(), REPORT_INNER_ERROR("E19999",
parent_input_index + 1); "[%s] Number of inputs [%zu] is not sufficient for subgraph which needs at lease [%d] inputs,"
"check invalid when %s.",
graph_item_->GetName().c_str(), inputs.size(), parent_input_index + 1, __FUNCTION__);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
@ -136,10 +141,10 @@ Status SubgraphExecutor::ExecuteAsync(const std::vector<TensorValue> &inputs,
const std::vector<ConstGeTensorDescPtr> &input_desc, const std::vector<ConstGeTensorDescPtr> &input_desc,
const std::vector<TensorValue> &outputs) { const std::vector<TensorValue> &outputs) {
GELOGD("[%s] is dynamic = %s", graph_item_->GetName().c_str(), graph_item_->IsDynamic() ? "true" : "false"); GELOGD("[%s] is dynamic = %s", graph_item_->GetName().c_str(), graph_item_->IsDynamic() ? "true" : "false");
GE_CHK_STATUS_RET(Init(inputs, input_desc), "[%s] Failed to init executor.", graph_item_->GetName().c_str()); GE_CHK_STATUS_RET(Init(inputs, input_desc), "[Invoke][Init]failed for [%s].", graph_item_->GetName().c_str());
if (!outputs.empty()) { if (!outputs.empty()) {
GE_CHK_STATUS_RET(EnableOutputZeroCopy(outputs), GE_CHK_STATUS_RET(EnableOutputZeroCopy(outputs),
"Failed to enable output zero copy by user provided outputs."); "[Invoke][EnableOutputZeroCopy] Failed by user provided outputs.");
} }
if (!graph_item_->IsDynamic()) { if (!graph_item_->IsDynamic()) {
return ExecuteAsyncForKnownShape(inputs); return ExecuteAsyncForKnownShape(inputs);
@ -194,12 +199,11 @@ Status SubgraphExecutor::ExecuteAsync(TaskContext &task_context) {
} }
GE_CHK_STATUS_RET(ExecuteAsync(inputs, input_desc), GE_CHK_STATUS_RET(ExecuteAsync(inputs, input_desc),
"[%s] Failed to execute subgraph.", "[Invoke][ExecuteAsync] failed for [%s].", graph_item_->GetName().c_str());
graph_item_->GetName().c_str());
GE_CHK_STATUS_RET(SetOutputsToParentNode(task_context), GE_CHK_STATUS_RET(SetOutputsToParentNode(task_context),
"[%s] Failed to set output shapes to parent node.", "[Invoke][SetOutputsToParentNode][%s] Failed to set output shapes to parent node.",
graph_item_->GetName().c_str()); graph_item_->GetName().c_str());
return SUCCESS; return SUCCESS;
} }
@ -239,7 +243,7 @@ Status SubgraphExecutor::PrepareNodes(int group) {
if (node_item.kernel_task == nullptr) { if (node_item.kernel_task == nullptr) {
GELOGW("[%s] Node of static shape got no task.", node_item.NodeName().c_str()); GELOGW("[%s] Node of static shape got no task.", node_item.NodeName().c_str());
GE_CHK_STATUS_RET(TaskCompileEngine::Compile(*p_node_state, context_), GE_CHK_STATUS_RET(TaskCompileEngine::Compile(*p_node_state, context_),
"[%s] Failed to create task.", p_node_state->GetName().c_str()); "[Invoke][Compile] failed for [%s].", p_node_state->GetName().c_str());
} else { } else {
node_state->SetKernelTask(node_item.kernel_task); node_state->SetKernelTask(node_item.kernel_task);
} }
@ -248,7 +252,9 @@ Status SubgraphExecutor::PrepareNodes(int group) {
GE_CHECK_NOTNULL(unique_task_context); GE_CHECK_NOTNULL(unique_task_context);
const auto &task = node_state->GetKernelTask(); const auto &task = node_state->GetKernelTask();
if (task == nullptr) { if (task == nullptr) {
GELOGE(INTERNAL_ERROR, "[%s] NodeTask is null.", node_state->GetName().c_str()); GELOGE(INTERNAL_ERROR, "[Get][KernelTask] failed for[%s], NodeTask is null.", node_state->GetName().c_str());
REPORT_CALL_ERROR("E19999", "invoke GetKernelTask failed for %s when %s, nodetask is null.",
node_state->GetName().c_str(), __FUNCTION__);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
auto shared_task_context = std::shared_ptr<TaskContext>(unique_task_context.release()); auto shared_task_context = std::shared_ptr<TaskContext>(unique_task_context.release());
@ -261,8 +267,10 @@ Status SubgraphExecutor::PrepareNodes(int group) {
GELOGD("Got end of sequence"); GELOGD("Got end of sequence");
return SUCCESS; return SUCCESS;
} }
GELOGE(INTERNAL_ERROR, "[%s] Error occurs while launching tasks. quit from preparing nodes.", GELOGE(INTERNAL_ERROR, "[Check][State][%s] Error occurs while launching tasks. quit from preparing nodes.",
graph_item_->GetName().c_str()); graph_item_->GetName().c_str());
REPORT_INNER_ERROR("E19999", "[%s] Error occurs while launching tasks. quit from preparing nodes when %s.",
graph_item_->GetName().c_str(), __FUNCTION__);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
@ -275,9 +283,9 @@ Status SubgraphExecutor::PrepareNodes(int group) {
Status SubgraphExecutor::InferShape(ShapeInferenceEngine *shape_inference_engine, NodeState &node_state) const { Status SubgraphExecutor::InferShape(ShapeInferenceEngine *shape_inference_engine, NodeState &node_state) const {
HYBRID_CHK_STATUS_RET(shape_inference_engine->InferShape(node_state), HYBRID_CHK_STATUS_RET(shape_inference_engine->InferShape(node_state),
"[%s] Failed to InferShape.", node_state.GetName().c_str()); "[Invoke][InferShape] failed for [%s].", node_state.GetName().c_str());
HYBRID_CHK_STATUS_RET(shape_inference_engine->PropagateOutputShapes(node_state), HYBRID_CHK_STATUS_RET(shape_inference_engine->PropagateOutputShapes(node_state),
"[%s] Failed to PropagateOutputShapes.", node_state.GetName().c_str()); "[Invoke][PropagateOutputShapes] failed for [%s].", node_state.GetName().c_str());
return SUCCESS; return SUCCESS;
} }
@ -285,7 +293,7 @@ Status SubgraphExecutor::PrepareForExecution(GraphExecutionContext *ctx, NodeSta
auto &node_item = *node_state.GetNodeItem(); auto &node_item = *node_state.GetNodeItem();
if (node_item.kernel_task == nullptr) { if (node_item.kernel_task == nullptr) {
GE_CHK_STATUS_RET(TaskCompileEngine::Compile(node_state, ctx), GE_CHK_STATUS_RET(TaskCompileEngine::Compile(node_state, ctx),
"Failed to create task for node[%s]", node_state.GetName().c_str()); "[Invoke][Compile] Failed for node[%s]", node_state.GetName().c_str());
} else { } else {
node_state.SetKernelTask(node_item.kernel_task); node_state.SetKernelTask(node_item.kernel_task);
} }
@ -293,7 +301,9 @@ Status SubgraphExecutor::PrepareForExecution(GraphExecutionContext *ctx, NodeSta
GE_CHECK_NOTNULL(unique_task_context); GE_CHECK_NOTNULL(unique_task_context);
const auto &task = node_state.GetKernelTask(); const auto &task = node_state.GetKernelTask();
if (task == nullptr) { if (task == nullptr) {
GELOGE(INTERNAL_ERROR, "[%s] NodeTask is null.", node_state.GetName().c_str()); GELOGE(INTERNAL_ERROR, "[Invoke][GetKernelTask] failed for[%s], NodeTask is null.", node_state.GetName().c_str());
REPORT_CALL_ERROR("E19999", "invoke GetKernelTask failed for %s, NodeTask is null when %s.",
node_state.GetName().c_str(), __FUNCTION__);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
auto shared_task_context = std::shared_ptr<TaskContext>(unique_task_context.release()); auto shared_task_context = std::shared_ptr<TaskContext>(unique_task_context.release());
@ -309,7 +319,8 @@ Status SubgraphExecutor::LaunchTasks() {
while (true) { while (true) {
NodeState *node_state = nullptr; NodeState *node_state = nullptr;
if (!ready_queue_.Pop(node_state)) { if (!ready_queue_.Pop(node_state)) {
GELOGE(INTERNAL_ERROR, "[%s] Failed to pop node.", graph_item_->GetName().c_str()); GELOGE(INTERNAL_ERROR, "[Invoke][Pop] failed for [%s].", graph_item_->GetName().c_str());
REPORT_CALL_ERROR("E19999", "invoke pop failed for %s when %s", graph_item_->GetName().c_str(), __FUNCTION__);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
@ -334,8 +345,7 @@ Status SubgraphExecutor::LaunchTasks() {
GE_CHECK_NOTNULL(shared_task_context); GE_CHECK_NOTNULL(shared_task_context);
shared_task_context->SetForceInferShape(force_infer_shape_); shared_task_context->SetForceInferShape(force_infer_shape_);
HYBRID_CHK_STATUS_RET(ExecutionEngine::ExecuteAsync(*node_state, shared_task_context, *context_), HYBRID_CHK_STATUS_RET(ExecutionEngine::ExecuteAsync(*node_state, shared_task_context, *context_),
"[%s] Execute node failed.", "[Invoke][ExecuteAsync] failed for [%s].", node_state->GetName().c_str());
node_state->GetName().c_str());
GELOGD("[%s] Done executing node successfully.", node_state->GetName().c_str()); GELOGD("[%s] Done executing node successfully.", node_state->GetName().c_str());
} }
} }
@ -361,8 +371,7 @@ Status SubgraphExecutor::ScheduleTasks(int group) {
} }
GE_CHK_STATUS_RET(prepare_future.get(), GE_CHK_STATUS_RET(prepare_future.get(),
"[%s] Error occurred in task preparation.", "[Invoke][get] [%s] Error occurred in task preparation.", graph_item_->GetName().c_str());
graph_item_->GetName().c_str());
GELOGD("[%s] Done launching all tasks successfully.", graph_item_->GetName().c_str()); GELOGD("[%s] Done launching all tasks successfully.", graph_item_->GetName().c_str());
return SUCCESS; return SUCCESS;
@ -373,17 +382,18 @@ Status SubgraphExecutor::GetOutputs(vector<TensorValue> &outputs) {
} }
Status SubgraphExecutor::GetOutputs(vector<TensorValue> &outputs, std::vector<ConstGeTensorDescPtr> &output_desc) { Status SubgraphExecutor::GetOutputs(vector<TensorValue> &outputs, std::vector<ConstGeTensorDescPtr> &output_desc) {
GE_CHK_STATUS_RET(GetOutputs(outputs), "[%s] Failed to get output tensors.", graph_item_->GetName().c_str()); GE_CHK_STATUS_RET(GetOutputs(outputs), "[Invoke][GetOutputs] failed for [%s].", graph_item_->GetName().c_str());
// copy output data from op to designated position // copy output data from op to designated position
GE_CHK_STATUS_RET(graph_item_->GetOutputDescList(output_desc), GE_CHK_STATUS_RET(graph_item_->GetOutputDescList(output_desc),
"[%s] Failed to get output tensor desc.", "[Invoke][GetOutputDescList][%s] Failed to get output tensor desc.", graph_item_->GetName().c_str());
graph_item_->GetName().c_str());
if (outputs.size() != output_desc.size()) { if (outputs.size() != output_desc.size()) {
GELOGE(INTERNAL_ERROR, GELOGE(INTERNAL_ERROR,
"Number of output tensors(%zu) mismatch number of output tensor desc(%zu).", "[Check][Size]Number of outputs(%zu) mismatch number of output_desc(%zu).",
outputs.size(), outputs.size(), output_desc.size());
output_desc.size()); REPORT_INNER_ERROR("E19999", "Number of outputs(%zu) mismatch number of output_desc(%zu),"
"check invlid when SubgraphExecutor %s.",
outputs.size(), output_desc.size(), __FUNCTION__);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
return SUCCESS; return SUCCESS;
@ -401,17 +411,17 @@ Status SubgraphExecutor::SetOutputsToParentNode(TaskContext &task_context) {
std::vector<TensorValue> outputs; std::vector<TensorValue> outputs;
std::vector<ConstGeTensorDescPtr> output_desc_list; std::vector<ConstGeTensorDescPtr> output_desc_list;
GE_CHK_STATUS_RET(subgraph_context_->GetOutputs(outputs), GE_CHK_STATUS_RET(subgraph_context_->GetOutputs(outputs),
"[%s] Failed to get output tensors.", "[Invoke][GetOutputs][%s] Failed to get output tensors.", graph_item_->GetName().c_str());
graph_item_->GetName().c_str());
GE_CHK_STATUS_RET(graph_item_->GetOutputDescList(output_desc_list), GE_CHK_STATUS_RET(graph_item_->GetOutputDescList(output_desc_list),
"[%s] Failed to get output tensor desc.", "[Invoke][GetOutputDescList][%s] Failed to get output tensor desc.", graph_item_->GetName().c_str());
graph_item_->GetName().c_str());
if (outputs.size() != output_desc_list.size()) { if (outputs.size() != output_desc_list.size()) {
GELOGE(INTERNAL_ERROR, "[%s] num output tensors = %zu, num output tensor desc = %zu", GELOGE(INTERNAL_ERROR, "[Check][Size][%s] num of output tensors = %zu, num of output tensor desc = %zu not equal",
graph_item_->GetName().c_str(), graph_item_->GetName().c_str(), outputs.size(), output_desc_list.size());
outputs.size(), REPORT_INNER_ERROR("E19999",
output_desc_list.size()); "%s num of output tensors = %zu, num of output tensor desc = %zu not equal,"
"check invalid when SubgraphExecutor %s",
graph_item_->GetName().c_str(), outputs.size(), output_desc_list.size(), __FUNCTION__);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
@ -460,9 +470,10 @@ Status SubgraphExecutor::EnableOutputZeroCopy(const vector<TensorValue> &outputs
const auto &output_edges = graph_item_->GetOutputEdges(); const auto &output_edges = graph_item_->GetOutputEdges();
// Op -> MetOutput, set the output tensor of Op that output to the NetOutput node // Op -> MetOutput, set the output tensor of Op that output to the NetOutput node
if (outputs.size() != output_edges.size()) { if (outputs.size() != output_edges.size()) {
GELOGE(PARAM_INVALID, "Output number mismatches, expect = %zu, but given = %zu", GELOGE(PARAM_INVALID, "[Check][Size]Output number mismatches, expect = %zu, but given = %zu",
output_edges.size(), output_edges.size(), outputs.size());
outputs.size()); REPORT_INNER_ERROR("E19999", "Output number mismatches, expect = %zu, but given = %zu when %s",
output_edges.size(), outputs.size(), __FUNCTION__);
return PARAM_INVALID; return PARAM_INVALID;
} }
@ -478,9 +489,7 @@ Status SubgraphExecutor::EnableOutputZeroCopy(const vector<TensorValue> &outputs
output_tensor.DebugString().c_str()); output_tensor.DebugString().c_str());
GE_CHK_STATUS_RET(subgraph_context_->SetOutput(*output_node, output_idx, output_tensor), GE_CHK_STATUS_RET(subgraph_context_->SetOutput(*output_node, output_idx, output_tensor),
"[%s] Failed to set input tensor[%zu]", "[Invoke][SetOutput][%s] Failed to set input tensor[%zu]", graph_item_->GetName().c_str(), i);
graph_item_->GetName().c_str(),
i);
} }
GELOGD("Done enabling zero copy for outputs successfully."); GELOGD("Done enabling zero copy for outputs successfully.");

@ -102,11 +102,13 @@ Status NodeDoneCallback::PrepareConstInputs(const NodeItem &node_item) {
if (output_tensor->GetSize() < static_cast<size_t>(tensor_size)) { if (output_tensor->GetSize() < static_cast<size_t>(tensor_size)) {
GELOGE(INTERNAL_ERROR, GELOGE(INTERNAL_ERROR,
"[%s] Tensor size is not enough. output index = %d, required size = %ld, tensor = %s", "[Check][Size][%s] Tensor size is not enough. output index = %d, required size = %ld, tensor = %s.",
node_item.NodeName().c_str(), node_item.NodeName().c_str(), output_idx, tensor_size,
output_idx, output_tensor->DebugString().c_str());
tensor_size, REPORT_INNER_ERROR("E19999",
output_tensor->DebugString().c_str()); "[%s] Tensor size is not enough. output index = %d, required size = %ld, tensor = %s when %s.",
node_item.NodeName().c_str(), output_idx, tensor_size,
output_tensor->DebugString().c_str(), __FUNCTION__);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
@ -128,7 +130,7 @@ Status NodeDoneCallback::PrepareConstInputs(const NodeItem &node_item) {
GE_CHK_GRAPH_STATUS_RET(RuntimeInferenceContext::GetContext(context_id, &runtime_infer_ctx), GE_CHK_GRAPH_STATUS_RET(RuntimeInferenceContext::GetContext(context_id, &runtime_infer_ctx),
"Failed to get RuntimeInferenceContext, context_id = %s", context_id.c_str()); "Failed to get RuntimeInferenceContext, context_id = %s", context_id.c_str());
GE_CHK_STATUS_RET(runtime_infer_ctx->SetTensor(node_item.node_id, output_idx, std::move(tensor)), GE_CHK_STATUS_RET(runtime_infer_ctx->SetTensor(node_item.node_id, output_idx, std::move(tensor)),
"Failed to SetTensor, node = %s, output_index = %d", node_item.NodeName().c_str(), output_idx); "[Set][Tensor] Failed, node = %s, output_index = %d", node_item.NodeName().c_str(), output_idx);
GELOGD("[%s] Output[%d] cached successfully in context: %s. node_id = %d, shape = [%s]", GELOGD("[%s] Output[%d] cached successfully in context: %s. node_id = %d, shape = [%s]",
node_item.NodeName().c_str(), node_item.NodeName().c_str(),
output_idx, output_idx,
@ -173,7 +175,8 @@ Status NodeDoneCallback::GetTaskDescInfo(const NodePtr node, const HybridModel *
Status NodeDoneCallback::ProfilingReport() { Status NodeDoneCallback::ProfilingReport() {
auto node = context_->GetNodeItem().node; auto node = context_->GetNodeItem().node;
if (node == nullptr) { if (node == nullptr) {
GELOGE(PARAM_INVALID, "Get node is nullptr"); GELOGE(PARAM_INVALID, "[Get][Node] value is nullptr.");
REPORT_INNER_ERROR("E19999", "Get node failed, when %s.", __FUNCTION__);
return PARAM_INVALID; return PARAM_INVALID;
} }
@ -190,7 +193,8 @@ Status NodeDoneCallback::ProfilingReport() {
std::vector<TaskDescInfo> task_desc_info; std::vector<TaskDescInfo> task_desc_info;
auto profiling_ret = GetTaskDescInfo(node, model, task_desc_info); auto profiling_ret = GetTaskDescInfo(node, model, task_desc_info);
if (profiling_ret != RT_ERROR_NONE) { if (profiling_ret != RT_ERROR_NONE) {
GELOGE(profiling_ret, "Get task info of node[%s] failed.", node->GetName().c_str()); GELOGE(profiling_ret, "[Get][TaskDescInfo] of node:%s failed.", node->GetName().c_str());
REPORT_CALL_ERROR("E19999", "GetTaskDescInfo of node:%s failed, when %s.", node->GetName().c_str(), __FUNCTION__);
return profiling_ret; return profiling_ret;
} }
@ -202,7 +206,8 @@ Status NodeDoneCallback::ProfilingReport() {
Status NodeDoneCallback::DumpDynamicNode() { Status NodeDoneCallback::DumpDynamicNode() {
auto node = context_->GetNodeItem().node; auto node = context_->GetNodeItem().node;
if (node == nullptr) { if (node == nullptr) {
GELOGE(PARAM_INVALID, "Get node is nullptr"); GELOGE(PARAM_INVALID, "[Get][Node] value is nullptr.");
REPORT_INNER_ERROR("E19999", "get node is nullptr when %s.", __FUNCTION__);
return PARAM_INVALID; return PARAM_INVALID;
} }
auto op_desc = node->GetOpDesc(); auto op_desc = node->GetOpDesc();
@ -211,13 +216,13 @@ Status NodeDoneCallback::DumpDynamicNode() {
vector<uintptr_t> output_addrs; vector<uintptr_t> output_addrs;
for (int i = 0; i < context_->NumInputs(); i++) { for (int i = 0; i < context_->NumInputs(); i++) {
auto tensor_value = context_->GetInput(i); auto tensor_value = context_->GetInput(i);
GE_CHK_BOOL_RET_STATUS(tensor_value != nullptr, PARAM_INVALID, "Tensor value is nullptr"); GE_CHK_BOOL_RET_STATUS(tensor_value != nullptr, PARAM_INVALID, "[Get][Tensor] value is nullptr.");
uint64_t input_addr = reinterpret_cast<uintptr_t>(tensor_value->GetData()); uint64_t input_addr = reinterpret_cast<uintptr_t>(tensor_value->GetData());
input_addrs.emplace_back(input_addr); input_addrs.emplace_back(input_addr);
} }
for (int j = 0; j < context_->NumOutputs(); j++) { for (int j = 0; j < context_->NumOutputs(); j++) {
auto tensor_value = context_->GetOutput(j); auto tensor_value = context_->GetOutput(j);
GE_CHK_BOOL_RET_STATUS(tensor_value != nullptr, PARAM_INVALID, "Tensor value is nullptr"); GE_CHK_BOOL_RET_STATUS(tensor_value != nullptr, PARAM_INVALID, "[Get][Tensor] value is nullptr.");
uint64_t output_addr = reinterpret_cast<uintptr_t>(tensor_value->GetData()); uint64_t output_addr = reinterpret_cast<uintptr_t>(tensor_value->GetData());
output_addrs.emplace_back(output_addr); output_addrs.emplace_back(output_addr);
} }
@ -245,11 +250,12 @@ Status NodeDoneCallback::DumpDynamicNode() {
void *global_step = context_->GetExecutionContext()->global_step; void *global_step = context_->GetExecutionContext()->global_step;
dump_op_.SetLoopAddr(global_step, loop_per_iter, loop_cond); dump_op_.SetLoopAddr(global_step, loop_per_iter, loop_cond);
GE_CHK_STATUS_RET(dump_op_.LaunchDumpOp(), "Failed to launch dump op in hybird model"); GE_CHK_STATUS_RET(dump_op_.LaunchDumpOp(), "[Launch][DumpOp] failed in hybird model.");
auto rt_ret = rtStreamSynchronize(stream); auto rt_ret = rtStreamSynchronize(stream);
if (rt_ret != RT_ERROR_NONE) { if (rt_ret != RT_ERROR_NONE) {
GELOGE(rt_ret, "rtStreamSynchronize failed"); GELOGE(rt_ret, "[Call][rtStreamSynchronize] failed, ret = %d.", rt_ret);
REPORT_CALL_ERROR("E19999", "call rtStreamSynchronize failed when %s, ret = %d.", __FUNCTION__, rt_ret);
return rt_ret; return rt_ret;
} }
return SUCCESS; return SUCCESS;
@ -264,12 +270,12 @@ Status NodeDoneCallback::OnNodeDone() {
const DumpProperties &dump_properties = context_->GetDumpProperties(); const DumpProperties &dump_properties = context_->GetDumpProperties();
if (dump_properties.IsDumpOpen() || context_->IsOverFlow()) { if (dump_properties.IsDumpOpen() || context_->IsOverFlow()) {
GELOGI("Start to dump dynamic shape op"); GELOGI("Start to dump dynamic shape op");
GE_CHK_STATUS_RET(DumpDynamicNode(), "Failed to dump dynamic node"); GE_CHK_STATUS_RET(DumpDynamicNode(), "[Call][DumpDynamicNode] Failed.");
} }
if (ProfilingManager::Instance().ProfilingModelExecuteOn()) { if (ProfilingManager::Instance().ProfilingModelExecuteOn()) {
GE_CHK_STATUS_RET(ProfilingReport(), "Report node[%s] to profiling failed.", GE_CHK_STATUS_RET(ProfilingReport(), "[Report][Profiling] of node[%s] failed.",
node_item.NodeName().c_str()); node_item.NodeName().c_str());
} }
// release workspace // release workspace
@ -292,8 +298,7 @@ Status NodeDoneCallback::OnNodeDone() {
} }
GE_CHK_STATUS_RET(context_->PropagateOutputs(), GE_CHK_STATUS_RET(context_->PropagateOutputs(),
"[%s] Failed to propagate outputs failed", "[Propagate][Outputs] of [%s] failed.", node_item.NodeName().c_str());
node_item.NodeName().c_str());
RECORD_CALLBACK_EVENT(graph_context_, context_->GetNodeName(), "[PropagateOutputs] End"); RECORD_CALLBACK_EVENT(graph_context_, context_->GetNodeName(), "[PropagateOutputs] End");
} }
@ -333,7 +338,8 @@ Status ExecutionEngine::DoExecuteAsync(NodeState &node_state,
const std::function<void()> &callback) { const std::function<void()> &callback) {
const auto &task = node_state.GetKernelTask(); const auto &task = node_state.GetKernelTask();
if (task == nullptr) { if (task == nullptr) {
GELOGE(INTERNAL_ERROR, "[%s] NodeTask is null.", node_state.GetName().c_str()); GELOGE(INTERNAL_ERROR, "[Get][KernelTask] of [%s] is null.", node_state.GetName().c_str());
REPORT_INNER_ERROR("E19999", "GetKernelTask of %s is null when %s.", node_state.GetName().c_str(), __FUNCTION__);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
@ -348,8 +354,7 @@ Status ExecutionEngine::DoExecuteAsync(NodeState &node_state,
GE_CHECK_NOTNULL(executor); GE_CHECK_NOTNULL(executor);
RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[PrepareTask] Start"); RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[PrepareTask] Start");
GE_CHK_STATUS_RET(executor->PrepareTask(*task, task_context), GE_CHK_STATUS_RET(executor->PrepareTask(*task, task_context),
"[%s] Failed to prepare task", "[Prepare][Task] for [%s] failed.", node_state.GetName().c_str());
node_state.GetName().c_str());
RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[PrepareTask] End"); RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[PrepareTask] End");
GELOGD("[%s] Done task preparation successfully.", node_state.GetName().c_str()); GELOGD("[%s] Done task preparation successfully.", node_state.GetName().c_str());
@ -360,7 +365,8 @@ Status ExecutionEngine::DoExecuteAsync(NodeState &node_state,
} }
} }
GE_CHK_STATUS_RET(ValidateInputTensors(node_state, task_context), "Failed to validate input tensors."); GE_CHK_STATUS_RET(ValidateInputTensors(node_state, task_context), "[Validate][InputTensors] for %s failed.",
node_state.GetName().c_str());
RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[ValidateInputTensors] End"); RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[ValidateInputTensors] End");
if (context.profiling_level > 0) { if (context.profiling_level > 0) {
@ -414,11 +420,10 @@ Status ExecutionEngine::ValidateInputTensors(const NodeState &node_state, const
input_tensor->GetSize()); input_tensor->GetSize());
} else { } else {
GELOGE(INTERNAL_ERROR, GELOGE(INTERNAL_ERROR,
"[%s] Input[%d]: tensor size mismatches. expected: %ld, but given %zu", "[Check][Size] for [%s] Input[%d]: tensor size mismatches. expected: %ld, but given %zu.",
task_context.GetNodeName(), task_context.GetNodeName(), i, expected_size, input_tensor->GetSize());
i, REPORT_INNER_ERROR("E19999", "[%s] Input[%d]: tensor size mismatches. expected: %ld, but given %zu when %s.",
expected_size, task_context.GetNodeName(), i, expected_size, input_tensor->GetSize(), __FUNCTION__);
input_tensor->GetSize());
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
} }
@ -432,8 +437,7 @@ Status ExecutionEngine::PropagateOutputs(const NodeItem &node_item,
GraphExecutionContext &context) { GraphExecutionContext &context) {
if (node_item.shape_inference_type != DEPEND_COMPUTE) { if (node_item.shape_inference_type != DEPEND_COMPUTE) {
GE_CHK_STATUS_RET(task_context.PropagateOutputs(), GE_CHK_STATUS_RET(task_context.PropagateOutputs(),
"[%s] Failed to propagate outputs.", "[Propagate][Outputs] for [%s] failed.", node_item.NodeName().c_str());
node_item.NodeName().c_str());
RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[PropagateOutputs] End"); RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[PropagateOutputs] End");
GELOGD("[%s] Done propagating outputs successfully.", node_item.NodeName().c_str()); GELOGD("[%s] Done propagating outputs successfully.", node_item.NodeName().c_str());
} }

@ -70,7 +70,7 @@ Status ShapeInferenceEngine::InferShape(NodeState &node_state) {
{ {
RECORD_SHAPE_INFERENCE_EVENT(execution_context_, node_item.NodeName().c_str(), "[InferShapeAndType] Start"); RECORD_SHAPE_INFERENCE_EVENT(execution_context_, node_item.NodeName().c_str(), "[InferShapeAndType] Start");
GE_CHK_STATUS_RET(ShapeRefiner::InferShapeAndTypeForRunning(node_item.node, true), GE_CHK_STATUS_RET(ShapeRefiner::InferShapeAndTypeForRunning(node_item.node, true),
"Invoke InferShapeAndType failed."); "[Invoke][InferShapeAndType] for %s failed.", node_item.NodeName().c_str());
RECORD_SHAPE_INFERENCE_EVENT(execution_context_, node_item.NodeName().c_str(), "[InferShapeAndType] End"); RECORD_SHAPE_INFERENCE_EVENT(execution_context_, node_item.NodeName().c_str(), "[InferShapeAndType] End");
} }
@ -172,8 +172,7 @@ Status ShapeInferenceEngine::InferShapeForSubgraph(const NodeItem &node_item, co
GE_CHK_STATUS_RET(ShapeRefiner::InferShapeAndType(node)); GE_CHK_STATUS_RET(ShapeRefiner::InferShapeAndType(node));
GELOGD("[%s] Done invoking InferShapeAndType", node->GetName().c_str()); GELOGD("[%s] Done invoking InferShapeAndType", node->GetName().c_str());
GE_CHK_STATUS_RET(UpdatePeerNodeShape(*node), GE_CHK_STATUS_RET(UpdatePeerNodeShape(*node),
"[%s] Failed to update shapes of peer node.", "[Update][PeerNodeShape] failed for [%s].", node->GetName().c_str());
node->GetName().c_str());
} }
for (auto &it : fused_subgraph.output_mapping) { for (auto &it : fused_subgraph.output_mapping) {
@ -205,7 +204,9 @@ Status ShapeInferenceEngine::UpdatePeerNodeShape(const Node &node) {
GE_CHECK_NOTNULL(peer_op_desc); GE_CHECK_NOTNULL(peer_op_desc);
auto peer_input_desc = peer_op_desc->MutableInputDesc(peer_anchor->GetIdx()); auto peer_input_desc = peer_op_desc->MutableInputDesc(peer_anchor->GetIdx());
if (peer_input_desc == nullptr) { if (peer_input_desc == nullptr) {
GELOGE(GRAPH_FAILED, "peer_input_desc is nullptr"); GELOGE(GRAPH_FAILED, "[Call][MutableInputDesc] for %s return nullptr.", peer_op_desc->GetName().c_str());
REPORT_CALL_ERROR("E19999", "%s call MutableInputDesc return nullptr when ShapeInferenceEngine %s.",
peer_op_desc->GetName().c_str(), __FUNCTION__);
continue; continue;
} }
@ -230,8 +231,11 @@ Status ShapeInferenceEngine::CanonicalizeShape(GeTensorDesc &tensor_desc,
const auto &tensor_shape = tensor_desc.MutableShape(); const auto &tensor_shape = tensor_desc.MutableShape();
if (tensor_shape.IsUnknownShape()) { if (tensor_shape.IsUnknownShape()) {
if (!fallback_with_range) { if (!fallback_with_range) {
GELOGE(INTERNAL_ERROR, "Output shape is still unknown after shape inference. shape = [%s]", GELOGE(INTERNAL_ERROR,
tensor_shape.ToString().c_str()); "[Is][UnknownShape] Output shape is still unknown after shape inference. shape = [%s].",
tensor_shape.ToString().c_str());
REPORT_INNER_ERROR("E19999", "Output shape is still unknown after shape inference. "
"shape = [%s] when ShapeInferenceEngine %s.", tensor_shape.ToString().c_str(), __FUNCTION__);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
@ -239,9 +243,10 @@ Status ShapeInferenceEngine::CanonicalizeShape(GeTensorDesc &tensor_desc,
std::vector<std::pair<int64_t, int64_t>> shape_range; std::vector<std::pair<int64_t, int64_t>> shape_range;
GE_CHK_GRAPH_STATUS_RET(tensor_desc.GetShapeRange(shape_range), "Failed to get shape range"); GE_CHK_GRAPH_STATUS_RET(tensor_desc.GetShapeRange(shape_range), "Failed to get shape range");
if (shape_range.size() != shape.size()) { if (shape_range.size() != shape.size()) {
GELOGE(INTERNAL_ERROR, "Number of shape ranges (%zu) mismatches that of dims (%zu)", GELOGE(INTERNAL_ERROR, "[Check][Size] Number of shape ranges (%zu) mismatches that of dims (%zu).",
shape_range.size(), shape_range.size(), shape.size());
shape.size()); REPORT_INNER_ERROR("E19999", "Number of shape ranges (%zu) mismatches that of dims (%zu)"
" when ShapeInferenceEngine %s.", shape_range.size(), shape.size(), __FUNCTION__);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
@ -265,7 +270,10 @@ Status ShapeInferenceEngine::CalcTensorSize(DataType data_type,
GELOGD("To calc tensor size by shape = [%s]", GeShape(shape).ToString().c_str()); GELOGD("To calc tensor size by shape = [%s]", GeShape(shape).ToString().c_str());
uint32_t type_size; uint32_t type_size;
if (!TypeUtils::GetDataTypeLength(data_type, type_size)) { if (!TypeUtils::GetDataTypeLength(data_type, type_size)) {
GELOGE(INTERNAL_ERROR, "Failed to get data type size"); GELOGE(INTERNAL_ERROR, "[Get][DataTypeLength] failed for type:%s.",
TypeUtils::DataTypeToSerialString(data_type).c_str());
REPORT_CALL_ERROR("E19999", "GetDataTypeLength failed for type:%s when ShapeInferenceEngine %s.",
TypeUtils::DataTypeToSerialString(data_type).c_str(), __FUNCTION__);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} }
@ -273,15 +281,13 @@ Status ShapeInferenceEngine::CalcTensorSize(DataType data_type,
for (const auto &dim : shape) { for (const auto &dim : shape) {
GE_CHECK_GE(dim, 0); GE_CHECK_GE(dim, 0);
GE_CHK_STATUS_RET(Int64MulCheckOverflow(tensor_size, dim), GE_CHK_STATUS_RET(Int64MulCheckOverflow(tensor_size, dim),
"Shape size overflow, shape = [%s]", "[Check][Overflow] Shape size overflow, shape = [%s]", GeShape(shape).ToString().c_str());
GeShape(shape).ToString().c_str());
tensor_size *= dim; tensor_size *= dim;
} }
GE_CHK_STATUS_RET(CheckInt64AddOverflow(tensor_size, kAlignment - 1), GE_CHK_STATUS_RET(CheckInt64AddOverflow(tensor_size, kAlignment - 1),
"Tensor size is too large: %ld, shape = [%s]", "[Check][Overflow]Tensor size is too large: %ld, shape = [%s] Shape size will overflow when add align.",
tensor_size, tensor_size, GeShape(shape).ToString().c_str());
GeShape(shape).ToString().c_str());
tensor_size = (tensor_size + kAlignment - 1) / kAlignment * kAlignment; tensor_size = (tensor_size + kAlignment - 1) / kAlignment * kAlignment;
return SUCCESS; return SUCCESS;
} }
@ -294,16 +300,24 @@ Status ShapeInferenceEngine::CalcOutputTensorSizes(const NodeItem &node_item, bo
const auto &shape = tensor_desc->MutableShape(); const auto &shape = tensor_desc->MutableShape();
// modify on copy // modify on copy
auto dims = shape.GetDims(); auto dims = shape.GetDims();
GE_CHK_STATUS_RET(CanonicalizeShape(*tensor_desc, dims, fallback_with_range), auto status_result = CanonicalizeShape(*tensor_desc, dims, fallback_with_range);
"[%s] Failed to canonicalize shape for output %zu", if (status_result != SUCCESS) {
node_item.NodeName().c_str(), REPORT_CALL_ERROR("E19999",
output_index); "Invoke CanonicalizeShape failed when ShapeInferenceEngine %s, node:%s, output:%zu.",
node_item.NodeName().c_str(), __FUNCTION__, output_index);
GELOGE(ge::FAILED, "[Canonicalize][Shape] failed for [%s], output %zu.",
node_item.NodeName().c_str(), output_index);
return status_result;
}
int64_t tensor_size; int64_t tensor_size;
GE_CHK_STATUS_RET(CalcTensorSize(tensor_desc->GetDataType(), dims, tensor_size), status_result = CalcTensorSize(tensor_desc->GetDataType(), dims, tensor_size);
"[%s] Failed to calc tensor size for output %zu", if (status_result != SUCCESS) {
node_item.NodeName().c_str(), REPORT_CALL_ERROR("E19999", "Invoke CalcTensorSize failed when ShapeInferenceEngine %s, node:%s, output:%zu.",
output_index); node_item.NodeName().c_str(), __FUNCTION__, output_index);
GELOGE(ge::FAILED, "[Calc][TensorSize] failed for [%s], output %zu.",
node_item.NodeName().c_str(), output_index);
return status_result;
}
GELOGD("[%s] Tensor size of output %zu = %ld", node_item.NodeName().c_str(), output_index, tensor_size); GELOGD("[%s] Tensor size of output %zu = %ld", node_item.NodeName().c_str(), output_index, tensor_size);
(void) TensorUtils::SetSize(*tensor_desc, tensor_size); (void) TensorUtils::SetSize(*tensor_desc, tensor_size);
} }

@ -32,7 +32,7 @@ Status TaskCompileEngine::Compile(NodeState &node_state, GraphExecutionContext *
shared_ptr<NodeTask> kernel_task; shared_ptr<NodeTask> kernel_task;
auto ret = node_item.node_executor->CompileTask(*context->model, node_item.node, kernel_task); auto ret = node_item.node_executor->CompileTask(*context->model, node_item.node, kernel_task);
RECORD_COMPILE_EVENT(context, node_state.GetName().c_str(), "[Compile] End"); RECORD_COMPILE_EVENT(context, node_state.GetName().c_str(), "[Compile] End");
GE_CHK_STATUS_RET(ret, "Failed to create task for node: %s", node_item.NodeName().c_str()); GE_CHK_STATUS_RET(ret, "[Compile][Task] failed for node: %s.", node_item.NodeName().c_str());
node_state.SetKernelTask(kernel_task); node_state.SetKernelTask(kernel_task);
GELOGI("Compiling node %s successfully", node_state.GetName().c_str()); GELOGI("Compiling node %s successfully", node_state.GetName().c_str());
return SUCCESS; return SUCCESS;

@ -95,7 +95,8 @@ Status GraphItem::GroupNodes() {
int group = node->group; int group = node->group;
if (group != last_group) { if (group != last_group) {
if (seen_groups.find(group) != seen_groups.end()) { if (seen_groups.find(group) != seen_groups.end()) {
GELOGE(INTERNAL_ERROR, "Unordered node group found. node = %s, group = %d", node->NodeName().c_str(), group); GELOGE(INTERNAL_ERROR,
"[Find][Group]Unordered node group found. node = %s, group = %d", node->NodeName().c_str(), group);
return INTERNAL_ERROR; return INTERNAL_ERROR;
} else { } else {
last_group = group; last_group = group;

File diff suppressed because it is too large Load Diff

@ -1 +1 @@
Subproject commit 620e9b9ac3210db3e4cf47babfb23d248bb9f17e Subproject commit 4ff5e3987f2e5d2980019defacaf0891861c84fc

@ -1 +1 @@
Subproject commit d744541c6ca7f6966c1befacc9f83f53b0829e0a Subproject commit 51fb6c4850906e8342598d47eccfca0b87ffea59
Loading…
Cancel
Save