diff --git a/ge/CMakeLists.txt b/ge/CMakeLists.txt index 85a1bd18..3a0f7638 100755 --- a/ge/CMakeLists.txt +++ b/ge/CMakeLists.txt @@ -607,7 +607,7 @@ set(INFER_SRC_LIST if (NOT ENABLE_D AND NOT ENABLE_ACL AND NOT ENABLE_MS_TESTCASES) ############ libge_runner.so ############ -add_library(ge_runner SHARED ${TRAIN_SRC_LIST} ${PROTO_SRCS} ${PROTO_CLIENT_SRCS} $) +add_library(ge_runner SHARED ${TRAIN_SRC_LIST} ${PROTO_SRCS} ${PROTO_CLIENT_SRCS}) target_compile_definitions(ge_runner PRIVATE PROTOBUF_INLINE_NOT_IN_HEADERS=0 @@ -648,11 +648,14 @@ target_include_directories(ge_runner PRIVATE ${GE_CODE_DIR}/third_party/fwkacllib/inc/toolchain ) -target_link_libraries(ge_runner +target_link_libraries(ge_runner PRIVATE $ ge_memory adump_server static_mmpa + -Wl,--whole-archive + msprofiler_fwk + -Wl,--no-whole-archive -Wl,--no-as-needed graph ge_common @@ -712,7 +715,7 @@ target_include_directories(ge_compiler PRIVATE ${GE_CODE_DIR}/third_party/fwkacllib/inc/toolchain ) -target_link_libraries(ge_compiler +target_link_libraries(ge_compiler PRIVATE $ ge_memory static_mmpa @@ -766,7 +769,14 @@ target_link_options(opensrc_ascendcl PRIVATE -Wl,--allow-multiple-definition -Wl,-z,muldefs -Wl,-Bsymbolic - -Wl,--exclude-libs,ALL + -Wl,--exclude-libs,libascend_protobuf.a + -Wl,--exclude-libs,libge_executor.a + -Wl,--exclude-libs,libge_common.a + -Wl,--exclude-libs,libgraph.a + -Wl,--exclude-libs,libmmpa.a + -Wl,--exclude-libs,libregister.a + -Wl,--exclude-libs,liberror_manager.a + -Wl,--exclude-libs,libadump_server.a ) target_link_libraries(opensrc_ascendcl PRIVATE -Wl,--whole-archive diff --git a/ge/common/dump/dump_op.cc b/ge/common/dump/dump_op.cc index e92ada05..0b9e9dcc 100644 --- a/ge/common/dump/dump_op.cc +++ b/ge/common/dump/dump_op.cc @@ -94,6 +94,9 @@ Status DumpOp::DumpOutput(aicpu::dump::Task &task) { for (auto dim : output_descs.at(i).GetShape().GetDims()) { output.mutable_shape()->add_dim(dim); } + for (auto dim : output_descs.at(i).GetOriginShape().GetDims()) { + output.mutable_origin_shape()->add_dim(dim); + } int64_t output_size = 0; if (TensorUtils::GetTensorSizeInBytes(output_descs.at(i), output_size) != SUCCESS) { GELOGE(PARAM_INVALID, "Get output size filed"); @@ -118,6 +121,9 @@ Status DumpOp::DumpInput(aicpu::dump::Task &task) { for (auto dim : input_descs.at(i).GetShape().GetDims()) { input.mutable_shape()->add_dim(dim); } + for (auto dim : input_descs.at(i).GetOriginShape().GetDims()) { + input.mutable_origin_shape()->add_dim(dim); + } int64_t input_size = 0; if (TensorUtils::GetTensorSizeInBytes(input_descs.at(i), input_size) != SUCCESS) { GELOGE(PARAM_INVALID, "Get output size filed"); @@ -214,8 +220,15 @@ Status DumpOp::LaunchDumpOp() { SetOpMappingLoopAddr(global_step_, loop_per_iter_, loop_cond_, op_mapping_info); GELOGI("Dump step is %s ,dump path is %s ,in Launch dump op", dump_properties_.GetDumpStep().c_str(), dump_path.c_str()); - + uint32_t task_id = 0; + uint32_t stream_id = 0; + rt_ret = rtGetTaskIdAndStreamID(&task_id, &stream_id); + if (rt_ret != RT_ERROR_NONE) { + GELOGW("call rtGetTaskIdAndStreamID failed, ret = 0x%X", rt_ret); + } aicpu::dump::Task task; + task.set_task_id(task_id); + task.set_stream_id(stream_id); task.mutable_op()->set_op_name(op_desc_->GetName()); task.mutable_op()->set_op_type(op_desc_->GetType()); if (dump_properties_.GetDumpMode() == kDumpOutput) { diff --git a/ge/common/ge/tbe_plugin_manager.cc b/ge/common/ge/tbe_plugin_manager.cc index 44199c32..0cc7d553 100644 --- a/ge/common/ge/tbe_plugin_manager.cc +++ b/ge/common/ge/tbe_plugin_manager.cc @@ -181,12 +181,19 @@ void TBEPluginManager::GetCustomOpPath(std::string &customop_path) { void TBEPluginManager::LoadCustomOpLib() { LoadPluginSo(options_); + std::string fmk_type = std::to_string(domi::TENSORFLOW); + auto it = options_.find(ge::FRAMEWORK_TYPE); + if (it != options_.end()) { + fmk_type = it->second; + } std::vector registration_datas = domi::OpRegistry::Instance()->registrationDatas; GELOGI("The size of registration_datas is: %zu", registration_datas.size()); for (OpRegistrationData reg_data : registration_datas) { - GELOGD("Begin to register optype: %s, imply_type: %s", reg_data.GetOmOptype().c_str(), - TypeUtils::ImplyTypeToSerialString(reg_data.GetImplyType()).c_str()); - domi::OpRegistry::Instance()->Register(reg_data); + if (std::to_string(reg_data.GetFrameworkType()) == fmk_type) { + GELOGD("Begin to register optype: %s, imply_type: %s", reg_data.GetOmOptype().c_str(), + TypeUtils::ImplyTypeToSerialString(reg_data.GetImplyType()).c_str()); + (void)domi::OpRegistry::Instance()->Register(reg_data); + } } } diff --git a/ge/common/profiling/ge_profiling.cc b/ge/common/profiling/ge_profiling.cc index 640f77a1..43ed6434 100644 --- a/ge/common/profiling/ge_profiling.cc +++ b/ge/common/profiling/ge_profiling.cc @@ -112,7 +112,6 @@ ge::Status RegProfCtrlCallback(MsprofCtrlCallback func) { if (ge::ProfilingManager::Instance().GetMsprofCallback().msprofCtrlCallback != nullptr) { GELOGW("Msprof ctrl callback is exist, just ignore it."); } else { - GELOGI("GE register Msprof ctrl callback."); ge::ProfilingManager::Instance().SetMsprofCtrlCallback(func); } return ge::SUCCESS; @@ -124,7 +123,6 @@ ge::Status RegProfSetDeviceCallback(MsprofSetDeviceCallback func) { return ge::PARAM_INVALID; } // Pass MsprofSetDeviceCallback to runtime - GELOGI("GE pass setdevice callback to runtime."); ge::Status rt_ret = rtRegDeviceStateCallback(kRtSetDeviceRegName.c_str(), static_cast(func)); if (rt_ret != ge::SUCCESS) { GELOGE(rt_ret, "Pass MsprofSetDeviceCallback to runtime failed!"); @@ -158,7 +156,7 @@ ge::Status ProfCommandHandle(ProfCommandHandleType type, void *data, uint32_t le if (type != kProfCommandhandleFinalize) { GE_CHECK_NOTNULL(data); } - ProfCommandHandleData *prof_config_param = (ProfCommandHandleData *)data; + ProfCommandHandleData *prof_config_param = reinterpret_cast(data); auto iter = kProfCommandTypeMap.find(type); if (iter == kProfCommandTypeMap.end()) { GELOGW("The prof comand type is invalid."); @@ -183,7 +181,8 @@ ge::Status ProfCommandHandle(ProfCommandHandleType type, void *data, uint32_t le if (type != kProfCommandhandleFinalize) { command.module_index = prof_config_param->profSwitch; } - GELOGI("GE commandhandle execute, Command Type: %d, data type config: 0x%llx", type, command.module_index); + GELOGI("GE commandhandle execute, Command Type: %s, data type config: 0x%llx", iter->second.c_str(), + command.module_index); if (type == kProfCommandhandleStart || type == kProfCommandhandleStop) { GELOGI("Profiling device nums:%s , deviceID:[%s]", prof_params[0].c_str(), prof_params[kDeviceListIndex].c_str()); } diff --git a/ge/common/profiling/profiling_manager.cc b/ge/common/profiling/profiling_manager.cc index 456cb0a4..994b3eac 100644 --- a/ge/common/profiling/profiling_manager.cc +++ b/ge/common/profiling/profiling_manager.cc @@ -38,10 +38,8 @@ const std::string kProfModelUnsubscribe = "prof_model_cancel_subscribe"; } // namespace namespace ge { -ProfilingManager::ProfilingManager() : is_load_profiling_(false), - is_execute_profiling_(false), - is_training_trace_(false), - subscribe_count_(0) { +ProfilingManager::ProfilingManager() + : is_load_profiling_(false), is_execute_profiling_(false), is_training_trace_(false), subscribe_count_(0) { prof_cb_.msprofCtrlCallback = nullptr; prof_cb_.msprofReporterCallback = nullptr; } @@ -102,8 +100,8 @@ ge::Status ProfilingManager::InitFromOptions(const Options &options, MsprofGeOpt return INTERNAL_ERROR; } is_execute_profiling_ = true; - GELOGI("The profiling in options is %s, %s. origin option: %s", options.profiling_mode.c_str(), - prof_conf.options, options.profiling_options.c_str()); + GELOGI("The profiling in options is %s, %s. origin option: %s", options.profiling_mode.c_str(), prof_conf.options, + options.profiling_options.c_str()); } else { (void)mmGetEnv("PROFILING_MODE", env_profiling_mode, MMPA_MAX_PATH); (void)mmGetEnv("PROFILING_OPTIONS", prof_conf.options, MSPROF_OPTIONS_DEF_LEN_MAX); @@ -143,6 +141,9 @@ ge::Status ProfilingManager::ParseOptions(const std::string &options) { } try { Json prof_options = Json::parse(options); + if (options.find(kTrainingTrace) == std::string::npos) { + return ge::SUCCESS; + } const std::string training_trace = prof_options[kTrainingTrace]; if (training_trace.empty()) { GELOGI("Training trace will not take effect."); @@ -802,32 +803,46 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::GetFpBpP if (!fp_point_.empty() && !bp_point_.empty()) { fp_point = fp_point_; bp_point = bp_point_; - GELOGI("Bp Fp have been initialized in env or options. bp_point: %s, fp_point: %s", bp_point.c_str(), fp_point.c_str()); + GELOGI("Bp Fp have been initialized in env or options. bp_point: %s, fp_point: %s", bp_point.c_str(), + fp_point.c_str()); return; } // ProfApi mode and training trace is set - try { - char env_profiling_options[MSPROF_OPTIONS_DEF_LEN_MAX] = { 0x00 }; + // Parse options first + char env_profiling_options[MSPROF_OPTIONS_DEF_LEN_MAX] = { 0x00 }; + bool is_profiling_valid = false; + std::string profiling_options; + if (ge::GetContext().GetOption(OPTION_EXEC_PROFILING_OPTIONS, profiling_options) == SUCCESS && + !profiling_options.empty()) { + is_profiling_valid = true; + } else { INT32 ret = mmGetEnv("PROFILING_OPTIONS", env_profiling_options, MSPROF_OPTIONS_DEF_LEN_MAX); if (ret != EN_OK) { GELOGI("PROFILING_OPTIONS env is not exist."); return; } GELOGI("Parse env PROFILING_OPTIONS:%s.", env_profiling_options); - Json prof_options = Json::parse(env_profiling_options); + profiling_options = env_profiling_options; + is_profiling_valid = true; + } + if (is_profiling_valid) { + try { + Json prof_options = Json::parse(profiling_options); - fp_point_ = prof_options[kFpPoint]; - bp_point_ = prof_options[kBpPoint]; + fp_point_ = prof_options[kFpPoint]; + bp_point_ = prof_options[kBpPoint]; - fp_point = fp_point_; - bp_point = bp_point_; - if (!fp_point_.empty() && !bp_point_.empty()) { - GELOGI("Training trace bp fp is set, bp_point:%s, fp_point:%s.", bp_point_.c_str(), fp_point_.c_str()); + fp_point = fp_point_; + bp_point = bp_point_; + if (!fp_point_.empty() && !bp_point_.empty()) { + GELOGI("Training trace bp fp is set, bp_point:%s, fp_point:%s.", bp_point_.c_str(), fp_point_.c_str()); + } + } catch (...) { + GELOGW("Json prof options is invalid."); + return; } - } catch (...) { - GELOGE(FAILED, "Json prof options is invalid."); - return; } + return; } diff --git a/ge/common/profiling/profiling_manager.h b/ge/common/profiling/profiling_manager.h index 5fa4fac4..22fa8f8c 100644 --- a/ge/common/profiling/profiling_manager.h +++ b/ge/common/profiling/profiling_manager.h @@ -36,21 +36,21 @@ using Json = nlohmann::json; namespace { const std::string GE_PROFILING_MODULE = "Framework"; // DataTypeConfig MASK - #define PROF_ACL_API_MASK 0x0001 - #define PROF_TASK_TIME_MASK 0x0002 - #define PROF_AICORE_METRICS_MASK 0x0004 - #define PROF_AICPU_TRACE_MASK 0x0008 - #define PROF_MODEL_EXECUTE_MASK 0x0010 - #define PROF_RUNTIME_API_MASK 0x0020 - #define PROF_RUNTIME_TRACE_MASK 0x0040 - #define PROF_SCHEDULE_TIMELINE_MASK 0x0080 - #define PROF_SCHEDULE_TRACE_MASK 0x0100 - #define PROF_AIVECTORCORE_METRICS_MASK 0x0200 - #define PROF_SUBTASK_TIME_MASK 0x0400 - #define PROF_TRAINING_TRACE_MASK 0x0800 - #define PROF_HCCL_TRACE_MASK 0x1000 - #define PROF_DATA_PROCESS_MASK 0x2000 - #define PROF_MODEL_LOAD_MASK 0x8000000000000000 + const uint64_t PROF_ACL_API_MASK = 0x0001; + const uint64_t PROF_TASK_TIME_MASK = 0x0002; + const uint64_t PROF_AICORE_METRICS_MASK = 0x0004; + const uint64_t PROF_AICPU_TRACE_MASK = 0x0008; + const uint64_t PROF_MODEL_EXECUTE_MASK = 0x0010; + const uint64_t PROF_RUNTIME_API_MASK = 0x0020; + const uint64_t PROF_RUNTIME_TRACE_MASK = 0x0040; + const uint64_t PROF_SCHEDULE_TIMELINE_MASK = 0x0080; + const uint64_t PROF_SCHEDULE_TRACE_MASK = 0x0100; + const uint64_t PROF_AIVECTORCORE_METRICS_MASK = 0x0200; + const uint64_t PROF_SUBTASK_TIME_MASK = 0x0400; + const uint64_t PROF_TRAINING_TRACE_MASK = 0x0800; + const uint64_t PROF_HCCL_TRACE_MASK = 0x1000; + const uint64_t PROF_DATA_PROCESS_MASK = 0x2000; + const uint64_t PROF_MODEL_LOAD_MASK = 0x8000000000000000; } // namespace namespace ge { @@ -80,7 +80,8 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ProfilingManager { bool ProfilingTrainingTraceOn() const { return is_training_trace_; } bool ProfilingModelLoadOn() const { return is_load_profiling_; } bool ProfilingModelExecuteOn() const; - bool ProfilingOn() const { return is_load_profiling_ && is_execute_profiling_; } // is_execute_profiling_ only used by ge option and env + // is_execute_profiling_ only used by ge option and env + bool ProfilingOn() const { return is_load_profiling_ && is_execute_profiling_; } void ReportProfilingData(uint32_t model_id, const std::vector &task_desc_info, const std::vector &compute_graph_desc_info); void ProfilingTaskDescInfo(uint32_t model_id, const std::vector &task_desc_info, diff --git a/ge/common/proto/op_mapping_info.proto b/ge/common/proto/op_mapping_info.proto index e23b7ebe..7fb6f84b 100644 --- a/ge/common/proto/op_mapping_info.proto +++ b/ge/common/proto/op_mapping_info.proto @@ -15,6 +15,7 @@ message Output { int32 original_output_data_type = 7; int32 original_output_format = 8; uint64 size = 9; + Shape origin_shape = 10; } message Input { @@ -23,6 +24,7 @@ message Input { Shape shape = 3; uint64 address = 4; uint64 size = 5; + Shape origin_shape = 6; } enum BufferType { diff --git a/ge/executor/ge_executor.cc b/ge/executor/ge_executor.cc index 57ab7800..18d78696 100644 --- a/ge/executor/ge_executor.cc +++ b/ge/executor/ge_executor.cc @@ -209,19 +209,6 @@ bool IsDynmaicDimsSizeMatchModel(const vector cur_dynamic_dims, namespace ge { bool GeExecutor::isInit_ = false; -class ModelListenerAdapter : public ModelListener { - public: - domi::Status OnComputeDone(uint32_t model_id, uint32_t dataIndex, uint32_t resultCode, - std::vector &outputs) { - if (listener == nullptr) { - GELOGE(ge::FAILED, "listener is null."); - return FAILED; - } - return listener->OnComputeDone(model_id, dataIndex, resultCode, outputs); - } - - std::shared_ptr listener; -}; static void InitOpsProtoManger() { string opsproto_path; @@ -573,60 +560,6 @@ Status GeExecutor::SetDynamicAippData(uint32_t model_id, void *dynamic_input_add return SUCCESS; } -// Load model -Status GeExecutor::LoadModelOffline(uint32_t &model_id, const std::string &path, const std::string &key, - int32_t priority, std::shared_ptr listener) { - GELOGI("load model offline begin."); - if (!isInit_) { - GELOGE(ACL_ERROR_GE_EXEC_NOT_INIT, "GeExecutor has not been initialized!"); - return ACL_ERROR_GE_EXEC_NOT_INIT; - } - - string filePath = RealPath(path.c_str()); - if (filePath.empty()) { - GELOGE(ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID, - "File path is invalid. please check your text file '%s'.", path.c_str()); - return ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID; - } - - std::shared_ptr listener_adapter = MakeShared(); - if (listener_adapter == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "ModelListenerAdapter make shared failed!"); - return ACL_ERROR_GE_MEMORY_ALLOCATION; - } - listener_adapter->listener = listener; - - Status ret = GraphLoader::LoadModelFromFile(path, key, priority, listener_adapter, model_id); - if (ret != SUCCESS) { - GELOGE(ret, "[GeExecutor] LoadModelFromFile failed"); - return ACL_ERROR_GE_LOAD_MODEL; - } - return SUCCESS; -} - -Status GeExecutor::LoadModel(uint32_t &model_id, const ModelData &model_data, - std::shared_ptr listener) { - GELOGI("Load model begin."); - if (!isInit_) { - GELOGE(ACL_ERROR_GE_EXEC_NOT_INIT, "GeExecutor has not been initialized!"); - return ACL_ERROR_GE_EXEC_NOT_INIT; - } - - std::shared_ptr listener_adapter = MakeShared(); - if (listener_adapter == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "ModelListenerAdapter make shared failed!"); - return ACL_ERROR_GE_MEMORY_ALLOCATION; - } - listener_adapter->listener = listener; - - Status ret = GraphLoader::LoadModel(model_data, listener_adapter, model_id); - if (ret != SUCCESS) { - GELOGE(ret, "[GeExecutor] LoadModel failed."); - return ACL_ERROR_GE_LOAD_MODEL; - } - return ret; -} - Status GeExecutor::UnloadModel(uint32_t model_id) { GELOGD("unload model %u begin.", model_id); if (!isInit_) { @@ -659,21 +592,6 @@ Status GeExecutor::UnloadModel(uint32_t model_id) { return SUCCESS; } -Status GeExecutor::RunModel(const ge::RunModelData &input_data, ge::RunModelData &output_data) { - GELOGI("run model begin."); - if (!isInit_) { - GELOGE(ACL_ERROR_GE_EXEC_NOT_INIT, "GeExecutor has not been initialized!"); - return ACL_ERROR_GE_EXEC_NOT_INIT; - } - - InputData inputs; - GetDomiInputData(input_data, inputs); - OutputData outputs; - GetDomiOutputData(output_data, outputs); - - return GraphExecutor::DataInput(inputs, outputs); -} - // Get input and output descriptor Status GeExecutor::GetModelDescInfo(uint32_t model_id, std::vector &input_desc, std::vector &output_desc, bool new_model_desc) { diff --git a/ge/executor/proto/op_mapping_info.proto b/ge/executor/proto/op_mapping_info.proto index e23b7ebe..7fb6f84b 100644 --- a/ge/executor/proto/op_mapping_info.proto +++ b/ge/executor/proto/op_mapping_info.proto @@ -15,6 +15,7 @@ message Output { int32 original_output_data_type = 7; int32 original_output_format = 8; uint64 size = 9; + Shape origin_shape = 10; } message Input { @@ -23,6 +24,7 @@ message Input { Shape shape = 3; uint64 address = 4; uint64 size = 5; + Shape origin_shape = 6; } enum BufferType { diff --git a/ge/ge_local_engine/engine/host_cpu_engine.cc b/ge/ge_local_engine/engine/host_cpu_engine.cc index c836d4d6..e17f73de 100644 --- a/ge/ge_local_engine/engine/host_cpu_engine.cc +++ b/ge/ge_local_engine/engine/host_cpu_engine.cc @@ -39,7 +39,7 @@ namespace { } \ ge_tensor = MakeShared(out_desc); \ GE_CHECK_NOTNULL(ge_tensor); \ - GELOGI("node:%s allocate output %zu success, size=%lld", op_desc->GetName().c_str(), i, data_num * sizeof(TYPE));\ + GELOGD("node:%s allocate output %zu success, size=%lld", op_desc->GetName().c_str(), i, data_num * sizeof(TYPE));\ if (ge_tensor->SetData(reinterpret_cast(buf.get()), data_num * sizeof(TYPE)) != GRAPH_SUCCESS) { \ GELOGE(MEMALLOC_FAILED, "Set data for output %zu of node %s failed.", i, op_desc->GetName().c_str()); \ return MEMALLOC_FAILED; \ @@ -50,8 +50,7 @@ namespace { } else { \ ge_tensor = outputs[i]; \ GE_CHECK_NOTNULL(ge_tensor); \ - GELOGI("node:%s existed output %zu, addr=%p, size=%lld", op_desc->GetName().c_str(), i, \ - reinterpret_cast(ge_tensor->GetData().data()), ge_tensor->GetData().size()); \ + GELOGD("node:%s existed output %zu", op_desc->GetName().c_str(), i); \ } \ auto tensor = TensorAdapter::AsTensor(*ge_tensor); \ auto tensor_name = op_desc->GetOutputNameByIndex(i); \ diff --git a/ge/generator/ge_generator.cc b/ge/generator/ge_generator.cc index 7c083d2b..acb029e9 100644 --- a/ge/generator/ge_generator.cc +++ b/ge/generator/ge_generator.cc @@ -563,6 +563,19 @@ Status GeGenerator::GenerateModel(const Graph &graph, const string &file_name_pr GE_CHECK_NOTNULL(ge_root_model); GE_CHECK_NOTNULL(ge_root_model->GetRootGraph()); + ModelHelper model_helper; + string model_name = ""; + Status name_ret = model_helper.GetModelNameFromMergedGraphName(ge_root_model->GetRootGraph()->GetName(), + model_name); + if (name_ret != SUCCESS) { + ErrorManager::GetInstance().ATCReportErrMessage("E10000", {"parameter"}, {"output"}); + GELOGE(FAILED, "Get model_name failed. Param --output is invalid."); + return PARAM_INVALID; + } + map name_to_ge_model = ge_root_model->GetSubgraphInstanceNameToModel(); + GeModelPtr &ge_model = name_to_ge_model[ge_root_model->GetRootGraph()->GetName()]; + GE_RETURN_WITH_LOG_IF_FALSE(ge_model != nullptr, "ge_model cannot be null"); + ge_model->SetName(model_name); ret = impl_->SaveRootModel(file_name_prefix, ge_root_model, model); if (ret != SUCCESS) { GELOGE(ret, "Save model failed"); diff --git a/ge/graph/build/memory/graph_mem_assigner.cc b/ge/graph/build/memory/graph_mem_assigner.cc index 16d5d38f..40b3b9dc 100644 --- a/ge/graph/build/memory/graph_mem_assigner.cc +++ b/ge/graph/build/memory/graph_mem_assigner.cc @@ -99,7 +99,7 @@ Status GraphMemoryAssigner::AssignMemory() { MemoryOffset memory_offset(RT_MEMORY_HBM, mem_assigner->GetMemOffset()); memory_offset_.emplace(RT_MEMORY_HBM, memory_offset); - if (mem_assigner->GetP2PMemOffset() > 0) { + if (mem_assigner->GetP2PMemOffset() >= 0) { MemoryOffset p2p_memory_offset(RT_MEMORY_P2P_DDR, mem_assigner->GetP2PMemOffset()); memory_offset_.emplace(RT_MEMORY_P2P_DDR, p2p_memory_offset); } diff --git a/ge/graph/build/stream_graph_optimizer.cc b/ge/graph/build/stream_graph_optimizer.cc index 2933d413..5741e6b3 100644 --- a/ge/graph/build/stream_graph_optimizer.cc +++ b/ge/graph/build/stream_graph_optimizer.cc @@ -48,26 +48,41 @@ void StreamGraphOptimizer::RefreshNodeId(const ComputeGraphPtr &comp_graph, Grap } } -bool StreamGraphOptimizer::IsSameStreamId(const ComputeGraphPtr &comp_graph) { +bool StreamGraphOptimizer::IsSameStreamIdOrBatchLabel(const ComputeGraphPtr &comp_graph) { if (comp_graph == nullptr) { return false; } std::set stream_set; + std::set label_set; for (const ge::NodePtr &cur_node : comp_graph->GetDirectNode()) { GE_IF_BOOL_EXEC(cur_node->GetOpDesc() == nullptr, continue); int64_t stream_id = cur_node->GetOpDesc()->GetStreamId(); if (stream_id == kInvalidStream) { continue; } - GELOGD("Node %s in subgraph %s stream id is: %ld, node num: %zu", cur_node->GetName().c_str(), - comp_graph->GetName().c_str(), stream_id, comp_graph->GetDirectNodesSize()); stream_set.insert(stream_id); + + std::string batch_label; + if (AttrUtils::GetStr(cur_node->GetOpDesc(), ATTR_NAME_BATCH_LABEL, batch_label)) { + label_set.insert(batch_label); + } else { + GELOGD("Node %s[%s] has no batch label, subgraph %s, stream id: %ld", cur_node->GetName().c_str(), + cur_node->GetType().c_str(), comp_graph->GetName().c_str(), stream_id); + continue; + } + + GELOGD("Node %s in subgraph %s stream id: %ld, node num: %zu", cur_node->GetName().c_str(), + comp_graph->GetName().c_str(), stream_id, comp_graph->GetDirectNodesSize()); } - if (stream_set.size() > 1) { - GELOGI("Nodes of graph: %s have different stream id, node num: %zu, different stream num: %zu.", + if (stream_set.size() > 1 || label_set.size() > 1) { + GELOGI("Nodes of graph: %s have different stream id or batch_label, node num: %zu, different stream num: %zu.", comp_graph->GetName().c_str(), comp_graph->GetDirectNodesSize(), stream_set.size()); return false; } + + if (!label_set.empty()) { + (void)AttrUtils::SetStr(comp_graph, ATTR_NAME_BATCH_LABEL, *label_set.begin()); + } return true; } @@ -99,8 +114,8 @@ Status StreamGraphOptimizer::OptimizeStreamedSubGraph(const ComputeGraphPtr &com continue; } - if (!IsSameStreamId(subgraph)) { - GELOGI("There are more than one stream in subgraph %s", subgraph->GetName().c_str()); + if (!IsSameStreamIdOrBatchLabel(subgraph)) { + GELOGI("There are more than one stream or batch_label in subgraph %s", subgraph->GetName().c_str()); continue; } OpDescPtr op_desc = nodes.at(0)->GetOpDesc(); @@ -112,9 +127,11 @@ Status StreamGraphOptimizer::OptimizeStreamedSubGraph(const ComputeGraphPtr &com return FAILED; } run_context.stream = run_context.graphStreamList[stream_id]; - GELOGD("Subgraph has same stream id, subgraph: %s, engine_name: %s, stream_id: %ld, rtstream: %lu.", - subgraph->GetName().c_str(), engine_name.c_str(), stream_id, - static_cast(reinterpret_cast(run_context.stream))); + std::string batch_label; + (void)AttrUtils::GetStr(subgraph, ATTR_NAME_BATCH_LABEL, batch_label); + GELOGD("Subgraph has same stream id, subgraph: %s, engine_name: %s, stream_id: %ld, rtstream: %lu, " + "batch_label: %s", subgraph->GetName().c_str(), engine_name.c_str(), stream_id, + static_cast(reinterpret_cast(run_context.stream)), batch_label.c_str()); for (auto iter = graph_optimizers.begin(); iter != graph_optimizers.end(); ++iter) { GE_CHECK_NOTNULL(*iter); Status ret = (*iter)->OptimizeStreamGraph(*subgraph, run_context); diff --git a/ge/graph/build/stream_graph_optimizer.h b/ge/graph/build/stream_graph_optimizer.h index b0eea135..d69fa7ba 100644 --- a/ge/graph/build/stream_graph_optimizer.h +++ b/ge/graph/build/stream_graph_optimizer.h @@ -41,7 +41,7 @@ class StreamGraphOptimizer { private: void RefreshNodeId(const ComputeGraphPtr &comp_graph, Graph2SubGraphInfoList &subgraph_map); - bool IsSameStreamId(const ComputeGraphPtr &comp_graph); + bool IsSameStreamIdOrBatchLabel(const ComputeGraphPtr &comp_graph); }; } // namespace ge #endif // GE_GRAPH_BUILD_OPTIMIZE_STREAM_GRAPH_H_ diff --git a/ge/graph/build/task_generator.cc b/ge/graph/build/task_generator.cc index b506f945..2089ad31 100644 --- a/ge/graph/build/task_generator.cc +++ b/ge/graph/build/task_generator.cc @@ -567,7 +567,7 @@ Status TaskGenerator::MarkFirstAndLastOps(const vector &ops, bool is_ continue; } string op_type = op_desc->GetType(); - if (!is_single_stream && (!op_desc->GetSubgraphInstanceNames().empty() || separator_types.count(op_type) != 0)) { + if (!op_desc->GetSubgraphInstanceNames().empty() || separator_types.count(op_type) != 0) { continuous_op_lists.emplace_back(vector()); } else { continuous_op_lists.back().emplace_back(op_desc); diff --git a/ge/graph/load/graph_loader.cc b/ge/graph/load/graph_loader.cc index 44556422..cb68533e 100644 --- a/ge/graph/load/graph_loader.cc +++ b/ge/graph/load/graph_loader.cc @@ -122,14 +122,14 @@ Status GraphLoader::LoadDataFromFile(const std::string &path, const std::string ModelData &model_data) { Status ret; if (!CheckInputPathValid(path)) { - GELOGE(GE_EXEC_MODEL_PATH_INVALID, "model path is invalid: %s", path.c_str()); - return GE_EXEC_MODEL_PATH_INVALID; + GELOGE(ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID, "model path is invalid: %s", path.c_str()); + return ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID; } GELOGI("Load model begin, model path is: %s", path.c_str()); if (!key_path.empty() && !CheckInputPathValid(key_path)) { - GELOGE(GE_EXEC_MODEL_KEY_PATH_INVALID, "decrypt_key path is invalid: %s", key_path.c_str()); - return GE_EXEC_MODEL_KEY_PATH_INVALID; + GELOGE(ACL_ERROR_GE_PARAM_INVALID, "decrypt_key path is invalid: %s", key_path.c_str()); + return ACL_ERROR_GE_PARAM_INVALID; } ret = DavinciModelParser::LoadFromFile(path.c_str(), key_path.c_str(), priority, model_data); @@ -144,63 +144,6 @@ Status GraphLoader::LoadDataFromFile(const std::string &path, const std::string return SUCCESS; } -Status GraphLoader::LoadModelFromFile(const std::string &path, const std::string &key_path, int32_t priority, - const std::shared_ptr &listener, uint32_t &model_id) { - Status ret; - ModelData model_data; - ret = LoadDataFromFile(path, key_path, priority, model_data); - if (ret != SUCCESS) { - GELOGE(ret, "LoadModelFromFile: Load failed. ret = %u", ret); - if (model_data.model_data != nullptr) { - delete[] static_cast(model_data.model_data); - model_data.model_data = nullptr; - } - return ret; - } - - ret = LoadModel(model_data, listener, model_id); - if (ret != SUCCESS) { - GELOGE(ret, "LoadModel: Load failed. ret = %u", ret); - if (model_data.model_data != nullptr) { - delete[] static_cast(model_data.model_data); - model_data.model_data = nullptr; - } - } - - if (model_data.model_data != nullptr) { - delete[] static_cast(model_data.model_data); - model_data.model_data = nullptr; - } - - return ret; -} - -Status GraphLoader::LoadModel(const ModelData &model_data, const std::shared_ptr &listener, - uint32_t &model_id) { - GELOGI("Load model begin, model_id:%u.", model_id); - - // For GeOp, Open Device 0 here. - GE_CHK_RT_RET(rtSetDevice(0)); - auto model_manager = ModelManager::GetInstance(); - GE_CHECK_NOTNULL(model_manager); - Status ret = model_manager->LoadModelOffline(model_id, model_data, listener); - if (ret != SUCCESS) { - GE_CHK_RT(rtDeviceReset(0)); - GELOGE(ret, "LoadModel: Load failed."); - return ret; - } - ret = model_manager->Start(model_id); - if (ret != SUCCESS) { - if (model_manager->Unload(model_id) != SUCCESS) { - GELOGE(FAILED, "LoadModel: Unload failed while trying to unload after a failed start."); - } - GELOGE(ret, "LoadModel: Start failed."); - return ret; - } - GELOGI("LoadModel: Start model success, model_id:%u.", model_id); - return SUCCESS; -} - Status GraphLoader::CommandHandle(const Command &command) { try { auto model_manager = ModelManager::GetInstance(); @@ -225,16 +168,16 @@ Status GraphLoader::CommandHandle(const Command &command) { } Status GraphLoader::LoadModelFromData(uint32_t &model_id, const ModelData &model_data, void *dev_ptr, - size_t memsize, void *weight_ptr, size_t weightsize) { + size_t mem_size, void *weight_ptr, size_t weight_size) { GELOGI("Load model begin, model_id:%u.", model_id); // For ACL, Open Device from App. auto model_manager = ModelManager::GetInstance(); GE_CHECK_NOTNULL(model_manager); Status ret = model_manager->LoadModelOffline( - model_id, model_data, nullptr, dev_ptr, memsize, weight_ptr, weightsize); + model_id, model_data, nullptr, dev_ptr, mem_size, weight_ptr, weight_size); if (ret != SUCCESS) { - GELOGE(ret, "Load model failed, model_id:%u.", model_id); - return ret; + GELOGE(ACL_ERROR_GE_LOAD_MODEL, "Load model failed, model_id:%u.", model_id); + return ACL_ERROR_GE_LOAD_MODEL; } GELOGI("Load model success, model_id:%u.", model_id); return SUCCESS; @@ -259,8 +202,8 @@ Status GraphLoader::LoadModelWithQ(uint32_t &model_id, const ModelData &model_da GE_CHECK_NOTNULL(model_manager); Status ret = model_manager->LoadModelWithQ(model_id, model_data, input_queue_ids, output_queue_ids); if (ret != SUCCESS) { - GELOGE(ret, "Load model with queue failed, model_id:%u.", model_id); - return ret; + GELOGE(ACL_ERROR_GE_LOAD_MODEL, "Load model with queue failed, model_id:%u.", model_id); + return ACL_ERROR_GE_LOAD_MODEL; } GELOGI("Load model with queue success, model_id:%u.", model_id); diff --git a/ge/graph/load/graph_loader.h b/ge/graph/load/graph_loader.h index 974af5c1..3a13a113 100644 --- a/ge/graph/load/graph_loader.h +++ b/ge/graph/load/graph_loader.h @@ -44,12 +44,6 @@ class GraphLoader { static Status GetMaxUsedMemory(uint32_t model_id, uint64_t &max_size); - static Status LoadModel(const ModelData &model_data, const std::shared_ptr &listener, - uint32_t &model_id); - - static Status LoadModelFromFile(const std::string &path, const std::string &key_path, int32_t priority, - const std::shared_ptr &listener, uint32_t &model_id); - static Status CommandHandle(const Command &command); static Status GetMemoryInfo(int64_t &free); diff --git a/ge/graph/load/new_model_manager/data_dumper.cc b/ge/graph/load/new_model_manager/data_dumper.cc index b331d780..6f65e907 100644 --- a/ge/graph/load/new_model_manager/data_dumper.cc +++ b/ge/graph/load/new_model_manager/data_dumper.cc @@ -319,6 +319,9 @@ Status DataDumper::GenerateOutput(aicpu::dump::Output &output, const OpDesc::Vis for (auto dim : tensor_descs.at(index).GetShape().GetDims()) { output.mutable_shape()->add_dim(dim); } + for (auto dim : tensor_descs.at(index).GetOriginShape().GetDims()) { + output.mutable_origin_shape()->add_dim(dim); + } int64_t output_size = 0; if (TensorUtils::GetTensorSizeInBytes(tensor_descs.at(index), output_size) != SUCCESS) { GELOGE(PARAM_INVALID, "Get output size filed"); @@ -476,6 +479,9 @@ Status DataDumper::GenerateInput(aicpu::dump::Input &input, const OpDesc::Vistor for (auto dim : tensor_descs.at(index).GetShape().GetDims()) { input.mutable_shape()->add_dim(dim); } + for (auto dim : tensor_descs.at(index).GetOriginShape().GetDims()) { + input.mutable_origin_shape()->add_dim(dim); + } int64_t input_size = 0; if (AttrUtils::GetInt(tensor_descs.at(index), ATTR_NAME_INPUT_ORIGIN_SIZE, input_size)) { GELOGI("Get aipp input size according to attr is %ld", input_size); diff --git a/ge/graph/load/new_model_manager/davinci_model.cc b/ge/graph/load/new_model_manager/davinci_model.cc index bc755e07..eae6de13 100644 --- a/ge/graph/load/new_model_manager/davinci_model.cc +++ b/ge/graph/load/new_model_manager/davinci_model.cc @@ -289,8 +289,8 @@ Status DavinciModel::InitWeightMem(void *dev_ptr, void *weight_ptr, size_t weigh if (weight_ptr == nullptr) { weights_mem_base_ = MallocWeightsMem(weights_size); if (weights_mem_base_ == nullptr) { - GELOGE(GE_EXEC_ALLOC_WEIGHT_MEM_FAILED, "Alloc weight memory failed. size: %zu", weights_size); - return GE_EXEC_ALLOC_WEIGHT_MEM_FAILED; + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Alloc weight memory failed. size: %zu", weights_size); + return ACL_ERROR_GE_MEMORY_ALLOCATION; } is_inner_weight_base_ = true; } @@ -307,8 +307,8 @@ Status DavinciModel::InitWeightMem(void *dev_ptr, void *weight_ptr, size_t weigh Status DavinciModel::InitFeatureMapAndP2PMem(void *dev_ptr, size_t mem_size) { if (is_feature_map_mem_has_inited_) { - GELOGE(FAILED, "call InitFeatureMapMem more than once ."); - return FAILED; + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "call InitFeatureMapMem more than once ."); + return ACL_ERROR_GE_MEMORY_ALLOCATION; } is_feature_map_mem_has_inited_ = true; @@ -316,8 +316,8 @@ Status DavinciModel::InitFeatureMapAndP2PMem(void *dev_ptr, size_t mem_size) { std::size_t p2p_data_size = P2PMemInfos().at(RT_MEMORY_P2P_DDR).memory_size; if ((dev_ptr != nullptr) && (mem_size < TotalMemSize())) { - GELOGE(FAILED, "Invalid mem param: mem_size=%zu totalsize=%zu.", mem_size, TotalMemSize()); - return FAILED; + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Invalid mem param: mem_size=%zu totalsize=%zu.", mem_size, TotalMemSize()); + return ACL_ERROR_GE_MEMORY_ALLOCATION; } mem_base_ = static_cast(dev_ptr); @@ -327,8 +327,8 @@ Status DavinciModel::InitFeatureMapAndP2PMem(void *dev_ptr, size_t mem_size) { if (TotalMemSize() && mem_base_ == nullptr) { mem_base_ = MallocFeatureMapMem(data_size); if (mem_base_ == nullptr) { - GELOGE(GE_EXEC_ALLOC_FEATURE_MAP_MEM_FAILED, "Alloc feature map memory failed. size: %zu", data_size); - return GE_EXEC_ALLOC_FEATURE_MAP_MEM_FAILED; + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Alloc feature map memory failed. size: %zu", data_size); + return ACL_ERROR_GE_MEMORY_ALLOCATION; } GEEVENT("[IMAS]InitFeatureMapAndP2PMem graph_%u MallocMemory type[F] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id, mem_base_, data_size); @@ -343,8 +343,8 @@ Status DavinciModel::InitFeatureMapAndP2PMem(void *dev_ptr, size_t mem_size) { if (p2p_data_size != 0) { p2p_mem_base_ = MallocP2PMem(p2p_data_size); if (p2p_mem_base_ == nullptr) { - GELOGE(GE_EXEC_ALLOC_P2P_MEM_FAILED, "Alloc p2p memory failed,size: %zu", p2p_data_size); - return GE_EXEC_ALLOC_P2P_MEM_FAILED; + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Alloc p2p memory failed,size: %zu", p2p_data_size); + return ACL_ERROR_GE_MEMORY_ALLOCATION; } GELOGI("InitFeatureMapAndP2PMem graph_%u MallocMemory type[F] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id, p2p_mem_base_, p2p_data_size); @@ -710,6 +710,7 @@ Status DavinciModel::Init(void *dev_ptr, size_t mem_size, void *weight_ptr, size } // collect profiling for ge + GE_CHK_STATUS_RET(InitModelProfile(), "Init model profile failed"); auto &profiling_manager = ProfilingManager::Instance(); if (profiling_manager.ProfilingModelLoadOn()) { Status p_ret = ReportProfilingData(); @@ -970,7 +971,7 @@ Status DavinciModel::InitDataOp(const NodePtr &node, uint32_t &data_op_index, ma uint32_t parent_index = 0; // Ignore subgraph Data Node. if (AttrUtils::GetInt(op_desc, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { GELOGI("Init zero copy by subgraph Data node: %s.", op_desc->GetName().c_str()); - return InitInputBatchLabel(node); + return SUCCESS; } data_op_list_.push_back(op_desc); @@ -1011,10 +1012,6 @@ Status DavinciModel::InitDataOp(const NodePtr &node, uint32_t &data_op_index, ma } data_op_index++; - if (InitInputZeroCopyNodes(node) != SUCCESS) { - GELOGE(PARAM_INVALID, "Input zero copy nodes init failed!"); - return PARAM_INVALID; - } return SUCCESS; } @@ -1036,39 +1033,6 @@ void DavinciModel::AdjustDataOpList(const map &data_by_inde } } -/// -/// @ingroup ge -/// @brief input zero copy node Initialize. -/// @param [in] NodePtr: Data Op. -/// @return Status -/// -Status DavinciModel::InitInputZeroCopyNodes(const NodePtr &node) { - auto out_data_anchor = node->GetOutDataAnchor(kDataIndex); - if (out_data_anchor == nullptr) { - GELOGE(FAILED, "Out data anchor is nullptr"); - return FAILED; - } - for (auto &peer_in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) { - auto node = peer_in_data_anchor->GetOwnerNode(); - auto op_desc = node->GetOpDesc(); - if (op_desc == nullptr) { - GELOGE(FAILED, "Op desc is nullptr"); - return FAILED; - } - string batch_label; - (void)ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label); - if (batch_label.empty()) { - batch_label = kDefaultBatchLable; - } - if (zero_copy_op_id_batch_label_.find(op_desc->GetId()) == zero_copy_op_id_batch_label_.end()) { - zero_copy_op_id_batch_label_.emplace(pair(op_desc->GetId(), batch_label)); - GELOGD("Init input zero copy nodes success, op name:%s, op id: %ld, batch label: %s.", op_desc->GetName().c_str(), - op_desc->GetId(), batch_label.c_str()); - } - } - return SUCCESS; -} - bool DavinciModel::IsGetNextSinkDynamic(const OpDescPtr &op_desc) { bool getnext_sink_dynamic = false; if (ge::AttrUtils::GetBool(op_desc, ATTR_GETNEXT_SINK_DYNMAIC, getnext_sink_dynamic) && getnext_sink_dynamic) { @@ -1094,7 +1058,7 @@ Status DavinciModel::InitNetOutput(const NodePtr &node) { if (owner_graph->GetParentGraph() != nullptr) { GELOGI("Init zero copy by subgraph NetOutput node: %s.", op_desc->GetName().c_str()); op_list_.erase(op_desc->GetId()); - return InitOutputBatchLabel(node); + return SUCCESS; } output_op_list_.push_back(op_desc); @@ -1146,8 +1110,6 @@ Status DavinciModel::InitNetOutput(const NodePtr &node) { } } - GE_IF_BOOL_EXEC(InitOutputZeroCopyNodes(node) != SUCCESS, - GELOGE(PARAM_INVALID, "Output zero copy nodes init failed!"); return PARAM_INVALID;); GetAllGearsInfo(node); if (is_getnext_sink_dynamic_) { GE_IF_BOOL_EXEC(GetGetDynamicDimsNodeInfo(node) != SUCCESS, @@ -1343,121 +1305,6 @@ void DavinciModel::ParseDynamicOutShape(const std::vector &str_info } } -/// -/// @ingroup ge -/// @brief output zero copy node Initialize. -/// @param [in] NodePtr: netoutput Op. -/// @return Status -/// -Status DavinciModel::InitOutputZeroCopyNodes(const NodePtr &node) { - set nodes_need_record; - for (auto &in_data_anchor : node->GetAllInDataAnchors()) { - auto peer_out_data_anchor = in_data_anchor->GetPeerOutAnchor(); - if (peer_out_data_anchor == nullptr) { - continue; - } - auto peer_node = peer_out_data_anchor->GetOwnerNode(); - nodes_need_record.emplace(peer_node); - - // Merge node output multiplexed input, upstream nodes need to be considered in multiple batch scenarios - if (peer_node->GetType() == MERGE) { - for (const auto &merge_peer_in_data_anchor : peer_node->GetAllInDataAnchors()) { - auto merge_peer_out_data_anchor = merge_peer_in_data_anchor->GetPeerOutAnchor(); - if (merge_peer_out_data_anchor == nullptr) { - continue; - } - auto merge_peer_node = merge_peer_out_data_anchor->GetOwnerNode(); - nodes_need_record.emplace(merge_peer_node); - } - } else { - for (const auto &other_in_data_anchor : peer_out_data_anchor->GetPeerInDataAnchors()) { - auto other_in_node = other_in_data_anchor->GetOwnerNode(); - if (other_in_node->GetType() != NETOUTPUT) { - nodes_need_record.emplace(other_in_node); - } - } - } - } - - for (const auto &node_need_record : nodes_need_record) { - auto op_desc = node_need_record->GetOpDesc(); - GE_CHECK_NOTNULL(op_desc); - string batch_label; - (void)ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label); - if (batch_label.empty()) { - batch_label = kDefaultBatchLable; - } - if (zero_copy_op_id_batch_label_.find(op_desc->GetId()) == zero_copy_op_id_batch_label_.end()) { - zero_copy_op_id_batch_label_.emplace(pair(op_desc->GetId(), batch_label)); - GELOGD("Init Output zero copy nodes success, op name:%s, op id: %ld, batch label: %s.", - op_desc->GetName().c_str(), op_desc->GetId(), batch_label.c_str()); - } - } - return SUCCESS; -} - -/// -/// @ingroup ge -/// @brief input zero copy node Initialize. -/// @param [in] NodePtr: Data Op. -/// @return Status -/// -Status DavinciModel::InitInputBatchLabel(const NodePtr &node) { - string batch_label; - if (!AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_BATCH_LABEL, batch_label)) { - return SUCCESS; // Not Multi-batch. - } - - const auto &out_data_anchor = node->GetOutDataAnchor(kDataIndex); - GE_CHECK_NOTNULL(out_data_anchor); - - for (const auto &peer_in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) { - const auto &node = peer_in_data_anchor->GetOwnerNode(); - const auto &op_desc = node->GetOpDesc(); - GE_CHECK_NOTNULL(op_desc); - - if (zero_copy_op_id_batch_label_.find(op_desc->GetId()) == zero_copy_op_id_batch_label_.end()) { - zero_copy_op_id_batch_label_[op_desc->GetId()] = batch_label; - GELOGD("Init input zero copy nodes success, op name: %s, op id: %ld, batch label: %s", op_desc->GetName().c_str(), - op_desc->GetId(), batch_label.c_str()); - } - } - - return SUCCESS; -} - -/// -/// @ingroup ge -/// @brief output zero copy node Initialize for Case. -/// @param [in] NodePtr: netoutput Op. -/// @return Status -/// -Status DavinciModel::InitOutputBatchLabel(const NodePtr &node) { - string batch_label; - if (!AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_BATCH_LABEL, batch_label)) { - return SUCCESS; // Not Multi-batch. - } - - for (const auto &in_data_anchor : node->GetAllInDataAnchors()) { - const auto &peer_out_data_anchor = in_data_anchor->GetPeerOutAnchor(); - if (peer_out_data_anchor == nullptr) { - continue; - } - - const auto &peer_node = peer_out_data_anchor->GetOwnerNode(); - const auto &op_desc = peer_node->GetOpDesc(); - GE_CHECK_NOTNULL(op_desc); - - if (zero_copy_op_id_batch_label_.find(op_desc->GetId()) == zero_copy_op_id_batch_label_.end()) { - zero_copy_op_id_batch_label_[op_desc->GetId()] = batch_label; - GELOGD("Init Output zero copy nodes success, op name: %s, op id: %ld, batch label: %s", - op_desc->GetName().c_str(), op_desc->GetId(), batch_label.c_str()); - } - } - - return SUCCESS; -} - /// @ingroup ge /// @brief LabelSet Op Initialize. /// @param [in] op_desc: LabelSet Op descriptor. @@ -2240,12 +2087,61 @@ Status DavinciModel::SyncVarData() { return ret; } -inline int64_t SumSize(const vector &size_list) { - int64_t sum_size = 0; - for (const int64_t &size : size_list) { - sum_size += size; +Status DavinciModel::InitModelProfile() { + for (const auto &task : task_list_) { + GE_CHECK_NOTNULL(task); + const FusionOpInfo *fusion_op_info = task->GetFusionOpInfo(); + // when type is RT_MODEL_TASK_KERNEL, ctx is not null + if ((fusion_op_info == nullptr) || fusion_op_info->original_op_names.empty()) { + continue; + } + + GELOGI("task.id = %u, opNum = %zu", task->GetTaskID(), fusion_op_info->original_op_names.size()); + op_id_map_.insert(std::make_pair(fusion_op_info->op_index, task->GetTaskID())); + } + + std::set task_id_set; + using CIT = std::multimap::const_iterator; + using Range = std::pair; + for (const auto &task : task_list_) { + GE_CHECK_NOTNULL(task); + const FusionOpInfo *fusion_op_info = task->GetFusionOpInfo(); + if ((fusion_op_info == nullptr) || fusion_op_info->original_op_names.empty()) { + continue; + } + + if (task_id_set.count(task->GetTaskID()) > 0) { + continue; + } + + const auto &op_desc = GetOpByIndex(fusion_op_info->op_index); + GE_CHK_BOOL_EXEC(op_desc != nullptr, return FAILED, "index: %u out of range", fusion_op_info->op_index); + + ProfileInfo profile; + profile.fusion_info = *fusion_op_info; + Range range = op_id_map_.equal_range(fusion_op_info->op_index); + for (CIT range_idx = range.first; range_idx != range.second; ++range_idx) { + profile.task_count++; + task_id_set.insert(range_idx->second); + } + + // memory info + TaskMemInfo &mem_info = profile.memory_info; + const auto input_size = ModelUtils::GetInputSize(op_desc); + const auto output_size = ModelUtils::GetOutputSize(op_desc); + const auto workspace_size = ModelUtils::GetWorkspaceSize(op_desc); + const auto weight_size = ModelUtils::GetWeightSize(op_desc); + mem_info.input_size = std::accumulate(input_size.begin(), input_size.end(), 0); + mem_info.output_size = std::accumulate(output_size.begin(), output_size.end(), 0); + mem_info.workspace_size = std::accumulate(workspace_size.begin(), workspace_size.end(), 0); + mem_info.weight_size = std::accumulate(weight_size.begin(), weight_size.end(), 0); + mem_info.total_size = mem_info.weight_size + mem_info.input_size + mem_info.output_size + mem_info.workspace_size; + + profile_list_.emplace_back(profile); } - return sum_size; + + GELOGI("fusion task size: %zu, profile info size: %zu", op_id_map_.size(), profile_list_.size()); + return SUCCESS; } Status DavinciModel::SinkModelProfile() { @@ -2253,18 +2149,12 @@ Status DavinciModel::SinkModelProfile() { auto &prof_mgr = ProfilingManager::Instance(); ReporterData reporter_data{}; // report model data tag name - std::string tag_name; - tag_name.append("model_load_info_").append(std::to_string(this->Id())); + std::string tag_name("model_load_info_" + std::to_string(this->Id())); GE_CHK_BOOL_EXEC(memcpy_s(reporter_data.tag, MSPROF_ENGINE_MAX_TAG_LEN, tag_name.c_str(), tag_name.size()) == EOK, return FAILED, "Sink model tag memcpy error."); // Model Header - string name; - if (!om_name_.empty()) { - name = om_name_; - } else { - name = name_; - } + std::string name = om_name_.empty() ? name_ : om_name_; size_t name_len = name.size(); reporter_data.deviceId = device_id_; reporter_data.data = (unsigned char *)&name_len; @@ -2296,128 +2186,71 @@ Status DavinciModel::SinkModelProfile() { GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED, "Reporter data fail, model id:%u.", this->Id()); - int32_t task_num = task_list_.size(); - std::multimap op_id_map; - std::set task_id_set; - for (int32_t i = 0; i < task_num; i++) { - auto task = task_list_[i]; - GE_CHECK_NOTNULL(task); - auto fusion_op_info = task->GetFusionOpInfo(); - // when type is RT_MODEL_TASK_KERNEL, ctx is not null - if (fusion_op_info != nullptr) { - uint32_t op_num = fusion_op_info->original_op_names.size(); - uint32_t task_id = task->GetTaskID(); - if (op_num > 0) { - GELOGI("task.id = %u, opNum = %u", task_id, op_num); - op_id_map.insert(std::make_pair(fusion_op_info->op_index, task_id)); - } - } - } - - struct memoryInfo { - int64_t input_size; - int64_t output_size; - int64_t weight_size; - int64_t workspace_size; - int64_t total_size; - - memoryInfo() : input_size(0), output_size(0), weight_size(0), workspace_size(0), total_size(0) {} - }; - using CIT = std::multimap::const_iterator; using Range = std::pair; - for (int32_t i = 0; i < task_num; i++) { - auto task = task_list_[i]; - GE_CHECK_NOTNULL(task); - auto fusion_op_info = task->GetFusionOpInfo(); - if (fusion_op_info != nullptr && fusion_op_info->original_op_names.size() > 0) { - uint32_t task_id = task->GetTaskID(); - uint32_t op_num = fusion_op_info->original_op_names.size(); - uint32_t task_count = 0; - if (task_id_set.count(task_id) != 0) { - continue; - } - - uint32_t op_id = fusion_op_info->op_index; - Range range = op_id_map.equal_range(op_id); - for (CIT range_idx = range.first; range_idx != range.second; ++range_idx) { - task_count++; - uint32_t task_id = range_idx->second; - task_id_set.insert(task_id); - } - - // op name after fusion - string fusion_op_name = fusion_op_info->op_name; - int32_t fusion_op_name_len = fusion_op_name.size() == 0 ? 1 : fusion_op_name.size(); - reporter_data.data = (unsigned char *)&fusion_op_name_len; + for (const ProfileInfo &profile : profile_list_) { + // op name after fusion + string fusion_op_name = profile.fusion_info.op_name; + int32_t fusion_op_name_len = fusion_op_name.size() == 0 ? 1 : fusion_op_name.size(); + reporter_data.data = (unsigned char *)&fusion_op_name_len; + reporter_data.dataLen = sizeof(int32_t); + GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED, + "Reporter data fail, model id:%u.", this->Id()); + + reporter_data.data = (unsigned char *)fusion_op_name.c_str(); + reporter_data.dataLen = fusion_op_name_len; + GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED, + "Reporter data fail, model id:%u.", this->Id()); + + // original op name before fusion + uint32_t op_num = profile.fusion_info.original_op_names.size(); + reporter_data.data = (unsigned char *)&op_num; + reporter_data.dataLen = sizeof(int32_t); + GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED, + "Reporter data fail, model id:%u.", this->Id()); + + for (uint32_t k = 0; k < op_num; k++) { + std::string op_name = profile.fusion_info.original_op_names[k]; + int32_t op_name_len = op_name.size() == 0 ? 1 : op_name.size(); + reporter_data.data = (unsigned char *)&op_name_len; reporter_data.dataLen = sizeof(int32_t); GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED, "Reporter data fail, model id:%u.", this->Id()); - - reporter_data.data = (unsigned char *)fusion_op_name.c_str(); - reporter_data.dataLen = fusion_op_name_len; - GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED, - "Reporter data fail, model id:%u.", this->Id()); - - // original op name before fusion - reporter_data.data = (unsigned char *)&op_num; - reporter_data.dataLen = sizeof(int32_t); - GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED, - "Reporter data fail, model id:%u.", this->Id()); - - for (uint32_t k = 0; k < op_num; k++) { - std::string op_name = fusion_op_info->original_op_names[k]; - int32_t op_name_len = op_name.size() == 0 ? 1 : op_name.size(); - reporter_data.data = (unsigned char *)&op_name_len; - reporter_data.dataLen = sizeof(int32_t); - GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED, - "Reporter data fail, model id:%u.", this->Id()); - reporter_data.data = (unsigned char *)op_name.c_str(); - reporter_data.dataLen = op_name_len; - GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED, - "Reporter data fail, model id:%u.", this->Id()); - } - - // stream id info - uint32_t streamId = task->GetStreamId(); - reporter_data.data = (unsigned char *)&streamId; - reporter_data.dataLen = sizeof(int32_t); - GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED, - "Reporter data fail, model id:%u.", this->Id()); - - // memory info - struct memoryInfo memory_info; - uint32_t op_index = fusion_op_info->op_index; - auto iter = op_list_.find(op_index); - GE_CHK_BOOL_EXEC(iter != op_list_.end(), return FAILED, "index is out of range, index: %u", op_index); - auto op_desc = iter->second; - memory_info.input_size = SumSize(ModelUtils::GetInputSize(op_desc)); - memory_info.output_size = SumSize(ModelUtils::GetOutputSize(op_desc)); - memory_info.workspace_size = SumSize(ModelUtils::GetWorkspaceSize(op_desc)); - memory_info.weight_size = SumSize(ModelUtils::GetWeightSize(op_desc)); - memory_info.total_size = - memory_info.weight_size + memory_info.input_size + memory_info.output_size + memory_info.workspace_size; - reporter_data.data = (unsigned char *)&memory_info; - reporter_data.dataLen = sizeof(struct memoryInfo); + reporter_data.data = (unsigned char *)op_name.c_str(); + reporter_data.dataLen = op_name_len; GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED, "Reporter data fail, model id:%u.", this->Id()); + } - // task info - reporter_data.data = (unsigned char *)&task_count; + // stream id info + uint32_t streamId = profile.fusion_info.stream_id; + reporter_data.data = (unsigned char *)&streamId; + reporter_data.dataLen = sizeof(int32_t); + GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED, + "Reporter data fail, model id:%u.", this->Id()); + + // memory info + reporter_data.data = (unsigned char *)&profile.memory_info; + reporter_data.dataLen = sizeof(profile.memory_info); + GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED, + "Reporter data fail, model id:%u.", this->Id()); + + // task info + reporter_data.data = (unsigned char *)&profile.task_count; + reporter_data.dataLen = sizeof(uint32_t); + GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED, + "Reporter data fail, model id:%u.", this->Id()); + + Range task_range = op_id_map_.equal_range(profile.fusion_info.op_index); + for (CIT idx = task_range.first; idx != task_range.second; ++idx) { + uint32_t task_id = idx->second; + reporter_data.data = (unsigned char *)&task_id; reporter_data.dataLen = sizeof(uint32_t); GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED, "Reporter data fail, model id:%u.", this->Id()); - - Range task_range = op_id_map.equal_range(op_id); - for (CIT idx = task_range.first; idx != task_range.second; ++idx) { - uint32_t task_id = idx->second; - reporter_data.data = (unsigned char *)&task_id; - reporter_data.dataLen = sizeof(uint32_t); - GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED, - "Reporter data fail, model id:%u.", this->Id()); - } } } + return SUCCESS; } @@ -2991,19 +2824,19 @@ Status DavinciModel::CreateKnownZeroCopyMap(const vector &inputs, const return SUCCESS; } -Status DavinciModel::UpdateKnownZeroCopyAddr() { - for (size_t i = 0; i < total_io_addrs_.size(); ++i) { - auto it_in = knonw_input_data_info_.find(total_io_addrs_[i]); +Status DavinciModel::UpdateKnownZeroCopyAddr(vector &total_io_addrs) { + for (size_t i = 0; i < total_io_addrs.size(); ++i) { + auto it_in = knonw_input_data_info_.find(total_io_addrs[i]); if (it_in != knonw_input_data_info_.end()) { - GELOGI("DavinciModel::UpdateKnownZeroCopyAddr input %zu,v addr %p,p addr %p .", i, total_io_addrs_[i], - knonw_input_data_info_.at(total_io_addrs_[i])); - total_io_addrs_[i] = knonw_input_data_info_.at(total_io_addrs_[i]); + GELOGI("DavinciModel::UpdateKnownZeroCopyAddr input %zu,v addr %p,p addr %p .", i, total_io_addrs[i], + knonw_input_data_info_.at(total_io_addrs[i])); + total_io_addrs[i] = knonw_input_data_info_.at(total_io_addrs[i]); } - auto it_out = knonw_output_data_info_.find(total_io_addrs_[i]); + auto it_out = knonw_output_data_info_.find(total_io_addrs[i]); if (it_out != knonw_output_data_info_.end()) { - GELOGI("DavinciModel::UpdateKnownZeroCopyAddr output %zu,v addr %p,p addr %p .", i, total_io_addrs_[i], - knonw_output_data_info_.at(total_io_addrs_[i])); - total_io_addrs_[i] = knonw_output_data_info_.at(total_io_addrs_[i]); + GELOGI("DavinciModel::UpdateKnownZeroCopyAddr output %zu,v addr %p,p addr %p .", i, total_io_addrs[i], + knonw_output_data_info_.at(total_io_addrs[i])); + total_io_addrs[i] = knonw_output_data_info_.at(total_io_addrs[i]); } } GELOGI("DavinciModel::UpdateKnownZeroCopyAddr success."); @@ -3032,7 +2865,7 @@ Status DavinciModel::UpdateKnownNodeArgs(const vector &inputs, const vec } else { total_io_addrs_ = orig_total_io_addrs_; } - GE_CHK_STATUS_RET(UpdateKnownZeroCopyAddr(), "DavinciModel::UpdateKnownZeroCopyAddr failed."); + GE_CHK_STATUS_RET(UpdateKnownZeroCopyAddr(total_io_addrs_), "DavinciModel::UpdateKnownZeroCopyAddr failed."); if (total_args_size_ == 0) { GELOGW("DavinciModel::UpdateKnownNodeArgs device args %p, dst size %u, pass rtMemcpy.", args_, total_args_size_); @@ -3099,7 +2932,14 @@ Status DavinciModel::MallocKnownArgs() { GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } - + // malloc dynamic and static hybrid memory + if (total_hybrid_args_size_ != 0) { + rt_ret = rtMalloc(&hybrid_addrs_, total_hybrid_args_size_, RT_MEMORY_HBM); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret); + return RT_ERROR_TO_GE_STATUS(rt_ret); + } + } // malloc fixed addr memory, eg: rts op if (total_fixed_addr_size_ != 0) { GELOGI("Begin to allocate fixed addr."); @@ -3257,27 +3097,20 @@ void DavinciModel::SetZeroCopyAddr(const OpDescPtr &op_desc, const std::vector(args) + offset + i * kAddrLen; - SetBatchLabelAddr(op_desc, reinterpret_cast(args_val)); - } + input_outside.SetOutsideAddrsValue(zero_copy_task, outside_addrs[i], args, offset + i * kAddrLen); } for (auto &output_outside_addrs : new_output_outside_addrs_) { ZeroCopyOffset &output_outside = output_outside_addrs.second; - bool ret = output_outside.SetOutsideAddrsValue(zero_copy_task, outside_addrs[i], args, offset + i * kAddrLen); - if (ret) { - void *args_val = static_cast(args) + offset + i * kAddrLen; - SetBatchLabelAddr(op_desc, reinterpret_cast(args_val)); - } + output_outside.SetOutsideAddrsValue(zero_copy_task, outside_addrs[i], args, offset + i * kAddrLen); } } - auto it = zero_copy_op_id_batch_label_.find(op_desc->GetId()); - if (it == zero_copy_op_id_batch_label_.end()) { + + string batch_label; + if (!AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label) || batch_label.empty()) { zero_copy_task.SetBatchLabel(kDefaultBatchLable); } else { - zero_copy_task.SetBatchLabel(it->second); + zero_copy_task.SetBatchLabel(batch_label); } std::lock_guard lock(outside_addrs_mutex_); @@ -3287,27 +3120,6 @@ void DavinciModel::SetZeroCopyAddr(const OpDescPtr &op_desc, const std::vectorGetId()); - if (it == zero_copy_op_id_batch_label_.end()) { - return; - } - - const string &batch_label = it->second; - auto iter = zero_copy_batch_label_addrs_.find(batch_label); - if (iter != zero_copy_batch_label_addrs_.end()) { - iter->second.insert(addr); - GELOGD("[ZCPY] Set zero copy batch label and addrs success, batch label: %s, op name:%s.", batch_label.c_str(), - op_desc->GetName().c_str()); - } else { - set addrs = {addr}; - zero_copy_batch_label_addrs_.emplace(pair>(batch_label, addrs)); - GELOGD("[ZCPY] New added zero copy batch label and addrs success, batch label: %s, op name:%s.", - batch_label.c_str(), op_desc->GetName().c_str()); - } -} - /// /// @ingroup ge /// @brief Copy Check input size and model op size. @@ -3441,15 +3253,15 @@ Status DavinciModel::UpdateIoTaskArgs(const std::map & void *addr = data.second.GetDataInfo().at(count).second; void *buffer_addr = reinterpret_cast(reinterpret_cast(buffer.data) + data.second.GetRelativeOffset().at(count)); - GELOGI("[ZCPY] Copy %s blobs_index %u, virtual_addr: %p, size: %ld, user_data_addr: %p", input_or_output.c_str(), - data.first, addr, size, buffer_addr); + GELOGI("[ZCPY] Copy %s blobs_index %u, virtual_addr: %p, size: %ld, user_data_addr: %p, batch_label: %s", + input_or_output.c_str(), data.first, addr, size, buffer_addr, batch_label.c_str()); // For input data, just copy for rts task. for (ZeroCopyTask &task : zero_copy_tasks_) { if (task.GetBatchLabel() != kDefaultBatchLable && task.GetBatchLabel() != batch_label) { continue; } uintptr_t addr_val = reinterpret_cast(addr); - if (task.UpdateTaskParam(addr_val, buffer_addr, zero_copy_batch_label_addrs_, batch_label) != SUCCESS) { + if (task.UpdateTaskParam(addr_val, buffer_addr) != SUCCESS) { return FAILED; } } @@ -3811,9 +3623,6 @@ Status DavinciModel::NnExecute(rtStream_t stream, bool async_mode, const InputDa GELOGD("Model Run begin, model id:%u, data index:%u, flag:%d.", model_id_, input_data.index, is_async_mode_); GE_CHK_STATUS_RET(InitModelStream(stream), "Init model stream failed."); is_dynamic_ = input_data.is_dynamic_batch; - if (!is_dynamic_) { - zero_copy_batch_label_addrs_.clear(); - } GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingModelExecuteOn(), SetProfileTime(MODEL_PRE_PROC_START)); Status ret = CopyModelData(input_data, output_data, is_dynamic_); diff --git a/ge/graph/load/new_model_manager/davinci_model.h b/ge/graph/load/new_model_manager/davinci_model.h index 19888e1f..be8efd90 100644 --- a/ge/graph/load/new_model_manager/davinci_model.h +++ b/ge/graph/load/new_model_manager/davinci_model.h @@ -76,6 +76,20 @@ struct timeInfo { int64_t dumpEndTime; }; +struct TaskMemInfo { + int64_t input_size{0}; + int64_t output_size{0}; + int64_t weight_size{0}; + int64_t workspace_size{0}; + int64_t total_size{0}; +}; + +struct ProfileInfo { + FusionOpInfo fusion_info; + TaskMemInfo memory_info; + uint32_t task_count{0}; +}; + enum ExecuteMode { INITIALIZATION, SYNCHRONIZATION, @@ -226,8 +240,6 @@ class DavinciModel { const vector &GetDataList() const { return data_op_list_; } // get Op - const map &GetOpList() const { return op_list_; } - OpDescPtr GetOpByIndex(uint32_t index) const { if (op_list_.find(index) == op_list_.end()) { return nullptr; @@ -436,10 +448,6 @@ class DavinciModel { int64_t GetLoadEndTime() { return load_end_time_; } - Status SinkModelProfile(); - - Status SinkTimeProfile(const InputData ¤t_data); - Status ReportProfilingData(); void SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, uint32_t task_id, uint32_t stream_id) { @@ -476,6 +484,14 @@ class DavinciModel { void SetTotalIOAddrs(vector &io_addrs) { total_io_addrs_.insert(total_io_addrs_.end(), io_addrs.begin(), io_addrs.end()); } + void SetHybridArgsSize(uint32_t args_size) { total_hybrid_args_size_ += args_size; } + uint32_t GetHybridArgsSize() { + return total_hybrid_args_size_; + } + void *GetCurrentHybridArgsAddr(uint32_t offset) { + void *cur_args = static_cast(hybrid_addrs_) + offset; + return cur_args; + } void SetTotalFixedAddrsSize(string tensor_name, int64_t fix_addr_size); int64_t GetFixedAddrsSize(string tensor_name); void *GetCurrentFixedAddr(int64_t offset) const { @@ -494,7 +510,7 @@ class DavinciModel { Status MallocKnownArgs(); Status UpdateKnownNodeArgs(const vector &inputs, const vector &outputs); Status CreateKnownZeroCopyMap(const vector &inputs, const vector &outputs); - Status UpdateKnownZeroCopyAddr(); + Status UpdateKnownZeroCopyAddr(vector &total_io_addrs); void SetKnownNodeAddrNotChanged(bool base_addr_not_changed) { base_addr_not_changed_ = base_addr_not_changed; } Status GetOrigInputInfo(uint32_t index, OriginInputInfo &orig_input_info); @@ -529,15 +545,6 @@ class DavinciModel { struct timeInfo time_info_; int32_t dataInputTid; - /// - /// @ingroup ge - /// @brief Save Batch label Info. - /// @param [in] const OpDescPtr &op_desc - /// @param [in] uintptr_t addr: address value in args block. - /// @return None. - /// - void SetBatchLabelAddr(const OpDescPtr &op_desc, uintptr_t addr); - /// /// @ingroup ge /// @brief Copy Check input size and model op size. @@ -649,14 +656,6 @@ class DavinciModel { /// void AdjustDataOpList(const map &data_by_index); - /// - /// @ingroup ge - /// @brief input zero copy node Initialize. - /// @param [in] NodePtr: Data Op. - /// @return Status - /// - Status InitInputZeroCopyNodes(const NodePtr &node); - /// /// @ingroup ge /// @brief NetOutput Op Initialize. @@ -665,30 +664,6 @@ class DavinciModel { /// Status InitNetOutput(const NodePtr &node); - /// - /// @ingroup ge - /// @brief output zero copy node Initialize. - /// @param [in] NodePtr: Data Op. - /// @return Status - /// - Status InitOutputZeroCopyNodes(const NodePtr &node); - - /// - /// @ingroup ge - /// @brief input zero copy node Initialize for Case. - /// @param [in] NodePtr: Data Op. - /// @return Status - /// - Status InitInputBatchLabel(const NodePtr &node); - - /// - /// @ingroup ge - /// @brief output zero copy node Initialize for Case. - /// @param [in] NodePtr: netoutput Op. - /// @return Status - /// - Status InitOutputBatchLabel(const NodePtr &node); - /// /// @ingroup ge /// @brief Constant Op Init. @@ -837,6 +812,11 @@ class DavinciModel { void SetDataDumperArgs(const ComputeGraphPtr &compute_graph); + Status InitModelProfile(); + Status SinkModelProfile(); + + Status SinkTimeProfile(const InputData ¤t_data); + Status GenOutputTensorInfo(const OpDescPtr &op_desc, uint32_t data_index, OutputData *output_data, std::vector &outputs); @@ -914,11 +894,6 @@ class DavinciModel { std::vector zero_copy_tasks_; // Task used Data or NetOutput addr. std::set copy_only_addrs_; // Address need copy to original place. - // {op_id, batch_label} - std::map zero_copy_op_id_batch_label_; - // {batch_label, addrs} - std::map> zero_copy_batch_label_addrs_; - std::vector task_list_; // rt_moodel_handle rtModel_t rt_model_handle_; @@ -977,6 +952,8 @@ class DavinciModel { void *args_ = nullptr; void *args_host_ = nullptr; void *fixed_addrs_ = nullptr; + void *hybrid_addrs_ = nullptr; + uint32_t total_hybrid_args_size_ = 0; int64_t total_fixed_addr_size_ = 0; std::map knonw_input_data_info_; std::map knonw_output_data_info_; @@ -1016,6 +993,9 @@ class DavinciModel { // key: input_index: input is merge node; value: each gear info and each output shape std::map, vector>> merge_nodes_gear_and_real_out_shape_info_; std::vector> all_gears_info_; + + std::multimap op_id_map_; + std::vector profile_list_; }; } // namespace ge #endif // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_DAVINCI_MODEL_H_ diff --git a/ge/graph/load/new_model_manager/model_manager.cc b/ge/graph/load/new_model_manager/model_manager.cc index 4c2d4530..0dbeb38e 100644 --- a/ge/graph/load/new_model_manager/model_manager.cc +++ b/ge/graph/load/new_model_manager/model_manager.cc @@ -89,6 +89,7 @@ Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, u if (op_type == aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_KERNEL_DESTROY) { std::vector v_aicpu_kernel; std::string model_key = std::to_string(session_id) + "_" + std::to_string(model_id); + std::lock_guard lock(map_mutex_); auto iter = model_aicpu_kernel_.find(model_key); if (iter != model_aicpu_kernel_.end()) { GELOGD("kernel destroy session_id %lu, model_id %u.", session_id, model_id); @@ -176,7 +177,7 @@ Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, u } void ModelManager::DestroyAicpuSession(uint64_t session_id) { - std::lock_guard lock(sess_ids_mutex_); + std::lock_guard lock(map_mutex_); auto it = sess_ids_.find(session_id); if (it == sess_ids_.end()) { GELOGI("The session: %lu not created.", session_id); @@ -205,7 +206,7 @@ void ModelManager::DestroyAicpuSession(uint64_t session_id) { } ge::Status ModelManager::DestroyAicpuSessionForInfer(uint32_t model_id) { - std::lock_guard lock(map_mutex_); + std::lock_guard lock(map_mutex_); auto hybrid_davinci_model = hybrid_model_map_.find(model_id); if (hybrid_davinci_model != hybrid_model_map_.end()) { uint64_t session_id = hybrid_davinci_model->second->GetSessionId(); @@ -215,8 +216,8 @@ ge::Status ModelManager::DestroyAicpuSessionForInfer(uint32_t model_id) { auto it = model_map_.find(model_id); if (it == model_map_.end()) { - GELOGE(GE_EXEC_MODEL_ID_INVALID, "model id %u does not exists.", model_id); - return GE_EXEC_MODEL_ID_INVALID; + GELOGE(ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, "model id %u does not exists.", model_id); + return ACL_ERROR_GE_EXEC_MODEL_ID_INVALID; } uint64_t session_id = it->second->GetSessionId(); DestroyAicpuSession(session_id); @@ -225,7 +226,7 @@ ge::Status ModelManager::DestroyAicpuSessionForInfer(uint32_t model_id) { ge::Status ModelManager::DestroyAicpuKernel(uint64_t session_id, uint32_t model_id) { GELOGD("destroy aicpu kernel in session_id %lu, model_id %u.", session_id, model_id); - std::lock_guard lock(map_mutex_); + std::lock_guard lock(map_mutex_); std::string model_key = std::to_string(session_id) + "_" + std::to_string(model_id); if (model_aicpu_kernel_.find(model_key) != model_aicpu_kernel_.end()) { Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_KERNEL_DESTROY, session_id, model_id); @@ -238,7 +239,7 @@ ge::Status ModelManager::DestroyAicpuKernel(uint64_t session_id, uint32_t model_ } ge::Status ModelManager::CreateAicpuKernel(uint64_t session_id, uint32_t model_id, uint64_t kernel_id) { - std::lock_guard lock(map_mutex_); + std::lock_guard lock(map_mutex_); std::vector v_aicpu_kernel; std::string model_key = std::to_string(session_id) + "_" + std::to_string(model_id); if (model_aicpu_kernel_.find(model_key) != model_aicpu_kernel_.end()) { @@ -250,7 +251,7 @@ ge::Status ModelManager::CreateAicpuKernel(uint64_t session_id, uint32_t model_i } ModelManager::~ModelManager() { - std::lock_guard lock(map_mutex_); + std::lock_guard lock(map_mutex_); model_map_.clear(); model_aicpu_kernel_.clear(); cust_aicpu_so_.clear(); @@ -358,18 +359,18 @@ Status ModelManager::LoadModelOnline(uint32_t &model_id, const shared_ptr &davinci_model) { GE_CHK_BOOL_EXEC(davinci_model != nullptr, return, "davinci_model ptr is null, id: %u", id); - std::lock_guard lock(map_mutex_); + std::lock_guard lock(map_mutex_); model_map_[id] = davinci_model; } void ModelManager::InsertModel(uint32_t id, shared_ptr &hybrid_model) { GE_CHK_BOOL_EXEC(hybrid_model != nullptr, return, "hybrid_model ptr is null, id: %u", id); - std::lock_guard lock(map_mutex_); + std::lock_guard lock(map_mutex_); hybrid_model_map_[id] = hybrid_model; } Status ModelManager::DeleteModel(uint32_t id) { - std::lock_guard lock(map_mutex_); + std::lock_guard lock(map_mutex_); auto it = model_map_.find(id); auto hybrid_model_it = hybrid_model_map_.find(id); @@ -384,22 +385,22 @@ Status ModelManager::DeleteModel(uint32_t id) { } else if (hybrid_model_it != hybrid_model_map_.end()) { (void)hybrid_model_map_.erase(hybrid_model_it); } else { - GELOGE(GE_EXEC_MODEL_ID_INVALID, "model id %u does not exists.", id); - return GE_EXEC_MODEL_ID_INVALID; + GELOGE(ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, "model id %u does not exists.", id); + return ACL_ERROR_GE_EXEC_MODEL_ID_INVALID; } return SUCCESS; } std::shared_ptr ModelManager::GetModel(uint32_t id) { - std::lock_guard lock(map_mutex_); + std::lock_guard lock(map_mutex_); auto it = model_map_.find(id); return (it == model_map_.end()) ? nullptr : it->second; } std::shared_ptr ModelManager::GetHybridModel(uint32_t id) { - std::lock_guard lock(map_mutex_); + std::lock_guard lock(map_mutex_); auto it = hybrid_model_map_.find(id); return (it == hybrid_model_map_.end()) ? nullptr : it->second; @@ -902,7 +903,7 @@ Status ModelManager::GetInputOutputDescInfo(const uint32_t model_id, vector davinci_model = GetModel(model_id); - GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, GE_EXEC_MODEL_ID_INVALID, + GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, "GetInputOutputDescInfo Failed, Invalid model id %u!", model_id); davinci_model->SetModelDescVersion(new_model_desc); @@ -970,8 +971,9 @@ Status ModelManager::GetUserDesignateShapeOrder(const uint32_t model_id, } Status ModelManager::GetCurShape(const uint32_t model_id, std::vector &batch_info, int32_t &dynamic_type) { - std::shared_ptr davinci_model = GetModel(model_id); - GE_CHECK_NOTNULL(davinci_model); + auto davinci_model = GetModel(model_id); + GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, + "GetCurShape Failed, Invalid Model ID %u!", model_id); davinci_model->GetCurShape(batch_info, dynamic_type); return SUCCESS; } @@ -984,7 +986,8 @@ Status ModelManager::GetModelAttr(uint32_t model_id, std::vector &dynami } std::shared_ptr davinci_model = GetModel(model_id); - GE_CHECK_NOTNULL(davinci_model); + GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, + "GetModelAttr Failed, Invalid Model ID %u!", model_id); davinci_model->GetModelAttr(dynamic_output_shape_info); return SUCCESS; } @@ -994,9 +997,8 @@ Status ModelManager::GetInputOutputDescInfoForZeroCopy(const uint32_t model_id, std::vector &inputFormats, std::vector &outputFormats) { std::shared_ptr davinci_model = GetModel(model_id); - GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, PARAM_INVALID, "GetInputOutputDescInfo Failed, Invalid model id %u!", - model_id); - + GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, + "GetInputOutputDescInfo Failed, Invalid model id %u!", model_id); return davinci_model->GetInputOutputDescInfoForZeroCopy(input_desc, output_desc, inputFormats, outputFormats); } @@ -1011,18 +1013,14 @@ Status ModelManager::GetInputOutputDescInfoForZeroCopy(const uint32_t model_id, Status ModelManager::GetAIPPInfo(const uint32_t model_id, uint32_t index, AippConfigInfo &aipp_info) { std::shared_ptr davinci_model = GetModel(model_id); GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, - "GetAIPPInfo failed, invalid model_id is %u.", - model_id); - + "GetAIPPInfo failed, invalid model_id is %u.", model_id); return davinci_model->GetAIPPInfo(index, aipp_info); } Status ModelManager::GetAippType(uint32_t model_id, uint32_t index, InputAippType &type, size_t &aipp_index) { std::shared_ptr davinci_model = GetModel(model_id); GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, - "GetAIPPInfo failed, invalid model_id is %u.", - model_id); - + "GetAIPPInfo failed, invalid model_id is %u.", model_id); return davinci_model->GetAippType(index, type, aipp_index); } @@ -1055,7 +1053,15 @@ Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model mmTimespec timespec = mmGetTickCount(); ModelHelper model_helper; - Status ret = model_helper.LoadModel(model); + Status ret = model_helper.LoadRootModel(model); + if (model_helper.GetModelType()) { + bool is_shape_unknown = false; + GE_CHK_STATUS_RET(model_helper.GetGeRootModel()->CheckIsUnknownShape(is_shape_unknown), + "CheckIsUnknownShape failed, model id:%u", model_id); + if (is_shape_unknown || GetContext().GetHostExecFlag()) { + return DoLoadHybridModelOnline(model_id, model_helper.GetGeRootModel(), listener); + } + } if (ret != SUCCESS) { GELOGE(ret, "load model failed."); return ret; @@ -1069,8 +1075,8 @@ Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Make shared failed"); return ACL_ERROR_GE_MEMORY_ALLOCATION; } catch (...) { - GELOGE(INTERNAL_ERROR, "Make shared failed since other exception raise"); - return INTERNAL_ERROR; + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Make shared failed since other exception raise"); + return ACL_ERROR_GE_MEMORY_ALLOCATION; } ret = davinci_model->Assign(ge_model); if (ret != SUCCESS) { @@ -1082,7 +1088,7 @@ Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model int32_t device_id = 0; rtError_t rt_ret = rtGetDevice(&device_id); if (rt_ret != RT_ERROR_NONE || device_id < 0) { - GELOGE(RT_FAILED, "Call rtGetDevice failed, ret = 0x%X, device_id = %d.", rt_ret, device_id); + GELOGE(rt_ret, "Call rtGetDevice failed, ret = 0x%X, device_id = %d.", rt_ret, device_id); return RT_ERROR_TO_GE_STATUS(rt_ret); } davinci_model->SetDeviceId(device_id); @@ -1214,7 +1220,7 @@ Status ModelManager::ExecuteModel(uint32_t model_id, rtStream_t stream, bool asy std::shared_ptr davinci_model = GetModel(model_id); GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, - "Invalid model id %u, check weather model has been loaded or not.", model_id); + "Invalid model id %u, check whether model has been loaded or not.", model_id); if (davinci_model->NeedDestroyAicpuKernel()) { GELOGI("Start to destroy specified aicpu kernel."); @@ -1237,7 +1243,7 @@ Status ModelManager::ExecuteModel(uint32_t model_id, rtStream_t stream, bool asy } Status ModelManager::CreateAicpuSession(uint64_t session_id) { - std::lock_guard lock(sess_ids_mutex_); + std::lock_guard lock(map_mutex_); auto it = sess_ids_.find(session_id); // never been created by any model if (it == sess_ids_.end()) { @@ -1456,8 +1462,7 @@ void ModelManager::GenModelId(uint32_t *id) { if (id == nullptr) { return; } - - std::lock_guard lock(map_mutex_); + std::lock_guard lock(map_mutex_); *id = ++max_model_id_; } diff --git a/ge/graph/load/new_model_manager/model_manager.h b/ge/graph/load/new_model_manager/model_manager.h index fc98d9c2..99af8415 100644 --- a/ge/graph/load/new_model_manager/model_manager.h +++ b/ge/graph/load/new_model_manager/model_manager.h @@ -353,8 +353,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager { std::map> hybrid_model_map_; std::map> model_aicpu_kernel_; uint32_t max_model_id_; - std::mutex map_mutex_; - std::mutex sess_ids_mutex_; + std::recursive_mutex map_mutex_; std::mutex session_id_create_mutex_; static::std::mutex exeception_infos_mutex_; uint64_t session_id_bias_; diff --git a/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc b/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc index 74faeb24..ce31ef30 100644 --- a/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc +++ b/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc @@ -90,20 +90,18 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci fusion_op_info_.op_index = context.op_index(); fusion_op_info_.original_op_names = original_op_names; fusion_op_info_.op_name = op_desc_->GetName()); - string session_graph_model_id; - davinci_model_->GetUniqueId(op_desc_, session_graph_model_id); - // get bin_file_key - const char *bin_file_key = davinci_model_->GetRegisterStub(op_desc_->GetName(), session_graph_model_id); // new aicpu kernel(rtCpuKernelLaunch) no need to check function if (kernel_type_ == ccKernelType::CCE_AI_CORE) { - rtError_t rt_ret; - rt_ret = rtGetFunctionByName(const_cast(kernel_def.stub_func().c_str()), &stub_func_); + rtError_t rt_ret = rtGetFunctionByName(const_cast(kernel_def.stub_func().c_str()), &stub_func_); GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "execute rtGetFunctionByName failed. stub_func: %s", kernel_def.stub_func().c_str()); return RT_ERROR_TO_GE_STATUS(rt_ret);); } else if (kernel_type_ == ccKernelType::TE) { - rtError_t rt_ret; - rt_ret = rtGetFunctionByName(bin_file_key, &stub_func_); + // get bin_file_key + string session_graph_model_id; + davinci_model_->GetUniqueId(op_desc_, session_graph_model_id); + const char *bin_file_key = davinci_model_->GetRegisterStub(op_desc_->GetName(), session_graph_model_id); + rtError_t rt_ret = rtGetFunctionByName(bin_file_key, &stub_func_); GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "execute rtGetFunctionByName failed. bin_file_key: %s", bin_file_key); return RT_ERROR_TO_GE_STATUS(rt_ret);); @@ -372,7 +370,11 @@ Status KernelTaskInfo::SuperKernelDistribute() { Status KernelTaskInfo::Distribute() { GELOGD("KernelTaskInfo Distribute Start."); if (davinci_model_->IsKnownNode()) { - args_ = davinci_model_->GetCurrentArgsAddr(args_offset_); + if (kernel_type_ == ccKernelType::TE) { + args_ = davinci_model_->GetCurrentArgsAddr(args_offset_); + } else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) { + args_ = davinci_model_->GetCurrentHybridArgsAddr(hybrid_args_offset_); + } GELOGI("Known node %s args addr %p, offset %u.", op_desc_->GetName().c_str(), args_, args_offset_); } rtError_t rt_ret = RT_ERROR_NONE; @@ -428,36 +430,31 @@ Status KernelTaskInfo::UpdateArgs() { const RuntimeParam &rts_param = davinci_model_->GetRuntimeParam(); vector input_data_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc_); vector output_data_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc_); - vector workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(rts_param, op_desc_); vector io_addrs; - if (!op_desc_->HasAttr(ATTR_DYNAMIC_SHAPE_FIXED_ADDR)) { - io_addrs.insert(io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end()); - io_addrs.insert(io_addrs.end(), output_data_addrs.begin(), output_data_addrs.end()); + io_addrs.insert(io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end()); + io_addrs.insert(io_addrs.end(), output_data_addrs.begin(), output_data_addrs.end()); + if (kernel_type_ == ccKernelType::TE) { + vector workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(rts_param, op_desc_); io_addrs.insert(io_addrs.end(), workspace_data_addrs.begin(), workspace_data_addrs.end()); - } else { - string peer_input_name; - if (AttrUtils::GetStr(op_desc_, ATTR_DYNAMIC_SHAPE_FIXED_ADDR, peer_input_name)) { - uint32_t output_index = davinci_model_->GetFixedAddrOutputIndex(peer_input_name); - if (output_index > output_data_addrs.size()) { - GELOGE(FAILED, "The output data addr size[%zu] and output index[%u] are inconsistent.", - output_data_addrs.size(), output_index); - return FAILED; - } - io_addrs.insert(io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end()); - for (size_t i = 0; i < output_data_addrs.size(); ++i) { - if (i == output_index) { - void *fixed_addr = davinci_model_->GetCurrentFixedAddr(fixed_addr_offset_); - io_addrs.emplace_back(fixed_addr); - continue; - } - io_addrs.emplace_back(output_data_addrs[i]); - } - io_addrs.insert(io_addrs.end(), workspace_data_addrs.begin(), workspace_data_addrs.end()); + davinci_model_->SetTotalIOAddrs(io_addrs); + } else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) { + davinci_model_->UpdateKnownZeroCopyAddr(io_addrs); + uintptr_t io_addr = reinterpret_cast(args_addr.get()) + sizeof(aicpu::AicpuParamHead); + auto addrs_size = sizeof(uint64_t) * io_addrs.size(); + errno_t sec_ret = memcpy_s(reinterpret_cast(io_addr), addrs_size, io_addrs.data(), addrs_size); + if (sec_ret != EOK) { + GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret); + return FAILED; + } + // copy args to device + rtError_t rt_ret = rtMemcpy(args_, args_size_, args_addr.get(), args_size_, RT_MEMCPY_HOST_TO_DEVICE); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt api(rtMemcpy) failed, ret: 0x%X", rt_ret); + return RT_ERROR_TO_GE_STATUS(rt_ret); } } - davinci_model_->SetTotalIOAddrs(io_addrs); GELOGI("KernelTaskInfo::UpdateArgs success."); return SUCCESS; } @@ -533,33 +530,18 @@ Status KernelTaskInfo::UpdateL2Data(const domi::KernelDef &kernel_def) { } Status KernelTaskInfo::CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) { - domi::KernelDef kernel_def = task_def.kernel(); - uint32_t args_size = kernel_def.args_size(); - args_offset_ = davinci_model->GetTotalArgsSize(); - davinci_model->SetTotalArgsSize(args_size); - GELOGI("kernel task name , args_size %u, args_offset %u", args_size, args_offset_); - - // get opcontext stored in model + const domi::KernelDef &kernel_def = task_def.kernel(); const domi::KernelContext &context = kernel_def.context(); - // get opdesc - op_desc_ = davinci_model->GetOpByIndex(context.op_index()); - GE_CHECK_NOTNULL(op_desc_); - // alloc fixed addr - string peer_input_name; - if (AttrUtils::GetStr(op_desc_, ATTR_DYNAMIC_SHAPE_FIXED_ADDR, peer_input_name) && !peer_input_name.empty()) { - uint32_t output_index = davinci_model->GetFixedAddrOutputIndex(peer_input_name); - if (output_index > op_desc_->GetOutputsSize()) { - GELOGE(FAILED, "The output size[%zu] and output index[%u] are inconsistent.", op_desc_->GetOutputsSize(), - output_index); - return FAILED; - } - fixed_addr_offset_ = davinci_model->GetFixedAddrsSize(peer_input_name); - auto tensor_desc = op_desc_->GetOutputDesc(output_index); - int64_t tensor_size = 0; - GE_CHK_STATUS(TensorUtils::GetSize(tensor_desc, tensor_size)); - davinci_model->SetTotalFixedAddrsSize(peer_input_name, tensor_size); - GELOGI("Calculate stream switch task args , tensor size is %ld, fixed addr offset %ld", tensor_size, - fixed_addr_offset_); + kernel_type_ = static_cast(context.kernel_type()); + if (kernel_type_ == ccKernelType::TE) { + uint32_t args_size = kernel_def.args_size(); + args_offset_ = davinci_model->GetTotalArgsSize(); + davinci_model->SetTotalArgsSize(args_size); + GELOGI("kernel task name , args_size %u, args_offset %u", args_size, args_offset_); + } else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) { + hybrid_args_offset_ = davinci_model->GetHybridArgsSize(); + davinci_model->SetHybridArgsSize(kernel_def.args_size()); + GELOGI("aicpu kernel task name , args_size %u, args_offset %u", kernel_def.args_size(), hybrid_args_offset_); } return SUCCESS; } @@ -888,7 +870,7 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k } // copy args to new host memory - std::unique_ptr args_addr(new (std::nothrow) uint8_t[args_size_]); + args_addr = std::unique_ptr(new (std::nothrow) uint8_t[args_size_]); GE_PRINT_DYNAMIC_MEMORY(new, "cce task physical memory.", sizeof(uint8_t) * args_size_) errno_t sec_ret = memcpy_s(args_addr.get(), args_size_, kernel_def.args().data(), args_size_); if (sec_ret != EOK) { @@ -896,8 +878,23 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k return FAILED; } - const RuntimeParam &rts_param = davinci_model_->GetRuntimeParam(); + auto aicpu_param_head = reinterpret_cast(args_addr.get()); + const auto &ext_info = kernel_def.kernel_ext_info(); + auto init_ret = InitAicpuTaskExtInfo(ext_info); + if (init_ret != SUCCESS) { + GELOGE(init_ret, "Init aicpu task ext info failed, ext_info size=%zu", ext_info.size()); + return init_ret; + } + GELOGI("Node[%s] type[%s] kernel_ext_info size=%zu, aicpu_ext_info_addr_=%p", op_desc_->GetName().c_str(), + op_desc_->GetType().c_str(), ext_info.size(), aicpu_ext_info_addr_); + aicpu_param_head->extInfoAddr = reinterpret_cast(aicpu_ext_info_addr_); + aicpu_param_head->extInfoLength = static_cast(ext_info.size()); + + if (davinci_model_->IsKnownNode()) { + return SUCCESS; + } + const RuntimeParam &rts_param = davinci_model_->GetRuntimeParam(); vector input_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc); vector output_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc); vector io_addrs; @@ -914,19 +911,6 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k } } - auto aicpu_param_head = reinterpret_cast(args_addr.get()); - const auto &ext_info = kernel_def.kernel_ext_info(); - auto init_ret = InitAicpuTaskExtInfo(ext_info); - if (init_ret != SUCCESS) { - GELOGE(init_ret, "Init aicpu task ext info failed, ext_info size=%zu", ext_info.size()); - return init_ret; - } - GELOGI("Node[%s] type[%s] kernel_ext_info size=%zu, aicpu_ext_info_addr_=%p", op_desc_->GetName().c_str(), - op_desc_->GetType().c_str(), ext_info.size(), aicpu_ext_info_addr_); - - aicpu_param_head->extInfoAddr = reinterpret_cast(aicpu_ext_info_addr_); - aicpu_param_head->extInfoLength = static_cast(ext_info.size()); - // malloc device memory for args rtError_t rt_ret = rtMalloc(static_cast(&args_), args_size_, RT_MEMORY_HBM); if (rt_ret != RT_ERROR_NONE) { diff --git a/ge/graph/load/new_model_manager/task_info/kernel_task_info.h b/ge/graph/load/new_model_manager/task_info/kernel_task_info.h index 1f90ede1..7717edd3 100644 --- a/ge/graph/load/new_model_manager/task_info/kernel_task_info.h +++ b/ge/graph/load/new_model_manager/task_info/kernel_task_info.h @@ -159,7 +159,9 @@ class KernelTaskInfo : public TaskInfo { OpDescPtr op_desc_; DavinciModel *davinci_model_; uint32_t args_offset_ = 0; + uint32_t hybrid_args_offset_ = 0; int64_t fixed_addr_offset_ = 0; + std::unique_ptr args_addr = nullptr; bool call_save_dump_ = false; // aicpu ext_info device mem diff --git a/ge/graph/load/new_model_manager/zero_copy_offset.cc b/ge/graph/load/new_model_manager/zero_copy_offset.cc index 9cd3f30b..f27d862d 100644 --- a/ge/graph/load/new_model_manager/zero_copy_offset.cc +++ b/ge/graph/load/new_model_manager/zero_copy_offset.cc @@ -183,22 +183,18 @@ void ZeroCopyOffset::SetOutputOutsideAddrs(const int64_t &input_offset, const bo addr_count_ = out_count; } -bool ZeroCopyOffset::SetOutsideAddrsValue(ZeroCopyTask &zero_copy_task, void *outside_addr, void *args, size_t offset) { +void ZeroCopyOffset::SetOutsideAddrsValue(ZeroCopyTask &zero_copy_task, void *outside_addr, void *args, size_t offset) { const auto addr_val = reinterpret_cast(outside_addr); - bool set_batch_label_flag = false; for (uint32_t out_count = 0; out_count < GetAddrCount(); ++out_count) { - auto &addrs_mapping_list = GetOutsideAddrs(); - auto args_addrs = addrs_mapping_list[out_count].find(outside_addr); - if (args_addrs != addrs_mapping_list[out_count].end()) { + auto args_addrs = outside_addrs_[out_count].find(outside_addr); + if (args_addrs != outside_addrs_[out_count].end()) { GE_CHK_STATUS(zero_copy_task.SetTaskArgsOffset(addr_val, offset), "Input args invalid."); void *args_val = static_cast(args) + offset; args_addrs->second.push_back(args_val); GELOGD("[ZCPY] set copy input: virtual_addr: 0x%lx, task_addr: %p, args: %p, offset: %zu.", addr_val, args_val, args, offset); - set_batch_label_flag = true; } } - return set_batch_label_flag; } } // namespace ge diff --git a/ge/graph/load/new_model_manager/zero_copy_offset.h b/ge/graph/load/new_model_manager/zero_copy_offset.h index fa80f28b..8ead742d 100644 --- a/ge/graph/load/new_model_manager/zero_copy_offset.h +++ b/ge/graph/load/new_model_manager/zero_copy_offset.h @@ -51,7 +51,7 @@ class ZeroCopyOffset { const OpDescPtr &op_desc, const size_t &idx, bool &fusion_flag); void SetOutputOutsideAddrs(const int64_t &input_offset, const bool &fusion_flag, void *addr, std::vector &tensor_addrs); - bool SetOutsideAddrsValue(ZeroCopyTask &zero_copy_task, void *outside_addr, void *args, size_t offset); + void SetOutsideAddrsValue(ZeroCopyTask &zero_copy_task, void *outside_addr, void *args, size_t offset); // basic_addr of l2-fusion void *GetBasicAddr() const { return basic_addr_; } diff --git a/ge/graph/load/new_model_manager/zero_copy_task.cc b/ge/graph/load/new_model_manager/zero_copy_task.cc index 2609cb4b..b938f14b 100644 --- a/ge/graph/load/new_model_manager/zero_copy_task.cc +++ b/ge/graph/load/new_model_manager/zero_copy_task.cc @@ -22,8 +22,6 @@ #include "common/ge_compiler_options.h" namespace ge { -const char *const kDefaultBatchLable = "Batch_default"; - ZeroCopyTask::ZeroCopyTask(const string &name, uint8_t *args, size_t size) : name_(name), args_addr_(args), args_size_(size), is_updated_(false) {} @@ -66,68 +64,23 @@ void ZeroCopyTask::SetOriginalArgs(const void *info, size_t size) { const uint8_t *data = static_cast(info); args_info_.assign(data, data + size); - GELOGI("[ZCPY] %s set info from virtual_addr: %p, args_addr: %p, args size: %zu, info size: %zu", name_.c_str(), info, + GELOGI("[ZCPY] %s set original args info: %p, args_addr: %p, args size: %zu, info size: %zu", name_.c_str(), info, args_addr_, args_size_, size); } -/** - * @ingroup ge - * @brief Check is dynamic batch node. - * @param [in] addr: virtual address value from Op. - * @param [in] data: data buffer from user. - * @param [in] batch_addrs: dynamic batch addr info. - * @param [in] batch_label: batch label. - * @return: true / false - */ -bool ZeroCopyTask::CheckDynamicBatch(const map> &batch_addrs, const string &batch_label, - uintptr_t addr) { - // Used for dynamic batch / resolution scene - set dynamic_input_addrs; - auto dynamic_input_iter = batch_addrs.find(batch_label); - if (dynamic_input_iter != batch_addrs.end()) { - dynamic_input_addrs = dynamic_input_iter->second; - } - - set fix_input_addrs; - auto fix_input_iter = batch_addrs.find(kDefaultBatchLable); - if (fix_input_iter != batch_addrs.end()) { - fix_input_addrs = fix_input_iter->second; - } - - if (fix_input_addrs.empty()) { - if (!dynamic_input_addrs.empty() && dynamic_input_addrs.find(addr) == dynamic_input_addrs.end()) { - return false; - } - } else { - if (!dynamic_input_addrs.empty() && dynamic_input_addrs.find(addr) == dynamic_input_addrs.end() && - fix_input_addrs.find(addr) == fix_input_addrs.end()) { - return false; - } - } - - return true; -} - /** * @ingroup ge * @brief Set user data addr to Task param. * @param [in] addr: virtual address value from Op. * @param [in] buffer_addr: real_data_buffer_addr from user. - * @param [in] batch_addrs: dynamic batch addr info. - * @param [in] batch_label: batch label. * @return: void */ -Status ZeroCopyTask::UpdateTaskParam(uintptr_t addr, void *buffer_addr, const map> &batch_addrs, - const string &batch_label) { +Status ZeroCopyTask::UpdateTaskParam(uintptr_t addr, void *buffer_addr) { auto iter = task_addr_offset_.find(addr); if (iter != task_addr_offset_.end()) { auto &cur_pair = *iter; uint8_t *args_info = args_info_.data(); for (auto offset : cur_pair.second) { - if (!CheckDynamicBatch(batch_addrs, batch_label, reinterpret_cast(args_addr_ + offset))) { - continue; - } - auto dst_addr = static_cast(buffer_addr); GELOGI("[ZCPY] %s update task, args_addr: %p, size: %zu, offset: %zu, virtual_addr: 0x%lx, user_data_addr: %p", name_.c_str(), args_addr_, args_size_, offset, addr, buffer_addr); diff --git a/ge/graph/load/new_model_manager/zero_copy_task.h b/ge/graph/load/new_model_manager/zero_copy_task.h index d0bb2b6d..efabc814 100644 --- a/ge/graph/load/new_model_manager/zero_copy_task.h +++ b/ge/graph/load/new_model_manager/zero_copy_task.h @@ -67,12 +67,9 @@ class ZeroCopyTask { * @brief Set user data addr to Task param. * @param [in] addr: virtual address value from Op. * @param [in] buffer_addr: data buffer_addr from user. - * @param [in] batch_addrs: dynamic batch addr info. - * @param [in] batch_label: batch label. * @return: 0 SUCCESS / others FAILED */ - ge::Status UpdateTaskParam(uintptr_t addr, void *buffer_addr, const map> &batch_addrs, - const string &batch_label); + ge::Status UpdateTaskParam(uintptr_t addr, void *buffer_addr); /** * @ingroup ge @@ -91,9 +88,6 @@ class ZeroCopyTask { return batch_label_; } - protected: - bool CheckDynamicBatch(const map> &batch_addrs, const string &batch_label, uintptr_t addr); - private: const string name_; diff --git a/ge/graph/manager/graph_manager.cc b/ge/graph/manager/graph_manager.cc index bd476ad5..7564c57b 100644 --- a/ge/graph/manager/graph_manager.cc +++ b/ge/graph/manager/graph_manager.cc @@ -23,25 +23,15 @@ #include #include #include -#include -#include "common/ge/ge_util.h" #include "common/math/math_util.h" #include "common/thread_pool.h" -#include "common/util.h" -#include "external/graph/types.h" -#include "framework/common/debug/ge_log.h" -#include "framework/common/ge_inner_error_codes.h" -#include "framework/common/ge_types.h" #include "analyzer/analyzer.h" #include "graph/common/ge_call_wrapper.h" #include "graph/common/local_context.h" #include "graph/common/transop_util.h" -#include "graph/debug/ge_attr_define.h" #include "graph/ge_context.h" #include "graph/ge_global_options.h" -#include "graph/ge_local_context.h" -#include "graph/manager/graph_mem_allocator.h" #include "graph/manager/util/rt_context_util.h" #include "graph/partition/dynamic_shape_partition.h" #include "graph/passes/enter_pass.h" @@ -61,8 +51,6 @@ #include "graph/passes/dimension_adjust_pass.h" #include "graph/passes/dimension_compute_pass.h" #include "graph/passes/flow_ctrl_pass.h" -#include "graph/passes/hccl_group_pass.h" -#include "graph/passes/hccl_memcpy_pass.h" #include "graph/passes/identity_pass.h" #include "graph/passes/input_output_connection_identify_pass.h" #include "graph/passes/iterator_op_pass.h" @@ -77,7 +65,6 @@ #include "graph/passes/permute_pass.h" #include "graph/passes/prune_pass.h" #include "graph/passes/ref_identity_delete_op_pass.h" -#include "graph/passes/replace_with_empty_const_pass.h" #include "graph/passes/reshape_recovery_pass.h" #include "graph/passes/reshape_remove_pass.h" #include "graph/passes/same_transdata_breadth_fusion_pass.h" @@ -87,13 +74,11 @@ #include "graph/passes/switch_logic_remove_pass.h" #include "graph/passes/switch_to_stream_switch_pass.h" #include "graph/passes/transop_breadth_fusion_pass.h" -#include "graph/passes/transop_depth_fusion_pass.h" #include "graph/passes/transop_nearby_allreduce_fusion_pass.h" #include "graph/passes/transop_symmetry_elimination_pass.h" #include "graph/passes/transop_without_reshape_fusion_pass.h" #include "graph/passes/transpose_transdata_pass.h" #include "graph/passes/variable_op_pass.h" -#include "graph/passes/variable_prepare_op_pass.h" #include "graph/passes/variable_ref_delete_op_pass.h" #include "graph/passes/variable_ref_useless_control_out_delete_pass.h" #include "graph/passes/end_of_sequence_add_control_pass.h" @@ -104,9 +89,6 @@ #include "graph/passes/memcpy_addr_async_pass.h" #include "graph/build/label_allocator.h" #include "graph/utils/tensor_adapter.h" -#include "graph/utils/type_utils.h" -#include "graph/graph_util.h" -#include "graph/types.h" #include "inc/pass_manager.h" #include "init/gelib.h" #include "ir_build/atc_ir_common.h" @@ -550,7 +532,7 @@ Status GraphManager::OptimizeSubGraphWithMultiThreads(ComputeGraphPtr compute_gr (void) AttrUtils::SetStr(subgraph->GetSubGraph(), ATTR_NAME_OP_COMPILE_STRATEGY, op_compile_strategy); } std::future f = executor.commit(GraphManager::ProcessSubGraphWithMultiThreads, this, - compute_graph->GetGraphID(), subgraph, compute_graph, session_id, + compute_graph->GetGraphID(), subgraph, compute_graph->GetName(), session_id, GetThreadLocalContext()); if (!f.valid()) { GELOGE(FAILED, "Future is invalid"); @@ -565,7 +547,7 @@ Status GraphManager::OptimizeSubGraphWithMultiThreads(ComputeGraphPtr compute_gr (void) AttrUtils::SetStr(subgraph->GetSubGraph(), ATTR_NAME_OP_COMPILE_STRATEGY, op_compile_strategy); } std::future f = executor.commit(GraphManager::ProcessSubGraphWithMultiThreads, this, - compute_graph->GetGraphID(), subgraph, compute_graph, session_id, + compute_graph->GetGraphID(), subgraph, compute_graph->GetName(), session_id, GetThreadLocalContext()); if (!f.valid()) { GELOGE(FAILED, "Future is invalid"); @@ -2471,7 +2453,8 @@ Status GraphManager::CheckAndReleaseMemory(const GeModelPtr &ge_model, const Gra Status GraphManager::ProcessSubGraphWithMultiThreads(GraphManager *graph_manager, GraphId root_graph_id, const SubGraphInfoPtr &sub_graph_info_ptr, - const ComputeGraphPtr &compute_graph, uint64_t session_id, + const std::string &root_graph_name, + uint64_t session_id, const GEThreadLocalContext &ge_context) { if (sub_graph_info_ptr != nullptr && graph_manager != nullptr) { GetContext().SetSessionId(session_id); @@ -2488,9 +2471,13 @@ Status GraphManager::ProcessSubGraphWithMultiThreads(GraphManager *graph_manager GELOGE(FAILED, "Failed to set attr ATTR_NAME_ROOT_GRAPH_ID for subgraph, graph_id: %u.", root_graph_id); return FAILED; } + if (!AttrUtils::SetStr(*compute_graph_tmp, ATTR_NAME_ROOT_GRAPH_NAME, root_graph_name)) { + GELOGE(FAILED, "Failed to set attr ATTR_NAME_ROOT_GRAPH_NAME for subgraph, \ + root_graph_name: %s.", root_graph_name.c_str()); + return FAILED; + } compute_graph_tmp->SetSessionID(session_id); Status ret = graph_manager->GetCompilerStages(root_graph_id).optimizer.OptimizeSubGraph(compute_graph_tmp, - compute_graph, engine_name); if (ret != SUCCESS) { GELOGE(ret, "SubGraph optimize Failed %s", engine_name.c_str()); diff --git a/ge/graph/manager/graph_manager.h b/ge/graph/manager/graph_manager.h index feca02fc..d2887c4c 100644 --- a/ge/graph/manager/graph_manager.h +++ b/ge/graph/manager/graph_manager.h @@ -219,7 +219,8 @@ class GraphManager { static Status ProcessSubGraphWithMultiThreads(GraphManager *graph_manager, GraphId root_graph_id, const SubGraphInfoPtr &sub_graph_info_ptr, - const ComputeGraphPtr &compute_graph, uint64_t session_id, + const std::string &root_graph_name, + uint64_t session_id, const GEThreadLocalContext &ge_context); Status ParseInputsDims(const std::vector &input_tensor); void ParseInputsDimsForData(const std::vector &input_tensor); diff --git a/ge/graph/manager/graph_mem_allocator.cc b/ge/graph/manager/graph_mem_allocator.cc index b832986b..7ee7df20 100644 --- a/ge/graph/manager/graph_mem_allocator.cc +++ b/ge/graph/manager/graph_mem_allocator.cc @@ -16,10 +16,7 @@ #include "graph/manager/graph_mem_allocator.h" -#include #include - -#include "framework/common/debug/ge_log.h" #include "graph/manager/graph_caching_allocator.h" #include "graph/manager/rdma_pool_allocator.h" diff --git a/ge/graph/manager/memory_api.cc b/ge/graph/manager/memory_api.cc index 45e4bb65..0798eb51 100644 --- a/ge/graph/manager/memory_api.cc +++ b/ge/graph/manager/memory_api.cc @@ -63,7 +63,7 @@ Status RdmaRemoteRegister(const std::vector &var_info, rtMemType_t }); auto hcom_remote_mem_register = - (HcclResult(*)(const MemRegisterAddr *, uint32_t))dlsym(handle, "hcom_remote_access_mem_register"); + (HcclResult(*)(const MemRegisterAddr *, uint32_t))dlsym(handle, "HcomRegRemoteAccessMem"); if (hcom_remote_mem_register == nullptr) { GELOGE(FAILED, "Failed to invoke hcom_remote_mem_register function."); return FAILED; diff --git a/ge/graph/optimize/graph_optimize.cc b/ge/graph/optimize/graph_optimize.cc index c5ebfda6..1548a065 100644 --- a/ge/graph/optimize/graph_optimize.cc +++ b/ge/graph/optimize/graph_optimize.cc @@ -76,7 +76,7 @@ void AddNodeInputProperty(ComputeGraphPtr &compute_graph) { } } -Status GraphOptimize::OptimizeSubGraph(ComputeGraphPtr &compute_graph, const ComputeGraphPtr &parent_graph, +Status GraphOptimize::OptimizeSubGraph(ComputeGraphPtr &compute_graph, const std::string &engine_name) { if (compute_graph == nullptr) { GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "[OptimizeSubGraph]: compute_graph is nullptr."); @@ -106,10 +106,6 @@ Status GraphOptimize::OptimizeSubGraph(ComputeGraphPtr &compute_graph, const Com for (auto iter = graph_optimizer.begin(); iter != graph_optimizer.end(); ++iter) { Status ret = (*iter)->OptimizeFusedGraphAfterGraphSlice(*(compute_graph)); if (ret != SUCCESS) { - auto root_graph = ge::GraphUtils::FindRootGraph(parent_graph); - if (root_graph != nullptr) { - ErrorManager::GetInstance().SaveMstuneCompileFailedMsg(root_graph->GetName()); - } GELOGE(ret, "[OptimizeSubGraph][OptimizeFusedGraphAfterGraphSlice]: graph optimize failed, ret:%d", ret); return ret; } diff --git a/ge/graph/optimize/graph_optimize.h b/ge/graph/optimize/graph_optimize.h index 969b4720..78d580b7 100644 --- a/ge/graph/optimize/graph_optimize.h +++ b/ge/graph/optimize/graph_optimize.h @@ -42,8 +42,7 @@ class GraphOptimize { ~GraphOptimize() = default; // subgraph optimize - Status OptimizeSubGraph(ComputeGraphPtr &compute_graph, const ComputeGraphPtr &parent_graph, - const std::string &engine_name); + Status OptimizeSubGraph(ComputeGraphPtr &compute_graph, const std::string &engine_name); // original graph optimize Status OptimizeOriginalGraph(ComputeGraphPtr &compute_graph); diff --git a/ge/graph/passes/dynamic_single_op_reset_shape_pass.cc b/ge/graph/passes/dynamic_single_op_reset_shape_pass.cc index 3e6377c7..d50b6df9 100644 --- a/ge/graph/passes/dynamic_single_op_reset_shape_pass.cc +++ b/ge/graph/passes/dynamic_single_op_reset_shape_pass.cc @@ -113,6 +113,17 @@ Status DynamicSingleOpResetShapePass::ResetOpShape(OpDescPtr &op_desc) { GE_CHECK_NOTNULL(op_desc); std::vector dynamic_shape_dims = {kDynamicShapeDim}; GeShape dynamic_shape(dynamic_shape_dims); + bool reset_shape_flag = false; + if (ResetInputTensorShape(op_desc, dynamic_shape, reset_shape_flag) == SUCCESS && reset_shape_flag) { + (void)ResetOutputTensorShape(op_desc, dynamic_shape); + } + return SUCCESS; +} + +Status DynamicSingleOpResetShapePass::ResetInputTensorShape(OpDescPtr &op_desc, const GeShape &dynamic_shape, + bool &reset_shape_flag) { + reset_shape_flag = false; + GE_CHECK_NOTNULL(op_desc); for (size_t i = 0; i < op_desc->GetAllInputsDesc().size(); i++) { auto input_desc = op_desc->MutableInputDesc(static_cast(i)); GE_CHECK_NOTNULL(input_desc); @@ -125,8 +136,14 @@ Status DynamicSingleOpResetShapePass::ResetOpShape(OpDescPtr &op_desc) { if (CheckIfConstInput(input_desc)) { continue; } + reset_shape_flag = true; input_desc->SetShape(dynamic_shape); } + return SUCCESS; +} + +Status DynamicSingleOpResetShapePass::ResetOutputTensorShape(OpDescPtr &op_desc, const GeShape &dynamic_shape) { + GE_CHECK_NOTNULL(op_desc); for (size_t i = 0; i < op_desc->GetAllOutputsDesc().size(); i++) { auto output_desc = op_desc->MutableOutputDesc(static_cast(i)); GE_CHECK_NOTNULL(output_desc); diff --git a/ge/graph/passes/dynamic_single_op_reset_shape_pass.h b/ge/graph/passes/dynamic_single_op_reset_shape_pass.h index 659bed9c..897fcac6 100644 --- a/ge/graph/passes/dynamic_single_op_reset_shape_pass.h +++ b/ge/graph/passes/dynamic_single_op_reset_shape_pass.h @@ -27,6 +27,8 @@ class DynamicSingleOpResetShapePass : public GraphPass { private: Status ResetOpShape(OpDescPtr &op_desc); + Status ResetInputTensorShape(OpDescPtr &op_desc, const GeShape &dynamic_shape, bool &reset_shape_flag); + Status ResetOutputTensorShape(OpDescPtr &op_desc, const GeShape &dynamic_shape); Status CheckAllAicpuNodes(const ComputeGraphPtr &graph, bool &is_not_aicpu); bool CheckIfConstInput(const GeTensorDescPtr &input_tensor_desc); }; diff --git a/ge/graph/passes/switch_to_stream_switch_pass.cc b/ge/graph/passes/switch_to_stream_switch_pass.cc index f75a104f..a7b922e0 100644 --- a/ge/graph/passes/switch_to_stream_switch_pass.cc +++ b/ge/graph/passes/switch_to_stream_switch_pass.cc @@ -17,13 +17,8 @@ #include "graph/passes/switch_to_stream_switch_pass.h" #include #include "common/ge/ge_util.h" -#include "framework/common/debug/ge_log.h" -#include "framework/common/debug/log.h" -#include "framework/common/ge_inner_error_codes.h" -#include "framework/common/types.h" #include "ge/ge_api_types.h" #include "graph/common/omg_util.h" -#include "graph/debug/ge_attr_define.h" #include "graph/ge_context.h" #include "graph/utils/type_utils.h" @@ -125,12 +120,13 @@ void SwitchToStreamSwitchPass::MarkCycleDependence( if (visited.count(tmp_node) > 0) { continue; } - GELOGD("MarkCycleDependence: tmp_node=%s.", tmp_node->GetName().c_str()); for (const NodePtr &out_node : tmp_node->GetOutAllNodes()) { if (switch_nodes.find(out_node) == switch_nodes.end()) { out_nodes.push(out_node); continue; } + GELOGD("MarkCycleDependence: tmp_node=%s, switch_node=%s.", + tmp_node->GetName().c_str(), out_node->GetName().c_str()); GE_IF_BOOL_EXEC(SetCyclicDependenceFlag(out_node) != SUCCESS, GELOGW("set cyclic dependence attr failed."); return ); auto map_iter = switch_cyclic_map_.find(out_node); @@ -602,7 +598,7 @@ Status SwitchToStreamSwitchPass::AddConstNode(const ComputeGraphPtr &graph, cons /// Status SwitchToStreamSwitchPass::ModifySwitchInCtlEdges(const NodePtr &switch_node, const NodePtr &cast_node, const std::set &same_cond_switch) { - GELOGI("ModifySwitchInCtlEdges: switch_node=%s, active_node=%s", switch_node->GetName().c_str(), + GELOGD("ModifySwitchInCtlEdges: switch_node=%s, active_node=%s", switch_node->GetName().c_str(), cast_node->GetName().c_str()); std::string orig_switch_name = switch_node->GetName(); OpDescPtr switch_desc = switch_node->GetOpDesc(); @@ -653,7 +649,7 @@ Status SwitchToStreamSwitchPass::ModifySwitchInCtlEdges(const NodePtr &switch_no /// Status SwitchToStreamSwitchPass::ModifySwitchOutCtlEdges(const NodePtr &switch_node, const NodePtr &stream_switch, const NodePtr &active_node) { - GELOGI("ModifySwitchOutCtlEdges: switch_node=%s, stream_switch=%s, active_node=%s", switch_node->GetName().c_str(), + GELOGD("ModifySwitchOutCtlEdges: switch_node=%s, stream_switch=%s, active_node=%s", switch_node->GetName().c_str(), stream_switch->GetName().c_str(), active_node->GetName().c_str()); auto find_res = switch_node_map_.find(switch_node); GE_IF_BOOL_EXEC(find_res == switch_node_map_.end(), { diff --git a/ge/graph/preprocess/graph_preprocess.cc b/ge/graph/preprocess/graph_preprocess.cc index 2ee5e330..da862836 100644 --- a/ge/graph/preprocess/graph_preprocess.cc +++ b/ge/graph/preprocess/graph_preprocess.cc @@ -18,7 +18,6 @@ #include #include #include -#include #include "common/formats/format_transfers/format_transfer_fractal_nz.h" #include "common/formats/format_transfers/format_transfer_fractal_z.h" #include "common/formats/format_transfers/format_transfer_nchw_nc1hwc0.h" @@ -28,13 +27,9 @@ #include "common/helper/model_helper.h" #include "common/math/math_util.h" #include "common/op/ge_op_utils.h" -#include "common/util/error_manager/error_manager.h" -#include "common/formats/utils/formats_trans_utils.h" -#include "framework/common/debug/ge_log.h" #include "graph/common/ge_call_wrapper.h" #include "graph/common/local_context.h" #include "graph/common/transop_util.h" -#include "graph/debug/ge_attr_define.h" #include "graph/ge_context.h" #include "graph/shape_refiner.h" #include "graph/manager/graph_var_manager.h" @@ -44,29 +39,21 @@ #include "graph/passes/aicpu_constant_folding_pass.h" #include "graph/passes/assert_pass.h" #include "graph/passes/assign_pass.h" -#include "graph/passes/base_pass.h" #include "graph/passes/common_subexpression_elimination_pass.h" #include "graph/passes/cond_pass.h" #include "graph/passes/cond_remove_pass.h" #include "graph/passes/constant_folding_pass.h" -#include "graph/passes/constant_fuse_same_pass.h" -#include "graph/passes/control_trigger_pass.h" #include "graph/passes/dimension_adjust_pass.h" #include "graph/passes/dimension_compute_pass.h" #include "graph/passes/dropout_pass.h" #include "graph/passes/enter_pass.h" -#include "graph/passes/flow_ctrl_pass.h" #include "graph/passes/for_pass.h" -#include "graph/passes/get_original_format_pass.h" #include "graph/passes/guarantee_const_pass.h" #include "graph/passes/hccl_group_pass.h" #include "graph/passes/hccl_memcpy_pass.h" #include "graph/passes/identity_pass.h" #include "graph/passes/infershape_pass.h" -#include "graph/passes/iterator_op_pass.h" -#include "graph/passes/merge_pass.h" #include "graph/passes/net_output_pass.h" -#include "graph/passes/next_iteration_pass.h" #include "graph/passes/no_use_reshape_remove_pass.h" #include "graph/passes/parallel_concat_start_op_pass.h" #include "graph/passes/placeholder_with_default_pass.h" @@ -81,45 +68,18 @@ #include "graph/passes/shape_operate_op_remove_pass.h" #include "graph/passes/snapshot_pass.h" #include "graph/passes/stop_gradient_pass.h" -#include "graph/passes/subgraph_pass.h" -#include "graph/passes/switch_data_edges_bypass.h" -#include "graph/passes/switch_dead_branch_elimination.h" -#include "graph/passes/switch_logic_remove_pass.h" -#include "graph/passes/merge_to_stream_merge_pass.h" -#include "graph/passes/switch_to_stream_switch_pass.h" -#include "graph/passes/attach_stream_label_pass.h" #include "graph/passes/unused_const_pass.h" -#include "graph/passes/unused_op_remove_pass.h" #include "graph/passes/var_is_initialized_op_pass.h" #include "graph/passes/variable_prepare_op_pass.h" #include "graph/preprocess/insert_op/util_insert_aipp_op.h" -#include "graph/types.h" -#include "graph/utils/tensor_utils.h" #include "graph/utils/type_utils.h" #include "inc/pass_manager.h" #include "init/gelib.h" #include "multi_batch_copy_graph.h" -#include "runtime/dev.h" -#include "graph/passes/dimension_adjust_pass.h" -#include "graph/passes/link_gen_mask_nodes_pass.h" -#include "graph/passes/permute_pass.h" -#include "graph/passes/reshape_remove_pass.h" -#include "graph/passes/same_transdata_breadth_fusion_pass.h" -#include "graph/passes/transop_breadth_fusion_pass.h" -#include "graph/passes/transop_depth_fusion_pass.h" -#include "graph/passes/transop_nearby_allreduce_fusion_pass.h" - -#include "graph/passes/cast_remove_pass.h" #include "graph/passes/data_pass.h" -#include "graph/passes/transop_without_reshape_fusion_pass.h" -#include "graph/passes/transpose_transdata_pass.h" -#include "graph/passes/variable_op_pass.h" -#include "graph/passes/variable_prepare_op_pass.h" -#include "graph/passes/variable_ref_delete_op_pass.h" #include "graph/passes/mark_agnostic_pass.h" - namespace ge { namespace { static std::map output_type_str_to_datatype = { diff --git a/ge/graph/preprocess/multi_batch_copy_graph.cc b/ge/graph/preprocess/multi_batch_copy_graph.cc index 9ab74d70..a90f145e 100644 --- a/ge/graph/preprocess/multi_batch_copy_graph.cc +++ b/ge/graph/preprocess/multi_batch_copy_graph.cc @@ -1407,11 +1407,13 @@ Status MultiBatchGraphCopyer::InsertIdentityAfterSwitchN() { } Status ProcessMultiBatch(ComputeGraphPtr &graph) { - const char *multi_batch_with_case = std::getenv("MULTI_BATCH_WITH_CASE"); - if (multi_batch_with_case != nullptr) { - PassManager pass_manager; - GE_CHK_STATUS_RET(pass_manager.AddPass("MultiBatchClonePass", new (std::nothrow) MultiBatchClonePass)); - return pass_manager.Run(graph); + if (GetLocalOmgContext().dynamic_node_type.empty()) { + const char *multi_batch_with_switchn = std::getenv("MULTI_BATCH_WITH_SWITCHN"); + if (multi_batch_with_switchn == nullptr) { + PassManager pass_manager; + GE_CHK_STATUS_RET(pass_manager.AddPass("MultiBatchClonePass", new (std::nothrow) MultiBatchClonePass)); + return pass_manager.Run(graph); + } } if (!GetLocalOmgContext().need_multi_batch) { GELOGI("No need to process_multi for no_train graph."); diff --git a/ge/hybrid/executor/node_state.cc b/ge/hybrid/executor/node_state.cc index 033c5304..ceed40b0 100644 --- a/ge/hybrid/executor/node_state.cc +++ b/ge/hybrid/executor/node_state.cc @@ -18,6 +18,7 @@ #include #include "framework/common/debug/log.h" #include "graph/compute_graph.h" +#include "graph/utils/tensor_utils.h" #include "hybrid_execution_context.h" #include "subgraph_context.h" @@ -35,29 +36,31 @@ ShapeInferenceState::ShapeInferenceState(const NodeItem &node_item) : node_item( this->num_pending_shapes_); } -Status ShapeInferenceState::UpdateInputShape(int idx, - const GeShape &ori_shape, - const GeShape &shape) { +Status ShapeInferenceState::UpdateInputShape(int idx, const GeTensorDesc &target) { if (node_item.IsInputShapeStatic(idx)) { GELOGD("[%s] Trying to update static shape, idx = %d. old shape = [%s], new shape = [%s]", node_item.NodeName().c_str(), idx, node_item.MutableInputDesc(idx)->GetShape().ToString().c_str(), - shape.ToString().c_str()); + target.GetShape().ToString().c_str()); return SUCCESS; } - GELOGD("[%s] Update input shape [%d] with Shape: [%s] and OriginalShape: [%s]", + int64_t tensor_size = -1; + (void) TensorUtils::GetSize(target, tensor_size); + GELOGD("[%s] Update input shape [%d] with Shape: [%s] and OriginalShape: [%s], size = %ld", node_item.NodeName().c_str(), idx, - shape.ToString().c_str(), - ori_shape.ToString().c_str()); + target.GetShape().ToString().c_str(), + target.GetOriginShape().ToString().c_str(), + tensor_size); std::lock_guard lk(mu_); auto tensor_desc = node_item.MutableInputDesc(idx); GE_CHECK_NOTNULL(tensor_desc); - tensor_desc->SetShape(shape); - tensor_desc->SetOriginShape(ori_shape); + tensor_desc->SetShape(target.GetShape()); + tensor_desc->SetOriginShape(target.GetOriginShape()); + (void) TensorUtils::SetSize(*tensor_desc, tensor_size); if (--num_pending_shapes_ == 0) { ready_cv_.notify_all(); } @@ -110,24 +113,24 @@ Status ShapeInferenceState::AwaitShapesReady(const GraphExecutionContext &contex for (auto &p : shape_futures) { auto idx = p.first; auto &future = p.second; - GeShape shape; - GeShape ori_shape; RECORD_SHAPE_INFERENCE_EVENT(&context, node_item.NodeName().c_str(), "[AwaitShape] [idx = %u] Start", idx); - GE_CHK_STATUS_RET(future.Get(ori_shape, shape), - "[%s] Get shape failed. index = %u", - node_item.NodeName().c_str(), - idx); + auto src_tensor_desc = future.GetTensorDesc(); + GE_CHECK_NOTNULL(src_tensor_desc); RECORD_SHAPE_INFERENCE_EVENT(&context, node_item.NodeName().c_str(), "[AwaitShape] [idx = %u] End", idx); - GELOGD("[%s] Update input shape [%u] with shape: [%s] and ori_shape: [%s]", - node_item.NodeName().c_str(), - idx, - shape.ToString().c_str(), - ori_shape.ToString().c_str()); auto input_desc = node_item.MutableInputDesc(idx); GE_CHECK_NOTNULL(input_desc); - input_desc->SetShape(std::move(shape)); - input_desc->SetOriginShape(ori_shape); + int64_t tensor_size = -1; + (void) TensorUtils::GetSize(*src_tensor_desc, tensor_size); + GELOGD("[%s] Update input shape [%u] with shape: [%s] and ori_shape: [%s], index = %zu", + node_item.NodeName().c_str(), + idx, + src_tensor_desc->GetShape().ToString().c_str(), + src_tensor_desc->GetOriginShape().ToString().c_str(), + tensor_size); + input_desc->SetShape(src_tensor_desc->GetShape()); + input_desc->SetOriginShape(src_tensor_desc->GetOriginShape()); + (void) TensorUtils::SetSize(*input_desc, tensor_size); } return SUCCESS; @@ -190,5 +193,14 @@ Status ShapeFuture::Get(GeShape &ori_shape, GeShape &shape) { GELOGD("Get shape from %s:%u. shape = [%s]", src_node_->GetName().c_str(), src_index_, shape.ToString().c_str()); return SUCCESS; } + +GeTensorDescPtr ShapeFuture::GetTensorDesc() { + GELOGD("Start to wait node: %s for getting shape", src_node_->GetName().c_str()); + if (!subgraph_context_->Await(src_node_)) { + GELOGE(INTERNAL_ERROR, "cancelled"); + return nullptr; + } + return src_node_->GetOpDesc()->MutableOutputDesc(src_index_); +} } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/executor/node_state.h b/ge/hybrid/executor/node_state.h index 04f1ee4b..312e177f 100644 --- a/ge/hybrid/executor/node_state.h +++ b/ge/hybrid/executor/node_state.h @@ -35,6 +35,7 @@ class ShapeFuture { ShapeFuture(NodePtr src_node, uint32_t src_index, SubgraphContext *subgraph_context); ~ShapeFuture() = default; Status Get(GeShape &ori_shape, GeShape &shape); + GeTensorDescPtr GetTensorDesc(); private: NodePtr src_node_; @@ -45,7 +46,7 @@ class ShapeFuture { struct ShapeInferenceState { explicit ShapeInferenceState(const NodeItem &node_item); - Status UpdateInputShape(int idx, const GeShape &ori_shape, const GeShape &shape); + Status UpdateInputShape(int idx, const GeTensorDesc &tensor_desc); void UpdateInputShapeFuture(int idx, ShapeFuture &&future); diff --git a/ge/hybrid/executor/subgraph_executor.cc b/ge/hybrid/executor/subgraph_executor.cc index 5a464f8e..4b6dddab 100644 --- a/ge/hybrid/executor/subgraph_executor.cc +++ b/ge/hybrid/executor/subgraph_executor.cc @@ -96,7 +96,7 @@ Status SubgraphExecutor::InitInputsForUnknownShape(const std::vectorGetOrCreateNodeState(input_node); GE_CHECK_NOTNULL(node_state); - node_state->GetShapeInferenceState().UpdateInputShape(0, tensor_desc->GetOriginShape(), tensor_desc->GetShape()); + node_state->GetShapeInferenceState().UpdateInputShape(0, *tensor_desc); } } @@ -268,13 +268,6 @@ Status SubgraphExecutor::PrepareForExecution(GraphExecutionContext *ctx, NodeSta } else { node_state.SetKernelTask(node_item.kernel_task); } - - GELOGD("[%s] Start to invoke CalcOpRunningParam.", node_item.NodeName().c_str()); - RECORD_COMPILE_EVENT(ctx, node_item.NodeName().c_str(), "[CalcOpRunningParam] Start"); - GE_CHK_STATUS_RET(NodeExecutorManager::GetInstance().CalcOpRunningParam(*node_item.node), - "[%s] Failed to invoke CalcOpRunningParam.", node_item.NodeName().c_str()); - RECORD_COMPILE_EVENT(ctx, node_item.NodeName().c_str(), "[CalcOpRunningParam] End"); - GELOGD("[%s] Done invoking CalcOpRunningParam successfully.", node_item.NodeName().c_str()); return SUCCESS; } diff --git a/ge/hybrid/executor/worker/execution_engine.cc b/ge/hybrid/executor/worker/execution_engine.cc index b984eec3..819454db 100644 --- a/ge/hybrid/executor/worker/execution_engine.cc +++ b/ge/hybrid/executor/worker/execution_engine.cc @@ -20,12 +20,9 @@ #include "graph/utils/tensor_adapter.h" #include "graph/debug/ge_attr_define.h" #include "hybrid/node_executor/node_executor.h" -#include "common/dump/dump_manager.h" +#include "hybrid/executor//worker//shape_inference_engine.h" #include "common/dump/dump_op.h" -#include "common/types.h" -#include "common/ge_types.h" #include "common/profiling/profiling_manager.h" -#include "runtime/base.h" namespace ge { namespace hybrid { @@ -348,6 +345,10 @@ Status NodeDoneCallback::OnNodeDone() { } GE_CHK_STATUS_RET_NOLOG(PrepareConstInputs(node_item)); + if (node_item.shape_inference_type == DEPEND_SHAPE_RANGE || node_item.shape_inference_type == DEPEND_COMPUTE) { + // update output tensor sizes + GE_CHK_STATUS_RET_NOLOG(ShapeInferenceEngine::CalcOutputTensorSizes(node_item)); + } // PropagateOutputs for type == DEPEND_COMPUTE if (node_item.shape_inference_type == DEPEND_COMPUTE) { if (graph_context_->trace_enabled) { diff --git a/ge/hybrid/executor/worker/shape_inference_engine.cc b/ge/hybrid/executor/worker/shape_inference_engine.cc index 1d813526..66d0ede2 100644 --- a/ge/hybrid/executor/worker/shape_inference_engine.cc +++ b/ge/hybrid/executor/worker/shape_inference_engine.cc @@ -17,9 +17,15 @@ #include "hybrid/executor/worker/shape_inference_engine.h" #include "graph/shape_refiner.h" #include "graph/utils/node_utils.h" +#include "graph/utils/tensor_utils.h" +#include "graph/utils/type_utils.h" +#include "common/math/math_util.h" #include "hybrid/node_executor/node_executor.h" namespace ge { +namespace { +const int kAlignment = 32; +} namespace hybrid { ShapeInferenceEngine::ShapeInferenceEngine(GraphExecutionContext *execution_context, SubgraphContext *subgraph_context) : execution_context_(execution_context), @@ -40,7 +46,9 @@ Status ShapeInferenceEngine::InferShape(NodeState &node_state) { } if (node_item.fused_subgraph != nullptr) { - return InferShapeForSubgraph(node_item, *node_item.fused_subgraph); + GE_CHK_STATUS_RET_NOLOG(InferShapeForSubgraph(node_item, *node_item.fused_subgraph)); + GE_CHK_STATUS_RET_NOLOG(CalcOutputTensorSizes(node_item)); + return SUCCESS; } // Skip shape inference for node of type DEPEND_COMPUTE @@ -63,21 +71,15 @@ Status ShapeInferenceEngine::InferShape(NodeState &node_state) { std::lock_guard lk(mu_); RECORD_SHAPE_INFERENCE_EVENT(execution_context_, node_item.NodeName().c_str(), "[InferShapeAndType] Start"); GE_CHK_STATUS_RET(ShapeRefiner::InferShapeAndTypeForRunning(node_item.node, true), - "Invoke InferShapeAndType failed."); + "Invoke InferShapeAndType failed."); RECORD_SHAPE_INFERENCE_EVENT(execution_context_, node_item.NodeName().c_str(), "[InferShapeAndType] End"); } - // Check again to make sure shape is valid after shape inference - if (node_item.shape_inference_type != DEPEND_SHAPE_RANGE) { - bool is_unknown_shape = false; - GE_CHK_STATUS_RET(NodeUtils::GetNodeUnknownShapeStatus(*node_item.node, is_unknown_shape), - "Failed to get shape status. node = %s", - node_item.NodeName().c_str()); - GE_CHK_BOOL_RET_STATUS(!is_unknown_shape, - INTERNAL_ERROR, - "[%s] Shape is still unknown after shape inference.", - node_item.NodeName().c_str()); - } + // update output tensor sizes after shape inference + // error if shape is still unknown and not of type DEPEND_SHAPE_RANGE + RECORD_COMPILE_EVENT(execution_context_, node_item.NodeName().c_str(), "[CalcOpRunningParam] Start"); + GE_CHK_STATUS_RET_NOLOG(CalcOutputTensorSizes(node_item, node_item.shape_inference_type == DEPEND_SHAPE_RANGE)); + RECORD_COMPILE_EVENT(execution_context_, node_item.NodeName().c_str(), "[CalcOpRunningParam] End"); GELOGD("[%s] [HybridTrace] After shape inference. Node = %s", node_item.NodeName().c_str(), @@ -127,8 +129,6 @@ Status ShapeInferenceEngine::PropagateOutputShapes(const NodeItem &node_item) { // propagate each output for (int i = 0; i < node_item.num_outputs; ++i) { auto output_desc = node_item.op_desc->MutableOutputDesc(i); - const auto &shape = output_desc->MutableShape(); - const auto &ori_shape = output_desc->GetOriginShape(); auto &output_nodes = node_item.outputs[i]; // propagate output to all sub-inputs @@ -149,9 +149,7 @@ Status ShapeInferenceEngine::PropagateOutputShapes(const NodeItem &node_item) { infer_state.UpdateInputShapeFuture(dst_input_index_and_node.first, std::move(future)); } else { - GE_CHK_STATUS_RET_NOLOG(infer_state.UpdateInputShape(dst_input_index_and_node.first, - ori_shape, - shape)); + GE_CHK_STATUS_RET_NOLOG(infer_state.UpdateInputShape(dst_input_index_and_node.first, *output_desc)); } } } @@ -230,5 +228,92 @@ Status ShapeInferenceEngine::UpdatePeerNodeShape(const Node &node) { } return SUCCESS; } + +Status ShapeInferenceEngine::CanonicalizeShape(GeTensorDesc &tensor_desc, + std::vector &shape, + bool fallback_with_range) { + const auto &tensor_shape = tensor_desc.MutableShape(); + if (tensor_shape.IsUnknownShape()) { + if (!fallback_with_range) { + GELOGE(INTERNAL_ERROR, "Output shape is still unknown after shape inference. shape = [%s]", + tensor_shape.ToString().c_str()); + return INTERNAL_ERROR; + } + + GELOGD("Calc output size by range"); + std::vector> shape_range; + GE_CHK_GRAPH_STATUS_RET(tensor_desc.GetShapeRange(shape_range), "Failed to get shape range"); + if (shape_range.size() != shape.size()) { + GELOGE(INTERNAL_ERROR, "Number of shape ranges (%zu) mismatches that of dims (%zu)", + shape_range.size(), + shape.size()); + return INTERNAL_ERROR; + } + + for (size_t dim_index = 0; dim_index < shape.size(); ++dim_index) { + if (shape[dim_index] == ge::UNKNOWN_DIM) { + shape[dim_index] = shape_range[dim_index].second; + } + } + + GELOGD("After canonicalization, shape = [%s], before = [%s]", + GeShape(shape).ToString().c_str(), + tensor_shape.ToString().c_str()); + } + + return SUCCESS; +} + +Status ShapeInferenceEngine::CalcTensorSize(DataType data_type, + const std::vector &shape, + int64_t &tensor_size) { + GELOGD("To calc tensor size by shape = [%s]", GeShape(shape).ToString().c_str()); + uint32_t type_size; + if (!TypeUtils::GetDataTypeLength(data_type, type_size)) { + GELOGE(INTERNAL_ERROR, "Failed to get data type size"); + return INTERNAL_ERROR; + } + + tensor_size = type_size; + for (const auto &dim : shape) { + GE_CHECK_GE(dim, 0); + GE_CHK_STATUS_RET(Int64MulCheckOverflow(tensor_size, dim), + "Shape size overflow, shape = [%s]", + GeShape(shape).ToString().c_str()); + tensor_size *= dim; + } + + GE_CHK_STATUS_RET(CheckInt64AddOverflow(tensor_size, kAlignment - 1), + "Tensor size is too large: %ld, shape = [%s]", + tensor_size, + GeShape(shape).ToString().c_str()); + tensor_size = (tensor_size + kAlignment - 1) / kAlignment * kAlignment; + return SUCCESS; +} + +Status ShapeInferenceEngine::CalcOutputTensorSizes(const NodeItem &node_item, bool fallback_with_range) { + auto op_desc = node_item.GetOpDesc(); + for (size_t output_index = 0; output_index < op_desc->GetOutputsSize(); ++output_index) { + auto tensor_desc = op_desc->MutableOutputDesc(output_index); + GE_CHECK_NOTNULL(tensor_desc); + const auto &shape = tensor_desc->MutableShape(); + // modify on copy + auto dims = shape.GetDims(); + GE_CHK_STATUS_RET(CanonicalizeShape(*tensor_desc, dims, fallback_with_range), + "[%s] Failed to canonicalize shape for output %zu", + node_item.NodeName().c_str(), + output_index); + + int64_t tensor_size; + GE_CHK_STATUS_RET(CalcTensorSize(tensor_desc->GetDataType(), dims, tensor_size), + "[%s] Failed to calc tensor size for output %zu", + node_item.NodeName().c_str(), + output_index); + GELOGD("[%s] Tensor size of output %zu = %ld", node_item.NodeName().c_str(), output_index, tensor_size); + (void) TensorUtils::SetSize(*tensor_desc, tensor_size); + } + + return SUCCESS; +} } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/executor/worker/shape_inference_engine.h b/ge/hybrid/executor/worker/shape_inference_engine.h index 7bb9269c..b946577f 100644 --- a/ge/hybrid/executor/worker/shape_inference_engine.h +++ b/ge/hybrid/executor/worker/shape_inference_engine.h @@ -34,7 +34,11 @@ class ShapeInferenceEngine { Status PropagateOutputShapes(const NodeItem &node_item); + static Status CalcOutputTensorSizes(const NodeItem &node_item, bool fallback_with_range = false); + private: + static Status CanonicalizeShape(GeTensorDesc &tensor_desc, std::vector &shape, bool fallback_with_range); + static Status CalcTensorSize(DataType data_type, const std::vector &shape, int64_t &tensor_size); static Status UpdatePeerNodeShape(const Node &node); Status AwaitDependentNodes(NodeState &node_state); diff --git a/ge/hybrid/model/node_item.cc b/ge/hybrid/model/node_item.cc index 69cf334d..eb00f509 100644 --- a/ge/hybrid/model/node_item.cc +++ b/ge/hybrid/model/node_item.cc @@ -22,6 +22,7 @@ #include "graph/debug/ge_attr_define.h" #include "graph/utils/node_utils.h" #include "hybrid/node_executor/node_executor.h" +#include "hybrid/executor/worker/shape_inference_engine.h" namespace ge { namespace hybrid { @@ -47,7 +48,7 @@ Status ParseInputMapping(Node &node, OpDesc &op_desc, FusedSubgraph &fused_subgr GE_CHECK_NOTNULL(dst_op_desc); auto in_idx = node_and_anchor.second->GetIdx(); auto tensor_desc = dst_op_desc->MutableInputDesc(in_idx); - fused_subgraph.input_mapping[parent_index].emplace_back(tensor_desc); + fused_subgraph.input_mapping[static_cast(parent_index)].emplace_back(tensor_desc); GELOGD("Input[%u] mapped to [%s:%u]", parent_index, dst_op_desc->GetName().c_str(), in_idx); } @@ -64,7 +65,7 @@ Status ParseOutputMapping(const OpDescPtr &op_desc, FusedSubgraph &fused_subgrap return FAILED; } - fused_subgraph.output_mapping.emplace(parent_index, op_desc); + fused_subgraph.output_mapping.emplace(static_cast(parent_index), op_desc); return SUCCESS; } @@ -126,12 +127,7 @@ Status NodeItem::Create(const NodePtr &node, std::unique_ptr &node_ite return SUCCESS; } -Status NodeItem::Init() { - GE_CHECK_LE(op_desc->GetInputsSize(), INT32_MAX); - GE_CHECK_LE(op_desc->GetOutputsSize(), INT32_MAX); - num_inputs = static_cast(op_desc->GetInputsSize()); - num_outputs = static_cast(op_desc->GetOutputsSize()); - +void NodeItem::ResolveOptionalInputs() { if (op_desc->GetAllInputsSize() != op_desc->GetInputsSize()) { has_optional_inputs = true; for (size_t i = 0; i < op_desc->GetAllInputsSize(); ++i) { @@ -143,7 +139,18 @@ Status NodeItem::Init() { } } } +} +Status NodeItem::InitInputsAndOutputs() { + GE_CHECK_LE(op_desc->GetInputsSize(), INT32_MAX); + GE_CHECK_LE(op_desc->GetOutputsSize(), INT32_MAX); + num_inputs = static_cast(op_desc->GetInputsSize()); + num_outputs = static_cast(op_desc->GetOutputsSize()); + ResolveOptionalInputs(); + return SUCCESS; +} + +Status NodeItem::ResolveDynamicState() { (void) AttrUtils::GetBool(op_desc, ATTR_NAME_FORCE_UNKNOWN_SHAPE, is_dynamic); GELOGD("node name = %s, is_dynamic = %d.", this->node_name.c_str(), is_dynamic); if (!is_dynamic) { @@ -151,38 +158,54 @@ Status NodeItem::Init() { "[%s] Failed to get shape status.", node->GetName().c_str()); } + return SUCCESS; +} - if (is_dynamic) { - for (int i = 0; i < num_inputs; ++i) { - const auto &input_desc = MutableInputDesc(i); - GE_CHECK_NOTNULL(input_desc); - if (input_desc->MutableShape().IsUnknownShape()) { - is_input_shape_static_.push_back(false); - } else { - num_static_input_shapes++; - is_input_shape_static_.push_back(true); - GELOGD("[%s] The shape of input[%d] is static. shape = [%s]", - NodeName().c_str(), i, input_desc->MutableShape().ToString().c_str()); - } +Status NodeItem::ResolveStaticInputsAndOutputs() { + for (int i = 0; i < num_inputs; ++i) { + const auto &input_desc = MutableInputDesc(i); + GE_CHECK_NOTNULL(input_desc); + if (input_desc->MutableShape().IsUnknownShape()) { + is_input_shape_static_.push_back(false); + } else { + num_static_input_shapes++; + is_input_shape_static_.push_back(true); + GELOGD("[%s] The shape of input[%d] is static. shape = [%s]", + NodeName().c_str(), i, input_desc->MutableShape().ToString().c_str()); } + } - for (int i = 0; i < num_outputs; ++i) { - const auto &output_desc = op_desc->MutableOutputDesc(i); - GE_CHECK_NOTNULL(output_desc); - if (output_desc->MutableShape().IsUnknownShape()) { - is_output_shape_static = false; - break; - } + for (int i = 0; i < num_outputs; ++i) { + const auto &output_desc = op_desc->MutableOutputDesc(i); + GE_CHECK_NOTNULL(output_desc); + if (output_desc->MutableShape().IsUnknownShape()) { + is_output_shape_static = false; + break; } + } - if (IsControlOp() || node_type == PARTITIONEDCALL) { - shape_inference_type = DEPEND_COMPUTE; - } else { - int32_t unknown_shape_type_val = 0; - (void) AttrUtils::GetInt(op_desc, ::ge::ATTR_NAME_UNKNOWN_SHAPE_TYPE, unknown_shape_type_val); - shape_inference_type = static_cast(unknown_shape_type_val); - } + if (is_output_shape_static) { + GE_CHK_STATUS_RET_NOLOG(ShapeInferenceEngine::CalcOutputTensorSizes(*this)); + } + return SUCCESS; +} + +void NodeItem::ResolveUnknownShapeType() { + if (IsControlOp() || node_type == PARTITIONEDCALL) { + shape_inference_type = DEPEND_COMPUTE; + } else { + int32_t unknown_shape_type_val = 0; + (void) AttrUtils::GetInt(op_desc, ::ge::ATTR_NAME_UNKNOWN_SHAPE_TYPE, unknown_shape_type_val); + shape_inference_type = static_cast(unknown_shape_type_val); + } +} +Status NodeItem::Init() { + GE_CHK_STATUS_RET_NOLOG(InitInputsAndOutputs()); + GE_CHK_STATUS_RET_NOLOG(ResolveDynamicState()); + if (is_dynamic) { + ResolveUnknownShapeType(); + GE_CHK_STATUS_RET_NOLOG(ResolveStaticInputsAndOutputs()); GE_CHK_STATUS_RET(ParseFusedSubgraph(*this), "[%s] Failed to parse fused subgraph", node_name.c_str()); } diff --git a/ge/hybrid/model/node_item.h b/ge/hybrid/model/node_item.h index 8fbdc648..99f0d83c 100644 --- a/ge/hybrid/model/node_item.h +++ b/ge/hybrid/model/node_item.h @@ -103,6 +103,11 @@ struct NodeItem { private: explicit NodeItem(NodePtr node); Status Init(); + Status InitInputsAndOutputs(); + void ResolveOptionalInputs(); + Status ResolveDynamicState(); + Status ResolveStaticInputsAndOutputs(); + void ResolveUnknownShapeType(); std::vector is_input_shape_static_; std::vector input_desc_indices_; diff --git a/ge/hybrid/node_executor/hccl/hccl_node_executor.cc b/ge/hybrid/node_executor/hccl/hccl_node_executor.cc index 704cab77..eebe2a81 100644 --- a/ge/hybrid/node_executor/hccl/hccl_node_executor.cc +++ b/ge/hybrid/node_executor/hccl/hccl_node_executor.cc @@ -42,10 +42,10 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function do GELOGE(FAILED, "hccl handle is nullptr! "); return FAILED; } - auto EnqueueHcomOpertion = (HcclResult(*)(HcomOpertion, std::function))dlsym( - context.handle_, "EnqueueHcomOpertion"); - if (EnqueueHcomOpertion == nullptr) { - GELOGE(FAILED, "Failed to invoke EnqueueHcomOpertion hcom unknown node function."); + auto HcomExecEnqueueOperation = (HcclResult(*)(HcomOperation, std::function))dlsym( + context.handle_, "HcomExecEnqueueOperation"); + if (HcomExecEnqueueOperation == nullptr) { + GELOGE(FAILED, "Failed to invoke HcomExecEnqueueOperation hcom unknown node function."); if (dlclose(context.handle_) != 0) { GELOGW("Failed to close handle %s", dlerror()); } @@ -70,7 +70,7 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function do const OpDescPtr op_desc = node_item.GetOpDesc(); GE_CHECK_NOTNULL(op_desc); - HcomOpertion op_info; + HcomOperation op_info; op_info.hcclType = op_desc->GetType(); op_info.inputPtr = inputs.empty() ? nullptr : inputs[0]; op_info.outputPtr = outputs.empty() ? nullptr : outputs[0]; @@ -96,7 +96,7 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function do op_info.root = root_id; auto callback = [this, op_desc](HcclResult status) { if (status != HCCL_SUCCESS) { - GELOGE(HCCL_E_INTERNAL, "node %s call EnqueueHcomOpertion failed, ret: 0x%X", op_desc->GetName().c_str(), status); + GELOGE(HCCL_E_INTERNAL, "node %s call HcomExecEnqueueOperation failed, ret: 0x%X", op_desc->GetName().c_str(), status); } std::lock_guard lock(this->hccl_mutex_); this->cond_.notify_all(); @@ -110,9 +110,9 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function do context.GetNodeName(), op_info.hcclType.c_str(), count, op_info.dataType, op_info.opType, op_info.root); op_info.count = count; - HcclResult hccl_ret = EnqueueHcomOpertion(op_info, callback); + HcclResult hccl_ret = HcomExecEnqueueOperation(op_info, callback); if (hccl_ret != HCCL_SUCCESS) { - GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", hccl_ret); + GELOGE(HCCL_E_INTERNAL, "Call HcomExecInitialize failed, ret: 0x%X", hccl_ret); return HCCL_E_INTERNAL; } @@ -213,11 +213,11 @@ Status RdmaNodeTask::ExtractTensor(TaskContext &context, vector done_callback) { GELOGI("[%s] RdmaNodeTask::ExecuteAsync in.", context.GetNodeName()); - auto EnqueueRemoteAccess = + auto HcomExecEnqueueRemoteAccess = (HcclResult(*)(const string &, const vector &, - std::function))dlsym(context.handle_, "EnqueueRemoteAccess"); - if (EnqueueRemoteAccess == nullptr) { - GELOGE(FAILED, "Failed to invoke EnqueueRemoteAccess hcom unknown node function."); + std::function))dlsym(context.handle_, "HcomExecEnqueueRemoteAccess"); + if (HcomExecEnqueueRemoteAccess == nullptr) { + GELOGE(FAILED, "Failed to invoke HcomExecEnqueueRemoteAccess hcom unknown node function."); if (dlclose(context.handle_) != 0) { GELOGW("Failed to close handle %s", dlerror()); } @@ -228,15 +228,15 @@ Status RdmaNodeTask::ExecuteAsync(TaskContext &context, std::function do auto callback = [this](HcclResult status) { if (status != HCCL_SUCCESS) { - GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", status); + GELOGE(HCCL_E_INTERNAL, "Call HcomExecInitialize failed, ret: 0x%X", status); } std::lock_guard lock(this->hccl_mutex_); this->cond_.notify_all(); GELOGI("rdma callback success."); }; - HcclResult hccl_ret = EnqueueRemoteAccess(context.GetNodeItem().NodeType(), addr_infos, callback); + HcclResult hccl_ret = HcomExecEnqueueRemoteAccess(context.GetNodeItem().NodeType(), addr_infos, callback); if (hccl_ret != HCCL_SUCCESS) { - GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", hccl_ret); + GELOGE(HCCL_E_INTERNAL, "Call HcomExecInitialize failed, ret: 0x%X", hccl_ret); return HCCL_E_INTERNAL; } @@ -307,32 +307,32 @@ Status HcclNodeExecutor::Initialize() { GELOGE(GE_PLGMGR_SO_NOT_EXIST, "Failed in dlopen %s! ", dlerror()); return FAILED; } - auto HcomExcutorInitialize = (HcclResult(*)())dlsym(handle_, "HcomExcutorInitialize"); - if (HcomExcutorInitialize == nullptr) { - GELOGE(FAILED, "Failed to invoke HcomExcutorInitialize hcom unknown node function."); + auto HcomExecInitialize = (HcclResult(*)())dlsym(handle_, "HcomExecInitialize"); + if (HcomExecInitialize == nullptr) { + GELOGE(FAILED, "Failed to invoke HcomExecInitialize hcom unknown node function."); return FAILED; } - HcclResult hccl_ret = HcomExcutorInitialize(); + HcclResult hccl_ret = HcomExecInitialize(); if (hccl_ret == HCCL_E_PTR) { GELOGI("Hccl comm is null, hcom executor initialize is not required."); } else if (hccl_ret == HCCL_SUCCESS) { GELOGI("Hcom executor initialize success."); } else { - GELOGE(FAILED, "Call HcomExcutorInitialize failed, ret: 0x%X", hccl_ret); + GELOGE(FAILED, "Call HcomExecInitialize failed, ret: 0x%X", hccl_ret); return FAILED; } return SUCCESS; } Status HcclNodeExecutor::Finalize() { - auto HcomExcutorFinalize = (HcclResult(*)())dlsym(handle_, "HcomExcutorFinalize"); - if (HcomExcutorFinalize == nullptr) { - GELOGE(FAILED, "Failed to invoke HcomExcutorFinalize hcom unknown node function."); + auto HcomExecFinalize = (HcclResult(*)())dlsym(handle_, "HcomExecFinalize"); + if (HcomExecFinalize == nullptr) { + GELOGE(FAILED, "Failed to invoke HcomExecFinalize hcom unknown node function."); return FAILED; } - HcclResult hccl_ret = HcomExcutorFinalize(); + HcclResult hccl_ret = HcomExecFinalize(); if (hccl_ret != HCCL_SUCCESS) { - GELOGE(FAILED, "Call HcomExcutorFinalize failed, ret: 0x%X", hccl_ret); + GELOGE(FAILED, "Call HcomExecFinalize failed, ret: 0x%X", hccl_ret); return FAILED; } // dlclose file handle diff --git a/ge/hybrid/node_executor/task_context.cc b/ge/hybrid/node_executor/task_context.cc index 77004f99..f16bfb2f 100644 --- a/ge/hybrid/node_executor/task_context.cc +++ b/ge/hybrid/node_executor/task_context.cc @@ -148,6 +148,10 @@ Status TaskContext::AllocateWorkspaces() { } Status TaskContext::RegisterCallback(const std::function &callback_fun) const { + if (callback_fun == nullptr) { + GELOGW("[%s] Callback is NULL", GetNodeName()); + return SUCCESS; + } auto ret = execution_context_->callback_manager->RegisterCallback(callback_fun); if (ret != SUCCESS) { GELOGE(ret, "[%s] Failed to register callback", GetNodeName()); @@ -384,6 +388,20 @@ const char *TaskContext::GetNodeName() const { return node_item_->NodeName().c_str(); } +void TaskContext::ReleaseInputsAndOutputs() { + for (int i = 0; i < node_item_->num_inputs; ++i) { + auto tensor = inputs_start_ + i; + tensor->Destroy(); + GELOGD("[%s] Tensor of input[%d] released", GetNodeName(), i); + } + + for (int i = 0; i < node_item_->num_outputs; ++i) { + auto tensor = outputs_start_ + i; + tensor->Destroy(); + GELOGD("[%s] Tensor of output[%d] released", GetNodeName(), i); + } +} + void TaskContext::ReleaseInput(int index) { auto input_tensor = MutableInput(index); if (input_tensor != nullptr) { @@ -456,5 +474,9 @@ Status TaskContext::TryExecuteCallback(const function &callback_fun) con const DumpProperties &TaskContext::GetDumpProperties() const { return execution_context_->dump_properties; } + +bool TaskContext::NeedCallback() { + return node_item_->has_observer || IsDumpEnabled() || execution_context_->profiling_level > 0; +} } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/node_executor/task_context.h b/ge/hybrid/node_executor/task_context.h index 0549a1dc..34754a14 100644 --- a/ge/hybrid/node_executor/task_context.h +++ b/ge/hybrid/node_executor/task_context.h @@ -50,6 +50,8 @@ class TaskContext { ConstGeTensorDescPtr GetOutputDesc(int index) const; GeTensorDescPtr MutableInputDesc(int index) const; GeTensorDescPtr MutableOutputDesc(int index) const; + void ReleaseInputsAndOutputs(); + bool NeedCallback(); void ReleaseInput(int index); const TensorValue *GetInput(int index) const; const TensorValue *GetOutput(int index) const; diff --git a/ge/ir_build/atc_ir_common.cc b/ge/ir_build/atc_ir_common.cc index 2a77e386..77d749de 100644 --- a/ge/ir_build/atc_ir_common.cc +++ b/ge/ir_build/atc_ir_common.cc @@ -63,6 +63,19 @@ vector SplitInputShape(const std::string &input_shape) { } } // namespace +Status CheckInputFormat(const string &input_format) { + if (input_format.empty()) { + return ge::SUCCESS; + } + if (!ge::TypeUtils::IsFormatValid(input_format.c_str())) { + ErrorManager::GetInstance().ATCReportErrMessage( + "E10001", {"parameter", "value", "reason"}, {"--input_format", input_format, "input format is invalid!"}); + GELOGE(ge::PARAM_INVALID, "input format [%s] is invalid!", input_format.c_str()); + return ge::PARAM_INVALID; + } + return ge::SUCCESS; +} + bool CheckDynamicBatchSizeInputShapeValid(unordered_map> shape_map, std::string &dynamic_batch_size) { int32_t size = 0; diff --git a/ge/ir_build/atc_ir_common.h b/ge/ir_build/atc_ir_common.h index 47361167..b26c2f2b 100644 --- a/ge/ir_build/atc_ir_common.h +++ b/ge/ir_build/atc_ir_common.h @@ -75,6 +75,7 @@ Status CheckInsertOpConfParamValid(const std::string insert_op_conf); Status CheckDisableReuseMemoryParamValid(const std::string disable_reuse_memory); Status CheckEnableSingleStreamParamValid(const std::string enable_single_stream); Status CheckImplmodeParamValid(const std::string &optypelist_for_implmode, std::string &op_select_implmode); +Status CheckInputFormat(const string &input_format); void PrintOptionMap(std::map &options, std::string tips); void EraseEndSemicolon(std::string ¶m); } diff --git a/ge/ir_build/ge_ir_build.cc b/ge/ir_build/ge_ir_build.cc index f181170c..c7ef6c1a 100644 --- a/ge/ir_build/ge_ir_build.cc +++ b/ge/ir_build/ge_ir_build.cc @@ -227,7 +227,6 @@ class Impl { ~Impl() { (void)generator_.Finalize(); }; graphStatus CheckOptions(const std::map &options); graphStatus CreateInputsForIRBuild(const ge::Graph &graph, vector &inputs); - graphStatus GetDefaultInputShape(const Graph &graph, string &default_shape); graphStatus UpdateDataOpAttr(const Graph &graph); graphStatus Init(const Graph &graph, const std::map &options); graphStatus BuildModel(const Graph &graph, const std::map &options, @@ -318,42 +317,10 @@ graphStatus Impl::CheckOptions(const std::map &options if (it != options_.end() && (CheckDisableReuseMemoryParamValid(it->second) != GRAPH_SUCCESS)) { return GRAPH_PARAM_INVALID; } - return GRAPH_SUCCESS; -} - -graphStatus Impl::GetDefaultInputShape(const Graph &graph, string &default_shape) { - auto compute_graph = ge::GraphUtils::GetComputeGraph(graph); - GE_CHECK_NOTNULL(compute_graph); - for (ge::NodePtr &input_node : compute_graph->GetDirectNode()) { - GE_CHECK_NOTNULL(input_node); - ge::OpDescPtr op = input_node->GetOpDesc(); - GE_CHECK_NOTNULL(op); - if (op->GetType() == DATA) { - string data_op_name = op->GetName(); - GELOGD("Data op name: %s, data op inputDesc size: %zu", data_op_name.c_str(), op->GetAllInputsDesc().size()); - ge::GeTensorDesc tensor = op->GetInputDesc(0); - ge::GeShape data_shape = tensor.GetShape(); - GELOGD("Data op get shape from InputDesc in ge ir graph."); - - string tmp_shape_str; - const std::vector &tmp_shape = data_shape.GetDims(); - if (tmp_shape.empty()) { - GELOGW("Data op: %s has zero shape dims!", data_op_name.c_str()); - } else { - tmp_shape_str += data_op_name + ":"; - for (auto tmp_dim : tmp_shape) { - tmp_shape_str += to_string((long)tmp_dim) + ","; - } - tmp_shape_str = tmp_shape_str.substr(0, tmp_shape_str.size() - 1); - tmp_shape_str += ";"; - default_shape += tmp_shape_str; - } - - GELOGD("Data op name: %s, data shape: %s.", data_op_name.c_str(), tmp_shape_str.c_str()); - } + // Check Input Format + if (options_.find(kInputFormat) != options_.end()) { + return CheckInputFormat(options_[kInputFormat]); } - default_shape = (default_shape.empty() ? default_shape : default_shape.substr(0, default_shape.size() - 1)); - GELOGI("Get default data op shape: %s from ge ir graph.", default_shape.c_str()); return GRAPH_SUCCESS; } @@ -378,13 +345,7 @@ graphStatus Impl::Init(const Graph &graph, const std::map &input_desc, const vector &output_desc) { return UNSUPPORTED; } -Status OpTask::UpdateArgTable(const SingleOpModelParam ¶m) { - auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param); + +Status OpTask::DoUpdateArgTable(const SingleOpModelParam ¶m, bool keep_workspace) { + auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param, keep_workspace); auto all_addresses = BuildTaskUtils::JoinAddresses(addresses); uintptr_t *arg_base = nullptr; size_t arg_num = 0; @@ -132,6 +133,10 @@ Status OpTask::UpdateArgTable(const SingleOpModelParam ¶m) { return SUCCESS; } +Status OpTask::UpdateArgTable(const SingleOpModelParam ¶m) { + return DoUpdateArgTable(param, true); +} + Status OpTask::LaunchKernel(const vector &input_desc, const vector &input_buffers, vector &output_desc, @@ -792,10 +797,9 @@ Status AiCpuTask::LaunchKernel(const std::vector &input_desc, return SUCCESS; } -Status AiCpuTask::UpdateArgTable(const SingleOpModelParam ¶m) { - auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param, false); - io_addr_host_ = BuildTaskUtils::JoinAddresses(addresses); - return SUCCESS; +Status AiCpuBaseTask::UpdateArgTable(const SingleOpModelParam ¶m) { + // aicpu do not have workspace, for now + return DoUpdateArgTable(param, false); } void AiCpuTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) { diff --git a/ge/single_op/task/op_task.h b/ge/single_op/task/op_task.h index 761697cb..bf78557c 100644 --- a/ge/single_op/task/op_task.h +++ b/ge/single_op/task/op_task.h @@ -54,6 +54,8 @@ class OpTask { rtStream_t stream); protected: + Status DoUpdateArgTable(const SingleOpModelParam ¶m, bool keep_workspace); + DumpProperties dump_properties_; DumpOp dump_op_; OpDescPtr op_desc_; @@ -110,7 +112,7 @@ class AiCpuBaseTask : public OpTask { AiCpuBaseTask() = default; ~AiCpuBaseTask() override; UnknowShapeOpType GetUnknownType() const { return unknown_type_; } - + Status UpdateArgTable(const SingleOpModelParam ¶m) override; protected: Status UpdateIoAddr(const std::vector &inputs, const std::vector &outputs); Status SetInputConst(); @@ -137,7 +139,6 @@ class AiCpuTask : public AiCpuBaseTask { ~AiCpuTask() override; Status LaunchKernel(rtStream_t stream) override; - Status UpdateArgTable(const SingleOpModelParam ¶m) override; void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) override; Status LaunchKernel(const std::vector &input_desc, diff --git a/inc/external/ge/ge_api_types.h b/inc/external/ge/ge_api_types.h index cce17f93..9b361b96 100644 --- a/inc/external/ge/ge_api_types.h +++ b/inc/external/ge/ge_api_types.h @@ -293,6 +293,7 @@ const std::string MDL_BANK_PATH_FLAG = "ge.mdl_bank_path"; // Configure op bank path const std::string OP_BANK_PATH_FLAG = "ge.op_bank_path"; +const std::string OP_BANK_UPDATE_FLAG = "ge.op_bank_update"; // Graph run mode enum GraphRunMode { PREDICTION = 0, TRAIN }; @@ -366,6 +367,7 @@ static const char *const OP_COMPILER_CACHE_DIR = ge::OP_COMPILER_CACHE_DIR; static const char *const OP_COMPILER_CACHE_MODE = ge::OP_COMPILER_CACHE_MODE; static const char *const MDL_BANK_PATH = ge::MDL_BANK_PATH_FLAG.c_str(); static const char *const OP_BANK_PATH = ge::OP_BANK_PATH_FLAG.c_str(); +static const char *const OP_BANK_UPDATE = ge::OP_BANK_UPDATE_FLAG.c_str(); static const char *const OP_DEBUG_LEVEL = ge::OP_DEBUG_LEVEL.c_str(); // for interface: aclgrphBuildModel @@ -389,22 +391,13 @@ const std::set ir_builder_suppported_options = {INPUT_FORMAT, OP_COMPILER_CACHE_DIR, OP_COMPILER_CACHE_MODE, MDL_BANK_PATH, - OP_BANK_PATH}; + OP_BANK_PATH, + OP_BANK_UPDATE}; // for interface: aclgrphParse -const std::set ir_parser_suppported_options = {INPUT_FORMAT, - INPUT_SHAPE, - OP_NAME_MAP, - IS_DYNAMIC_INPUT, - INPUT_FP16_NODES, - IS_INPUT_ADJUST_HW_LAYOUT, - IS_OUTPUT_ADJUST_HW_LAYOUT, - OUTPUT, - OUTPUT_TYPE, - OUT_NODES, - COMPRESS_WEIGHT_CONF, - ENABLE_SCOPE_FUSION_PASSES, - LOG_LEVEL}; +const std::set ir_parser_suppported_options = { + INPUT_FP16_NODES, IS_INPUT_ADJUST_HW_LAYOUT, IS_OUTPUT_ADJUST_HW_LAYOUT, OUTPUT, + OUT_NODES, COMPRESS_WEIGHT_CONF, ENABLE_SCOPE_FUSION_PASSES}; // for interface: aclgrphBuildInitialize const std::set global_options = {CORE_TYPE, diff --git a/inc/framework/common/ge_types.h b/inc/framework/common/ge_types.h index 254dbada..eea9824b 100644 --- a/inc/framework/common/ge_types.h +++ b/inc/framework/common/ge_types.h @@ -37,7 +37,9 @@ enum FrameworkType { MINDSPORE = 1, TENSORFLOW = 3, ANDROID_NN, +#ifndef ONLY_COMPILE_OPEN_SRC ONNX, +#endif FRAMEWORK_RESERVED, }; diff --git a/inc/framework/common/profiling/ge_profiling.h b/inc/framework/common/profiling/ge_profiling.h index c51f837f..83699754 100644 --- a/inc/framework/common/profiling/ge_profiling.h +++ b/inc/framework/common/profiling/ge_profiling.h @@ -20,7 +20,8 @@ #include "ge/ge_api_error_codes.h" #include "toolchain/prof_callback.h" -#define MAX_DEV_NUM (64) +const int MAX_DEV_NUM = 64; + enum ProfCommandHandleType { kProfCommandhandleInit = 0, kProfCommandhandleStart, diff --git a/inc/framework/executor/ge_executor.h b/inc/framework/executor/ge_executor.h index 5a73126f..1b78860d 100644 --- a/inc/framework/executor/ge_executor.h +++ b/inc/framework/executor/ge_executor.h @@ -30,8 +30,6 @@ #include "runtime/base.h" namespace ge { -class ModelListenerAdapter; - class SingleOp; class DynamicSingleOp; @@ -55,14 +53,8 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeExecutor { ge::Status Initialize(); ge::Status Finalize(); - // Load model - ge::Status LoadModelOffline(uint32_t &model_id, const std::string &path, const std::string &key, int32_t priority, - std::shared_ptr listener); - ge::Status UnloadModel(uint32_t modelId); - ge::Status RunModel(const ge::RunModelData &input_data, ge::RunModelData &output_data); - // Get input and output descriptor ge::Status GetModelDescInfo(uint32_t model_id, std::vector &input_desc, std::vector &output_desc, bool new_model_desc = false); @@ -168,9 +160,6 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeExecutor { ge::Status GetModelDescInfoForZeroCopy(uint32_t model_id, std::vector &input_desc, std::vector &output_desc); - ge::Status LoadModel(uint32_t &model_id, const ge::ModelData &model_data, - std::shared_ptr listener); - ge::Status CommandHandle(const ge::Command &command); ge::Status SetDump(const DumpConfig &dump_config); @@ -297,8 +286,6 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeExecutor { private: static bool isInit_; }; - -ge::Status ModelInfoParser(const ge::ModelData &model, ge::ModelInfo &model_info); } // namespace ge #endif // INC_FRAMEWORK_EXECUTOR_GE_EXECUTOR_H_ diff --git a/inc/framework/omg/parser/model_parser.h b/inc/framework/omg/parser/model_parser.h index 20bfcef4..57cff9a7 100644 --- a/inc/framework/omg/parser/model_parser.h +++ b/inc/framework/omg/parser/model_parser.h @@ -36,7 +36,7 @@ using Status = domi::Status; namespace domi { using GetGraphCallback = std::function( - const google::protobuf::Message *root_proto, const std::string &graph)>; + const google::protobuf::Message *root_proto, const std::string &graph)>; class ModelParser { public: ModelParser() {} @@ -44,19 +44,20 @@ class ModelParser { virtual ~ModelParser() {} /** - * @ingroup domi_omg - * @brief Analyze network model data - * @param [in] file Network model file path - * @param [in|out] graph Save the network information after analysis - * @return SUCCESS - * @return Others failed - */ + * @ingroup domi_omg + * @brief Analyze network model data + * @param [in] file Network model file path + * @param [in|out] graph Save the network information after analysis + * @return SUCCESS + * @return Others failed + */ virtual Status Parse(const char *file, ge::Graph &graph) = 0; /** * @ingroup domi_omg * @brief Parse relevant data from memory and save it to graph * @param [in] input Model file memory data + * @param [in] input Model file memory size * @param [in|out] graph A graph for saving the model information after analysis * @return SUCCESS * @return FAILED @@ -64,36 +65,49 @@ class ModelParser { */ virtual Status ParseFromMemory(const char *data, uint32_t size, ge::ComputeGraphPtr &graph) = 0; +#ifndef ONLY_COMPILE_OPEN_SRC + /** + * @ingroup domi_omg + * @brief Parse relevant data from memory and save it to graph + * @param [in] input Model file memory data + * @param [in] input Model file memory size + * @param [in|out] graph A graph for saving the model information after analysis + * @return SUCCESS + * @return FAILED + * @author + */ + virtual Status ParseFromMemory(const char *data, uint32_t size, ge::Graph &graph) = 0; +#endif + /** - * @ingroup domi_omg - * @brief Analyze network model data - * @param [in] proto network model - * @param [in|out] graph Save the network information after analysis - * @return SUCCESS - * @return Others failed - */ + * @ingroup domi_omg + * @brief Analyze network model data + * @param [in] proto network model + * @param [in|out] graph Save the network information after analysis + * @return SUCCESS + * @return Others failed + */ virtual Status ParseProto(const google::protobuf::Message *proto, ge::ComputeGraphPtr &graph) = 0; /** - * @ingroup domi_omg - * @brief Analyze callback model data in subgraph - * @param [in] proto network model - * @param [in] callback callback of subgraph - * @param [in|out] graph Save the network information after analysis - * @return SUCCESS - * @return Others failed - */ - virtual Status ParseProtoWithSubgraph(const google::protobuf::Message *proto, - GetGraphCallback callback, + * @ingroup domi_omg + * @brief Analyze callback model data in subgraph + * @param [in] proto network model + * @param [in] callback callback of subgraph + * @param [in|out] graph Save the network information after analysis + * @return SUCCESS + * @return Others failed + */ + virtual Status ParseProtoWithSubgraph(const google::protobuf::Message *proto, GetGraphCallback callback, ge::ComputeGraphPtr &graph) = 0; /** - * @ingroup domi_omg - * @brief Convert model files to JSON format - * @param [in] model_file Model file path to be converted - * @param [out] json_file Converted JSON file path - * @return SUCCESS - * @return Others failed - */ + * @ingroup domi_omg + * @brief Convert model files to JSON format + * @param [in] model_file Model file path to be converted + * @param [out] json_file Converted JSON file path + * @return SUCCESS + * @return Others failed + */ virtual Status ToJson(const char *model_file, const char *json_file) { return domi::SUCCESS; } /* diff --git a/inc/framework/omg/parser/parser_inner_ctx.h b/inc/framework/omg/parser/parser_inner_ctx.h index f24e2639..5d91bd46 100644 --- a/inc/framework/omg/parser/parser_inner_ctx.h +++ b/inc/framework/omg/parser/parser_inner_ctx.h @@ -59,7 +59,7 @@ struct ParserContext { bool train_flag = false; domi::domiTensorFormat_t format = domi::DOMI_TENSOR_ND; domi::FrameworkType type = domi::FRAMEWORK_RESERVED; - RunMode run_mode = ONLY_PRE_CHECK; + RunMode run_mode = GEN_OM_MODEL; // save caffe custom proto path, used by caffe parse std::string custom_proto_path; // save caffe proto path, used by caffe parse diff --git a/metadef/graph/ge_attr_define.cc b/metadef/graph/ge_attr_define.cc index e2886c49..c942286d 100644 --- a/metadef/graph/ge_attr_define.cc +++ b/metadef/graph/ge_attr_define.cc @@ -167,6 +167,7 @@ const std::string ATTR_NAME_DYNAMIC_OUTPUT_DIMS = "_dynamic_output_dims"; const std::string ATTR_NAME_INPUT_ORIGIN_SIZE = "input_origin_size"; const std::string ATTR_NAME_ROOT_GRAPH_ID = "_root_graph_id"; +const std::string ATTR_NAME_ROOT_GRAPH_NAME = "_root_graph_name"; // Identify node connecting to input and output const std::string ATTR_NAME_NODE_CONNECT_INPUT = "_is_connected_to_data"; diff --git a/metadef/graph/proto/op_mapping_info.proto b/metadef/graph/proto/op_mapping_info.proto index e23b7ebe..7fb6f84b 100644 --- a/metadef/graph/proto/op_mapping_info.proto +++ b/metadef/graph/proto/op_mapping_info.proto @@ -15,6 +15,7 @@ message Output { int32 original_output_data_type = 7; int32 original_output_format = 8; uint64 size = 9; + Shape origin_shape = 10; } message Input { @@ -23,6 +24,7 @@ message Input { Shape shape = 3; uint64 address = 4; uint64 size = 5; + Shape origin_shape = 6; } enum BufferType { diff --git a/metadef/graph/utils/type_utils.cc b/metadef/graph/utils/type_utils.cc index e724b606..9d12c56f 100644 --- a/metadef/graph/utils/type_utils.cc +++ b/metadef/graph/utils/type_utils.cc @@ -118,8 +118,7 @@ const std::map kDataFormatMap = { {"NCDHW", FORMAT_NCDHW}, {"ND", FORMAT_ND}}; -const std::map kStringToFormatMap = - { +const std::map kStringToFormatMap = { {"NCHW", FORMAT_NCHW}, {"NHWC", FORMAT_NHWC}, {"ND", FORMAT_ND}, @@ -164,7 +163,7 @@ const std::map kStringToFormatMap = {"NULL", FORMAT_NULL}, // add for json input {"RESERVED", FORMAT_RESERVED}, - {"UNDEFINED", FORMAT_RESERVED}, + {"UNDEFINED", FORMAT_RESERVED} }; const std::map kDataTypeToStringMap = { diff --git a/metadef/inc/common/proto/op_mapping_info.proto b/metadef/inc/common/proto/op_mapping_info.proto index e23b7ebe..7fb6f84b 100644 --- a/metadef/inc/common/proto/op_mapping_info.proto +++ b/metadef/inc/common/proto/op_mapping_info.proto @@ -15,6 +15,7 @@ message Output { int32 original_output_data_type = 7; int32 original_output_format = 8; uint64 size = 9; + Shape origin_shape = 10; } message Input { @@ -23,6 +24,7 @@ message Input { Shape shape = 3; uint64 address = 4; uint64 size = 5; + Shape origin_shape = 6; } enum BufferType { diff --git a/metadef/inc/common/util/platform_info.h b/metadef/inc/common/util/platform_info.h index ab80f830..af12fc85 100644 --- a/metadef/inc/common/util/platform_info.h +++ b/metadef/inc/common/util/platform_info.h @@ -19,12 +19,8 @@ #include #include -#include #include "platform_info_def.h" - -using std::map; -using std::vector; -using std::string; +#include "platform_infos_def.h" namespace fe { class PlatformInfoManager { @@ -36,66 +32,143 @@ class PlatformInfoManager { uint32_t InitializePlatformInfo(); uint32_t Finalize(); - uint32_t GetPlatformInfo(const string SoCVersion, PlatformInfo &platform_info, OptionalInfo &opti_compilation_info); + uint32_t GetPlatformInfo(const std::string SoCVersion, + PlatformInfo &platform_info, + OptionalInfo &opti_compilation_info); uint32_t GetPlatformInfoWithOutSocVersion(PlatformInfo &platform_info, OptionalInfo &opti_compilation_info); void SetOptionalCompilationInfo(OptionalInfo &opti_compilation_info); + uint32_t GetPlatformInfos(const std::string SoCVersion, + PlatFormInfos &platform_info, + OptionalInfos &opti_compilation_info); + + uint32_t GetPlatformInfoWithOutSocVersion(PlatFormInfos &platform_info, OptionalInfos &opti_compilation_info); + + void SetOptionalCompilationInfo(OptionalInfos &opti_compilation_info); + private: PlatformInfoManager(); ~PlatformInfoManager(); - uint32_t LoadIniFile(string ini_file_real_path); + uint32_t LoadIniFile(std::string ini_file_real_path); - void Trim(string &str); + void Trim(std::string &str); - uint32_t LoadConfigFile(string real_path); + uint32_t LoadConfigFile(std::string real_path); - string RealPath(const std::string &path); + std::string RealPath(const std::string &path); - string GetSoFilePath(); + std::string GetSoFilePath(); - void ParseVersion(map &version_map, string &soc_version, PlatformInfo &platform_info_temp); + void ParseVersion(std::map &version_map, + std::string &soc_version, + PlatformInfo &platform_info_temp); - void ParseSocInfo(map &soc_info_map, PlatformInfo &platform_info_temp); + void ParseSocInfo(std::map &soc_info_map, + PlatformInfo &platform_info_temp); - void ParseCubeOfAICoreSpec(map &ai_core_spec_map, PlatformInfo &platform_info_temp); + void ParseCubeOfAICoreSpec(std::map &ai_core_spec_map, + PlatformInfo &platform_info_temp); - void ParseBufferOfAICoreSpec(map &ai_core_spec_map, PlatformInfo &platform_info_temp); + void ParseBufferOfAICoreSpec(std::map &ai_core_spec_map, + PlatformInfo &platform_info_temp); - void ParseUBOfAICoreSpec(map &ai_core_spec_map, PlatformInfo &platform_info_temp); + void ParseUBOfAICoreSpec(std::map &ai_core_spec_map, + PlatformInfo &platform_info_temp); - void ParseUnzipOfAICoreSpec(map &ai_core_spec_map, PlatformInfo &platform_info_temp); + void ParseUnzipOfAICoreSpec(std::map &ai_core_spec_map, + PlatformInfo &platform_info_temp); - void ParseAICoreSpec(map &ai_core_spec_map, PlatformInfo &platform_info_temp); + void ParseAICoreSpec(std::map &ai_core_spec_map, + PlatformInfo &platform_info_temp); - void ParseBufferOfAICoreMemoryRates(map &ai_core_memory_rates_map, PlatformInfo &platform_info_temp); + void ParseBufferOfAICoreMemoryRates(std::map &ai_core_memory_rates_map, + PlatformInfo &platform_info_temp); - void ParseAICoreMemoryRates(map &ai_core_memory_rates_map, PlatformInfo &platform_info_temp); + void ParseAICoreMemoryRates(std::map &ai_core_memory_rates_map, + PlatformInfo &platform_info_temp); - void ParseUBOfAICoreMemoryRates(map &ai_core_memory_rates_map, PlatformInfo &platform_info_temp); + void ParseUBOfAICoreMemoryRates(std::map &ai_core_memory_rates_map, + PlatformInfo &platform_info_temp); - void ParseAICoreintrinsicDtypeMap(map &ai_coreintrinsic_dtype_map, PlatformInfo &platform_info_temp); + void ParseAICoreintrinsicDtypeMap(std::map &ai_coreintrinsic_dtype_map, + PlatformInfo &platform_info_temp); - void ParseVectorCoreSpec(map &vector_core_spec_map, PlatformInfo &platform_info_temp); + void ParseVectorCoreSpec(std::map &vector_core_spec_map, + PlatformInfo &platform_info_temp); - void ParseVectorCoreMemoryRates(map &vector_core_memory_rates_map, PlatformInfo &platform_info_temp); + void ParseVectorCoreMemoryRates(std::map &vector_core_memory_rates_map, + PlatformInfo &platform_info_temp); - void ParseCPUCache(map &CPUCacheMap, PlatformInfo &platform_info_temp); + void ParseCPUCache(std::map &CPUCacheMap, + PlatformInfo &platform_info_temp); - void ParseVectorCoreintrinsicDtypeMap(map &vector_coreintrinsic_dtype_map, + void ParseVectorCoreintrinsicDtypeMap(std::map &vector_coreintrinsic_dtype_map, PlatformInfo &platform_info_temp); - uint32_t ParsePlatformInfoFromStrToStruct(map> &content_info_map, string &soc_version, + uint32_t ParsePlatformInfoFromStrToStruct(std::map> &content_info_map, + std::string &soc_version, PlatformInfo &platform_info_temp); - uint32_t AssemblePlatformInfoVector(map> &content_info_map); + void ParseVersion(std::map &version_map, + std::string &soc_version, + PlatFormInfos &platform_info_temp); + + void ParseSocInfo(std::map &soc_info_map, PlatFormInfos &platform_info_temp); + + void ParseCubeOfAICoreSpec(std::map &ai_core_spec_map, + PlatFormInfos &platform_info_temp); + + void ParseBufferOfAICoreSpec(std::map &ai_core_spec_map, + PlatFormInfos &platform_info_temp); + + void ParseUBOfAICoreSpec(std::map &ai_core_spec_map, + PlatFormInfos &platform_info_temp); + + void ParseUnzipOfAICoreSpec(std::map &ai_core_spec_map, + PlatFormInfos &platform_info_temp); + + void ParseAICoreSpec(std::map &ai_core_spec_map, + PlatFormInfos &platform_info_temp); + + void ParseBufferOfAICoreMemoryRates(std::map &ai_core_memory_rates_map, + PlatFormInfos &platform_info_temp); + + void ParseAICoreMemoryRates(std::map &ai_core_memory_rates_map, + PlatFormInfos &platform_info_temp); + + void ParseUBOfAICoreMemoryRates(std::map &ai_core_memory_rates_map, + PlatFormInfos &platform_info_temp); + + void ParseAICoreintrinsicDtypeMap(std::map &ai_coreintrinsic_dtype_map, + PlatFormInfos &platform_info_temp); + + void ParseVectorCoreSpec(std::map &vector_core_spec_map, + PlatFormInfos &platform_info_temp); + + void ParseVectorCoreMemoryRates(std::map &vector_core_memory_rates_map, + PlatFormInfos &platform_info_temp); + + void ParseCPUCache(std::map &CPUCacheMap, + PlatFormInfos &platform_info_temp); + + void ParseVectorCoreintrinsicDtypeMap(std::map &vector_coreintrinsic_dtype_map, + PlatFormInfos &platform_info_temp); + + uint32_t ParsePlatformInfo(std::map> &content_info_map, + std::string &soc_version, + PlatFormInfos &platform_info_temp); + + uint32_t AssemblePlatformInfoVector(std::map> &content_info_map); private: bool init_flag_; - map platform_info_map_; + std::map platform_info_map_; OptionalInfo opti_compilation_info_; + std::map platform_infos_map_; + OptionalInfos opti_compilation_infos_; }; } // namespace fe #endif diff --git a/metadef/inc/common/util/platform_infos_def.h b/metadef/inc/common/util/platform_infos_def.h new file mode 100644 index 00000000..be72061c --- /dev/null +++ b/metadef/inc/common/util/platform_infos_def.h @@ -0,0 +1,283 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PLATFORM_INFOS_DEF_H +#define PLATFORM_INFOS_DEF_H + +#include +#include +#include +#include +#include "platform_info_def.h" + +namespace fe { +class StrInfoImpl; +using StrInfoImplPtr = std::shared_ptr; +class StrInfos { + public: + bool Init(); + std::string GetAIcVersion(); + std::string GetCcecAIcVersion(); + std::string GetCcecAIvVersion(); + std::string IsSupportAICpuCompiler(); + + void SetAIcVersion(std::string &aic_version); + void SetCcecAIcVersion(std::string &ccec_aic_version); + void SetCcecAIvVersion(std::string &ccec_aiv_version); + void SetIsSupportAICpuCompiler(std::string &is_support_ai_cpu_compiler); + private: + StrInfoImplPtr str_info_impl_{nullptr}; +}; + +class SoCInfoImpl; +using SoCInfoImplPtr = std::shared_ptr; +class SoCInfos { + public: + bool Init(); + uint32_t GetAICoreCnt(); + uint32_t GetVectorCoreCnt(); + uint32_t GetAICpuCnt(); + MemoryType GetMemType(); + uint64_t GetMemSize(); + L2Type GetL2Type(); + uint64_t GetL2Size(); + uint32_t GetL2PageNum(); + + void SetAICoreCnt(uint32_t ai_core_cnt); + void SetVectorCoreCnt(uint32_t vector_core_cnt); + void SetAICpuCnt(uint32_t ai_cpu_cnt); + void SetMemType(MemoryType memory_type); + void SetMemSize(uint64_t memory_size); + void SetL2Type(L2Type l2_type); + void SetL2Size(uint64_t l2_size); + void SetL2PageNum(uint32_t l2_page_num); + private: + SoCInfoImplPtr soc_info_impl_{nullptr}; +}; + +class AICoreSpecImpl; +using AICoreSpecImplPtr = std::shared_ptr; +class AICoreSpecs { + public: + bool Init(); + double GetCubeFreq(); + uint64_t GetCubeMSize(); + uint64_t GetCubeNSize(); + uint64_t GetCubeKSize(); + uint64_t GetVecCalcSize(); + uint64_t GetL0aSize(); + uint64_t GetL0bSize(); + uint64_t GetL0cSize(); + uint64_t GetL1Size(); + uint64_t GetSmaskBuffer(); + uint64_t GetUBSize(); + uint64_t GetUBBlockSize(); + uint64_t GetUBBankSize(); + uint64_t GetUBBankNum(); + uint64_t GetUBBurstInOneBlock(); + uint64_t GetUBBankGroupNum(); + uint32_t GetUnzipEngines(); + uint32_t GetUnzipMaxRatios(); + uint32_t GetUnzipChannels(); + uint8_t GetUnzipIsTight(); + uint8_t GetCubeVectorSplit(); + + void SetCubeFreq(double cube_freq); + void SetCubeMSize(uint64_t cube_m_size); + void SetCubeNSize(uint64_t cube_n_size); + void SetCubeKSize(uint64_t cube_k_size); + void SetVecCalcSize(uint64_t vec_calc_size); + void SetL0aSize(uint64_t l0_a_size); + void SetL0bSize(uint64_t l0_b_size); + void SetL0cSize(uint64_t l0_c_size); + void SetL1Size(uint64_t l1_size); + void SetSmaskBuffer(uint64_t smask_buffer); + void SetUBSize(uint64_t ub_size); + void SetUBBlockSize(uint64_t ubblock_size); + void SetUBBankSize(uint64_t ubbank_size); + void SetUBBankNum(uint64_t ubbank_num); + void SetUBBurstInOneBlock(uint64_t ubburst_in_one_block); + void SetUBBankGroupNum(uint64_t ubbank_group_num); + void SetUnzipEngines(uint32_t unzip_engines); + void SetUnzipMaxRatios(uint32_t unzip_max_ratios); + void SetUnzipChannels(uint32_t unzip_channels); + void SetUnzipIsTight(uint8_t unzip_is_tight); + void SetCubeVectorSplit(uint8_t cube_vector_split); + private: + AICoreSpecImplPtr aicore_spec_impl_{nullptr}; +}; + +class AICoreMemRateImpl; +using AICoreMemRateImplPtr = std::shared_ptr; +class AICoreMemRates { + public: + bool Init(); + double GetDdrRate(); + double GetDdrReadRate(); + double GetDdrWriteRate(); + double GetL2Rate(); + double GetL2ReadRate(); + double GetL2WriteRate(); + double GetL1ToL0aRate(); + double GetL1ToL0bRate(); + double GetL1ToUBRate(); + double GetL0cToUBRate(); + double GetUBToL2Rate(); + double GetUBToDdrRate(); + double GetUBToL1Rate(); + + void SetDdrRate(double ddr_rate); + void SetDdrReadRate(double ddr_read_rate); + void SetDdrWriteRate(double ddr_write_rate); + void SetL2Rate(double l2_rate); + void SetL2ReadRate(double l2_read_rate); + void SetL2WriteRate(double l2_write_rate); + void SetL1ToL0aRate(double l1_to_l0_a_rate); + void SetL1ToL0bRate(double l1_to_l0_b_rate); + void SetL1ToUBRate(double l1_to_ub_rate); + void SetL0cToUBRate(double l0_c_to_ub_rate); + void SetUBToL2Rate(double ub_to_l2_rate); + void SetUBToDdrRate(double ub_to_ddr_rate); + void SetUBToL1Rate(double ub_to_l1_rate); + private: + AICoreMemRateImplPtr aicore_mem_rate_impl_{nullptr}; +}; + +class VectorCoreSpecImpl; +using VectorCoreSpecImplPtr = std::shared_ptr; +class VectorCoreSpecs { + public: + bool Init(); + double GetVecFreq(); + uint64_t GetVecCalcSize(); + uint64_t GetSmaskBuffer(); + uint64_t GetUBSize(); + uint64_t GetUBBlockSize(); + uint64_t GetUBBankSize(); + uint64_t GetUBBankNum(); + uint64_t GetUBBurstInOneBlock(); + uint64_t GetUBBankGroupNum(); + uint64_t GetVectorRegSize(); + uint64_t GetPredicateRegSize(); + uint64_t GetAddressRegSize(); + uint64_t GetAlignmentRegSize(); + + void SetVecFreq(double vec_freq); + void SetVecCalcSize(uint64_t vec_calc_size); + void SetSmaskBuffer(uint64_t smask_buffer); + void SetUBSize(uint64_t ub_size); + void SetUBBlockSize(uint64_t ubblock_size); + void SetUBBankSize(uint64_t ubbank_size); + void SetUBBankNum(uint64_t ubbank_num); + void SetUBBurstInOneBlock(uint64_t ubburst_in_one_block); + void SetUBBankGroupNum(uint64_t ubbank_group_num); + void SetVectorRegSize(uint64_t vector_reg_size); + void SetPredicateRegSize(uint64_t predicate_reg_size); + void SetAddressRegSize(uint64_t address_reg_size); + void SetAlignmentRegSize(uint64_t alignment_reg_size); + private: + VectorCoreSpecImplPtr vector_core_spec_impl_{nullptr}; +}; + +class VectorCoreMemRateImpl; +using VectorCoreMemRateImplPtr = std::shared_ptr; +class VectorCoreMemRates { + public: + bool Init(); + double GetDdrRate(); + double GetDdrReadRate(); + double GetDdrWriteRate(); + double GetL2Rate(); + double GetL2ReadRate(); + double GetL2WriteRate(); + double GetUBToL2Rate(); + double GetUBToDdrRate(); + + void SetDdrRate(double ddr_rate); + void SetDdrReadRate(double ddr_read_rate); + void SetDdrWriteRate(double ddr_write_rate); + void SetL2Rate(double l2_rate); + void SetL2ReadRate(double l2_read_rate); + void SetL2WriteRate(double l2_write_rate); + void SetUBToL2Rate(double ub_to_l2_rate); + void SetUBToDdrRate(double ub_to_ddr_rate); + private: + VectorCoreMemRateImplPtr vector_core_mem_rate_impl_{nullptr}; +}; + +class CPUCacheImpl; +using CPUCacheImplPtr = std::shared_ptr; +class CPUCaches { + public: + bool Init(); + uint32_t GetAICPUSyncBySW(); + uint32_t GetTSCPUSyncBySW(); + + void SetAICPUSyncBySW(uint32_t AICPUSyncBySW); + void SetTSCPUSyncBySW(uint32_t TSCPUSyncBySW); + private: + CPUCacheImplPtr cpu_cache_impl_{nullptr}; +}; + +class PlatFormInfosImpl; +using PlatFormInfosImplPtr = std::shared_ptr; +class PlatFormInfos { + public: + bool Init(); + StrInfos GetStrInfo(); + SoCInfos GetSocInfo(); + AICoreSpecs GetAICoreSpec(); + AICoreMemRates GetAICoreMemRates(); + std::map> GetAICoreIntrinsicDtype(); + VectorCoreSpecs GetVectorCoreSpec(); + VectorCoreMemRates GetVectorCoreMemRates(); + CPUCaches GetCPUCache(); + std::map> GetVectorCoreIntrinsicDtype(); + + void SetStrInfo(StrInfos &str_infos); + void SetSocInfo(SoCInfos &SoC_infos); + void SetAICoreSpec(AICoreSpecs &AICore_specs); + void SetAICoreMemRates(AICoreMemRates &AICore_mem_rates); + void SetAICoreIntrinsicDtype(std::map> &intrinsic_dtypes); + void SetVectorCoreSpec(VectorCoreSpecs &vector_core_specs); + void SetVectorCoreMemRates(VectorCoreMemRates &vectorcore_mem_rates); + void SetCPUCache(CPUCaches &CPU_caches); + void SetVectorCoreIntrinsicDtype(std::map> &intrinsic_dtypes); + + private: + PlatFormInfosImplPtr platform_infos_impl_{nullptr}; +}; + +class OptionalInfosImpl; +using OptionalInfosImplPtr = std::shared_ptr; +class OptionalInfos { + public: + bool Init(); + std::string GetSocVersion(); + std::string GetCoreType(); + uint32_t GetAICoreNum(); + std::string GetL1FusionFlag(); + + void SetSocVersion(std::string soc_version); + void SetCoreType(std::string core_type); + void SetAICoreNum(uint32_t ai_core_num); + void SetL1FusionFlag(std::string l1_fusion_flag); + private: + OptionalInfosImplPtr optional_infos_impl_{nullptr}; +}; + +} +#endif diff --git a/metadef/inc/graph/debug/ge_attr_define.h b/metadef/inc/graph/debug/ge_attr_define.h index 47c8c93b..6796604a 100644 --- a/metadef/inc/graph/debug/ge_attr_define.h +++ b/metadef/inc/graph/debug/ge_attr_define.h @@ -188,6 +188,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_INPUT_ORIGIN_SIZE; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_ROOT_GRAPH_ID; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_ROOT_GRAPH_NAME; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_NODE_CONNECT_INPUT; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_NODE_CONNECT_OUTPUT; diff --git a/metadef/inc/register/proto/op_mapping_info.proto b/metadef/inc/register/proto/op_mapping_info.proto index e23b7ebe..7fb6f84b 100644 --- a/metadef/inc/register/proto/op_mapping_info.proto +++ b/metadef/inc/register/proto/op_mapping_info.proto @@ -15,6 +15,7 @@ message Output { int32 original_output_data_type = 7; int32 original_output_format = 8; uint64 size = 9; + Shape origin_shape = 10; } message Input { @@ -23,6 +24,7 @@ message Input { Shape shape = 3; uint64 address = 4; uint64 size = 5; + Shape origin_shape = 6; } enum BufferType {