diff --git a/ge/graph/build/graph_builder.cc b/ge/graph/build/graph_builder.cc index 74b884de..d1357bc6 100644 --- a/ge/graph/build/graph_builder.cc +++ b/ge/graph/build/graph_builder.cc @@ -399,41 +399,6 @@ static Status InsertMemcpyNode(const ComputeGraphPtr &graph, const OutDataAnchor return SUCCESS; } -static Status GenerateTaskForConstant(const std::shared_ptr &graph) { - if (graph->GetGraphUnknownFlag()) { - GELOGI("Graph %s is unknown graph, ignore gen_task for constant.", graph->GetName().c_str()); - return SUCCESS; - } - for (auto &node : graph->GetDirectNode()) { - // CONSTANT not generate task, so insert IDENTITY between CONSTANT and NETOUTPUT - auto op_desc = node->GetOpDesc(); - if (op_desc == nullptr) { - continue; - } - auto op_type = op_desc->GetType(); - if (op_type == NETOUTPUT) { - for (InDataAnchorPtr &in_data_anchor : node->GetAllInDataAnchors()) { - const OutDataAnchorPtr &peer_out_anchor = in_data_anchor->GetPeerOutAnchor(); - GE_IF_BOOL_EXEC(peer_out_anchor == nullptr, continue); - NodePtr in_node = peer_out_anchor->GetOwnerNode(); - GE_CHECK_NOTNULL(in_node); - - std::string in_node_op_type = in_node->GetType(); - if (in_node_op_type == CONSTANT) { - GELOGD("Insert MemcpyAsync node between %s and %s.", in_node->GetName().c_str(), node->GetName().c_str()); - std::string name = node->GetName() + "_input_" + std::to_string(in_data_anchor->GetIdx()) + "_Memcpy"; - if (InsertMemcpyNode(graph, peer_out_anchor, {in_data_anchor}, name) != SUCCESS) { - GELOGE(FAILED, "Insert memcpy between %s and %s failed.", - in_node->GetName().c_str(), node->GetName().c_str()); - return FAILED; - } - } - } - } - } - return SUCCESS; -} - Status GraphBuilder::MarkFpBpProfilingTaskAttr(ComputeGraphPtr &com_graph) { bool original_unknown_shape_flag = com_graph->GetGraphUnknownFlag(); com_graph->SetGraphUnknownFlag(false); @@ -516,9 +481,6 @@ Status GraphBuilder::BuildForDynamicShapeGraph(ComputeGraphPtr &comp_graph, !sub_graph->GetParentGraph()->GetGraphUnknownFlag()) { continue; } - - GE_CHK_STATUS_RET(GenerateTaskForConstant(sub_graph), "Generate task For constant node in subgraph failed."); - if (sub_graph->GetGraphUnknownFlag()) { // unknown shape build flow GE_CHK_STATUS_RET(BuildForUnknownShapeGraph(sub_graph, ge_model_ptr, session_id), diff --git a/ge/hybrid/executor/hybrid_execution_context.h b/ge/hybrid/executor/hybrid_execution_context.h index 003e8010..54840c6a 100644 --- a/ge/hybrid/executor/hybrid_execution_context.h +++ b/ge/hybrid/executor/hybrid_execution_context.h @@ -68,7 +68,7 @@ struct GraphExecutionContext { DumpProperties dump_properties; bool trace_enabled = false; bool dump_enabled = false; - std::atomic_bool is_eos_; + std::atomic_bool is_eos_{false}; long profiling_level = 0; long iteration = 0; void *global_step = nullptr; diff --git a/ge/hybrid/executor/hybrid_model_executor.cc b/ge/hybrid/executor/hybrid_model_executor.cc index 4b589a03..ceffa203 100644 --- a/ge/hybrid/executor/hybrid_model_executor.cc +++ b/ge/hybrid/executor/hybrid_model_executor.cc @@ -33,9 +33,6 @@ HybridModelExecutor::~HybridModelExecutor() { if (context_.rt_gen_context != nullptr) { (void) rtCtxDestroy(context_.rt_gen_context); } - if (context_.global_step != nullptr) { - (void) rtFree(context_.global_step); - } } Status HybridModelExecutor::Init() { @@ -49,9 +46,10 @@ Status HybridModelExecutor::Execute(HybridModelExecutor::ExecuteArgs &args) { GELOGD("Start to execute model."); auto root_graph_item = model_->GetRootGraphItem(); GE_CHECK_NOTNULL(root_graph_item); - - GE_CHK_RT_RET(rtMemcpyAsync(context_.global_step, sizeof(uint64_t), &context_.iteration, - sizeof(uint64_t), RT_MEMCPY_HOST_TO_DEVICE_EX, context_.stream)); + if (context_.global_step != nullptr) { + GE_CHK_RT_RET(rtMemcpyAsync(context_.global_step, sizeof(uint64_t), &context_.iteration, + sizeof(uint64_t), RT_MEMCPY_HOST_TO_DEVICE_EX, context_.stream)); + } SubgraphExecutor executor(model_->GetRootGraphItem(), &context_); auto ret = ExecuteGraphInternal(executor, args); Cleanup(); @@ -102,8 +100,8 @@ Status HybridModelExecutor::InitExecutionContext() { GE_CHK_RT_RET(rtCtxGetCurrent(&context_.rt_context)); GE_CHK_RT_RET(rtCtxCreate(&context_.rt_gen_context, RT_CTX_GEN_MODE, 0)); GE_CHK_RT_RET(rtCtxSetCurrent(context_.rt_context)); - GE_CHK_RT_RET(rtMalloc(&context_.global_step, sizeof(uint64_t), RT_MEMORY_HBM)); + context_.global_step = model_->GetGlobalStep(); context_.stream = stream_; context_.model = model_; context_.is_eos_ = false; @@ -136,6 +134,16 @@ Status HybridModelExecutor::ResetExecutionContext(GraphExecutionContext &context string ctx_id = std::to_string(context.context_id); RuntimeInferenceContext::DestroyContext(ctx_id); GE_CHK_GRAPH_STATUS_RET(RuntimeInferenceContext::CreateContext(ctx_id), "Failed to Destroy RuntimeInferenceContext"); + RuntimeInferenceContext *ctx = nullptr; + GE_CHK_GRAPH_STATUS_RET(RuntimeInferenceContext::GetContext(ctx_id, &ctx), "Failed to get context"); + for (auto &host_tensor : context.model->GetHostTensors()) { + auto node_id = host_tensor.first; + for (const auto &output_idx_and_tensor : host_tensor.second) { + auto output_idx = output_idx_and_tensor.first; + GELOGD("Preload const host tensor, node_id = %ld, output id = %d", node_id, output_idx); + ctx->SetTensor(node_id, output_idx, output_idx_and_tensor.second.Clone()); + } + } return SUCCESS; } } // namespace hybrid diff --git a/ge/hybrid/executor/hybrid_model_pipeline_executor.cc b/ge/hybrid/executor/hybrid_model_pipeline_executor.cc index 4706fa97..97b4e1aa 100644 --- a/ge/hybrid/executor/hybrid_model_pipeline_executor.cc +++ b/ge/hybrid/executor/hybrid_model_pipeline_executor.cc @@ -38,6 +38,16 @@ Status StageExecutor::ResetExecutionContext(GraphExecutionContext &context) { string ctx_id = std::to_string(context.context_id); RuntimeInferenceContext::DestroyContext(ctx_id); GE_CHK_GRAPH_STATUS_RET(RuntimeInferenceContext::CreateContext(ctx_id), "Failed to Destroy RuntimeInferenceContext"); + RuntimeInferenceContext *ctx = nullptr; + GE_CHK_GRAPH_STATUS_RET(RuntimeInferenceContext::GetContext(ctx_id, &ctx), "Failed to get context"); + for (auto &host_tensor : context.model->GetHostTensors()) { + auto node_id = host_tensor.first; + for (const auto &output_idx_and_tensor : host_tensor.second) { + auto output_idx = output_idx_and_tensor.first; + GELOGD("Preload const host tensor, node_id = %ld, output id = %d", node_id, output_idx); + ctx->SetTensor(node_id, output_idx, output_idx_and_tensor.second.Clone()); + } + } return SUCCESS; } diff --git a/ge/hybrid/model/hybrid_model.cc b/ge/hybrid/model/hybrid_model.cc index a0217d52..6acbd6cf 100644 --- a/ge/hybrid/model/hybrid_model.cc +++ b/ge/hybrid/model/hybrid_model.cc @@ -357,5 +357,25 @@ TensorValue *HybridModel::GetTensor(const NodePtr &node) const { return GetVariable(node->GetName()); } + +const map>> &HybridModel::GetHostTensors() const { + return host_tensors_; +} + +void *HybridModel::GetGlobalStep() const { + if (global_step_ == nullptr) { + return nullptr; + } + return global_step_->GetData(); +} + +TensorBuffer *HybridModel::GetModelWeight(const string &subgraph_name) const { + auto it = weight_buffer_map_.find(subgraph_name); + if (it == weight_buffer_map_.end()) { + GELOGD("Model weight not found, subgraph name = %s", subgraph_name.c_str()); + return nullptr; + } + return it->second.get(); +} } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/model/hybrid_model.h b/ge/hybrid/model/hybrid_model.h index fae53679..5d772b98 100644 --- a/ge/hybrid/model/hybrid_model.h +++ b/ge/hybrid/model/hybrid_model.h @@ -45,6 +45,8 @@ class HybridModel { return root_runtime_param_.session_id; } + void *GetGlobalStep() const; + GeModelPtr GetGeModel(const NodePtr &node) const; NodeItem *MutableNodeItem(const NodePtr &node); @@ -91,6 +93,10 @@ class HybridModel { TensorValue* GetTensor(const NodePtr &node) const; + TensorBuffer* GetModelWeight(const std::string &subgraph_name) const; + + const std::map>> &GetHostTensors() const; + const std::vector* GetTaskDefs(const NodePtr &node) const; const GraphItem *GetRootGraphItem() const; @@ -145,6 +151,7 @@ class HybridModel { std::unique_ptr root_graph_item_; std::map> subgraph_items_; std::map> node_items_; + std::map>> host_tensors_; bool is_new_model_desc_ = false; // support aipp bool is_single_op_ = false; @@ -153,10 +160,10 @@ class HybridModel { uint32_t device_id_ = 0; uint32_t model_id_ = 0; uint8_t *var_mem_base_ = nullptr; - std::unique_ptr weight_buffer_; std::map> weight_buffer_map_; RuntimeParam root_runtime_param_; string om_name_; + std::unique_ptr global_step_; }; } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/model/hybrid_model_builder.cc b/ge/hybrid/model/hybrid_model_builder.cc index 25dabd78..b463cd1f 100755 --- a/ge/hybrid/model/hybrid_model_builder.cc +++ b/ge/hybrid/model/hybrid_model_builder.cc @@ -145,6 +145,9 @@ Status HybridModelBuilder::Build() { GE_CHK_STATUS_RET(InitConstantOps(), "[%s] Failed to init constant op", GetGraphName()); GE_CHK_STATUS_RET(InitVariableTensors(), "[%s] Failed to init variables", GetGraphName()); GE_CHK_STATUS_RET(LoadTasks(), "[%s] Failed to load tasks", GetGraphName()); + GE_CHK_STATUS_RET(OptimizeDependenciesForConstantInputs(), + "[%s] Failed to optimize dependencies for constant inputs", + GetGraphName()); GELOGI("[%s] Done building hybrid model successfully.", GetGraphName()); return SUCCESS; } @@ -346,6 +349,7 @@ Status HybridModelBuilder::ParseDependentInputNodes(NodeItem &node_item, const s auto src_node_item = MutableNodeItem(src_node); src_node_item->to_const_output_id_list.emplace(peer_out_anchor->GetIdx()); dependent_for_shape_inference.emplace(src_node); + host_input_value_dependencies_[&node_item].emplace_back(peer_out_anchor->GetIdx(), src_node_item); GELOGD("[%s] Dependent added from output of [%s:%d]", node_item.NodeName().c_str(), src_node_item->NodeName().c_str(), @@ -1480,7 +1484,7 @@ Status HybridModelBuilder::IdentifyVariableOutputs(NodeItem &node_item) { src_node->GetName().c_str(), src_op_type.c_str()); - if (src_op_type != CONSTANTOP && src_op_type != VARIABLE) { + if (src_op_type != CONSTANTOP && src_op_type != CONSTANT && src_op_type != VARIABLE) { continue; } @@ -1489,6 +1493,9 @@ Status HybridModelBuilder::IdentifyVariableOutputs(NodeItem &node_item) { GELOGD("Got parent output index = %u", parent_index); GE_CHECK_LE(parent_index, INT32_MAX); node_item.ref_outputs.emplace(static_cast(parent_index), src_node); + if (src_op_type == CONSTANTOP || src_op_type == CONSTANT) { + known_subgraph_constant_output_refs_[&node_item].emplace(parent_index, src_node); + } } // Data nodes marked with REF_VAR_SRC_VAR_NAME @@ -1554,6 +1561,10 @@ Status HybridModelBuilder::InitModelMem() { } runtime_param_.var_base = hybrid_model_.var_mem_base_; + auto allocator = NpuMemoryAllocator::GetAllocator(); + GE_CHECK_NOTNULL(allocator); + hybrid_model_.global_step_ = TensorBuffer::Create(allocator, sizeof(int64_t)); + GE_CHECK_NOTNULL(hybrid_model_.global_step_); return SUCCESS; } @@ -2113,5 +2124,88 @@ Status HybridModelBuilder::ParseDependentByParallelGroup() { } return SUCCESS; } + +Status HybridModelBuilder::OptimizeDependenciesForConstantInputs() { + std::map> converted; + for (auto &it : host_input_value_dependencies_) { + auto node_item = it.first; + std::map ref_counts; + bool changed = false; + for (auto output_idx_and_node : it.second) { + auto output_idx = output_idx_and_node.first; + auto src_node_item = output_idx_and_node.second; + ++ref_counts[src_node_item]; + NodePtr constant_node; + if (src_node_item->node_type == CONSTANT || src_node_item->node_type == CONSTANTOP) { + constant_node = src_node_item->node; + GELOGD("src node [%s] is a constant", src_node_item->NodeName().c_str()); + } else { + auto iter = known_subgraph_constant_output_refs_.find(src_node_item); + if (iter != known_subgraph_constant_output_refs_.end()) { + constant_node = iter->second[output_idx]; + if (constant_node != nullptr) { + GELOGD("Output[%u] of subgraph [%s] is a constant", output_idx, src_node_item->NodeName().c_str()); + } + } + } + + if (constant_node == nullptr) { + GELOGD("Output[%u] of [%s] is not a constant", output_idx, src_node_item->NodeName().c_str()); + continue; + } + + if (converted[constant_node].count(output_idx) == 0) { + GE_CHK_STATUS_RET(Convert2HostTensor(constant_node, src_node_item->node_id, output_idx), + "[%s] Failed to convert constant to host tensor", constant_node->GetName().c_str()); + converted[constant_node].emplace(output_idx); + } + + src_node_item->to_const_output_id_list.erase(output_idx); + --ref_counts[src_node_item]; + changed = true; + } + + if (changed) { + std::vector depends_to_keep; + for (auto &ref_count_it : ref_counts) { + if (ref_count_it.second == 0) { + GELOGD("[%s] no longer depends on [%s] for shape inference", + node_item->NodeName().c_str(), + ref_count_it.first->NodeName().c_str()); + } else { + depends_to_keep.emplace_back(ref_count_it.first->node); + } + } + node_item->dependents_for_shape_inference.swap(depends_to_keep); + } + } + + return SUCCESS; +} +Status HybridModelBuilder::Convert2HostTensor(const NodePtr &node, int node_id, uint32_t output_idx) { + auto tensor_value = hybrid_model_.GetTensor(node); + GE_CHECK_NOTNULL(tensor_value); + auto tensor_desc = node->GetOpDesc()->MutableOutputDesc(0); + GE_CHECK_NOTNULL(tensor_desc); + Tensor tensor(TensorAdapter::GeTensorDesc2TensorDesc(*tensor_desc)); + int64_t tensor_size = -1; + GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetTensorSizeInBytes(*tensor_desc, tensor_size), + "[%s] Failed to get tensor size", node->GetName().c_str()); + if (tensor_size > 0) { + auto copy_size = static_cast(tensor_size); + GE_CHECK_GE(tensor_value->GetSize(), copy_size); + std::vector buffer(copy_size); + GE_CHK_RT_RET(rtMemcpy(buffer.data(), + copy_size, + tensor_value->GetData(), + copy_size, + RT_MEMCPY_DEVICE_TO_HOST)); + tensor.SetData(std::move(buffer)); + GELOGD("[%s] Copy constant tensor to host successfully, size = %zu", node->GetName().c_str(), copy_size); + } + + hybrid_model_.host_tensors_[node_id].emplace_back(output_idx, std::move(tensor)); + return SUCCESS; +} } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/model/hybrid_model_builder.h b/ge/hybrid/model/hybrid_model_builder.h index a59a282a..d383953b 100644 --- a/ge/hybrid/model/hybrid_model_builder.h +++ b/ge/hybrid/model/hybrid_model_builder.h @@ -91,6 +91,8 @@ class HybridModelBuilder { Status GenerateBpProfilingTask(const OpDescPtr &op_desc, vector &task_def_list); Status GenerateEndProfilingTask(const OpDescPtr &op_desc, vector &task_def_list); Status GenerateArProfilingTask(const OpDescPtr &op_desc, int64_t log_id, vector &task_def_list); + Status OptimizeDependenciesForConstantInputs(); + Status Convert2HostTensor(const NodePtr &node, int node_id, uint32_t output_idx); const char* GetGraphName() const { return hybrid_model_.model_name_.c_str(); @@ -111,6 +113,12 @@ class HybridModelBuilder { RuntimeParam &runtime_param_; VarManager *var_manager_ = nullptr; + + // map> + std::map> known_subgraph_constant_output_refs_; + + // map> + std::map>> host_input_value_dependencies_; }; } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc index 1c46db20..dafa8201 100644 --- a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc +++ b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc @@ -18,6 +18,7 @@ #include "cce/aicpu_engine_struct.h" #include "framework/common/debug/ge_log.h" #include "framework/common/fmk_error_codes.h" +#include "common/dump/dump_manager.h" #include "common/ge/ge_util.h" #include "graph/attr_value.h" #include "graph/debug/ge_attr_define.h" @@ -110,15 +111,6 @@ Status KnownNodeTask::Init(TaskContext &context) { GELOGI("KnownNodeTask::Init mem base is %p, size %lu.", davinci_model_->GetRuntimeParam().mem_base, davinci_model_->GetRuntimeParam().mem_size); } - if (!load_flag_) { - auto dump_properties = context.GetDumpProperties(); - if (dump_properties.IsDumpOpen() || dump_properties.IsOpDebugOpen()) { - davinci_model_->SetDumpProperties(dump_properties); - void *global_step = context.GetExecutionContext()->global_step; - davinci_model_->SetKnownShapeGlobalStep(global_step); - } - load_flag_ = true; - } GE_CHK_STATUS_RET(ModelManager::GetInstance()->DestroyAicpuKernel(davinci_model_->GetSessionId(), davinci_model_->Id(), davinci_model_->SubModelId()), "KnownNodeTask::Init destroy aicpu kernel failed."); @@ -126,20 +118,35 @@ Status KnownNodeTask::Init(TaskContext &context) { return SUCCESS; } -Status KnownNodeTask::InitDavinciModel() { - GELOGD("[Init][Model] start"); +Status KnownNodeTask::InitDavinciModel(const HybridModel &model, TensorBuffer *weight_buffer) { + GELOGD("[Init][DavinciModel] start"); davinci_model_->InitRuntimeParams(); GE_CHK_STATUS_RET(davinci_model_->InitVariableMem(), "init variable mem failed"); int32_t device_id = 0; GE_CHK_RT_RET(rtGetDevice(&device_id)); davinci_model_->SetDeviceId(static_cast(device_id)); - GE_CHK_STATUS_RET(DoInitDavinciModel(), "[Init][Model] Failed to init davinci model."); + + auto dump_properties = DumpManager::GetInstance().GetDumpProperties(model.GetSessionId()); + if (dump_properties.IsDumpOpen() || dump_properties.IsOpDebugOpen()) { + davinci_model_->SetDumpProperties(dump_properties); + void *global_step = model.GetGlobalStep(); + davinci_model_->SetKnownShapeGlobalStep(global_step); + } + + void *weight = nullptr; + size_t weight_size = 0; + if (weight_buffer != nullptr) { + weight = weight_buffer->GetData(); + weight_size = weight_buffer->GetSize(); + } + GELOGD("Start to init davinci model, weight size = %zu", weight_size); + GE_CHK_STATUS_RET(DoInitDavinciModel(weight, weight_size), "[Init][Model] Failed to init davinci model."); GELOGD("[Init][Model] success"); return SUCCESS; } -Status KnownNodeTask::DoInitDavinciModel() { - return davinci_model_->Init(); +Status KnownNodeTask::DoInitDavinciModel(void *weight, size_t weight_size) { + return davinci_model_->Init(nullptr, 0, weight, weight_size); } Status KnownNodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const { @@ -165,6 +172,10 @@ Status KnownNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &node const GeModelPtr ge_model = model.GetGeModel(node); GE_CHECK_NOTNULL(ge_model); + AscendString graph_name; + GE_CHK_GRAPH_STATUS_RET(ge_model->GetGraph().GetName(graph_name), "Failed to get graph name"); + auto weight_buffer = model.GetModelWeight(graph_name.GetString()); + std::shared_ptr davinci_model = MakeShared(0, nullptr); GE_CHECK_NOTNULL(davinci_model); @@ -180,7 +191,7 @@ Status KnownNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &node auto known_node_task = MakeShared(davinci_model); GE_CHECK_NOTNULL(known_node_task); - GE_CHK_STATUS_RET_NOLOG(known_node_task->InitDavinciModel()); + GE_CHK_STATUS_RET_NOLOG(known_node_task->InitDavinciModel(model, weight_buffer)); GELOGI("[%s] KnownNodeExecutor::LoadTask success.", node->GetName().c_str()); task = std::move(known_node_task); return SUCCESS; diff --git a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h index 5eed528a..26141b5a 100644 --- a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h +++ b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h @@ -36,13 +36,12 @@ class KnownNodeTask : public NodeTask { Status UpdateArgs(TaskContext &context) override; Status ExecuteAsync(TaskContext &context, std::function done_callback) override; Status Init(TaskContext &context) override; - Status InitDavinciModel(); + Status InitDavinciModel(const HybridModel &model, TensorBuffer *weight_buffer); protected: - virtual Status DoInitDavinciModel(); + virtual Status DoInitDavinciModel(void *weight, size_t weight_size); private: std::shared_ptr davinci_model_ = nullptr; - bool load_flag_ = false; }; class KnownNodeExecutor : public NodeExecutor {