You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1063 lines
32 KiB
1063 lines
32 KiB
/**
|
|
* Copyright 2020 Huawei Technologies Co., Ltd
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#ifndef GE_GRAPH_LOAD_NEW_MODEL_MANAGER_DAVINCI_MODEL_H_
|
|
#define GE_GRAPH_LOAD_NEW_MODEL_MANAGER_DAVINCI_MODEL_H_
|
|
|
|
#include <map>
|
|
#include <memory>
|
|
#include <set>
|
|
#include <string>
|
|
#include <thread>
|
|
#include <vector>
|
|
|
|
#include "common/ge_types.h"
|
|
#include "common/helper/model_helper.h"
|
|
#include "common/helper/om_file_helper.h"
|
|
#include "common/opskernel/ge_task_info.h"
|
|
#include "common/properties_manager.h"
|
|
#include "common/types.h"
|
|
#include "framework/common/util.h"
|
|
#include "graph/debug/ge_attr_define.h"
|
|
#include "graph/load/model_manager/aipp_utils.h"
|
|
#include "graph/load/model_manager/data_dumper.h"
|
|
#include "graph/load/model_manager/data_inputer.h"
|
|
#include "graph/load/model_manager/model_utils.h"
|
|
#include "graph/load/model_manager/zero_copy_offset.h"
|
|
#include "graph/load/model_manager/zero_copy_task.h"
|
|
#include "graph/model.h"
|
|
#include "graph/node.h"
|
|
#include "graph/op_desc.h"
|
|
#include "graph/operator.h"
|
|
#include "graph/utils/attr_utils.h"
|
|
#include "graph/utils/tensor_utils.h"
|
|
#include "mmpa/mmpa_api.h"
|
|
#include "proto/task.pb.h"
|
|
#include "task_info/task_info.h"
|
|
#include "graph/common/local_context.h"
|
|
|
|
using std::mutex;
|
|
using std::thread;
|
|
using std::multimap;
|
|
|
|
namespace ge {
|
|
// op debug need 2048 bits buffer
|
|
const size_t kOpDebugMemorySize = 2048UL;
|
|
const size_t kDebugP2pSize = 8UL;
|
|
|
|
typedef enum tagModelProcStage {
|
|
MODEL_LOAD_START = 1,
|
|
MODEL_LOAD_END,
|
|
MODEL_PRE_PROC_START,
|
|
MODEL_PRE_PROC_END,
|
|
MODEL_INFER_START,
|
|
MODEL_INFER_END,
|
|
MODEL_AFTER_PROC_START,
|
|
MODEL_AFTER_PROC_END,
|
|
MODEL_PROC_INVALID,
|
|
} ModelProcStage;
|
|
|
|
struct timeInfo {
|
|
uint32_t modelId;
|
|
int64_t processBeginTime;
|
|
int64_t processEndTime;
|
|
int64_t inferenceBeginTime;
|
|
int64_t inferenceEndTime;
|
|
int64_t dumpBeginTime;
|
|
int64_t dumpEndTime;
|
|
};
|
|
|
|
// For super kernel
|
|
struct SuperKernelTaskInfo {
|
|
uint32_t last_block_dim;
|
|
uint32_t last_args_size;
|
|
uint32_t last_task_id;
|
|
uint32_t last_stream_id;
|
|
void *last_stream;
|
|
void *last_sm_desc;
|
|
vector<void *> kernel_list;
|
|
vector<void *> arg_list;
|
|
vector<uint32_t> dump_flag_list;
|
|
vector<OpDescPtr> op_desc_list;
|
|
vector<uintptr_t> dump_args_list;
|
|
uint32_t last_dump_flag;
|
|
int64_t last_group_key;
|
|
uintptr_t last_dump_args;
|
|
OpDescPtr last_op;
|
|
};
|
|
|
|
struct TaskMemInfo {
|
|
int64_t input_size{0};
|
|
int64_t output_size{0};
|
|
int64_t weight_size{0};
|
|
int64_t workspace_size{0};
|
|
int64_t total_size{0};
|
|
};
|
|
|
|
struct ProfileInfo {
|
|
FusionOpInfo fusion_info;
|
|
TaskMemInfo memory_info;
|
|
uint32_t task_count{0};
|
|
};
|
|
|
|
enum ExecuteMode {
|
|
INITIALIZATION,
|
|
SYNCHRONIZATION,
|
|
ASYNCHRONIZATION,
|
|
};
|
|
|
|
// comments
|
|
class DavinciModel {
|
|
public:
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief DavinciModel constructor
|
|
/// @author
|
|
///
|
|
DavinciModel(int32_t priority, const shared_ptr<ModelListener> &listener);
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief DavinciModel desctructor, free Parse and Init resources
|
|
/// @author
|
|
///
|
|
~DavinciModel();
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief apply model to model_def_
|
|
///
|
|
Status Assign(const GeModelPtr &ge_model);
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief DavinciModel initialization, including Stream, ccHandle, Event, DataInputer, etc
|
|
/// @return execute result
|
|
/// @author
|
|
///
|
|
Status Init(void *dev_ptr = nullptr, size_t memsize = 0, void *weight_ptr = nullptr, size_t weightsize = 0);
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief ACL case, Load task list with queue.
|
|
/// @param [in] input_que_ids: input queue ids from user, nums equal Data Op.
|
|
/// @param [in] output_que_ids: input queue ids from user, nums equal NetOutput Op.
|
|
/// @return: 0 for success / others for fail
|
|
///
|
|
Status SetQueIds(const vector<uint32_t> &input_queue_ids, const vector<uint32_t> &output_queue_ids);
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief Get DataInputer
|
|
/// @return model ID
|
|
///
|
|
uint32_t Id() const { return model_id_; }
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief Get DataInputer
|
|
/// @return model ID
|
|
///
|
|
void SetId(uint32_t model_id) { model_id_ = model_id; }
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief Get SubModelId
|
|
/// @return sub model ID
|
|
///
|
|
uint32_t SubModelId() const { return sub_model_id_; }
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief Get SubModelId
|
|
/// @return sub model ID
|
|
///
|
|
void SetSubModelId(uint32_t sub_model_id) { sub_model_id_ = sub_model_id; }
|
|
|
|
static void *Run(DavinciModel *model_pointer);
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief NnExecute
|
|
/// @param [in] stream execute stream
|
|
/// @param [in] async_mode is asynchronize mode.
|
|
/// @param [in] input_data model input data
|
|
/// @param [out] output_data model output data
|
|
///
|
|
Status NnExecute(rtStream_t stream, bool async_mode, const InputData &input_data, OutputData &output_data);
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief lock mutex run flag
|
|
/// @author
|
|
///
|
|
void LockRunFlg() { mux_run_flg_.lock(); }
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief unlock mutex run flag
|
|
/// @author
|
|
///
|
|
void UnlockRunFlg() { mux_run_flg_.unlock(); }
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief get DataInputer
|
|
/// @return DataInputer pointer
|
|
///
|
|
DataInputer *const GetDataInputer() const { return data_inputer_; }
|
|
|
|
// get Stream number
|
|
uint32_t StreamNum() const { return runtime_param_.stream_num; }
|
|
|
|
// get Event number
|
|
uint32_t EventNum() const { return runtime_param_.event_num; }
|
|
|
|
// get Lable number
|
|
uint32_t LabelNum() const { return runtime_param_.label_num; }
|
|
|
|
// get batch number
|
|
uint32_t BatchNum() const { return runtime_param_.batch_num; }
|
|
|
|
// get session id
|
|
uint64_t SessionId() const { return runtime_param_.session_id; }
|
|
|
|
// get model priority
|
|
int32_t Priority() const { return priority_; }
|
|
|
|
// get total mem size
|
|
size_t TotalMemSize() const { return runtime_param_.mem_size; }
|
|
|
|
const map<uint32_t, MemInfo> &P2PMemInfos() const { return runtime_param_.memory_infos; }
|
|
|
|
// model name
|
|
string Name() const { return name_; }
|
|
|
|
// om_name
|
|
string OmName() const { return om_name_; }
|
|
|
|
// version
|
|
uint32_t Version() const { return version_; }
|
|
|
|
// get total weights mem size
|
|
size_t TotalWeightsMemSize() const { return runtime_param_.weight_size; }
|
|
|
|
size_t TotalVarMemSize() const { return runtime_param_.var_size; }
|
|
|
|
// get base memory address
|
|
uint8_t *MemBase() { return mem_base_; }
|
|
|
|
// get weight base memory address
|
|
uint8_t *WeightsMemBase() { return weights_mem_base_; }
|
|
|
|
uint8_t *VarMemBase() { return var_mem_base_; }
|
|
|
|
// get Event list
|
|
const vector<rtEvent_t> &GetEventList() const { return event_list_; }
|
|
|
|
const vector<rtStream_t> &GetStreamList() const { return stream_list_; }
|
|
|
|
const vector<rtLabel_t> &GetLabelList() const { return label_list_; }
|
|
|
|
Status DestroyThread();
|
|
|
|
// get Op
|
|
OpDescPtr GetOpByIndex(uint32_t index) const {
|
|
if (op_list_.find(index) == op_list_.end()) {
|
|
return nullptr;
|
|
}
|
|
return op_list_.at(index);
|
|
}
|
|
|
|
void *GetGlobalStep() const { return global_step_addr_; }
|
|
|
|
// get task info for profiling
|
|
const vector<TaskDescInfo> &GetTaskDescInfo() const { return task_desc_info_; }
|
|
|
|
// get updated task info list
|
|
vector<TaskInfoPtr> GetTaskList() { return task_list_; }
|
|
|
|
// Modified from KernelTaskInfo.
|
|
SuperKernelTaskInfo &GetSuperKernelTaskInfo() { return skt_info_; }
|
|
|
|
rtModel_t GetRtModelHandle() const { return rt_model_handle_; }
|
|
|
|
rtStream_t GetRtModelStream() const { return rt_model_stream_; }
|
|
|
|
uint64_t GetRtBaseAddr() const { return runtime_param_.logic_mem_base; }
|
|
|
|
uint64_t GetRtWeightAddr() const { return runtime_param_.logic_weight_base; }
|
|
|
|
uint64_t GetRtVarAddr() const { return runtime_param_.logic_var_base; }
|
|
|
|
uint32_t GetFlowctrlIndex(uint32_t op_index);
|
|
|
|
void PushHcclStream(rtStream_t value);
|
|
|
|
bool IsBroadCastOpData(const NodePtr &var_node);
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief For TVM Op, avoid Addr Reuse.
|
|
/// @return void*
|
|
///
|
|
const char *GetRegisterStub(const string &tvm_binfile_key, const string &session_graph_model_id = "");
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief get model input and output desc info
|
|
/// @param [out] input_shape model input size
|
|
/// @param [out] output_shape model output size
|
|
/// @return execute result
|
|
///
|
|
Status GetInputOutputDescInfo(vector<InputOutputDescInfo> &input_desc, vector<InputOutputDescInfo> &output_desc);
|
|
|
|
Status GetInputOutputDescInfo(vector<InputOutputDescInfo> &input_desc, vector<InputOutputDescInfo> &output_desc,
|
|
vector<uint32_t> &input_formats, vector<uint32_t> &output_formats, bool by_dims);
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief Get dynamic batch_info
|
|
/// @param [out] batch_info
|
|
/// @param [out] dynamic_type
|
|
/// @return execute result
|
|
///
|
|
Status GetDynamicBatchInfo(vector<vector<int64_t>> &batch_info, int32_t &dynamic_type) const;
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief Get combined dynamic dims info
|
|
/// @param [out] batch_info
|
|
/// @return None
|
|
///
|
|
void GetCombinedDynamicDims(vector<vector<int64_t>> &batch_info) const;
|
|
|
|
void GetUserDesignateShapeOrder(vector<string> &user_input_shape_order) const;
|
|
|
|
void GetCurShape(vector<int64_t> &batch_info, int32_t &dynamic_type) const;
|
|
|
|
void GetModelAttr(vector<string> &dynamic_output_shape_info) const;
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief Get AIPP input info
|
|
/// @param [in] index
|
|
/// @param [out] aipp_info
|
|
/// @return execute result
|
|
///
|
|
Status GetAippInfo(uint32_t index, AippConfigInfo &aipp_info) const;
|
|
|
|
Status GetAippType(uint32_t index, InputAippType &type, size_t &aipp_index) const;
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief Get model_id.
|
|
/// @return model_id
|
|
///
|
|
uint32_t GetModelId() const { return model_id_; }
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief get unique identification for op when load two or more models
|
|
/// @param [in] op_desc : current op.
|
|
/// @param [in] string identification: unique identification for current op.
|
|
/// @return None
|
|
///
|
|
void GetUniqueId(const OpDescPtr &op_desc, string &unique_identification);
|
|
|
|
Status ReturnResult(uint32_t data_id, const bool rslt_flg, const bool seq_end_flg, OutputData *output_data);
|
|
|
|
Status ReturnNoOutput(uint32_t data_id);
|
|
|
|
Status ModelRunStart();
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief stop run model
|
|
/// @return Status
|
|
///
|
|
Status ModelRunStop();
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief model run flag
|
|
/// @return Status
|
|
///
|
|
bool RunFlag() const { return run_flg_; }
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief Set Session Id
|
|
/// @return void
|
|
///
|
|
void SetSessionId(uint64_t session_id) { session_id_ = session_id; }
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief Get Session Id
|
|
/// @return sessionID
|
|
///
|
|
uint64_t GetSessionId() const { return session_id_; }
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief SetDeviceId
|
|
/// @return void
|
|
///
|
|
void SetDeviceId(uint32_t device_id) { device_id_ = device_id; }
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief Get device Id
|
|
/// @return device id
|
|
///
|
|
uint32_t GetDeviceId() const { return device_id_; }
|
|
|
|
bool NeedDestroyAicpuKernel() const { return need_destroy_aicpu_kernel_; }
|
|
|
|
Status UpdateSessionId(uint64_t session_id);
|
|
|
|
const RuntimeParam &GetRuntimeParam() { return runtime_param_; }
|
|
|
|
int32_t GetDataInputTid() const { return dataInputTid; }
|
|
void SetDataInputTid(int32_t data_input_tid) { dataInputTid = data_input_tid; }
|
|
|
|
void DisableZeroCopy(const void *addr);
|
|
|
|
bool GetOpDugReg() const { return is_op_debug_reg_; }
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief Save outside address of Data or NetOutput used info for ZeroCopy.
|
|
/// @param [in] const OpDescPtr &op_desc: current op desc
|
|
/// @param [in] const vector<void *> &outside_addrs: address of task
|
|
/// @param [in] const void *args_offset: arguments address save the address.
|
|
/// @return None.
|
|
///
|
|
void SetZeroCopyAddr(const OpDescPtr &op_desc, const vector<void *> &outside_addrs, const void *info, void *args,
|
|
size_t size, size_t offset);
|
|
|
|
void SetDynamicSize(const vector<uint64_t> &batch_num, int32_t dynamic_type);
|
|
|
|
bool GetL1FusionEnableOption() { return is_l1_fusion_enable_; }
|
|
|
|
void SetProfileTime(ModelProcStage stage, int64_t endTime = 0);
|
|
|
|
int64_t GetLoadBeginTime() { return load_begin_time_; }
|
|
|
|
int64_t GetLoadEndTime() { return load_end_time_; }
|
|
|
|
Status ReportProfilingData();
|
|
|
|
void SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, uint32_t task_id, uint32_t stream_id) {
|
|
data_dumper_.SaveDumpOpInfo(model_param, op, task_id, stream_id);
|
|
}
|
|
|
|
void SaveDumpTask(uint32_t task_id, uint32_t stream_id, const shared_ptr<OpDesc> &op_desc, uintptr_t args) {
|
|
data_dumper_.SaveDumpTask(task_id, stream_id, op_desc, args);
|
|
}
|
|
|
|
void DumperShrink() {
|
|
data_dumper_.DumpShrink();
|
|
}
|
|
|
|
void SetEndGraphId(uint32_t task_id, uint32_t stream_id);
|
|
DavinciModel &operator=(const DavinciModel &model) = delete;
|
|
|
|
DavinciModel(const DavinciModel &model) = delete;
|
|
|
|
const map<int64_t, vector<rtStream_t>> &GetHcclFolowStream() {
|
|
return main_follow_stream_mapping_;
|
|
}
|
|
void SaveHcclFollowStream(int64_t main_stream_id, rtStream_t stream);
|
|
|
|
void InitRuntimeParams();
|
|
Status InitVariableMem();
|
|
|
|
void UpdateMemBase(uint8_t *mem_base) {
|
|
runtime_param_.mem_base = mem_base;
|
|
mem_base_ = mem_base;
|
|
}
|
|
void SetTotalArgsSize(uint32_t args_size) { total_args_size_ += args_size; }
|
|
uint32_t GetTotalArgsSize() { return total_args_size_; }
|
|
void *GetCurrentArgsAddr(uint32_t offset) {
|
|
void *cur_args = static_cast<char *>(args_) + offset;
|
|
return cur_args;
|
|
}
|
|
void SetTotalIOAddrs(const vector<void *> &io_addrs);
|
|
void SetHybridArgsSize(uint32_t args_size) { total_hybrid_args_size_ += args_size; }
|
|
uint32_t GetHybridArgsSize() {
|
|
return total_hybrid_args_size_;
|
|
}
|
|
void *GetCurrentHybridArgsAddr(uint32_t offset) {
|
|
void *cur_args = static_cast<char *>(hybrid_addrs_) + offset;
|
|
return cur_args;
|
|
}
|
|
void SetTotalFixedAddrsSize(string tensor_name, int64_t fix_addr_size);
|
|
int64_t GetFixedAddrsSize(string tensor_name);
|
|
void *GetCurrentFixedAddr(int64_t offset) const {
|
|
void *cur_addr = static_cast<char *>(fixed_addrs_) + offset;
|
|
return cur_addr;
|
|
}
|
|
|
|
uint32_t GetFixedAddrOutputIndex(string tensor_name) {
|
|
if (tensor_name_to_peer_output_index_.find(tensor_name) != tensor_name_to_peer_output_index_.end()) {
|
|
return tensor_name_to_peer_output_index_[tensor_name];
|
|
}
|
|
return UINT32_MAX;
|
|
}
|
|
void SetKnownNode(bool known_node) { known_node_ = known_node; }
|
|
bool IsKnownNode() { return known_node_; }
|
|
Status MallocKnownArgs();
|
|
Status UpdateKnownNodeArgs(const vector<void *> &inputs, const vector<void *> &outputs);
|
|
Status CreateKnownZeroCopyMap(const vector<void *> &inputs, const vector<void *> &outputs);
|
|
Status UpdateKnownZeroCopyAddr(vector<void *> &total_io_addrs, bool update_args = true);
|
|
void SetKnownNodeAddrNotChanged(bool base_addr_not_changed) { base_addr_not_changed_ = base_addr_not_changed; }
|
|
|
|
Status GetOrigInputInfo(uint32_t index, OriginInputInfo &orig_input_info) const;
|
|
Status GetAllAippInputOutputDims(uint32_t index, vector<InputOutputDims> &input_dims,
|
|
vector<InputOutputDims> &output_dims) const;
|
|
|
|
// om file name
|
|
void SetOmName(string om_name) { om_name_ = om_name; }
|
|
|
|
void SetDumpProperties(const DumpProperties &dump_properties) { data_dumper_.SetDumpProperties(dump_properties); }
|
|
const DumpProperties &GetDumpProperties() const { return data_dumper_.GetDumpProperties(); }
|
|
|
|
bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const {
|
|
return data_dumper_.GetOpDescInfo(stream_id, task_id, op_desc_info);
|
|
}
|
|
|
|
private:
|
|
// memory address of weights
|
|
uint8_t *weights_mem_base_;
|
|
uint8_t *var_mem_base_;
|
|
// memory address of model
|
|
uintptr_t fixed_mem_base_; // Initial of mem_base_, keep forever.
|
|
uint8_t *mem_base_;
|
|
uint8_t *p2p_mem_base_;
|
|
bool is_inner_mem_base_;
|
|
bool is_inner_weight_base_;
|
|
bool is_inner_p2p_mem_base_;
|
|
// input data manager
|
|
DataInputer *data_inputer_;
|
|
int64_t load_begin_time_;
|
|
int64_t load_end_time_;
|
|
struct timeInfo time_info_;
|
|
int32_t dataInputTid;
|
|
|
|
void *GetRunAddress(void *addr) const;
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief Copy Check input size and model op size.
|
|
/// @param [in] const int64_t &input_size: input size.
|
|
/// @param [in] const int64_t &op_size: model op size.
|
|
/// @param [in] is_dynamic: dynamic batch input flag.
|
|
/// @return true if success
|
|
///
|
|
bool CheckInputAndModelSize(const int64_t &input_size, const int64_t &op_size, bool is_dynamic);
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief Set copy only for No task feed NetOutput address.
|
|
/// @return None.
|
|
///
|
|
void SetCopyOnlyOutput();
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief Copy Input/Output to model for direct use.
|
|
/// @param [in] const InputData &input_data: user input data info.
|
|
/// @param [in/out] OutputData &output_data: user output data info.
|
|
/// @param [in] bool is_dynamic: whether is dynamic input, true: is dynamic input; false: not is dynamic input
|
|
/// @return SUCCESS handle successfully / others handle failed
|
|
///
|
|
Status CopyModelData(const InputData &input_data, OutputData &output_data, bool is_dynamic);
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief Copy Data addr to model for direct use.
|
|
/// @param [in] data_info: model memory addr/size map { data_index, { tensor_size, tensor_addr } }.
|
|
/// @param [in] is_input: input data or output data
|
|
/// @param [in] blobs: user input/output data list.
|
|
/// @param [in] is_dynamic: whether is dynamic input, true: is dynamic input; false: not is dynamic input
|
|
/// @param [in] batch_label: batch label for multi-batch scenes
|
|
/// @return SUCCESS handle successfully / others handle failed
|
|
///
|
|
Status UpdateIoTaskArgs(const map<uint32_t, ZeroCopyOffset> &data_info, bool is_input,
|
|
const vector<DataBuffer> &blobs, bool is_dynamic, const string &batch_label);
|
|
|
|
Status CopyInputData(const InputData &input_data, bool device_data = false);
|
|
|
|
Status CopyOutputData(uint32_t data_id, OutputData &output_data, rtMemcpyKind_t kind);
|
|
|
|
Status SyncVarData();
|
|
|
|
Status InitWeightMem(void *dev_ptr, void *weight_ptr, size_t weight_size);
|
|
Status InitFeatureMapAndP2PMem(void *dev_ptr, size_t mem_size);
|
|
|
|
void CreateInputDimsInfo(const OpDescPtr &op_desc, Format format, ShapeDescription &shape1, ShapeDescription &shape2);
|
|
|
|
void SetInputDimsInfo(const vector<int64_t> &input_dims, Format &format, ShapeDescription &shape_info);
|
|
|
|
Status GetInputDescInfo(vector<InputOutputDescInfo> &input_desc, vector<uint32_t> &input_formats, bool by_dims) const;
|
|
Status GetOutputDescInfo(vector<InputOutputDescInfo> &output_desc, vector<uint32_t> &output_formats) const;
|
|
|
|
Status InitTaskInfo(domi::ModelTaskDef &modelTaskInfo);
|
|
|
|
void UnbindHcomStream();
|
|
|
|
Status DistributeTask();
|
|
|
|
void SaveProfilingTaskDescInfo(const OpDescPtr &op, const TaskInfoPtr &task,
|
|
const domi::TaskDef &task_def, size_t task_index);
|
|
|
|
uint8_t *MallocFeatureMapMem(size_t data_size);
|
|
|
|
uint8_t *MallocWeightsMem(size_t weights_size);
|
|
|
|
uint8_t *MallocP2PMem(size_t p2p_data_size);
|
|
|
|
void FreeFeatureMapMem();
|
|
|
|
void FreeWeightsMem();
|
|
|
|
void FreeP2PMem();
|
|
|
|
void ReleaseTask();
|
|
|
|
void ClearTaskAddrs();
|
|
|
|
void UnbindTaskSinkStream();
|
|
|
|
bool IsAicpuKernelConnectSpecifiedLayer();
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief Reduce memory usage after task sink.
|
|
/// @return: void
|
|
///
|
|
void Shrink();
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief Travel all nodes and do some init.
|
|
/// @param [in] compute_graph: ComputeGraph to load.
|
|
/// @return Status
|
|
///
|
|
Status InitNodes(const ComputeGraphPtr &compute_graph);
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief Data Op Initialize.
|
|
/// @param [in] ComputeGraphPtr: root graph of the model.
|
|
/// @param [in] NodePtr: Data Op.
|
|
/// @param [in/out] data_op_index: index of courrent count.
|
|
/// @param [in/out] data_by_index: Data ordered by index.
|
|
/// @return Status
|
|
///
|
|
Status InitDataOp(const ComputeGraphPtr &graph, const NodePtr &node, uint32_t &data_op_index,
|
|
map<uint32_t, OpDescPtr> &data_by_index);
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief Sort Data op list by index.
|
|
/// @param [in] data_by_index: map of Data Op.
|
|
/// @param [in] output_op_list: list of NetOutput op.
|
|
/// @return Status
|
|
///
|
|
Status GenInputOutputInfo(const map<uint32_t, OpDescPtr> &data_by_index, const vector<OpDescPtr> &output_op_list);
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief NetOutput Op Initialize.
|
|
/// @param [in] ComputeGraphPtr: root graph of the model.
|
|
/// @param [in] NodePtr: NetOutput Op.
|
|
/// @param [in/out] vector<OpDescPtr>: All NetOutput node in model.
|
|
/// @return Status
|
|
///
|
|
Status InitNetOutput(const ComputeGraphPtr &graph, const NodePtr &node, vector<OpDescPtr> &output_op_list);
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief Constant Op Init.
|
|
/// @return Status
|
|
///
|
|
Status InitConstant(const OpDescPtr &op_desc);
|
|
|
|
Status InitVariable(const OpDescPtr &op_desc, map<string, OpDescPtr> &variable_by_name);
|
|
|
|
/// @ingroup ge
|
|
/// @brief LabelSet Op Initialize.
|
|
/// @param [in] op_desc: LabelSet Op descriptor.
|
|
/// @return Status
|
|
Status InitLabelSet(const OpDescPtr &op_desc);
|
|
|
|
Status InitStreamSwitch(const OpDescPtr &op_desc);
|
|
|
|
Status InitStreamActive(const OpDescPtr &op_desc);
|
|
|
|
Status InitStreamSwitchN(const OpDescPtr &op_desc);
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief Case Op Init.
|
|
/// @return Status
|
|
///
|
|
Status InitCase(const OpDescPtr &op_desc);
|
|
|
|
Status SetDynamicBatchInfo(const OpDescPtr &op_desc, uint32_t batch_num);
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief TVM Op Init.
|
|
/// @return Status
|
|
///
|
|
Status InitTbeHandle(const OpDescPtr &op_desc);
|
|
|
|
void StoreTbeHandle(const string &handle_key);
|
|
void CleanTbeHandle();
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief Make active stream list and bind to model.
|
|
/// @return: 0 for success / others for fail
|
|
///
|
|
Status BindModelStream();
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief Init model stream for NN model.
|
|
/// @return Status
|
|
///
|
|
Status InitModelStream(rtStream_t stream);
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief ACL, Load task list with queue entrance.
|
|
/// @return: 0 for success / others for fail
|
|
///
|
|
Status LoadWithQueue();
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief ACL, Bind Data Op addr to input queue.
|
|
/// @return: 0 for success / others for fail
|
|
///
|
|
Status BindInputQueue();
|
|
|
|
Status CpuTaskModelZeroCopy(vector<uintptr_t> &mbuf_list, map<const void *, ZeroCopyOffset> &outside_addrs);
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief ACL, Bind NetOutput Op addr to output queue.
|
|
/// @return: 0 for success / others for fail
|
|
///
|
|
Status BindOutputQueue();
|
|
Status CpuModelPrepareOutput(uintptr_t addr, uint32_t size);
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief definiteness queue schedule, bind input queue to task.
|
|
/// @param [in] queue_id: input queue id from user.
|
|
/// @param [in] addr: Data Op output tensor address.
|
|
/// @param [in] size: Data Op output tensor size.
|
|
/// @return: 0 for success / others for fail
|
|
///
|
|
Status CpuModelDequeue(uint32_t queue_id);
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief definiteness queue schedule, bind output queue to task.
|
|
/// @param [in] queue_id: output queue id from user.
|
|
/// @param [in] addr: NetOutput Op input tensor address.
|
|
/// @param [in] size: NetOutput Op input tensor size.
|
|
/// @return: 0 for success / others for fail
|
|
///
|
|
Status CpuModelEnqueue(uint32_t queue_id, uintptr_t addr, uint32_t size);
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief definiteness queue schedule, active original model stream.
|
|
/// @return: 0 for success / others for fail
|
|
///
|
|
Status CpuActiveStream();
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief definiteness queue schedule, wait for end graph.
|
|
/// @return: 0 for success / others for fail
|
|
///
|
|
Status CpuWaitEndGraph();
|
|
|
|
Status BindEnqueue();
|
|
Status CpuModelEnqueue(uint32_t queue_id, uintptr_t out_mbuf);
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief definiteness queue schedule, repeat run model.
|
|
/// @return: 0 for success / others for fail
|
|
///
|
|
Status CpuModelRepeat();
|
|
|
|
Status InitEntryTask();
|
|
Status AddHeadStream();
|
|
|
|
///
|
|
/// @ingroup ge
|
|
/// @brief set ts device.
|
|
/// @return: 0 for success / others for fail
|
|
///
|
|
Status SetTSDevice();
|
|
|
|
Status OpDebugRegister();
|
|
|
|
void OpDebugUnRegister();
|
|
|
|
void CheckHasHcomOp(const ComputeGraphPtr &graph);
|
|
|
|
Status DoTaskSink();
|
|
|
|
void CreateOutput(uint32_t index, const OpDescPtr &op_desc, InputOutputDescInfo &output, uint32_t &format_result);
|
|
|
|
Status TransAllVarData(ComputeGraphPtr &graph, uint32_t graph_id);
|
|
|
|
// get desc info of graph for profiling
|
|
Status GetComputeGraphInfo(vector<ComputeGraphDescInfo> &graph_desc_info);
|
|
|
|
void SetDataDumperArgs(const ComputeGraphPtr &graph, const map<string, OpDescPtr> &variable_by_name);
|
|
|
|
Status InitL1DataDumperArgs();
|
|
|
|
Status InitModelProfile();
|
|
Status SinkModelProfile();
|
|
|
|
Status SinkTimeProfile(const InputData ¤t_data);
|
|
|
|
Status InitOutputTensorInfo(const OpDescPtr &op_desc);
|
|
Status GenOutputTensorInfo(OutputData *output_data, vector<OutputTensorInfo> &outputs);
|
|
|
|
Status InitInputDescInfo(const OpDescPtr &op_desc);
|
|
Status InitOutputDescInfo(const OpDescPtr &op_desc, const vector<string> &out_node_name);
|
|
|
|
Status InitOrigInputInfo(uint32_t index, const OpDescPtr &op_desc);
|
|
Status InitAippInfo(uint32_t index, const OpDescPtr &op_desc);
|
|
Status InitAippType(uint32_t index, const OpDescPtr &op_desc, const map<uint32_t, OpDescPtr> &data_list);
|
|
Status InitAippInputOutputDims(uint32_t index, const OpDescPtr &op_desc);
|
|
|
|
void ParseAIPPInfo(string in_out_info, InputOutputDims &dims_info);
|
|
void SetLabelForDynamic(const NodePtr &node);
|
|
|
|
void ParseDynamicOutShape(const vector<string> &str_info, vector<vector<int64_t>> &vec_info);
|
|
bool IsGetNextSinkDynamic(const OpDescPtr &op_desc);
|
|
|
|
Status InitRealSizeAndShapeInfo(const ComputeGraphPtr &compute_graph, const NodePtr &node);
|
|
void GetAllGearsInfo(const NodePtr &node);
|
|
Status GetGetDynamicDimsNodeInfo(const NodePtr &node);
|
|
Status GetGearAndRealOutSizeInfo(const ComputeGraphPtr &graph, const NodePtr &node);
|
|
Status GetRealOutputSizeOfCase(const ComputeGraphPtr &graph, size_t input_index, const NodePtr &case_node);
|
|
Status GetGearAndRealOutShapeInfo(const ComputeGraphPtr &graph, const NodePtr &node);
|
|
|
|
bool is_weight_mem_has_inited_;
|
|
bool is_feature_map_mem_has_inited_;
|
|
|
|
uint32_t model_id_;
|
|
uint32_t runtime_model_id_;
|
|
uint32_t sub_model_id_ = 0;
|
|
string name_;
|
|
|
|
// used for inference data dump
|
|
string om_name_;
|
|
|
|
uint32_t version_;
|
|
GeModelPtr ge_model_; // release after DavinciModel::Init
|
|
|
|
bool need_destroy_aicpu_kernel_{false};
|
|
|
|
map<uint32_t, OpDescPtr> op_list_; // release after DavinciModel::Init
|
|
|
|
map<string, GeTensorDesc> broadcast_variable_;
|
|
void *global_step_addr_{nullptr};
|
|
uint64_t global_step_size_{0};
|
|
|
|
map<uint32_t, ZeroCopyOffset> new_input_data_info_;
|
|
map<uint32_t, ZeroCopyOffset> new_output_data_info_;
|
|
map<const void *, ZeroCopyOffset> new_input_outside_addrs_;
|
|
map<const void *, ZeroCopyOffset> new_output_outside_addrs_;
|
|
|
|
set<const void *> real_virtual_addrs_;
|
|
|
|
// output op: save cce op actual needed memory size
|
|
vector<int64_t> output_memory_size_list_;
|
|
|
|
thread thread_id_;
|
|
|
|
shared_ptr<ModelListener> listener_;
|
|
|
|
bool run_flg_;
|
|
|
|
mutex mux_run_flg_;
|
|
|
|
int32_t priority_;
|
|
|
|
vector<rtStream_t> stream_list_;
|
|
|
|
mutex all_hccl_stream_list_mutex_;
|
|
vector<rtStream_t> all_hccl_stream_list_;
|
|
|
|
// for reuse hccl_follow_stream
|
|
mutex capacity_of_stream_mutex_;
|
|
map<int64_t, vector<rtStream_t>> main_follow_stream_mapping_;
|
|
|
|
vector<rtEvent_t> event_list_;
|
|
|
|
vector<rtLabel_t> label_list_;
|
|
set<uint32_t> label_id_indication_;
|
|
|
|
mutex outside_addrs_mutex_;
|
|
vector<ZeroCopyTask> zero_copy_tasks_; // Task used Data or NetOutput addr.
|
|
set<const void *> copy_only_addrs_; // Address need copy to original place.
|
|
|
|
vector<TaskInfoPtr> task_list_;
|
|
// rt_moodel_handle
|
|
rtModel_t rt_model_handle_;
|
|
|
|
rtStream_t rt_model_stream_;
|
|
|
|
bool is_inner_model_stream_;
|
|
|
|
bool is_async_mode_; // For NN execute, Async mode use rtMemcpyAsync on rt_model_stream_.
|
|
ExecuteMode last_execute_mode_;
|
|
|
|
bool is_stream_list_bind_{false};
|
|
bool is_pure_head_stream_{false};
|
|
rtStream_t rt_head_stream_{nullptr};
|
|
rtStream_t rt_entry_stream_{nullptr};
|
|
rtAicpuDeployType_t deploy_type_{AICPU_DEPLOY_RESERVED};
|
|
|
|
// ACL queue schedule, save queue ids for Init.
|
|
vector<TaskInfoPtr> cpu_task_list_;
|
|
vector<uint32_t> input_queue_ids_; // input queue ids created by caller.
|
|
vector<uint32_t> output_queue_ids_; // output queue ids created by caller.
|
|
vector<uintptr_t> input_mbuf_list_; // input mbuf created by dequeue task.
|
|
vector<uintptr_t> output_mbuf_list_; // output mbuf created by dequeue task.
|
|
|
|
uint64_t session_id_;
|
|
|
|
uint32_t device_id_;
|
|
|
|
mutex flowctrl_op_index_internal_map_mutex_;
|
|
map<uint32_t, uint32_t> flowctrl_op_index_internal_map_;
|
|
|
|
vector<rtStream_t> active_stream_list_;
|
|
set<uint32_t> active_stream_indication_;
|
|
|
|
set<uint32_t> hcom_streams_;
|
|
RuntimeParam runtime_param_;
|
|
|
|
static mutex tvm_bin_mutex_;
|
|
set<string> tvm_bin_kernel_;
|
|
|
|
map<string, uint32_t> used_tbe_handle_map_;
|
|
|
|
// for profiling task and graph info
|
|
vector<TaskDescInfo> task_desc_info_;
|
|
|
|
std::map<std::string, std::pair<uint32_t, uint32_t>> profiler_report_op_info_;
|
|
|
|
int64_t maxDumpOpNum_;
|
|
// for data dump
|
|
DataDumper data_dumper_;
|
|
uint64_t iterator_count_;
|
|
bool is_l1_fusion_enable_;
|
|
map<OpDescPtr, void *> saved_task_addrs_; // release after DavinciModel::Init
|
|
void *l1_fusion_addr_ = nullptr;
|
|
|
|
bool known_node_ = false;
|
|
uint32_t total_args_size_ = 0;
|
|
void *args_ = nullptr;
|
|
void *args_host_ = nullptr;
|
|
void *fixed_addrs_ = nullptr;
|
|
void *hybrid_addrs_ = nullptr;
|
|
uint32_t total_hybrid_args_size_ = 0;
|
|
int64_t total_fixed_addr_size_ = 0;
|
|
map<const void *, void *> known_input_data_info_;
|
|
map<const void *, void *> known_output_data_info_;
|
|
vector<void *> total_io_addrs_;
|
|
vector<void *> orig_total_io_addrs_;
|
|
bool base_addr_not_changed_ = false;
|
|
|
|
vector<vector<int64_t>> batch_info_;
|
|
vector<vector<int64_t>> combined_batch_info_;
|
|
vector<string> user_designate_shape_order_;
|
|
int32_t dynamic_type_ = 0;
|
|
bool is_dynamic_ = false;
|
|
|
|
vector<uint64_t> batch_size_;
|
|
// key: input tensor name, generally rts op;
|
|
// value: the fixed addr of input anchor, same as the peer output anchor addr of the peer op
|
|
map<string, int64_t> tensor_name_to_fixed_addr_size_;
|
|
|
|
// key: input tensor name, generally rts op; value: the peer output anchor of the peer op
|
|
map<string, int64_t> tensor_name_to_peer_output_index_;
|
|
// if model is first execute
|
|
bool is_first_execute_;
|
|
// for op debug
|
|
mutex debug_reg_mutex_;
|
|
bool is_op_debug_reg_ = false;
|
|
void *op_debug_addr_ = nullptr;
|
|
void *p2p_debug_addr_ = nullptr;
|
|
bool is_online_infer_dynamic_ = false;
|
|
bool is_getnext_sink_dynamic_ = false;
|
|
vector<int32_t> cur_dynamic_dims_;
|
|
void *netoutput_last_input_addr_ = nullptr;
|
|
int64_t netoutput_last_input_size_ = 0;
|
|
size_t shape_of_cur_dynamic_dims_ = 0;
|
|
// key: input_index: input is merge node; value: each gear info and each output size
|
|
map<size_t, map<vector<int32_t>, int64_t>> merge_nodes_gear_and_real_out_size_info_;
|
|
// key: input_index: input is merge node; value: each gear info and each output shape
|
|
map<size_t, map<vector<int32_t>, vector<int64_t>>> merge_nodes_gear_and_real_out_shape_info_;
|
|
vector<vector<int32_t>> all_gears_info_;
|
|
|
|
multimap<uint32_t, uint32_t> op_id_map_;
|
|
vector<ProfileInfo> profile_list_;
|
|
|
|
// For super kernel.
|
|
SuperKernelTaskInfo skt_info_;
|
|
|
|
bool is_dynamic_aipp_ = false;
|
|
vector<string> dynamic_output_shape_info_;
|
|
|
|
vector<vector<void *>> input_addrs_list_;
|
|
vector<vector<void *>> output_addrs_list_;
|
|
|
|
vector<int64_t> output_buffer_size_;
|
|
vector<GeShape> output_shape_info_;
|
|
|
|
map<uint32_t, OriginInputInfo> orig_input_info_;
|
|
map<uint32_t, AippConfigInfo> aipp_info_list_;
|
|
map<uint32_t, pair<InputAippType, size_t>> aipp_type_list_;
|
|
map<uint32_t, pair<vector<InputOutputDims>, vector<InputOutputDims>>> aipp_dims_info_;
|
|
|
|
vector<InputOutputDescInfo> input_descs_;
|
|
vector<InputOutputDescInfo> input_descs_dims_;
|
|
vector<uint32_t> input_formats_;
|
|
vector<InputOutputDescInfo> output_descs_;
|
|
vector<uint32_t> output_formats_;
|
|
};
|
|
} // namespace ge
|
|
#endif // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_DAVINCI_MODEL_H_
|