pull/1402/head
yangwei 4 years ago
parent f19cd2fca9
commit 7ec6e4fe61

@ -2875,23 +2875,16 @@ Status DavinciModel::UpdateKnownNodeArgs(const vector<void *> &inputs, const vec
GELOGI("DavinciModel::UpdateKnownNodeArgs in"); GELOGI("DavinciModel::UpdateKnownNodeArgs in");
GE_CHK_STATUS_RET(CreateKnownZeroCopyMap(inputs, outputs), GE_CHK_STATUS_RET(CreateKnownZeroCopyMap(inputs, outputs),
"DavinciModel::UpdateKnownNodeArgs create map for input/output zero copy."); "DavinciModel::UpdateKnownNodeArgs create map for input/output zero copy.");
if (!base_addr_not_changed_) { total_io_addrs_.clear();
total_io_addrs_.clear(); for (size_t task_index = 0; task_index < task_list_.size(); ++task_index) {
orig_total_io_addrs_.clear(); auto &task = task_list_[task_index];
for (size_t task_index = 0; task_index < task_list_.size(); ++task_index) { if (task != nullptr) {
auto &task = task_list_[task_index]; Status ret = task->UpdateArgs();
if (task != nullptr) { if (ret != SUCCESS) {
Status ret = task->UpdateArgs(); GELOGE(FAILED, "task %zu created by davinci model is nullptr.", task_index);
if (ret != SUCCESS) { return FAILED;
GELOGE(FAILED, "task %zu created by davinci model is nullptr.", task_index);
return FAILED;
}
} }
} }
// cache latest iterator io addr
orig_total_io_addrs_ = total_io_addrs_;
} else {
total_io_addrs_ = orig_total_io_addrs_;
} }
GE_CHK_STATUS_RET(UpdateKnownZeroCopyAddr(total_io_addrs_, false), "DavinciModel::UpdateKnownZeroCopyAddr failed."); GE_CHK_STATUS_RET(UpdateKnownZeroCopyAddr(total_io_addrs_, false), "DavinciModel::UpdateKnownZeroCopyAddr failed.");
@ -2949,16 +2942,14 @@ Status DavinciModel::MallocKnownArgs() {
return ret; return ret;
} }
} }
rtError_t rt_ret;
// malloc args memory // malloc args memory
if (total_args_size_ == 0) { if (total_args_size_ != 0) {
GELOGW("DavinciModel::MallocKnownArgs total_args_size_ equals to zero."); rt_ret = rtMalloc(&args_, total_args_size_, RT_MEMORY_HBM);
return SUCCESS; if (rt_ret != RT_ERROR_NONE) {
} GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret);
return RT_ERROR_TO_GE_STATUS(rt_ret);
rtError_t rt_ret = rtMalloc(&args_, total_args_size_, RT_MEMORY_HBM); }
if (rt_ret != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret);
return RT_ERROR_TO_GE_STATUS(rt_ret);
} }
// malloc dynamic and static hybrid memory // malloc dynamic and static hybrid memory
if (total_hybrid_args_size_ != 0) { if (total_hybrid_args_size_ != 0) {

@ -534,7 +534,6 @@ class DavinciModel {
Status UpdateKnownNodeArgs(const vector<void *> &inputs, const vector<void *> &outputs); Status UpdateKnownNodeArgs(const vector<void *> &inputs, const vector<void *> &outputs);
Status CreateKnownZeroCopyMap(const vector<void *> &inputs, const vector<void *> &outputs); Status CreateKnownZeroCopyMap(const vector<void *> &inputs, const vector<void *> &outputs);
Status UpdateKnownZeroCopyAddr(vector<void *> &total_io_addrs, bool update_args = true); Status UpdateKnownZeroCopyAddr(vector<void *> &total_io_addrs, bool update_args = true);
void SetKnownNodeAddrNotChanged(bool base_addr_not_changed) { base_addr_not_changed_ = base_addr_not_changed; }
Status GetOrigInputInfo(uint32_t index, OriginInputInfo &orig_input_info) const; Status GetOrigInputInfo(uint32_t index, OriginInputInfo &orig_input_info) const;
Status GetAllAippInputOutputDims(uint32_t index, vector<InputOutputDims> &input_dims, Status GetAllAippInputOutputDims(uint32_t index, vector<InputOutputDims> &input_dims,
@ -1007,8 +1006,6 @@ class DavinciModel {
map<const void *, void *> known_input_data_info_; map<const void *, void *> known_input_data_info_;
map<const void *, void *> known_output_data_info_; map<const void *, void *> known_output_data_info_;
vector<void *> total_io_addrs_; vector<void *> total_io_addrs_;
vector<void *> orig_total_io_addrs_;
bool base_addr_not_changed_ = false;
vector<vector<int64_t>> batch_info_; vector<vector<int64_t>> batch_info_;
vector<vector<int64_t>> combined_batch_info_; vector<vector<int64_t>> combined_batch_info_;

@ -124,7 +124,8 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci
return FAILED; return FAILED;
} }
ret = InitTVMTask(args_offset_tmp[0], kernel_def); io_addr_offset_ = args_offset_tmp[0];
ret = InitTVMTask(io_addr_offset_, kernel_def);
} else if (kernel_type_ == ccKernelType::CUSTOMIZED) { } else if (kernel_type_ == ccKernelType::CUSTOMIZED) {
ret = InitAICPUCustomTask(context.op_index(), kernel_def); ret = InitAICPUCustomTask(context.op_index(), kernel_def);
} else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) { } else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) {
@ -380,7 +381,8 @@ Status KernelTaskInfo::Distribute() {
GELOGD("KernelTaskInfo Distribute Start."); GELOGD("KernelTaskInfo Distribute Start.");
if (davinci_model_->IsKnownNode()) { if (davinci_model_->IsKnownNode()) {
if (kernel_type_ == ccKernelType::TE) { if (kernel_type_ == ccKernelType::TE) {
args_ = davinci_model_->GetCurrentArgsAddr(args_offset_); args_ = l2_buffer_on_ ? davinci_model_->GetCurrentHybridArgsAddr(hybrid_args_offset_)
: davinci_model_->GetCurrentArgsAddr(args_offset_);
} else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) { } else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) {
args_ = davinci_model_->GetCurrentHybridArgsAddr(hybrid_args_offset_); args_ = davinci_model_->GetCurrentHybridArgsAddr(hybrid_args_offset_);
} }
@ -449,29 +451,41 @@ void KernelTaskInfo::SetIoAddrs(const OpDescPtr &op_desc) {
} }
} }
Status KernelTaskInfo::CopyNoncontinuousArgs(uint16_t offset) {
GE_CHECK_NOTNULL(davinci_model_);
// copy new io addrs
vector<void *> io_addrs = io_addrs_;
davinci_model_->UpdateKnownZeroCopyAddr(io_addrs);
auto addr_size = kAddrLen * io_addrs.size();
// copy io addr
errno_t sec_ret = memcpy_s(args_addr.get() + offset, addr_size, io_addrs.data(), addr_size);
if (sec_ret != EOK) {
GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret);
return FAILED;
}
// copy args to device
rtError_t rt_ret = rtMemcpy(args_, args_size_, args_addr.get(), args_size_, RT_MEMCPY_HOST_TO_DEVICE);
if (rt_ret != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "Call rt api(rtMemcpy) failed, ret: 0x%X", rt_ret);
return RT_ERROR_TO_GE_STATUS(rt_ret);
}
GELOGD("Copy noncontinuous args success, kernel type %d.", kernel_type_);
return SUCCESS;
}
Status KernelTaskInfo::UpdateArgs() { Status KernelTaskInfo::UpdateArgs() {
GELOGI("KernelTaskInfo::UpdateArgs in."); GELOGI("KernelTaskInfo::UpdateArgs in.");
GE_CHECK_NOTNULL(davinci_model_);
if (kernel_type_ == ccKernelType::TE) { if (kernel_type_ == ccKernelType::TE) {
if (l2_buffer_on_) {
return CopyNoncontinuousArgs(io_addr_offset_);
}
davinci_model_->SetTotalIOAddrs(io_addrs_); davinci_model_->SetTotalIOAddrs(io_addrs_);
} else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) { } else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) {
vector<void *> io_addrs = io_addrs_; return CopyNoncontinuousArgs(sizeof(aicpu::AicpuParamHead));
davinci_model_->UpdateKnownZeroCopyAddr(io_addrs);
uintptr_t io_addr = reinterpret_cast<uintptr_t>(args_addr.get()) + sizeof(aicpu::AicpuParamHead);
auto addrs_size = sizeof(uint64_t) * io_addrs.size();
errno_t sec_ret = memcpy_s(reinterpret_cast<void *>(io_addr), addrs_size, io_addrs.data(), addrs_size);
if (sec_ret != EOK) {
GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret);
return FAILED;
}
// copy args to device
rtError_t rt_ret = rtMemcpy(args_, args_size_, args_addr.get(), args_size_, RT_MEMCPY_HOST_TO_DEVICE);
if (rt_ret != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "Call rt api(rtMemcpy) failed, ret: 0x%X", rt_ret);
return RT_ERROR_TO_GE_STATUS(rt_ret);
}
} }
GELOGI("KernelTaskInfo::UpdateArgs success.");
return SUCCESS; return SUCCESS;
} }
@ -516,8 +530,8 @@ Status KernelTaskInfo::UpdateL2Data(const domi::KernelDef &kernel_def) {
return SUCCESS; return SUCCESS;
} }
char *sm_contrl = const_cast<char *>(sm_desc.data()); char *sm_control = const_cast<char *>(sm_desc.data());
rtL2Ctrl_t *l2_ctrl_info = reinterpret_cast<rtL2Ctrl_t *>(sm_contrl); rtL2Ctrl_t *l2_ctrl_info = reinterpret_cast<rtL2Ctrl_t *>(sm_control);
uint64_t gen_base_addr = davinci_model_->GetRtBaseAddr(); uint64_t gen_base_addr = davinci_model_->GetRtBaseAddr();
// There is no weight for te op now. Update L2_mirror_addr by data memory base. // There is no weight for te op now. Update L2_mirror_addr by data memory base.
@ -545,19 +559,31 @@ Status KernelTaskInfo::UpdateL2Data(const domi::KernelDef &kernel_def) {
return SUCCESS; return SUCCESS;
} }
void KernelTaskInfo::SetContinuousArgs(uint32_t args_size, DavinciModel *davinci_model) {
args_offset_ = davinci_model->GetTotalArgsSize();
davinci_model->SetTotalArgsSize(args_size);
}
void KernelTaskInfo::SetNoncontinuousArgs(uint32_t args_size, DavinciModel *davinci_model) {
hybrid_args_offset_ = davinci_model->GetHybridArgsSize();
davinci_model->SetHybridArgsSize(args_size);
}
Status KernelTaskInfo::CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) { Status KernelTaskInfo::CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) {
GE_CHECK_NOTNULL(davinci_model);
const domi::KernelDef &kernel_def = task_def.kernel(); const domi::KernelDef &kernel_def = task_def.kernel();
const domi::KernelContext &context = kernel_def.context(); const domi::KernelContext &context = kernel_def.context();
kernel_type_ = static_cast<ccKernelType>(context.kernel_type()); kernel_type_ = static_cast<ccKernelType>(context.kernel_type());
uint32_t args_size = kernel_def.args_size();
if (kernel_type_ == ccKernelType::TE) { if (kernel_type_ == ccKernelType::TE) {
uint32_t args_size = kernel_def.args_size(); if (kernel_def.sm_desc().empty()) {
args_offset_ = davinci_model->GetTotalArgsSize(); SetContinuousArgs(args_size, davinci_model);
davinci_model->SetTotalArgsSize(args_size); return SUCCESS;
GELOGI("kernel task name , args_size %u, args_offset %u", args_size, args_offset_); }
l2_buffer_on_ = true;
SetNoncontinuousArgs(args_size, davinci_model);
} else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) { } else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) {
hybrid_args_offset_ = davinci_model->GetHybridArgsSize(); SetNoncontinuousArgs(args_size, davinci_model);
davinci_model->SetHybridArgsSize(kernel_def.args_size());
GELOGI("aicpu kernel task name , args_size %u, args_offset %u", kernel_def.args_size(), hybrid_args_offset_);
} }
return SUCCESS; return SUCCESS;
} }
@ -568,8 +594,23 @@ Status KernelTaskInfo::InitTVMTask(uint16_t offset, const domi::KernelDef &kerne
// get tvm op desc // get tvm op desc
OpDescPtr op_desc = davinci_model_->GetOpByIndex(ctx_.opIndex); OpDescPtr op_desc = davinci_model_->GetOpByIndex(ctx_.opIndex);
GE_CHECK_NOTNULL(op_desc); GE_CHECK_NOTNULL(op_desc);
args_addr = std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[args_size_]);
errno_t sec_ret = memcpy_s(args_addr.get(), args_size_, kernel_def.args().data(), args_size_);
if (sec_ret != EOK) {
GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret);
return FAILED;
}
Status ge_ret = UpdateL2Data(kernel_def);
// update origin l2 data
if (ge_ret != SUCCESS) {
return ge_ret;
}
if (davinci_model_->IsKnownNode()) { if (davinci_model_->IsKnownNode()) {
args_ = davinci_model_->GetCurrentArgsAddr(args_offset_); args_ = l2_buffer_on_ ? davinci_model_->GetCurrentHybridArgsAddr(hybrid_args_offset_)
: davinci_model_->GetCurrentArgsAddr(args_offset_);
InitDumpTask(offset); InitDumpTask(offset);
return SUCCESS; return SUCCESS;
} }
@ -609,12 +650,6 @@ Status KernelTaskInfo::InitTVMTask(uint16_t offset, const domi::KernelDef &kerne
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
return RT_ERROR_TO_GE_STATUS(rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret);
} }
vector<uint8_t> args_info(args_size_);
errno_t sec_ret = memcpy_s(args_info.data(), args_size_, kernel_def.args().data(), args_size_);
if (sec_ret != EOK) {
GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret);
return FAILED;
}
if ((args_size_ <= offset) || (args_size_ - offset < kAddrLen * tensor_device_addrs.size())) { if ((args_size_ <= offset) || (args_size_ - offset < kAddrLen * tensor_device_addrs.size())) {
GELOGE(FAILED, "offset >= kernelInfo.argsSize or copy content beyond applied memory."); GELOGE(FAILED, "offset >= kernelInfo.argsSize or copy content beyond applied memory.");
@ -628,7 +663,7 @@ Status KernelTaskInfo::InitTVMTask(uint16_t offset, const domi::KernelDef &kerne
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
return RT_ERROR_TO_GE_STATUS(rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret);
} }
sec_ret = memcpy_s(args_info.data() + offset, args_size_ - offset, tensor_device_addrs.data(), sec_ret = memcpy_s(args_addr.get() + offset, args_size_ - offset, tensor_device_addrs.data(),
kAddrLen * tensor_device_addrs.size()); kAddrLen * tensor_device_addrs.size());
if (sec_ret != EOK) { if (sec_ret != EOK) {
GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret); GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret);
@ -640,19 +675,13 @@ Status KernelTaskInfo::InitTVMTask(uint16_t offset, const domi::KernelDef &kerne
GE_CHK_BOOL_TRUE_EXEC_INFO(davinci_model_->GetOpDugReg(), dump_args_ = static_cast<char *>(args_) + offset, GE_CHK_BOOL_TRUE_EXEC_INFO(davinci_model_->GetOpDugReg(), dump_args_ = static_cast<char *>(args_) + offset,
"Op debug is open in TVM task info"); "Op debug is open in TVM task info");
Status ge_ret = UpdateL2Data(kernel_def);
// update origin l2 data
if (ge_ret != SUCCESS) {
return ge_ret;
}
vector<void *> virtual_io_addrs; // use virtual address for zero copy key. vector<void *> virtual_io_addrs; // use virtual address for zero copy key.
virtual_io_addrs.insert(virtual_io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end()); virtual_io_addrs.insert(virtual_io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end());
virtual_io_addrs.insert(virtual_io_addrs.end(), output_data_addrs.begin(), output_data_addrs.end()); virtual_io_addrs.insert(virtual_io_addrs.end(), output_data_addrs.begin(), output_data_addrs.end());
if (op_desc->GetType() == ATOMICADDRCLEAN) { if (op_desc->GetType() == ATOMICADDRCLEAN) {
virtual_io_addrs.insert(virtual_io_addrs.end(), workspace_data_addrs.begin(), workspace_data_addrs.end()); virtual_io_addrs.insert(virtual_io_addrs.end(), workspace_data_addrs.begin(), workspace_data_addrs.end());
} }
davinci_model_->SetZeroCopyAddr(op_desc, virtual_io_addrs, args_info.data(), args_, args_size_, offset); davinci_model_->SetZeroCopyAddr(op_desc, virtual_io_addrs, args_addr.get(), args_, args_size_, offset);
GELOGD("Do InitTVMTask end"); GELOGD("Do InitTVMTask end");
return SUCCESS; return SUCCESS;

@ -129,6 +129,9 @@ class KernelTaskInfo : public TaskInfo {
bool IsL1FusionOp(const OpDescPtr &op_desc); bool IsL1FusionOp(const OpDescPtr &op_desc);
void SetIoAddrs(const OpDescPtr &op_desc); void SetIoAddrs(const OpDescPtr &op_desc);
void InitDumpTask(uint32_t offset); void InitDumpTask(uint32_t offset);
void SetContinuousArgs(uint32_t args_size, DavinciModel *davinci_model);
void SetNoncontinuousArgs(uint32_t args_size, DavinciModel *davinci_model);
Status CopyNoncontinuousArgs(uint16_t offset);
// For super kernel // For super kernel
Status SaveSKTDumpInfo(); Status SaveSKTDumpInfo();
@ -163,6 +166,8 @@ class KernelTaskInfo : public TaskInfo {
uint32_t hybrid_args_offset_ = 0; uint32_t hybrid_args_offset_ = 0;
int64_t fixed_addr_offset_ = 0; int64_t fixed_addr_offset_ = 0;
std::unique_ptr<uint8_t[]> args_addr = nullptr; std::unique_ptr<uint8_t[]> args_addr = nullptr;
uint16_t io_addr_offset_ = 0;
bool l2_buffer_on_ = false;
bool call_save_dump_ = false; bool call_save_dump_ = false;
// aicpu ext_info device mem // aicpu ext_info device mem

@ -105,11 +105,6 @@ Status KnownNodeTask::Init(TaskContext &context) {
"known node task allocate workspace failed."); "known node task allocate workspace failed.");
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(),
"[KnownNodeTask_AllocateWorkspace] End, size %zu", davinci_model_->TotalMemSize()); "[KnownNodeTask_AllocateWorkspace] End, size %zu", davinci_model_->TotalMemSize());
bool addr_not_changed = false;
if (davinci_model_->GetRuntimeParam().mem_base == buffer) {
addr_not_changed = true;
}
davinci_model_->SetKnownNodeAddrNotChanged(addr_not_changed);
// update mem base // update mem base
davinci_model_->UpdateMemBase(static_cast<uint8_t *>(buffer)); davinci_model_->UpdateMemBase(static_cast<uint8_t *>(buffer));
GELOGI("KnownNodeTask::Init mem base is %p, size %lu.", GELOGI("KnownNodeTask::Init mem base is %p, size %lu.",

Loading…
Cancel
Save