!1402 l2 buffer for f1.3.0

From: @youui
Reviewed-by: @ji_chen,@xchu42
Signed-off-by: @liyihan123,@ji_chen
pull/1402/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit 1735e1b1f3

@ -2875,23 +2875,16 @@ Status DavinciModel::UpdateKnownNodeArgs(const vector<void *> &inputs, const vec
GELOGI("DavinciModel::UpdateKnownNodeArgs in");
GE_CHK_STATUS_RET(CreateKnownZeroCopyMap(inputs, outputs),
"DavinciModel::UpdateKnownNodeArgs create map for input/output zero copy.");
if (!base_addr_not_changed_) {
total_io_addrs_.clear();
orig_total_io_addrs_.clear();
for (size_t task_index = 0; task_index < task_list_.size(); ++task_index) {
auto &task = task_list_[task_index];
if (task != nullptr) {
Status ret = task->UpdateArgs();
if (ret != SUCCESS) {
GELOGE(FAILED, "task %zu created by davinci model is nullptr.", task_index);
return FAILED;
}
total_io_addrs_.clear();
for (size_t task_index = 0; task_index < task_list_.size(); ++task_index) {
auto &task = task_list_[task_index];
if (task != nullptr) {
Status ret = task->UpdateArgs();
if (ret != SUCCESS) {
GELOGE(FAILED, "task %zu created by davinci model is nullptr.", task_index);
return FAILED;
}
}
// cache latest iterator io addr
orig_total_io_addrs_ = total_io_addrs_;
} else {
total_io_addrs_ = orig_total_io_addrs_;
}
GE_CHK_STATUS_RET(UpdateKnownZeroCopyAddr(total_io_addrs_, false), "DavinciModel::UpdateKnownZeroCopyAddr failed.");
@ -2949,16 +2942,14 @@ Status DavinciModel::MallocKnownArgs() {
return ret;
}
}
rtError_t rt_ret;
// malloc args memory
if (total_args_size_ == 0) {
GELOGW("DavinciModel::MallocKnownArgs total_args_size_ equals to zero.");
return SUCCESS;
}
rtError_t rt_ret = rtMalloc(&args_, total_args_size_, RT_MEMORY_HBM);
if (rt_ret != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret);
return RT_ERROR_TO_GE_STATUS(rt_ret);
if (total_args_size_ != 0) {
rt_ret = rtMalloc(&args_, total_args_size_, RT_MEMORY_HBM);
if (rt_ret != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret);
return RT_ERROR_TO_GE_STATUS(rt_ret);
}
}
// malloc dynamic and static hybrid memory
if (total_hybrid_args_size_ != 0) {

@ -534,7 +534,6 @@ class DavinciModel {
Status UpdateKnownNodeArgs(const vector<void *> &inputs, const vector<void *> &outputs);
Status CreateKnownZeroCopyMap(const vector<void *> &inputs, const vector<void *> &outputs);
Status UpdateKnownZeroCopyAddr(vector<void *> &total_io_addrs, bool update_args = true);
void SetKnownNodeAddrNotChanged(bool base_addr_not_changed) { base_addr_not_changed_ = base_addr_not_changed; }
Status GetOrigInputInfo(uint32_t index, OriginInputInfo &orig_input_info) const;
Status GetAllAippInputOutputDims(uint32_t index, vector<InputOutputDims> &input_dims,
@ -1007,8 +1006,6 @@ class DavinciModel {
map<const void *, void *> known_input_data_info_;
map<const void *, void *> known_output_data_info_;
vector<void *> total_io_addrs_;
vector<void *> orig_total_io_addrs_;
bool base_addr_not_changed_ = false;
vector<vector<int64_t>> batch_info_;
vector<vector<int64_t>> combined_batch_info_;

@ -124,7 +124,8 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci
return FAILED;
}
ret = InitTVMTask(args_offset_tmp[0], kernel_def);
io_addr_offset_ = args_offset_tmp[0];
ret = InitTVMTask(io_addr_offset_, kernel_def);
} else if (kernel_type_ == ccKernelType::CUSTOMIZED) {
ret = InitAICPUCustomTask(context.op_index(), kernel_def);
} else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) {
@ -380,7 +381,8 @@ Status KernelTaskInfo::Distribute() {
GELOGD("KernelTaskInfo Distribute Start.");
if (davinci_model_->IsKnownNode()) {
if (kernel_type_ == ccKernelType::TE) {
args_ = davinci_model_->GetCurrentArgsAddr(args_offset_);
args_ = l2_buffer_on_ ? davinci_model_->GetCurrentHybridArgsAddr(hybrid_args_offset_)
: davinci_model_->GetCurrentArgsAddr(args_offset_);
} else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) {
args_ = davinci_model_->GetCurrentHybridArgsAddr(hybrid_args_offset_);
}
@ -449,29 +451,41 @@ void KernelTaskInfo::SetIoAddrs(const OpDescPtr &op_desc) {
}
}
Status KernelTaskInfo::CopyNoncontinuousArgs(uint16_t offset) {
GE_CHECK_NOTNULL(davinci_model_);
// copy new io addrs
vector<void *> io_addrs = io_addrs_;
davinci_model_->UpdateKnownZeroCopyAddr(io_addrs);
auto addr_size = kAddrLen * io_addrs.size();
// copy io addr
errno_t sec_ret = memcpy_s(args_addr.get() + offset, addr_size, io_addrs.data(), addr_size);
if (sec_ret != EOK) {
GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret);
return FAILED;
}
// copy args to device
rtError_t rt_ret = rtMemcpy(args_, args_size_, args_addr.get(), args_size_, RT_MEMCPY_HOST_TO_DEVICE);
if (rt_ret != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "Call rt api(rtMemcpy) failed, ret: 0x%X", rt_ret);
return RT_ERROR_TO_GE_STATUS(rt_ret);
}
GELOGD("Copy noncontinuous args success, kernel type %d.", kernel_type_);
return SUCCESS;
}
Status KernelTaskInfo::UpdateArgs() {
GELOGI("KernelTaskInfo::UpdateArgs in.");
GE_CHECK_NOTNULL(davinci_model_);
if (kernel_type_ == ccKernelType::TE) {
if (l2_buffer_on_) {
return CopyNoncontinuousArgs(io_addr_offset_);
}
davinci_model_->SetTotalIOAddrs(io_addrs_);
} else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) {
vector<void *> io_addrs = io_addrs_;
davinci_model_->UpdateKnownZeroCopyAddr(io_addrs);
uintptr_t io_addr = reinterpret_cast<uintptr_t>(args_addr.get()) + sizeof(aicpu::AicpuParamHead);
auto addrs_size = sizeof(uint64_t) * io_addrs.size();
errno_t sec_ret = memcpy_s(reinterpret_cast<void *>(io_addr), addrs_size, io_addrs.data(), addrs_size);
if (sec_ret != EOK) {
GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret);
return FAILED;
}
// copy args to device
rtError_t rt_ret = rtMemcpy(args_, args_size_, args_addr.get(), args_size_, RT_MEMCPY_HOST_TO_DEVICE);
if (rt_ret != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "Call rt api(rtMemcpy) failed, ret: 0x%X", rt_ret);
return RT_ERROR_TO_GE_STATUS(rt_ret);
}
return CopyNoncontinuousArgs(sizeof(aicpu::AicpuParamHead));
}
GELOGI("KernelTaskInfo::UpdateArgs success.");
return SUCCESS;
}
@ -516,8 +530,8 @@ Status KernelTaskInfo::UpdateL2Data(const domi::KernelDef &kernel_def) {
return SUCCESS;
}
char *sm_contrl = const_cast<char *>(sm_desc.data());
rtL2Ctrl_t *l2_ctrl_info = reinterpret_cast<rtL2Ctrl_t *>(sm_contrl);
char *sm_control = const_cast<char *>(sm_desc.data());
rtL2Ctrl_t *l2_ctrl_info = reinterpret_cast<rtL2Ctrl_t *>(sm_control);
uint64_t gen_base_addr = davinci_model_->GetRtBaseAddr();
// There is no weight for te op now. Update L2_mirror_addr by data memory base.
@ -545,19 +559,31 @@ Status KernelTaskInfo::UpdateL2Data(const domi::KernelDef &kernel_def) {
return SUCCESS;
}
void KernelTaskInfo::SetContinuousArgs(uint32_t args_size, DavinciModel *davinci_model) {
args_offset_ = davinci_model->GetTotalArgsSize();
davinci_model->SetTotalArgsSize(args_size);
}
void KernelTaskInfo::SetNoncontinuousArgs(uint32_t args_size, DavinciModel *davinci_model) {
hybrid_args_offset_ = davinci_model->GetHybridArgsSize();
davinci_model->SetHybridArgsSize(args_size);
}
Status KernelTaskInfo::CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) {
GE_CHECK_NOTNULL(davinci_model);
const domi::KernelDef &kernel_def = task_def.kernel();
const domi::KernelContext &context = kernel_def.context();
kernel_type_ = static_cast<ccKernelType>(context.kernel_type());
uint32_t args_size = kernel_def.args_size();
if (kernel_type_ == ccKernelType::TE) {
uint32_t args_size = kernel_def.args_size();
args_offset_ = davinci_model->GetTotalArgsSize();
davinci_model->SetTotalArgsSize(args_size);
GELOGI("kernel task name , args_size %u, args_offset %u", args_size, args_offset_);
if (kernel_def.sm_desc().empty()) {
SetContinuousArgs(args_size, davinci_model);
return SUCCESS;
}
l2_buffer_on_ = true;
SetNoncontinuousArgs(args_size, davinci_model);
} else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) {
hybrid_args_offset_ = davinci_model->GetHybridArgsSize();
davinci_model->SetHybridArgsSize(kernel_def.args_size());
GELOGI("aicpu kernel task name , args_size %u, args_offset %u", kernel_def.args_size(), hybrid_args_offset_);
SetNoncontinuousArgs(args_size, davinci_model);
}
return SUCCESS;
}
@ -568,8 +594,23 @@ Status KernelTaskInfo::InitTVMTask(uint16_t offset, const domi::KernelDef &kerne
// get tvm op desc
OpDescPtr op_desc = davinci_model_->GetOpByIndex(ctx_.opIndex);
GE_CHECK_NOTNULL(op_desc);
args_addr = std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[args_size_]);
errno_t sec_ret = memcpy_s(args_addr.get(), args_size_, kernel_def.args().data(), args_size_);
if (sec_ret != EOK) {
GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret);
return FAILED;
}
Status ge_ret = UpdateL2Data(kernel_def);
// update origin l2 data
if (ge_ret != SUCCESS) {
return ge_ret;
}
if (davinci_model_->IsKnownNode()) {
args_ = davinci_model_->GetCurrentArgsAddr(args_offset_);
args_ = l2_buffer_on_ ? davinci_model_->GetCurrentHybridArgsAddr(hybrid_args_offset_)
: davinci_model_->GetCurrentArgsAddr(args_offset_);
InitDumpTask(offset);
return SUCCESS;
}
@ -609,12 +650,6 @@ Status KernelTaskInfo::InitTVMTask(uint16_t offset, const domi::KernelDef &kerne
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
return RT_ERROR_TO_GE_STATUS(rt_ret);
}
vector<uint8_t> args_info(args_size_);
errno_t sec_ret = memcpy_s(args_info.data(), args_size_, kernel_def.args().data(), args_size_);
if (sec_ret != EOK) {
GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret);
return FAILED;
}
if ((args_size_ <= offset) || (args_size_ - offset < kAddrLen * tensor_device_addrs.size())) {
GELOGE(FAILED, "offset >= kernelInfo.argsSize or copy content beyond applied memory.");
@ -628,7 +663,7 @@ Status KernelTaskInfo::InitTVMTask(uint16_t offset, const domi::KernelDef &kerne
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
return RT_ERROR_TO_GE_STATUS(rt_ret);
}
sec_ret = memcpy_s(args_info.data() + offset, args_size_ - offset, tensor_device_addrs.data(),
sec_ret = memcpy_s(args_addr.get() + offset, args_size_ - offset, tensor_device_addrs.data(),
kAddrLen * tensor_device_addrs.size());
if (sec_ret != EOK) {
GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret);
@ -640,19 +675,13 @@ Status KernelTaskInfo::InitTVMTask(uint16_t offset, const domi::KernelDef &kerne
GE_CHK_BOOL_TRUE_EXEC_INFO(davinci_model_->GetOpDugReg(), dump_args_ = static_cast<char *>(args_) + offset,
"Op debug is open in TVM task info");
Status ge_ret = UpdateL2Data(kernel_def);
// update origin l2 data
if (ge_ret != SUCCESS) {
return ge_ret;
}
vector<void *> virtual_io_addrs; // use virtual address for zero copy key.
virtual_io_addrs.insert(virtual_io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end());
virtual_io_addrs.insert(virtual_io_addrs.end(), output_data_addrs.begin(), output_data_addrs.end());
if (op_desc->GetType() == ATOMICADDRCLEAN) {
virtual_io_addrs.insert(virtual_io_addrs.end(), workspace_data_addrs.begin(), workspace_data_addrs.end());
}
davinci_model_->SetZeroCopyAddr(op_desc, virtual_io_addrs, args_info.data(), args_, args_size_, offset);
davinci_model_->SetZeroCopyAddr(op_desc, virtual_io_addrs, args_addr.get(), args_, args_size_, offset);
GELOGD("Do InitTVMTask end");
return SUCCESS;

@ -129,6 +129,9 @@ class KernelTaskInfo : public TaskInfo {
bool IsL1FusionOp(const OpDescPtr &op_desc);
void SetIoAddrs(const OpDescPtr &op_desc);
void InitDumpTask(uint32_t offset);
void SetContinuousArgs(uint32_t args_size, DavinciModel *davinci_model);
void SetNoncontinuousArgs(uint32_t args_size, DavinciModel *davinci_model);
Status CopyNoncontinuousArgs(uint16_t offset);
// For super kernel
Status SaveSKTDumpInfo();
@ -163,6 +166,8 @@ class KernelTaskInfo : public TaskInfo {
uint32_t hybrid_args_offset_ = 0;
int64_t fixed_addr_offset_ = 0;
std::unique_ptr<uint8_t[]> args_addr = nullptr;
uint16_t io_addr_offset_ = 0;
bool l2_buffer_on_ = false;
bool call_save_dump_ = false;
// aicpu ext_info device mem

@ -105,11 +105,6 @@ Status KnownNodeTask::Init(TaskContext &context) {
"known node task allocate workspace failed.");
RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(),
"[KnownNodeTask_AllocateWorkspace] End, size %zu", davinci_model_->TotalMemSize());
bool addr_not_changed = false;
if (davinci_model_->GetRuntimeParam().mem_base == buffer) {
addr_not_changed = true;
}
davinci_model_->SetKnownNodeAddrNotChanged(addr_not_changed);
// update mem base
davinci_model_->UpdateMemBase(static_cast<uint8_t *>(buffer));
GELOGI("KnownNodeTask::Init mem base is %p, size %lu.",

Loading…
Cancel
Save