!606 ACL single op refactory

From: @xchu42
Reviewed-by: @wqtshg,@ji_chen
Signed-off-by: @ji_chen
pull/606/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit ee621b8e67

@ -25,6 +25,7 @@
#include "graph/load/new_model_manager/model_utils.h"
#include "runtime/mem.h"
#include "single_op/single_op_manager.h"
#include "single_op/task/build_task_utils.h"
#include "graph/load/new_model_manager/model_manager.h"
namespace ge {
@ -77,7 +78,8 @@ Status ProfilingTaskInfo(OpTask *op_task) {
}
} // namespace
SingleOp::SingleOp(std::mutex *stream_mutex, rtStream_t stream) : stream_mutex_(stream_mutex), stream_(stream) {
SingleOp::SingleOp(StreamResource *stream_resource, std::mutex *stream_mutex, rtStream_t stream)
: stream_resource_(stream_resource), stream_mutex_(stream_mutex), stream_(stream) {
}
FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY SingleOp::~SingleOp() {
@ -159,37 +161,6 @@ Status SingleOp::UpdateArgs(const std::vector<DataBuffer> &inputs, const std::ve
*arg_addr = args_[i];
}
}
// update aicpu_TF or aicpu_CC args
for (auto &task : tasks_) {
size_t io_addr_num = args_.size();
if (task->GetOpTaskType() == OP_TASK_AICPU) {
GELOGD("Update aicpu_TF task args");
task->SetIoAddrsForDump(args_);
auto *dst_io_addr = const_cast<uintptr_t *>(reinterpret_cast<const uintptr_t *>(task->GetIOAddr()));
GE_CHECK_NOTNULL(dst_io_addr);
auto rt_ret = rtMemcpyAsync(dst_io_addr,
sizeof(uint64_t) * args_.size(),
&args_[0],
sizeof(uint64_t) * args_.size(),
RT_MEMCPY_HOST_TO_DEVICE_EX,
stream_);
if (rt_ret != RT_ERROR_NONE) {
GELOGE(rt_ret, "rtMemcpyAsync addresses failed, ret = %d", rt_ret);
return rt_ret;
}
} else if (task->GetOpTaskType() == OP_TASK_AICPUCC) {
GELOGD("Update aicpu_CC task args");
const uintptr_t *task_io_addr = reinterpret_cast<const uintptr_t *>(task->GetIOAddr());
GE_CHECK_NOTNULL(task_io_addr);
auto io_addr = reinterpret_cast<uint64_t *>(const_cast<uintptr_t *>(task_io_addr));
for (size_t i = 0; i < io_addr_num; ++i) {
io_addr[i] = static_cast<uintptr_t>(args_[i]);
}
} else {
GELOGW("Only TF_kernel aicpu and aicpu_CC are supported, but got %u", task->GetOpTaskType());
continue;
}
}
return SUCCESS;
}
@ -200,7 +171,19 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status SingleOp::ExecuteAsync(c
return ret;
}
GE_CHECK_NOTNULL(stream_resource_);
std::lock_guard<std::mutex> lk(*stream_mutex_);
auto current_mem_base = stream_resource_->GetMemoryBase();
if (running_param_->mem_base != current_mem_base) {
running_param_->mem_base = const_cast<uint8_t *>(current_mem_base);
GELOGD("Memory base changed, new memory base = %p", current_mem_base);
for (auto &task : tasks_) {
auto new_address = BuildTaskUtils::GetAddresses(task->GetOpdesc(), *running_param_);
GE_CHK_STATUS_RET(task->UpdateArgTable(*running_param_),
"[%s] Failed to update arg table",
task->GetOpdesc()->GetName().c_str());
}
}
ret = UpdateArgs(inputs, outputs);
if (ret != SUCCESS) {
return ret;
@ -225,9 +208,6 @@ DynamicSingleOp::DynamicSingleOp(uintptr_t resource_id, std::mutex *stream_mutex
: resource_id_(resource_id), stream_mutex_(stream_mutex), stream_(stream) {
}
DynamicSingleOp::~DynamicSingleOp() {
}
Status DynamicSingleOp::ValidateParams(const vector<GeTensorDesc> &input_desc,
const std::vector<DataBuffer> &inputs,
std::vector<GeTensorDesc> &output_desc,
@ -249,65 +229,24 @@ Status DynamicSingleOp::ValidateParams(const vector<GeTensorDesc> &input_desc,
}
if (input_desc.size() != num_inputs_) {
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Input number mismatches. expect %zu, but given %zu",
num_inputs_, input_desc.size());
GELOGE(ACL_ERROR_GE_PARAM_INVALID,
"Input number mismatches. expect %zu, but given %zu",
num_inputs_,
input_desc.size());
return ACL_ERROR_GE_PARAM_INVALID;
}
if (output_desc.size() != num_outputs_) {
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Output number mismatches. expect %zu, but given %zu",
num_outputs_, output_desc.size());
GELOGE(ACL_ERROR_GE_PARAM_INVALID,
"Output number mismatches. expect %zu, but given %zu",
num_outputs_,
output_desc.size());
return ACL_ERROR_GE_PARAM_INVALID;
}
return SUCCESS;
}
Status DynamicSingleOp::AllocateWorkspaces(const std::vector<int64_t> &workspace_sizes,
std::vector<void *> &workspaces) {
static const std::string kPurpose("malloc workspace memory for dynamic op.");
if (workspace_sizes.empty()) {
GELOGD("No need to allocate workspace.");
return SUCCESS;
}
int64_t total_size = 0;
std::vector<int64_t> ws_offsets;
for (auto ws_size : workspace_sizes) {
// alignment and padding should be done in OpParaCalculate
GE_CHK_STATUS_RET_NOLOG(CheckInt64AddOverflow(total_size, ws_size));
ws_offsets.emplace_back(total_size);
total_size += ws_size;
}
GELOGD("Total workspace size is %ld", total_size);
StreamResource *stream_resource = SingleOpManager::GetInstance().GetResource(resource_id_, stream_);
GE_CHECK_NOTNULL(stream_resource);
auto ws_base = stream_resource->MallocMemory(kPurpose, static_cast<size_t>(total_size));
if (ws_base == nullptr) {
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to allocate memory of size: %ld", total_size);
return ACL_ERROR_GE_MEMORY_ALLOCATION;
}
GELOGD("Done allocating workspace memory successfully.");
for (auto ws_offset : ws_offsets) {
workspaces.emplace_back(ws_base + ws_offset);
}
return SUCCESS;
}
Status DynamicSingleOp::ExecuteTbeTask(const vector<GeTensorDesc> &input_desc,
const vector<void *> &inputs,
vector<GeTensorDesc> &output_desc,
vector<void *> &outputs) {
GE_CHK_STATUS_RET_NOLOG(op_task_->UpdateRunInfo(input_desc, output_desc));
std::vector<void *> workspace_buffers;
GE_CHK_STATUS_RET_NOLOG(AllocateWorkspaces(op_task_->GetWorkspaceSizes(), workspace_buffers));
return op_task_->LaunchKernel(inputs, outputs, workspace_buffers, stream_);
}
Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc,
const vector<DataBuffer> &input_buffers,
vector<GeTensorDesc> &output_desc,
@ -316,32 +255,8 @@ Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc,
GE_CHK_STATUS_RET_NOLOG(ValidateParams(input_desc, input_buffers, output_desc, output_buffers));
std::lock_guard<std::mutex> lk(*stream_mutex_);
std::vector<void *> inputs;
std::vector<void *> outputs;
for (auto &buffer : input_buffers) {
inputs.emplace_back(buffer.data);
}
for (auto &buffer : output_buffers) {
outputs.emplace_back(buffer.data);
}
if (op_task_->GetOpTaskType() == OP_TASK_TBE) {
auto ret = ExecuteTbeTask(input_desc, inputs, output_desc, outputs);
if (ret == SUCCESS) {
GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(op_task_.get()));
}
return ret;
} else if (op_task_->GetOpTaskType() == OP_TASK_AICPU || op_task_->GetOpTaskType() == OP_TASK_AICPUCC) {
auto aicpu_ret = op_task_->LaunchKernel(input_desc, input_buffers, output_desc, output_buffers, stream_);
if (aicpu_ret == SUCCESS) {
GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(op_task_.get()));
}
return aicpu_ret;
} else {
GELOGE(ACL_ERROR_GE_OP_TASK_TYPE_INVALID,
"Only TBE_Task, AI_CPU_Task and AI_CPUCC_Task are supported, but got %u",
op_task_->GetOpTaskType());
return ACL_ERROR_GE_OP_TASK_TYPE_INVALID;
}
GE_CHK_STATUS_RET_NOLOG(op_task_->LaunchKernel(input_desc, input_buffers, output_desc, output_buffers, stream_));
GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(op_task_.get()));
return SUCCESS;
}
} // namespace ge

@ -30,9 +30,11 @@
#include "cce/aicpu_engine_struct.h"
namespace ge {
class StreamResource;
struct SingleOpModelParam;
class SingleOp {
public:
SingleOp(std::mutex *stream_mutex, rtStream_t stream);
SingleOp(StreamResource *stream_resource, std::mutex *stream_mutex, rtStream_t stream);
~SingleOp();
Status ExecuteAsync(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs);
@ -44,6 +46,7 @@ class SingleOp {
Status GetArgs(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs);
friend class SingleOpModel;
StreamResource *stream_resource_;
std::mutex *stream_mutex_;
rtStream_t stream_ = nullptr;
std::vector<void *> input_addr_list_;
@ -54,12 +57,13 @@ class SingleOp {
std::vector<OpTask *> tasks_;
std::vector<std::vector<uintptr_t *>> arg_table_;
std::unique_ptr<SingleOpModelParam> running_param_;
};
class DynamicSingleOp {
public:
DynamicSingleOp(uintptr_t resource_id, std::mutex *stream_mutex_, rtStream_t stream);
~DynamicSingleOp();
~DynamicSingleOp() = default;
Status ExecuteAsync(const vector<GeTensorDesc> &input_desc,
const std::vector<DataBuffer> &inputs,
std::vector<GeTensorDesc> &output_desc,
@ -72,14 +76,6 @@ class DynamicSingleOp {
std::vector<GeTensorDesc> &output_desc,
std::vector<DataBuffer> &outputs) const;
Status AllocateWorkspaces(const std::vector<int64_t> &workspace_sizes,
std::vector<void *> &workspaces);
Status ExecuteTbeTask(const vector<GeTensorDesc> &input_desc,
const vector<void *> &inputs,
vector<GeTensorDesc> &output_desc,
vector<void *> &outputs);
std::unique_ptr<OpTask> op_task_;
uintptr_t resource_id_ = 0;
std::mutex *stream_mutex_;

@ -92,7 +92,8 @@ Status SingleOpModel::InitModelMem(StreamResource &res) {
if (model_params_.memory_size > model_params_.zero_copy_mem_size) {
const string purpose("malloc feature map memory on model execute.");
GELOGI("total memory: %lu, zero_copy_mem: %lu", model_params_.memory_size, model_params_.zero_copy_mem_size);
model_params_.mem_base = res.MallocMemory(purpose, model_params_.memory_size - model_params_.zero_copy_mem_size);
model_params_.mem_base =
res.MallocMemory(purpose, model_params_.memory_size - model_params_.zero_copy_mem_size, false);
if (model_params_.mem_base == nullptr) {
return ACL_ERROR_GE_MEMORY_ALLOCATION;
}
@ -226,9 +227,10 @@ Status SingleOpModel::SetInputsAndOutputs(SingleOp &single_op) {
return SUCCESS;
}
Status SingleOpModel::BuildTaskList(SingleOp &single_op) {
Status SingleOpModel::BuildTaskList(StreamResource *stream_resource, SingleOp &single_op) {
auto ge_model = model_helper_.GetGeModel();
GE_CHECK_NOTNULL(ge_model);
single_op.arg_table_.resize(single_op.input_sizes_.size() + single_op.output_sizes_.size());
auto tasks = ge_model->GetModelTaskDefPtr()->task();
for (int i = 0; i < tasks.size(); ++i) {
const TaskDef &task_def = tasks[i];
@ -247,9 +249,11 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) {
return ret;
}
single_op.arg_table_.resize(single_op.input_sizes_.size() + single_op.output_sizes_.size());
ParseArgTable(tbe_task, single_op);
tbe_task->SetModelArgs(model_name_, model_id_);
if (tbe_task->tiling_buffer_ != nullptr) {
tbe_task->stream_resource_ = stream_resource;
}
single_op.tasks_.emplace_back(tbe_task);
} else if (kernel_type == ccKernelType::AI_CPU || kernel_type == ccKernelType::CUST_AI_CPU) {
GELOGD("Building AICPU_CC task");
@ -261,6 +265,7 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) {
return ret;
}
task->SetModelArgs(model_name_, model_id_);
ParseArgTable(task, single_op);
single_op.tasks_.emplace_back(task);
} else {
GELOGE(ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID,
@ -278,6 +283,7 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) {
return ret;
}
aicpu_task->SetModelArgs(model_name_, model_id_);
ParseArgTable(aicpu_task, single_op);
single_op.tasks_.emplace_back(aicpu_task);
} else {
// skip
@ -287,21 +293,23 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) {
return SUCCESS;
}
void SingleOpModel::ParseArgTable(TbeOpTask *task, SingleOp &op) {
void SingleOpModel::ParseArgTable(OpTask *task, SingleOp &op) {
if (task == nullptr) {
GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "tbe op task is nullptr");
return;
}
// args: addr1, addr2, addr3 ...
auto *args = const_cast<uintptr_t *>(reinterpret_cast<const uintptr_t *>(task->GetArgs()));
size_t arg_size = task->GetArgSize();
for (size_t i = 0; i < arg_size / sizeof(void *); ++i) {
uintptr_t *ptr_to_addr = args + i;
uintptr_t *arg_base = nullptr;
size_t arg_num = 0;
task->GetIoAddr(arg_base, arg_num);
for (size_t i = 0; i < arg_num; ++i) {
uintptr_t *ptr_to_addr = arg_base + i;
uintptr_t addr = *ptr_to_addr;
auto iter = model_params_.addr_mapping_.find(addr);
if (iter != model_params_.addr_mapping_.end()) {
int arg_index = iter->second;
GELOGI("%s args[%zu] mapped to user designated args[%d]", task->GetStubName().c_str(), i, arg_index);
GELOGI("%s args[%zu] mapped to user designated args[%d]", task->GetOpdesc()->GetName().c_str(), i, arg_index);
op.arg_table_[iter->second].emplace_back(ptr_to_addr);
}
}
@ -373,7 +381,7 @@ Status SingleOpModel::BuildCpuKernelTask(const domi::KernelDef &kernel_def, OpTa
}
auto builder = AiCpuCCTaskBuilder(iter->second->GetOpDesc(), kernel_def);
auto ret = builder.BuildTask(*aicpucc_task, kernel_id);
auto ret = builder.BuildTask(*aicpucc_task, kernel_id, model_params_);
if (ret != SUCCESS) {
GELOGE(ret, "build aicpu_CC op task failed");
return ret;
@ -386,8 +394,10 @@ Status SingleOpModel::BuildCpuKernelTask(const domi::KernelDef &kernel_def, OpTa
Status SingleOpModel::BuildOp(StreamResource &resource, SingleOp &single_op) {
GE_CHK_STATUS_RET_NOLOG(ParseInputsAndOutputs());
GE_CHK_STATUS_RET_NOLOG(InitModelMem(resource));
single_op.running_param_.reset(new (std::nothrow)SingleOpModelParam(model_params_));
GE_CHECK_NOTNULL(single_op.running_param_);
GE_CHK_STATUS_RET_NOLOG(SetInputsAndOutputs(single_op));
return BuildTaskList(single_op);
return BuildTaskList(&resource, single_op);
}
Status SingleOpModel::BuildModelTaskKernel(const TaskDef &task_def, DynamicSingleOp &single_op) {

@ -65,7 +65,7 @@ class SingleOpModel {
Status ParseInputNode(const OpDescPtr &op_desc);
void ParseOutputNode(const OpDescPtr &op_desc);
Status BuildTaskList(SingleOp &single_op);
Status BuildTaskList(StreamResource *stream_resource, SingleOp &single_op);
Status BuildTaskListForDynamicOp(DynamicSingleOp &dynamic_single_op);
Status BuildKernelTask(const domi::KernelDef &kernel_def, TbeOpTask **task);
Status BuildKernelExTask(const domi::KernelExDef &kernel_def, AiCpuTask **task,
@ -74,7 +74,7 @@ class SingleOpModel {
Status BuildModelTaskKernel(const domi::TaskDef &task_def, DynamicSingleOp &single_op);
static void ParseOpModelParams(ModelHelper &model_helper, SingleOpModelParam &param);
void ParseArgTable(TbeOpTask *task, SingleOp &op);
void ParseArgTable(OpTask *task, SingleOp &op);
std::string model_name_;
uint32_t model_id_ = 0;

@ -69,11 +69,25 @@ uint8_t *StreamResource::DoMallocMemory(const std::string &purpose,
size_t size,
size_t &max_allocated,
std::vector<uint8_t *> &allocated) {
if (size == 0) {
GELOGD("Mem size == 0");
return nullptr;
}
if (size <= max_allocated && !allocated.empty()) {
GELOGD("reuse last memory");
return allocated.back();
}
if (!allocated.empty()) {
uint8_t *current_buffer = allocated.back();
allocated.pop_back();
if (rtStreamSynchronize(stream_) != RT_ERROR_NONE) {
GELOGW("Failed to invoke rtStreamSynchronize");
}
(void) rtFree(current_buffer);
}
uint8_t *buffer = nullptr;
auto ret = rtMalloc(reinterpret_cast<void **>(&buffer), size, RT_MEMORY_HBM);
if (ret != RT_ERROR_NONE) {
@ -96,10 +110,14 @@ uint8_t *StreamResource::DoMallocMemory(const std::string &purpose,
return buffer;
}
uint8_t *StreamResource::MallocMemory(const std::string &purpose, size_t size) {
uint8_t *StreamResource::MallocMemory(const std::string &purpose, size_t size, bool holding_lock) {
GELOGD("To Malloc memory, size = %zu", size);
uint8_t *buffer = DoMallocMemory(purpose, size, max_memory_size_, memory_list_);
return buffer;
if (holding_lock) {
return DoMallocMemory(purpose, size, max_memory_size_, memory_list_);
} else {
std::lock_guard<std::mutex> lk(stream_mu_);
return DoMallocMemory(purpose, size, max_memory_size_, memory_list_);
}
}
uint8_t *StreamResource::MallocWeight(const std::string &purpose, size_t size) {
@ -158,7 +176,7 @@ Status StreamResource::BuildOperator(const string &model_name, const ModelData &
return ret;
}
auto new_op = std::unique_ptr<SingleOp>(new(std::nothrow) SingleOp(&stream_mu_, stream_));
auto new_op = std::unique_ptr<SingleOp>(new(std::nothrow) SingleOp(this, &stream_mu_, stream_));
if (new_op == nullptr) {
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "new SingleOp failed");
return ACL_ERROR_GE_MEMORY_ALLOCATION;
@ -171,4 +189,12 @@ Status StreamResource::BuildOperator(const string &model_name, const ModelData &
op_map_[model_data.model_data] = std::move(new_op);
return SUCCESS;
}
const uint8_t *StreamResource::GetMemoryBase() const {
if (memory_list_.empty()) {
return nullptr;
}
return memory_list_.back();
}
} // namespace ge

@ -45,8 +45,9 @@ class StreamResource {
Status BuildOperator(const std::string &model_name, const ModelData &model_data, SingleOp **single_op);
Status BuildDynamicOperator(const std::string &model_name, const ModelData &model_data, DynamicSingleOp **single_op);
uint8_t *MallocMemory(const std::string &purpose, size_t size);
uint8_t *MallocMemory(const std::string &purpose, size_t size, bool holding_lock = true);
uint8_t *MallocWeight(const std::string &purpose, size_t size);
const uint8_t *GetMemoryBase() const;
private:
uint8_t *DoMallocMemory(const std::string &purpose,

@ -17,17 +17,22 @@
#include "single_op/task/aicpu_kernel_task_builder.h"
#include "framework/common/taskdown_common.h"
#include "graph/load/new_model_manager/model_manager.h"
#include "build_task_utils.h"
namespace ge {
AiCpuCCTaskBuilder::AiCpuCCTaskBuilder(const OpDescPtr &op_desc, const domi::KernelDef &kernel_def)
: op_desc_(op_desc), kernel_def_(kernel_def) {}
Status AiCpuCCTaskBuilder::SetKernelArgs(AiCpuCCTask &task) {
Status AiCpuCCTaskBuilder::SetKernelArgs(AiCpuCCTask &task, const SingleOpModelParam &param) {
size_t aicpu_arg_size = kernel_def_.args_size();
if (aicpu_arg_size <= 0) {
if (aicpu_arg_size <= sizeof(aicpu::AicpuParamHead)) {
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "aicpu_arg_size is invalid, value = %zu", aicpu_arg_size);
return ACL_ERROR_GE_PARAM_INVALID;
}
task.io_addr_num_ = op_desc_->GetInputsSize() + op_desc_->GetOutputsSize();
GE_CHECK_GE(aicpu_arg_size - sizeof(aicpu::AicpuParamHead), task.io_addr_num_ * sizeof(void *));
std::unique_ptr<uint8_t[]> aicpu_args;
aicpu_args.reset(new(std::nothrow) uint8_t[aicpu_arg_size]());
if (aicpu_args == nullptr) {
@ -41,13 +46,19 @@ Status AiCpuCCTaskBuilder::SetKernelArgs(AiCpuCCTask &task) {
return ACL_ERROR_GE_INTERNAL_ERROR;
}
task.SetIoAddr(aicpu_args.get() + sizeof(aicpu::AicpuParamHead));
task.SetIoAddr(reinterpret_cast<uintptr_t *>(aicpu_args.get() + sizeof(aicpu::AicpuParamHead)));
task.SetKernelArgs(std::move(aicpu_args), aicpu_arg_size);
auto addresses = BuildTaskUtils::GetKernelArgs(op_desc_, param);
GE_CHECK_GE(addresses.size(), task.io_addr_num_);
for (size_t i = 0; i < task.io_addr_num_; ++i) {
task.io_addr_[i] = reinterpret_cast<uintptr_t>(addresses[i]);
}
return SUCCESS;
}
Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task, uint64_t kernel_id) {
auto ret = SetKernelArgs(task);
Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task, uint64_t kernel_id, const SingleOpModelParam &param) {
auto ret = SetKernelArgs(task, param);
if (ret != SUCCESS) {
return ret;
}
@ -86,6 +97,10 @@ Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task, uint64_t kernel_id) {
return ret;
}
if (task.GetUnknownType() == DEPEND_COMPUTE) {
GELOGE(FAILED, "AiCpuCCTask unknown type is depend compute, it's not supported now.");
return FAILED;
}
auto aicpu_param_head = reinterpret_cast<aicpu::AicpuParamHead *>(task.args_.get());
if (task.ext_info_addr_dev_ != nullptr) {
aicpu_param_head->extInfoLength = kernel_ext_info.size();

@ -30,10 +30,10 @@ class AiCpuCCTaskBuilder {
explicit AiCpuCCTaskBuilder(const OpDescPtr &op_desc, const domi::KernelDef &kernel_def);
~AiCpuCCTaskBuilder() = default;
Status BuildTask(AiCpuCCTask &task, uint64_t kernel_id);
Status BuildTask(AiCpuCCTask &task, uint64_t kernel_id, const SingleOpModelParam &param);
private:
Status SetKernelArgs(AiCpuCCTask &task);
Status SetKernelArgs(AiCpuCCTask &task, const SingleOpModelParam &param);
const OpDescPtr op_desc_;
const domi::KernelDef &kernel_def_;
};

@ -26,26 +26,6 @@ namespace ge {
AiCpuTaskBuilder::AiCpuTaskBuilder(const OpDescPtr &op_desc, const domi::KernelExDef &kernel_def)
: op_desc_(op_desc), kernel_def_(kernel_def) {}
Status AiCpuTaskBuilder::SetInputOutputAddr(void **io_addr, const std::vector<void *> &addresses) {
size_t arg_size = kernel_def_.args_size();
auto rt_ret = rtMalloc(io_addr, arg_size, RT_MEMORY_HBM);
if (rt_ret != RT_ERROR_NONE) {
GELOGE(rt_ret, "rtMalloc failed, size = %zu, ret = %d", arg_size, rt_ret);
return rt_ret;
}
const void *src_addr = reinterpret_cast<const void *>(addresses.data());
uint64_t src_len = sizeof(void *) * addresses.size();
rt_ret = rtMemcpy(*io_addr, arg_size, src_addr, src_len, RT_MEMCPY_HOST_TO_DEVICE);
if (rt_ret != RT_ERROR_NONE) {
(void)rtFree(*io_addr);
GELOGE(rt_ret, "rtMemcpy addresses failed, ret = %d", rt_ret);
return rt_ret;
}
return SUCCESS;
}
Status AiCpuTaskBuilder::SetFmkOpKernel(void *io_addr, void *ws_addr, STR_FWK_OP_KERNEL &fwk_op_kernel) {
auto sec_ret = memcpy_s(&fwk_op_kernel, sizeof(STR_FWK_OP_KERNEL),
kernel_def_.args().data(), kernel_def_.args().size());
@ -80,39 +60,27 @@ namespace ge {
return SUCCESS;
}
Status AiCpuTaskBuilder::InitWorkspaceAndIO(void **io_addr, void **kernel_workspace,
const SingleOpModelParam &param, bool dynamic_flag) {
Status AiCpuTaskBuilder::InitWorkspaceAndIO(AiCpuTask &task, const SingleOpModelParam &param, bool dynamic_flag) {
if (kernel_def_.args_size() > sizeof(STR_FWK_OP_KERNEL)) {
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d",
sizeof(STR_FWK_OP_KERNEL), kernel_def_.args_size());
return ACL_ERROR_GE_PARAM_INVALID;
}
auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param);
auto ws_addr_vec = addresses.at(BuildTaskUtils::kAddressIndexWorkspace);
if (dynamic_flag) {
GE_CHK_RT_RET(rtMalloc(kernel_workspace, kernel_def_.task_info_size(), RT_MEMORY_HBM));
} else {
if (ws_addr_vec.empty()) {
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "workspace Data Address is empty.");
return ACL_ERROR_GE_PARAM_INVALID;
}
*kernel_workspace = ws_addr_vec[0];
}
GE_CHK_RT_RET(rtMemcpy(*kernel_workspace, kernel_def_.task_info_size(),
GE_CHK_RT_RET(rtMalloc(&task.workspace_addr_, kernel_def_.task_info_size(), RT_MEMORY_HBM));
GE_CHK_RT_RET(rtMemcpy(task.workspace_addr_, kernel_def_.task_info_size(),
kernel_def_.task_info().data(), kernel_def_.task_info_size(),
RT_MEMCPY_HOST_TO_DEVICE));
auto ret = SetInputOutputAddr(io_addr, BuildTaskUtils::JoinAddresses(addresses));
if (ret != SUCCESS) {
return ret;
}
auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param, false);
task.io_addr_host_ = BuildTaskUtils::JoinAddresses(addresses);
task.io_addr_size_ = task.io_addr_host_.size() * sizeof(void *);
GE_CHK_RT_RET(rtMalloc(&task.io_addr_, task.io_addr_size_, RT_MEMORY_HBM));
return SUCCESS;
}
Status AiCpuTaskBuilder::BuildTask(ge::AiCpuTask &task, const SingleOpModelParam &param,
bool dynamic_flag, uint64_t kernel_id) {
GE_CHK_STATUS_RET_NOLOG(InitWorkspaceAndIO(&task.io_addr_, &task.workspace_addr_, param, dynamic_flag));
GE_CHK_STATUS_RET_NOLOG(InitWorkspaceAndIO(task, param, dynamic_flag));
STR_FWK_OP_KERNEL fwk_op_kernel = {0};
auto ret = SetFmkOpKernel(task.io_addr_, task.workspace_addr_, fwk_op_kernel);

@ -33,10 +33,8 @@ namespace ge {
private:
static Status SetKernelArgs(void **args, STR_FWK_OP_KERNEL &kernel);
Status SetInputOutputAddr(void **io_addr, const std::vector<void *> &addresses);
Status SetFmkOpKernel(void *io_addr, void *ws_addr, STR_FWK_OP_KERNEL &kernel);
Status InitWorkspaceAndIO(void **io_addr, void **kernel_workspace,
const SingleOpModelParam &param, bool dynamic_flag);
Status InitWorkspaceAndIO(AiCpuTask &task, const SingleOpModelParam &param, bool dynamic_flag);
const OpDescPtr op_desc_;
const domi::KernelExDef &kernel_def_;

@ -32,7 +32,8 @@ const uint64_t kVarSize = 0;
}
std::vector<std::vector<void *>> BuildTaskUtils::GetAddresses(const OpDescPtr &op_desc,
const SingleOpModelParam &param) {
const SingleOpModelParam &param,
bool keep_workspace) {
std::vector<std::vector<void *>> ret;
RuntimeParam runtime_para;
runtime_para.mem_size = param.memory_size;
@ -49,7 +50,9 @@ std::vector<std::vector<void *>> BuildTaskUtils::GetAddresses(const OpDescPtr &o
ret.emplace_back(ModelUtils::GetInputDataAddrs(runtime_para, op_desc));
ret.emplace_back(ModelUtils::GetOutputDataAddrs(runtime_para, op_desc));
ret.emplace_back(ModelUtils::GetWorkspaceDataAddrs(runtime_para, op_desc));
if (keep_workspace) {
ret.emplace_back(ModelUtils::GetWorkspaceDataAddrs(runtime_para, op_desc));
}
return ret;
}

@ -27,15 +27,17 @@
namespace ge {
class BuildTaskUtils {
public:
static constexpr int kAddressIndexOutput = 1;
static constexpr int kAddressIndexWorkspace = 2;
static std::vector<std::vector<void *>> GetAddresses(const OpDescPtr &op_desc, const SingleOpModelParam &param);
static std::vector<std::vector<void *>> GetAddresses(const OpDescPtr &op_desc,
const SingleOpModelParam &param,
bool keep_workspace = true);
static std::vector<void *> JoinAddresses(const std::vector<std::vector<void *>> &addresses);
static std::vector<void *> GetKernelArgs(const OpDescPtr &op_desc, const SingleOpModelParam &param);
static std::string GetTaskInfo(const OpDescPtr &op_desc);
template<typename T>
static std::string VectorToString(const std::vector<T> &values)
{
static std::string VectorToString(const std::vector<T> &values) {
std::stringstream ss;
ss << '[';
auto size = values.size();

File diff suppressed because it is too large Load Diff

@ -32,49 +32,27 @@
#include "init/gelib.h"
namespace ge {
enum OpTaskType {
OP_TASK_TBE = 0,
OP_TASK_AICPU,
OP_TASK_AICPUCC,
OP_TASK_INVALID,
};
class StreamResource;
struct SingleOpModelParam;
class OpTask {
public:
OpTask() = default;
virtual ~OpTask() = default;
virtual Status LaunchKernel(rtStream_t stream) = 0;
virtual Status UpdateRunInfo(const vector<GeTensorDesc> &input_desc,
const vector<GeTensorDesc> &output_desc) {
return UNSUPPORTED;
}
virtual Status LaunchKernel(const std::vector<void *> &inputs,
const std::vector<void *> &outputs,
const std::vector<void *> &workspaces,
rtStream_t stream) {
return UNSUPPORTED;
}
virtual OpTaskType GetOpTaskType() = 0;
virtual const void *GetIOAddr() const = 0;
const vector<int64_t> &GetWorkspaceSizes() const;
void SetWorkspaceSizes(const vector<int64_t> &workspace_sizes);
const vector<GeTensorDesc> &output_desc);
virtual Status UpdateArgTable(const SingleOpModelParam &param);
void SetModelArgs(std::string model_name, uint32_t model_id);
Status GetProfilingArgs(std::string &model_name, std::string &op_name, uint32_t &model_id, uint32_t &block_dim);
const OpDescPtr &GetOpdesc() const {return op_desc_;}
Status OpenDump(rtStream_t stream);
void SetIoAddrsForDump(const vector<uint64_t> &io_addrs_for_dump) {
io_addrs_for_dump_ = io_addrs_for_dump;
}
virtual void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) = 0;
virtual Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
const std::vector<DataBuffer> &input_buffers,
std::vector<GeTensorDesc> &output_desc,
std::vector<DataBuffer> &output_buffers,
rtStream_t stream) {
return UNSUPPORTED;
}
rtStream_t stream);
private:
std::vector<int64_t> workspace_sizes_;
protected:
DumpProperties dump_properties_;
DumpOp dump_op_;
@ -82,19 +60,18 @@ class OpTask {
std::string model_name_;
uint32_t model_id_ = 0;
uint32_t block_dim_ = 1;
std::vector<uint64_t> io_addrs_for_dump_;
};
class TbeOpTask : public OpTask {
public:
~TbeOpTask() override;
Status LaunchKernel(rtStream_t stream) override;
OpTaskType GetOpTaskType() override {
return OP_TASK_TBE;
}
const void *GetIOAddr() const override {
return nullptr;
}
Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
const std::vector<DataBuffer> &input_buffers,
std::vector<GeTensorDesc> &output_desc,
std::vector<DataBuffer> &output_buffers,
rtStream_t stream) override;
void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) override;
void SetSmDesc(void *sm_desc);
void SetStubFunc(const std::string &name, const void *stub_func);
void SetKernelArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size, uint32_t block_dim, const OpDescPtr &op_desc);
@ -102,20 +79,17 @@ class TbeOpTask : public OpTask {
Status UpdateRunInfo(const vector<GeTensorDesc> &input_desc,
const vector<GeTensorDesc> &output_desc) override;
Status LaunchKernel(const vector<void *> &inputs,
const vector<void *> &outputs,
const vector<void *> &workspaces,
rtStream_t stream) override;
const void *GetArgs() const;
size_t GetArgSize() const;
const std::string &GetStubName() const;
void EnableDynamicSupport(const NodePtr &node, void *tiling_buffer, size_t max_tiling_size);
private:
friend class SingleOpModel;
static Status UpdateTensorDesc(const GeTensorDesc &src_tensor, GeTensorDesc &dst_tensor);
Status UpdateNodeByShape(const vector<GeTensorDesc> &input_desc,
const vector<GeTensorDesc> &output_desc);
Status AllocateWorkspaces(const std::vector<int64_t> &workspace_sizes);
const void *stub_func_ = nullptr;
std::unique_ptr<uint8_t[]> args_;
@ -123,9 +97,11 @@ class TbeOpTask : public OpTask {
void *sm_desc_ = nullptr;
std::string stub_name_;
StreamResource *stream_resource_ = nullptr;
void *tiling_buffer_ = nullptr;
uint32_t max_tiling_size_ = 0;
std::string tiling_data_;
std::vector<void *> workspaces_;
NodePtr node_;
};
@ -133,9 +109,10 @@ class AiCpuBaseTask : public OpTask {
public:
AiCpuBaseTask() = default;
~AiCpuBaseTask() override;
const UnknowShapeOpType GetUnknownType() const { return unknown_type_; }
UnknowShapeOpType GetUnknownType() const { return unknown_type_; }
protected:
Status UpdateIoAddr(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs);
Status SetExtInfoAndType(const std::string &kernel_ext_info, uint64_t kernel_id);
Status UpdateExtInfo(const std::vector<GeTensorDesc> &input_desc,
@ -158,10 +135,8 @@ class AiCpuTask : public AiCpuBaseTask {
~AiCpuTask() override;
Status LaunchKernel(rtStream_t stream) override;
OpTaskType GetOpTaskType() override {
return OP_TASK_AICPU;
}
const void *GetIOAddr() const override;
Status UpdateArgTable(const SingleOpModelParam &param) override;
void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) override;
Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
const std::vector<DataBuffer> &input_buffers,
@ -171,8 +146,6 @@ class AiCpuTask : public AiCpuBaseTask {
Status SetMemCopyTask(const domi::KernelExDef &kernel_def);
private:
Status SetIO(const vector<void *> &inputs, vector<void *> &outputs);
// for copy task.
Status InitForSummaryAndCopy();
Status UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output_desc,
@ -188,27 +161,31 @@ class AiCpuTask : public AiCpuBaseTask {
friend class AiCpuTaskBuilder;
void *workspace_addr_ = nullptr;
std::string task_info_;
// device addr
// device addr
void *args_ = nullptr;
size_t arg_size_ = 0;
std::string op_type_;
// device addr
void *io_addr_ = nullptr;
size_t io_addr_size_ = 0;
// host addr
std::vector<void *> io_addr_host_;
bool dynamic_flag_ = false;
// for copy task
void *copy_task_args_buf_;
void *copy_workspace_buf_;
void *copy_task_args_buf_ = nullptr;
void *copy_workspace_buf_ = nullptr;
std::vector<void *> output_summary_;
std::vector<aicpu::FWKAdapter::ResultSummary> output_summary_host_;
void *copy_ioaddr_dev_;
void *copy_ioaddr_dev_ = nullptr;
void *copy_input_release_flag_dev_;
void *copy_input_data_size_dev_;
void *copy_input_src_dev_;
void *copy_input_dst_dev_;
void *copy_input_release_flag_dev_ = nullptr;
void *copy_input_data_size_dev_ = nullptr;
void *copy_input_src_dev_ = nullptr;
void *copy_input_dst_dev_ = nullptr;
vector<void *> out_shape_hbm_;
uint64_t kernel_id_ = 0;
@ -222,13 +199,12 @@ class AiCpuCCTask : public AiCpuBaseTask {
AiCpuCCTask &operator=(const AiCpuCCTask &) = delete;
Status LaunchKernel(rtStream_t stream) override;
OpTaskType GetOpTaskType() override { return OP_TASK_AICPUCC; }
const void *GetIOAddr() const override;
void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) override;
const void *GetArgs() const;
void SetKernelArgs(std::unique_ptr<uint8_t[]> args, size_t arg_size);
void SetSoName(const std::string &so_name);
void SetkernelName(const std::string &kernel_Name);
void SetIoAddr(void *io_addr);
void SetIoAddr(uintptr_t *io_addr);
size_t GetArgSize() const;
Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
@ -244,7 +220,8 @@ private:
std::unique_ptr<uint8_t[]> args_;
size_t arg_size_ = 0;
void *sm_desc_ = nullptr;
void *io_addr_ = nullptr;
uintptr_t *io_addr_ = nullptr;
size_t io_addr_num_ = 0;
bool is_custom_ = false;
uint32_t dump_flag_ = RT_KERNEL_DEFAULT;
};

Loading…
Cancel
Save