|
|
@ -30,6 +30,14 @@ using mindspore::lite::KernelRegistrar;
|
|
|
|
|
|
|
|
|
|
|
|
namespace mindspore::kernel {
|
|
|
|
namespace mindspore::kernel {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ArithmeticOpenCLKernel::~ArithmeticOpenCLKernel() {
|
|
|
|
|
|
|
|
if (weight_ptr_ != nullptr) {
|
|
|
|
|
|
|
|
auto allocator = runtime_->GetAllocator();
|
|
|
|
|
|
|
|
allocator->Free(weight_ptr_);
|
|
|
|
|
|
|
|
weight_ptr_ = nullptr;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
std::vector<size_t> ArithmeticOpenCLKernel::InitGlobalSize() const {
|
|
|
|
std::vector<size_t> ArithmeticOpenCLKernel::InitGlobalSize() const {
|
|
|
|
const size_t global_x = out_tensors_[0]->Width();
|
|
|
|
const size_t global_x = out_tensors_[0]->Width();
|
|
|
|
const size_t global_y = out_tensors_[0]->Height();
|
|
|
|
const size_t global_y = out_tensors_[0]->Height();
|
|
|
@ -39,10 +47,18 @@ std::vector<size_t> ArithmeticOpenCLKernel::InitGlobalSize() const {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() {
|
|
|
|
void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() {
|
|
|
|
size_t H = out_tensors_[0]->Batch() * out_tensors_[0]->Height();
|
|
|
|
|
|
|
|
size_t W = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
|
|
|
|
|
|
|
|
local_size_ = {16, 16};
|
|
|
|
local_size_ = {16, 16};
|
|
|
|
global_size_ = {W, H};
|
|
|
|
if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
|
|
|
|
|
|
|
|
size_t H = out_tensors_[0]->Batch() * out_tensors_[0]->Height();
|
|
|
|
|
|
|
|
size_t W = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
|
|
|
|
|
|
|
|
global_size_ = {W, H};
|
|
|
|
|
|
|
|
} else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) {
|
|
|
|
|
|
|
|
size_t H = out_tensors_[0]->Batch();
|
|
|
|
|
|
|
|
size_t W = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
|
|
|
|
|
|
|
|
global_size_ = {W, H};
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
MS_LOG(ERROR) << "Unspport data format " << out_tensors_[0]->GetFormat();
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() {
|
|
|
|
void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() {
|
|
|
@ -51,16 +67,16 @@ void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int ArithmeticOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
|
|
|
|
int ArithmeticOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
|
|
|
|
size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
|
|
|
|
|
|
|
|
int H = out_tensors_[0]->Batch() * out_tensors_[0]->Height();
|
|
|
|
|
|
|
|
int W = out_tensors_[0]->Width() * CO4;
|
|
|
|
|
|
|
|
size_t im_dst_x, im_dst_y;
|
|
|
|
size_t im_dst_x, im_dst_y;
|
|
|
|
if (in_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
|
|
|
|
if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
|
|
|
|
im_dst_x = W;
|
|
|
|
im_dst_x = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
|
|
|
|
im_dst_y = H;
|
|
|
|
im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height();
|
|
|
|
|
|
|
|
} else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) {
|
|
|
|
|
|
|
|
im_dst_y = out_tensors_[0]->Batch();
|
|
|
|
|
|
|
|
im_dst_x = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * CO4;
|
|
|
|
MS_LOG(ERROR) << "Unspport data format " << out_tensors_[0]->GetFormat();
|
|
|
|
im_dst_x = out_tensors_[0]->Width();
|
|
|
|
return RET_ERROR;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#ifdef ENABLE_FP16
|
|
|
|
#ifdef ENABLE_FP16
|
|
|
|
size_t img_dtype = CL_HALF_FLOAT;
|
|
|
|
size_t img_dtype = CL_HALF_FLOAT;
|
|
|
@ -73,11 +89,26 @@ int ArithmeticOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_si
|
|
|
|
return RET_OK;
|
|
|
|
return RET_OK;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int ArithmeticOpenCLKernel::InitBuffer() {
|
|
|
|
|
|
|
|
const ArithmeticParameter *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(op_parameter_);
|
|
|
|
|
|
|
|
if (!arithmetic_parameter->broadcasting_) {
|
|
|
|
|
|
|
|
if (in_tensors_[1]->TensorType() == schema::NodeType_ValueNode && in_tensors_[1]->Data() != nullptr) {
|
|
|
|
|
|
|
|
auto allocatdor = runtime_->GetAllocator();
|
|
|
|
|
|
|
|
std::vector<size_t> img_size;
|
|
|
|
|
|
|
|
GetImageSize(0, &img_size);
|
|
|
|
|
|
|
|
weight_ptr_ = allocatdor->CreateImageFromHost(in_tensors_[1]->Data(), in_tensors_[1]->ElementsNum(), img_size);
|
|
|
|
|
|
|
|
return RET_OK;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return RET_OK;
|
|
|
|
|
|
|
|
}
|
|
|
|
int ArithmeticOpenCLKernel::Init() {
|
|
|
|
int ArithmeticOpenCLKernel::Init() {
|
|
|
|
runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
|
|
|
|
runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
|
|
|
|
std::string kernel_name;
|
|
|
|
std::string kernel_name;
|
|
|
|
|
|
|
|
|
|
|
|
if (in_tensors_[1]->TensorType() == schema::NodeType_ValueNode && in_tensors_[1]->Data() != nullptr) {
|
|
|
|
const ArithmeticParameter *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(op_parameter_);
|
|
|
|
|
|
|
|
if (arithmetic_parameter->broadcasting_ && in_tensors_[1]->TensorType() == schema::NodeType_ValueNode &&
|
|
|
|
|
|
|
|
in_tensors_[1]->Data() != nullptr) {
|
|
|
|
element_flag_ = false;
|
|
|
|
element_flag_ = false;
|
|
|
|
kernel_name = "BoardcastArith";
|
|
|
|
kernel_name = "BoardcastArith";
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
@ -103,7 +134,7 @@ int ArithmeticOpenCLKernel::Init() {
|
|
|
|
|
|
|
|
|
|
|
|
lite::STATUS error_code = RET_OK;
|
|
|
|
lite::STATUS error_code = RET_OK;
|
|
|
|
#ifdef PROGRAM_WITH_IL
|
|
|
|
#ifdef PROGRAM_WITH_IL
|
|
|
|
kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
|
|
|
|
kernel_ = runtime_->GetKernelFromBinary(kernel_name);
|
|
|
|
#else
|
|
|
|
#else
|
|
|
|
if (out_mem_type_ == OpenCLMemType::IMG) {
|
|
|
|
if (out_mem_type_ == OpenCLMemType::IMG) {
|
|
|
|
kernel_name += "_IMG";
|
|
|
|
kernel_name += "_IMG";
|
|
|
@ -119,23 +150,31 @@ int ArithmeticOpenCLKernel::Init() {
|
|
|
|
if (error_code != RET_OK) {
|
|
|
|
if (error_code != RET_OK) {
|
|
|
|
return error_code;
|
|
|
|
return error_code;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
auto format = schema::Format_NHWC4;
|
|
|
|
|
|
|
|
if (arithmetic_parameter->ndim_ == 2) {
|
|
|
|
|
|
|
|
format = schema::Format_NC4;
|
|
|
|
|
|
|
|
}
|
|
|
|
in_ori_format_ = in_tensors_[0]->GetFormat();
|
|
|
|
in_ori_format_ = in_tensors_[0]->GetFormat();
|
|
|
|
in_tensors_[0]->SetFormat(schema::Format_NHWC4);
|
|
|
|
|
|
|
|
out_ori_format_ = out_tensors_[0]->GetFormat();
|
|
|
|
out_ori_format_ = out_tensors_[0]->GetFormat();
|
|
|
|
out_tensors_[0]->SetFormat(schema::Format_NHWC4);
|
|
|
|
in_tensors_[0]->SetFormat(format);
|
|
|
|
|
|
|
|
if (element_flag_ && in_tensors_[1]->TensorType() != schema::NodeType_ValueNode) {
|
|
|
|
|
|
|
|
in_tensors_[1]->SetFormat(format);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
out_tensors_[0]->SetFormat(format);
|
|
|
|
Image2dGetWorkGroupSize();
|
|
|
|
Image2dGetWorkGroupSize();
|
|
|
|
|
|
|
|
InitBuffer();
|
|
|
|
return RET_OK;
|
|
|
|
return RET_OK;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int ArithmeticOpenCLKernel::Run() {
|
|
|
|
int ArithmeticOpenCLKernel::Run() {
|
|
|
|
MS_LOG(DEBUG) << this->name() << " Running!";
|
|
|
|
MS_LOG(DEBUG) << this->name() << " Running!";
|
|
|
|
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int arg_idx = 0;
|
|
|
|
int arg_idx = 0;
|
|
|
|
|
|
|
|
runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->Data());
|
|
|
|
ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->Data());
|
|
|
|
|
|
|
|
if (element_flag_) {
|
|
|
|
if (element_flag_) {
|
|
|
|
runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[1]->Data());
|
|
|
|
void *weight = weight_ptr_ == nullptr ? in_tensors_[1]->Data() : weight_ptr_;
|
|
|
|
|
|
|
|
runtime_->SetKernelArg(kernel_, arg_idx++, weight);
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
float value = static_cast<float *>(in_tensors_[1]->Data())[0];
|
|
|
|
float value = static_cast<float *>(in_tensors_[1]->Data())[0];
|
|
|
|
switch (op_parameter_->type_) {
|
|
|
|
switch (op_parameter_->type_) {
|
|
|
@ -155,23 +194,47 @@ int ArithmeticOpenCLKernel::Run() {
|
|
|
|
MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_;
|
|
|
|
MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_;
|
|
|
|
break;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ocl_runtime->SetKernelArg(kernel_, arg_idx++, weight_);
|
|
|
|
runtime_->SetKernelArg(kernel_, arg_idx++, weight_);
|
|
|
|
ocl_runtime->SetKernelArg(kernel_, arg_idx++, bias_);
|
|
|
|
runtime_->SetKernelArg(kernel_, arg_idx++, bias_);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->Data());
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int H = 0;
|
|
|
|
|
|
|
|
int W = 0;
|
|
|
|
|
|
|
|
if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
|
|
|
|
|
|
|
|
H = out_tensors_[0]->Batch() * out_tensors_[0]->Height();
|
|
|
|
|
|
|
|
W = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
|
|
|
|
|
|
|
|
} else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) {
|
|
|
|
|
|
|
|
H = out_tensors_[0]->Batch();
|
|
|
|
|
|
|
|
W = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
MS_LOG(ERROR) << "Error output type " << out_tensors_[0]->GetFormat();
|
|
|
|
|
|
|
|
return RET_ERROR;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->Data());
|
|
|
|
|
|
|
|
int H = out_tensors_[0]->Batch() * out_tensors_[0]->Height();
|
|
|
|
|
|
|
|
int W = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
|
|
|
|
|
|
|
|
cl_int2 output_shape{W, H};
|
|
|
|
cl_int2 output_shape{W, H};
|
|
|
|
ocl_runtime->SetKernelArg(kernel_, arg_idx++, output_shape);
|
|
|
|
runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
|
|
|
|
ocl_runtime->RunKernel(kernel_, global_size_, local_size_, nullptr);
|
|
|
|
runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
|
|
|
|
return RET_OK;
|
|
|
|
return RET_OK;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
kernel::LiteKernel *OpenCLBiasAddKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
|
|
|
|
|
|
|
|
const std::vector<lite::tensor::Tensor *> &outputs,
|
|
|
|
|
|
|
|
OpParameter *opParameter, const lite::Context *ctx,
|
|
|
|
|
|
|
|
const kernel::KernelKey &desc, const lite::PrimitiveC *primitive);
|
|
|
|
|
|
|
|
|
|
|
|
kernel::LiteKernel *OpenCLArithmeticKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
|
|
|
|
kernel::LiteKernel *OpenCLArithmeticKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
|
|
|
|
const std::vector<lite::tensor::Tensor *> &outputs,
|
|
|
|
const std::vector<lite::tensor::Tensor *> &outputs,
|
|
|
|
OpParameter *opParameter, const lite::Context *ctx,
|
|
|
|
OpParameter *opParameter, const lite::Context *ctx,
|
|
|
|
const kernel::KernelKey &desc,
|
|
|
|
const kernel::KernelKey &desc,
|
|
|
|
const mindspore::lite::PrimitiveC *primitive) {
|
|
|
|
const mindspore::lite::PrimitiveC *primitive) {
|
|
|
|
|
|
|
|
const ArithmeticParameter *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(opParameter);
|
|
|
|
|
|
|
|
if (arithmetic_parameter->broadcasting_) {
|
|
|
|
|
|
|
|
for (size_t i = 0; i < arithmetic_parameter->ndim_; i++) {
|
|
|
|
|
|
|
|
if (arithmetic_parameter->in_shape1_[i] != 0 && arithmetic_parameter->in_shape1_[i] != 1) {
|
|
|
|
|
|
|
|
return OpenCLBiasAddKernelCreator(inputs, outputs, opParameter, ctx, desc, primitive);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
auto *kernel =
|
|
|
|
auto *kernel =
|
|
|
|
new (std::nothrow) ArithmeticOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs, ctx);
|
|
|
|
new (std::nothrow) ArithmeticOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs, ctx);
|
|
|
|
if (kernel == nullptr) {
|
|
|
|
if (kernel == nullptr) {
|
|
|
|