!8242 optimize opencl runtime

Merge pull request !8242 from wandongdong/dev
pull/8242/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit 0c588571b4

@ -1,8 +1,8 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void DepthwiseConv2d_IMG_NC4HW4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
__write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding,
int2 dilation, int4 src_size, int4 dst_size, float relu_clip_min,
__kernel void DepthwiseConv2d_IMG_NC4HW4(__write_only image2d_t dst_data, __read_only image2d_t src_data,
__global FLT4 *filter, __global FLT4 *bias, int2 kernel_size, int2 stride,
int2 padding, int2 dilation, int4 src_size, int4 dst_size, float relu_clip_min,
float relu_clip_max) {
int X = get_global_id(0);
int Y = get_global_id(1);
@ -32,8 +32,8 @@ __kernel void DepthwiseConv2d_IMG_NC4HW4(__read_only image2d_t src_data, __globa
WRITE_IMAGE(dst_data, (int2)(X, (Z * dst_size.y + Y)), res);
}
__kernel void DepthwiseConv2d_IMG_NHWC4_b222(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
__write_only image2d_t dst_data, int2 kernel_size, int2 stride,
__kernel void DepthwiseConv2d_IMG_NHWC4_b222(__write_only image2d_t dst_data, __read_only image2d_t src_data,
__global FLT4 *filter, __global FLT4 *bias, int2 kernel_size, int2 stride,
int2 padding, int2 dilation, int4 src_size, int4 dst_size,
float relu_clip_min, float relu_clip_max) {
int X = get_global_id(1) * 2;
@ -126,8 +126,8 @@ __kernel void DepthwiseConv2d_IMG_NHWC4_b222(__read_only image2d_t src_data, __g
}
}
}
__kernel void DepthwiseConv2d_IMG_NHWC4_b221(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
__write_only image2d_t dst_data, int2 kernel_size, int2 stride,
__kernel void DepthwiseConv2d_IMG_NHWC4_b221(__write_only image2d_t dst_data, __read_only image2d_t src_data,
__global FLT4 *filter, __global FLT4 *bias, int2 kernel_size, int2 stride,
int2 padding, int2 dilation, int4 src_size, int4 dst_size,
float relu_clip_min, float relu_clip_max) {
int X = get_global_id(1) * 2;
@ -182,8 +182,8 @@ __kernel void DepthwiseConv2d_IMG_NHWC4_b221(__read_only image2d_t src_data, __g
WRITE_IMAGE(dst_data, (int2)((X + 1) * dst_size.z + Z, Y + 1), r[3]);
}
}
__kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
__write_only image2d_t dst_data, int2 kernel_size, int2 stride,
__kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__write_only image2d_t dst_data, __read_only image2d_t src_data,
__global FLT4 *filter, __global FLT4 *bias, int2 kernel_size, int2 stride,
int2 padding, int2 dilation, int4 src_size, int4 dst_size,
float relu_clip_min, float relu_clip_max) {
int X = get_global_id(0);
@ -215,8 +215,8 @@ __kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__read_only image2d_t src_data, __gl
WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z + 1, Y), r[1]);
}
}
__kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias,
__global FLT4 *dst_data, int2 kernel_size, int2 stride, int2 padding,
__kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *dst_data, __global FLT4 *src_data, __global FLT4 *filter,
__global FLT4 *bias, int2 kernel_size, int2 stride, int2 padding,
int2 dilation, int4 src_size, int4 dst_size, float relu_clip_min,
float relu_clip_max) {
int X = get_global_id(0);
@ -247,10 +247,9 @@ __kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *src_data, __global FLT4
dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res;
}
__kernel void DepthwiseConv2d_BUF_NHWC4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias,
__global FLT4 *dst_data, int2 kernel_size, int2 stride, int2 padding,
int2 dilation, int4 src_size, int4 dst_size, float relu_clip_min,
float relu_clip_max) {
__kernel void DepthwiseConv2d_BUF_NHWC4(__global FLT4 *dst_data, __global FLT4 *src_data, __global FLT4 *filter,
__global FLT4 *bias, int2 kernel_size, int2 stride, int2 padding, int2 dilation,
int4 src_size, int4 dst_size, float relu_clip_min, float relu_clip_max) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
@ -279,8 +278,8 @@ __kernel void DepthwiseConv2d_BUF_NHWC4(__global FLT4 *src_data, __global FLT4 *
dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res;
}
__kernel void DepthwiseConv2d_BUF_NHWC4_1x1(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias,
__global FLT4 *dst_data, int2 kernel_size, int2 stride, int2 padding,
__kernel void DepthwiseConv2d_BUF_NHWC4_1x1(__global FLT4 *dst_data, __global FLT4 *src_data, __global FLT4 *filter,
__global FLT4 *bias, int2 kernel_size, int2 stride, int2 padding,
int2 dilation, int4 src_size, int4 dst_size, float relu_clip_min,
float relu_clip_max) {
int X = get_global_id(0);

@ -1,62 +0,0 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_IMAGE_FORMAT_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_IMAGE_FORMAT_H_
#include "src/runtime/opencl/opencl_runtime.h"
namespace mindspore {
namespace kernel {
/**
* MindSpore to OpenCL channel order.
* @param num_channels
* @return opencl_channels
*/
cl_channel_order ToChannelOrder(int num_channels) {
switch (num_channels) {
case 1:
return CL_R;
case 2:
return CL_RG;
case 3:
return CL_RGB;
case 4:
return CL_RGBA;
default:
return -1;
}
}
/**
* MindSpore image channel type to OpenCL channel data type.
* @param data_type
* @return opencl_data_type
*/
cl_channel_type ToImageChannelType(TypeId data_type) {
switch (data_type) {
case kNumberTypeFloat32:
return CL_FLOAT;
case kNumberTypeFloat16:
return CL_HALF_FLOAT;
default:
return -1;
}
}
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_IMAGE_FORMAT_H_

@ -30,6 +30,7 @@ using mindspore::kernel::KERNEL_ARCH::kGPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
using mindspore::lite::opencl::MemType;
using mindspore::schema::PrimitiveType_Eltwise;
namespace mindspore::kernel {
@ -72,7 +73,7 @@ void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() {
}
}
int ArithmeticOpenCLKernel::InitBuffer() {
int ArithmeticOpenCLKernel::InitWeights() {
auto fp16_enable = ocl_runtime_->GetFp16Enable();
auto data_size = fp16_enable ? sizeof(float16_t) : sizeof(float);
for (auto in_tensor_ : in_tensors_) {
@ -255,7 +256,7 @@ int ArithmeticOpenCLKernel::Init() {
#ifdef PROGRAM_WITH_IL
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
#else
if (out_mem_type_ == OpenCLMemType::IMG) {
if (out_mem_type_ == MemType::IMG) {
kernel_name += "_IMG";
} else {
kernel_name += "_BUF";
@ -271,7 +272,7 @@ int ArithmeticOpenCLKernel::Init() {
}
Image2dGetWorkGroupSize();
InitBuffer();
InitWeights();
SetArgs();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;

@ -32,7 +32,7 @@ class ArithmeticOpenCLKernel : public OpenCLKernel {
int Init() override;
int Run() override;
int InitBuffer() override;
int InitWeights() override;
int SetArgs();
private:

@ -35,7 +35,7 @@ using mindspore::schema::PrimitiveType_BiasAdd;
namespace mindspore::kernel {
int BiasAddOpenCLKernel::InitBuffer() {
int BiasAddOpenCLKernel::InitWeights() {
int C = in_tensors_[1]->shape()[0];
int div_ci = UP_DIV(C, C4NUM);
auto allocator = ocl_runtime_->GetAllocator();
@ -70,7 +70,7 @@ int BiasAddOpenCLKernel::Init() {
MS_LOG(ERROR) << "BiasAdd weight channel size:" << Bias_Size << " must be equal with in_teneors channel size:" << C;
return mindspore::lite::RET_ERROR;
}
InitBuffer();
InitWeights();
std::set<std::string> build_options;
std::string source = biasadd_source;
std::string program_name = "BiasAdd";

@ -35,7 +35,7 @@ class BiasAddOpenCLKernel : public OpenCLKernel {
int Init() override;
int Run() override;
int InitBuffer() override;
int InitWeights() override;
private:
cl_int4 GetGlobalshape();

@ -101,7 +101,7 @@ int ConvolutionOpenCLKernel::Init() {
winograd_mem1_ = allocator->Malloc(size, {width, height, img_dtype});
}
InitBuffer();
InitWeights();
MS_LOG(DEBUG) << "Convolution Init Done!";
return RET_OK;
@ -236,7 +236,7 @@ int ConvolutionOpenCLKernel::InitBias() {
return RET_OK;
}
int ConvolutionOpenCLKernel::InitBuffer() {
int ConvolutionOpenCLKernel::InitWeights() {
InitWeight();
if (has_bias_) {
InitBias();
@ -360,10 +360,13 @@ int ConvolutionOpenCLKernel::Run() {
}
if (use_winograd_) {
ocl_runtime_->RunKernel(kernel_4x4to36_, {size_t(TILES_XY_), 6, size_t(CI_SLICES_)}, {8, 6, 4}, nullptr);
ocl_runtime_->RunKernel(kernel_conv_, {size_t(UP_DIV(TILES_XY_, 2)), 36, size_t(UP_DIV(CO_SLICES_, 2))}, {8, 6, 2},
nullptr);
ocl_runtime_->RunKernel(kernel_36to4x4_, {size_t(TILES_XY_), 4, size_t(CO_SLICES_)}, {32, 4, 2}, nullptr);
ocl_runtime_->RunKernel(kernel_4x4to36_, std::vector<size_t>({size_t(TILES_XY_), 6, size_t(CI_SLICES_)}),
std::vector<size_t>({8, 6, 4}), nullptr);
ocl_runtime_->RunKernel(kernel_conv_,
std::vector<size_t>({size_t(UP_DIV(TILES_XY_, 2)), 36, size_t(UP_DIV(CO_SLICES_, 2))}),
std::vector<size_t>({8, 6, 2}), nullptr);
ocl_runtime_->RunKernel(kernel_36to4x4_, std::vector<size_t>({size_t(TILES_XY_), 4, size_t(CO_SLICES_)}),
std::vector<size_t>({32, 4, 2}), nullptr);
} else {
ocl_runtime_->RunKernel(kernel_conv_, global_, local_, nullptr);
}

@ -36,11 +36,11 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
int Init() override;
int Run() override;
int InitBuffer() override;
int InitWeights() override;
void SetGlobalLocal() override;
private:
void SetBlockSize();
void SetGlobalLocal();
int InitWeight();
int InitBias();
int GenerateWinogradWeight();

@ -37,13 +37,21 @@ using mindspore::kernel::KERNEL_ARCH::kGPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
using mindspore::lite::opencl::MemType;
using mindspore::schema::PrimitiveType_DepthwiseConv2D;
namespace mindspore::kernel {
int DepthwiseConv2dOpenCLKernel::Init() {
int DepthwiseConv2dOpenCLKernel::CheckSpecs() {
if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16) {
MS_LOG(ERROR) << "Unsupported data type " << in_tensors_[0]->data_type();
return RET_ERROR;
}
return RET_OK;
}
int DepthwiseConv2dOpenCLKernel::Prepare() {
std::string kernel_name = "DepthwiseConv2d";
if (out_mem_type_ == OpenCLMemType::BUF) {
if (out_mem_type_ == MemType::BUF) {
kernel_name += "_BUF";
} else {
kernel_name += "_IMG";
@ -66,14 +74,14 @@ int DepthwiseConv2dOpenCLKernel::Init() {
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
#endif
InitBuffer();
GetGlobalSize(0, &global_size_);
GetLocalSize(0, global_size_, &local_size_);
InitWeights();
SetGlobalLocal();
SetConstArgs();
MS_LOG(DEBUG) << kernel_name << " Init Done! mem type=" << static_cast<int>(out_mem_type_);
return mindspore::lite::RET_OK;
}
int DepthwiseConv2dOpenCLKernel::InitBuffer() {
int DepthwiseConv2dOpenCLKernel::InitWeights() {
auto parameter = reinterpret_cast<ConvParameter *>(op_parameter_);
auto allocator = ocl_runtime_->GetAllocator();
bool is_fp16 = ocl_runtime_->GetFp16Enable();
@ -138,28 +146,7 @@ int DepthwiseConv2dOpenCLKernel::InitBuffer() {
}
return mindspore::lite::RET_OK;
}
int DepthwiseConv2dOpenCLKernel::GetGlobalSize(size_t idx, std::vector<size_t> *global_size) {
size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM * block_size_[2]);
std::vector<size_t> global = {CO4, (size_t)UP_DIV(out_tensors_[0]->Width(), block_size_[1]),
(size_t)UP_DIV(out_tensors_[0]->Height(), block_size_[0])};
*global_size = std::move(global);
return mindspore::lite::RET_OK;
}
int DepthwiseConv2dOpenCLKernel::GetLocalSize(size_t idx, const std::vector<size_t> &global_size,
std::vector<size_t> *local_size) {
const int max_group_size = ocl_runtime_->DeviceMaxWorkGroupSize();
int z = global_size[0];
int y = std::min(max_group_size / z, GetMaxDivisorStrategy0(global_size[2], 8));
int x = std::max(1, std::min(static_cast<int>(global_size[1]), max_group_size / (y * z)));
local_size->clear();
*local_size = std::vector<size_t>({static_cast<size_t>(z), static_cast<size_t>(x), static_cast<size_t>(y)});
return mindspore::lite::RET_OK;
}
int DepthwiseConv2dOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
void DepthwiseConv2dOpenCLKernel::SetConstArgs() {
auto parameter = reinterpret_cast<ConvParameter *>(op_parameter_);
size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
size_t CI4 = UP_DIV(in_tensors_[0]->Channel(), C4NUM);
@ -174,11 +161,9 @@ int DepthwiseConv2dOpenCLKernel::Run() {
cl_int4 dst_size = {(cl_int)out_tensors_[0]->Width(), (cl_int)out_tensors_[0]->Height(), (cl_int)CO4,
(cl_int)out_tensors_[0]->Batch()};
int arg_cnt = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c());
int arg_cnt = 2;
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, packed_weight_, lite::opencl::MemType::BUF);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, bias_data_, lite::opencl::MemType::BUF);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, kernel_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, stride);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding);
@ -187,31 +172,31 @@ int DepthwiseConv2dOpenCLKernel::Run() {
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].first);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].second);
ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
return mindspore::lite::RET_OK;
}
void DepthwiseConv2dOpenCLKernel::SetGlobalLocal() {
// set global
size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM * block_size_[2]);
std::vector<size_t> global_size = {CO4, (size_t)UP_DIV(out_tensors_[0]->Width(), block_size_[1]),
(size_t)UP_DIV(out_tensors_[0]->Height(), block_size_[0])};
// set local
const int max_group_size = ocl_runtime_->DeviceMaxWorkGroupSize();
int z = global_size[0];
int y = std::min(max_group_size / z, GetMaxDivisorStrategy0(global_size[2], 8));
int x = std::max(1, std::min(static_cast<int>(global_size[1]), max_group_size / (y * z)));
std::vector<size_t> local_size =
std::vector<size_t>({static_cast<size_t>(z), static_cast<size_t>(x), static_cast<size_t>(y)});
OpenCLKernel::AlignGlobalLocal(global_size, local_size);
}
kernel::LiteKernel *OpenCLDepthwiseConv2dKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs,
OpParameter *opParameter, const lite::InnerContext *ctx,
const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
auto *kernel =
new (std::nothrow) DepthwiseConv2dOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr.";
free(opParameter);
return nullptr;
}
auto ret = kernel->Init();
if (ret != mindspore::lite::RET_OK) {
delete kernel;
MS_LOG(ERROR) << "Init DepthwiseConv2dOpenCLKernel failed!";
return nullptr;
}
return kernel;
int DepthwiseConv2dOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
return mindspore::lite::RET_OK;
}
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_DepthwiseConv2D, OpenCLDepthwiseConv2dKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_DepthwiseConv2D, OpenCLDepthwiseConv2dKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_DepthwiseConv2D, OpenCLKernelCreator<DepthwiseConv2dOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_DepthwiseConv2D, OpenCLKernelCreator<DepthwiseConv2dOpenCLKernel>)
} // namespace mindspore::kernel

@ -31,15 +31,13 @@ class DepthwiseConv2dOpenCLKernel : public OpenCLKernel {
~DepthwiseConv2dOpenCLKernel() override = default;
int Init() override;
int Run() override;
int Prepare() override;
int InitBuffer() override;
int GetGlobalSize(size_t idx, std::vector<size_t> *global_size) override;
int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) override;
int CheckSpecs() override;
int InitWeights() override;
void SetConstArgs() override;
void SetGlobalLocal() override;
private:
void *packed_weight_{nullptr};

@ -57,7 +57,7 @@ int GatherOpenCLKernel::Init() {
return RET_OK;
}
int GatherOpenCLKernel::InitBuffer() {
int GatherOpenCLKernel::InitWeights() {
auto indices_tensor = in_tensors_.at(1);
int indices_num = indices_tensor->ElementsNum();
bool isIndicesInt32 = indices_tensor->data_type() == kNumberTypeInt32;
@ -88,7 +88,7 @@ int GatherOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
auto param = reinterpret_cast<GatherParameter *>(this->op_parameter_);
if (InitBuffer() != RET_OK) {
if (InitWeights() != RET_OK) {
return RET_ERROR;
}
auto input_shape = in_tensors_[0]->shape();

@ -33,7 +33,7 @@ class GatherOpenCLKernel : public OpenCLKernel {
int Init() override;
int Run() override;
int InitBuffer() override;
int InitWeights() override;
private:
cl::Kernel kernel_;

@ -20,7 +20,6 @@
#include "include/errorcode.h"
#include "src/kernel_registry.h"
#include "src/runtime/kernel/opencl/utils.h"
#include "src/runtime/kernel/opencl/image_format.h"
#ifndef PROGRAM_WITH_IL
#include "src/runtime/kernel/opencl/cl/avg_pool2d.cl.inc"
#include "src/runtime/kernel/opencl/cl/max_pool2d.cl.inc"
@ -32,6 +31,7 @@ using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_INVALID_OP_NAME;
using mindspore::lite::RET_MEMORY_FAILED;
using mindspore::lite::RET_OK;
using mindspore::lite::opencl::MemType;
using mindspore::schema::PrimitiveType_Pooling;
namespace mindspore {
@ -73,7 +73,7 @@ int PoolingOpenCLKernel::Init() {
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
#else
kernel_name += "_NHWC4";
if (out_mem_type_ == OpenCLMemType::BUF) {
if (out_mem_type_ == MemType::BUF) {
MS_LOG(ERROR) << "buffer output not support yet.";
return mindspore::lite::RET_ERROR;
} else {

@ -33,7 +33,7 @@ using mindspore::schema::PrimitiveType_PReLU;
namespace mindspore::kernel {
int PReluOpenCLKernel::InitBuffer() {
int PReluOpenCLKernel::InitWeights() {
auto allocator = ocl_runtime_->GetAllocator();
auto weight_tensor = in_tensors_[1];
if (weight_is_scalar) {
@ -110,7 +110,7 @@ int PReluOpenCLKernel::Init() {
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
InitBuffer();
InitWeights();
MS_LOG(DEBUG) << program_name << " init Done!";
return mindspore::lite::RET_OK;
}

@ -34,7 +34,7 @@ class PReluOpenCLKernel : public OpenCLKernel {
int Init() override;
int Run() override;
int InitBuffer() override;
int InitWeights() override;
private:
cl::Kernel kernel_;

@ -30,6 +30,7 @@ using mindspore::kernel::KERNEL_ARCH::kGPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
using mindspore::lite::opencl::MemType;
using mindspore::schema::PrimitiveType_Scale;
namespace mindspore::kernel {
@ -52,7 +53,7 @@ void ScaleOpenCLKernel::Image2dGetWorkGroupSize() {
global_size_ = {image2d_info.width, image2d_info.height};
}
int ScaleOpenCLKernel::InitBuffer() {
int ScaleOpenCLKernel::InitWeights() {
if (!weight_vector_flag_) {
return RET_OK;
}
@ -178,7 +179,7 @@ int ScaleOpenCLKernel::Init() {
#ifdef PROGRAM_WITH_IL
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
#else
if (out_mem_type_ == OpenCLMemType::IMG) {
if (out_mem_type_ == MemType::IMG) {
kernel_name += "_IMG";
} else {
kernel_name += "_BUF";
@ -194,7 +195,7 @@ int ScaleOpenCLKernel::Init() {
}
Image2dGetWorkGroupSize();
InitBuffer();
InitWeights();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}

@ -32,7 +32,7 @@ class ScaleOpenCLKernel : public OpenCLKernel {
int Init() override;
int Run() override;
int InitBuffer() override;
int InitWeights() override;
private:
void Image2dGetWorkGroupSize();

@ -27,16 +27,40 @@ using mindspore::kernel::KERNEL_ARCH::kGPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
using mindspore::lite::opencl::MemType;
using mindspore::schema::PrimitiveType_ToFormat;
namespace mindspore::kernel {
int ToFormatOpenCLKernel::Init() {
int ToFormatOpenCLKernel::CheckSpecs() {
if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16) {
MS_LOG(ERROR) << "Unsupported data type " << in_tensors_[0]->data_type();
return RET_ERROR;
}
auto parameter = reinterpret_cast<OpenCLToFormatParameter *>(op_parameter_);
out_mem_type_ = parameter->out_mem_type;
return RET_OK;
}
void ToFormatOpenCLKernel::SetConstArgs() {
cl_int4 shape{(cl_int)N_, (cl_int)H_, (cl_int)W_, (cl_int)C_};
cl_int4 gsize{(cl_int)(N_ * H_), (cl_int)W_, (cl_int)UP_DIV(C_, C4NUM), 1};
ocl_runtime_->SetKernelArg(kernel_, 2, gsize);
ocl_runtime_->SetKernelArg(kernel_, 3, shape);
}
void ToFormatOpenCLKernel::SetGlobalLocal() {
std::vector<size_t> global = {N_ * H_, W_, UP_DIV(C_, C4NUM)};
std::vector<size_t> local = {8, 16, 3};
size_t max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())());
if (max_work_group_size < 384) {
local[2] = 1;
}
OpenCLKernel::AlignGlobalLocal(global, local);
}
int ToFormatOpenCLKernel::Prepare() {
std::map<TypeId, std::string> dtype_str{{kNumberTypeFloat32, "float"}, {kNumberTypeFloat16, "half"}};
std::string kernel_name;
if (parameter->out_mem_type == OpenCLMemType::IMG) {
if (out_mem_type_ == MemType::IMG) {
kernel_name = "to_format_NHWC_to_NHWC4_IMG_" + dtype_str[in_tensors_[0]->data_type()];
} else {
kernel_name = "to_format_NHWC4_to_NHWC_BUF_" + dtype_str[out_tensors_[0]->data_type()];
@ -54,6 +78,8 @@ int ToFormatOpenCLKernel::Init() {
#endif
InitNHWC();
SetGlobalLocal();
SetConstArgs();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}
@ -86,43 +112,14 @@ int ToFormatOpenCLKernel::InitNHWC() {
int ToFormatOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
std::vector<size_t> global = {N_ * H_, W_, UP_DIV(C_, C4NUM)};
std::vector<size_t> local = {8, 16, 3};
size_t max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())());
if (max_work_group_size < 384) {
local[2] = 1;
}
cl_int4 shape{(cl_int)N_, (cl_int)H_, (cl_int)W_, (cl_int)C_};
cl_int4 gsize{(cl_int)global[0], (cl_int)global[1], (cl_int)global[2], 1};
auto src_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::BUF : lite::opencl::MemType::IMG;
auto dst_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::IMG : lite::opencl::MemType::BUF;
auto src_mem_type = (out_mem_type_ == MemType::IMG) ? lite::opencl::MemType::BUF : lite::opencl::MemType::IMG;
auto dst_mem_type = out_mem_type_;
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), src_mem_type);
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), dst_mem_type);
ocl_runtime_->SetKernelArg(kernel_, 2, gsize);
ocl_runtime_->SetKernelArg(kernel_, 3, shape);
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
return RET_OK;
}
kernel::LiteKernel *OpenCLToFormatKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
auto *kernel = new (std::nothrow) ToFormatOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << "kernel " << opParameter->name_ << " create failed.";
free(opParameter);
return nullptr;
}
auto ret = kernel->Init();
if (ret != mindspore::lite::RET_OK) {
delete kernel;
return nullptr;
}
return kernel;
}
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_ToFormat, OpenCLToFormatKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_ToFormat, OpenCLToFormatKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_ToFormat, OpenCLKernelCreator<ToFormatOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_ToFormat, OpenCLKernelCreator<ToFormatOpenCLKernel>)
} // namespace mindspore::kernel

@ -30,8 +30,12 @@ class ToFormatOpenCLKernel : public OpenCLKernel {
: OpenCLKernel(parameter, inputs, outputs) {}
~ToFormatOpenCLKernel() override = default;
int Init() override;
int Run() override;
int Prepare() override;
int CheckSpecs() override;
void SetConstArgs() override;
void SetGlobalLocal() override;
private:
int InitNHWC();

@ -27,13 +27,11 @@ using mindspore::lite::RET_OK;
namespace mindspore::kernel {
enum class OpenCLMemType { BUF, IMG };
struct OpenCLToFormatParameter {
OpParameter op_parameter{};
schema::Format src_format{schema::Format::Format_NHWC};
schema::Format dst_format{schema::Format::Format_NHWC4};
OpenCLMemType out_mem_type{OpenCLMemType::IMG};
lite::opencl::MemType out_mem_type{lite::opencl::MemType::IMG};
};
struct Image2DInfo {
@ -107,13 +105,52 @@ class OpenCLKernel : public LiteKernel {
ocl_runtime_ = ocl_runtime_wrap_.GetInstance();
}
~OpenCLKernel() override = default;
int AlignGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local) {
std::vector<size_t> internal_global_ws = global;
for (size_t i = 0; i < local.size(); ++i) {
internal_global_ws[i] = UP_ROUND(global[i], local[i]);
}
int Init() override { return RET_ERROR; }
MS_LOG(DEBUG) << "global size: " << global.size() << ", local size: " << local.size();
for (size_t i = 0; i < global.size(); i++) {
MS_LOG(DEBUG) << "global[" << i << "] = " << global[i];
}
for (size_t i = 0; i < local.size(); i++) {
MS_LOG(DEBUG) << "local[" << i << "] = " << local[i];
}
if (global.size() == 1) {
global_range_ = cl::NDRange(internal_global_ws[0]);
if (!local.empty()) {
local_range_ = cl::NDRange(local[0]);
}
} else if (global.size() == 2) {
global_range_ = cl::NDRange(internal_global_ws[0], internal_global_ws[1]);
if (!local.empty()) {
local_range_ = cl::NDRange(local[0], local[1]);
}
} else if (global.size() == 3) {
global_range_ = cl::NDRange(internal_global_ws[0], internal_global_ws[1], internal_global_ws[2]);
if (!local.empty()) {
local_range_ = cl::NDRange(local[0], local[1], local[2]);
}
} else {
MS_LOG(ERROR) << "Not supported NDRange!";
return RET_ERROR;
}
return RET_OK;
}
int Init() override { return RET_ERROR; } // !!!To be deleted
int Prepare() override { return RET_OK; }
int PreProcess() override { return RET_ERROR; }
int ReSize() override { return RET_ERROR; }
int Run() override { return RET_ERROR; }
virtual int InitBuffer() { return RET_OK; }
virtual int CheckSpecs() { return RET_ERROR; }
virtual int InitWeights() { return RET_OK; }
virtual void SetConstArgs() {}
virtual void SetGlobalLocal() {}
virtual int GetGlobalSize(size_t idx, std::vector<size_t> *global_size) { return RET_ERROR; }
virtual int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) {
return RET_ERROR;
@ -128,18 +165,39 @@ class OpenCLKernel : public LiteKernel {
return RET_OK;
}
OpenCLMemType GetMemType() { return out_mem_type_; }
void SetMemType(OpenCLMemType mem_type) { out_mem_type_ = mem_type; }
lite::opencl::MemType GetMemType() { return out_mem_type_; }
void SetMemType(lite::opencl::MemType mem_type) { out_mem_type_ = mem_type; }
protected:
lite::opencl::OpenCLRuntime *ocl_runtime_;
OpenCLMemType out_mem_type_{OpenCLMemType::IMG};
std::vector<size_t> global_size_;
std::vector<size_t> local_size_;
lite::opencl::MemType out_mem_type_{lite::opencl::MemType::IMG};
cl::NDRange global_range_{cl::NullRange};
cl::NDRange local_range_{cl::NullRange};
std::vector<size_t> global_size_; // !!!To be deleted
std::vector<size_t> local_size_; // !!!To be deleted
private:
lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_;
};
template <class T>
kernel::LiteKernel *OpenCLKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
auto *kernel = new (std::nothrow) T(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr.";
free(opParameter);
return nullptr;
}
auto ret = kernel->CheckSpecs();
if (ret != mindspore::lite::RET_OK) {
delete kernel;
MS_LOG(ERROR) << "Init " << opParameter->name_ << " failed!";
return nullptr;
}
return kernel;
}
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_OPENCL_KERNEL_H_

@ -24,20 +24,22 @@
namespace mindspore::kernel {
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
using mindspore::lite::opencl::MemType;
SubGraphOpenCLKernel::~SubGraphOpenCLKernel() { UnInit(); }
int SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToNull(
const std::vector<lite::Tensor *> &in_tensors, const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels,
OpenCLMemType mem_type) {
MemType mem_type) {
for (size_t i = 0; i < in_tensors.size(); ++i) {
for (auto &jv : in_kernels.at(i)) {
auto tensors = (mem_type == OpenCLMemType::IMG) ? jv->in_tensors() : jv->out_tensors();
auto tensors = (mem_type == MemType::IMG) ? jv->in_tensors() : jv->out_tensors();
auto ft = std::find_if(tensors.begin(), tensors.end(),
[&in_tensors, &i](lite::Tensor *kv) { return kv == in_tensors.at(i); });
if (ft != tensors.end()) {
*ft = nullptr;
}
auto kernels = (mem_type == OpenCLMemType::IMG) ? jv->in_kernels() : jv->out_kernels();
auto kernels = (mem_type == MemType::IMG) ? jv->in_kernels() : jv->out_kernels();
std::replace_if(
kernels.begin(), kernels.end(),
[this, &in_tensors, &i](kernel::LiteKernel *kv) {
@ -47,7 +49,7 @@ int SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToNull(
this->nodes_set_.count(kv) == 0;
},
nullptr);
if (mem_type == OpenCLMemType::IMG) {
if (mem_type == MemType::IMG) {
jv->set_in_tensors(tensors);
jv->SetInKernel(kernels);
} else {
@ -61,25 +63,24 @@ int SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToNull(
int SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToConvert(const lite::Tensor *in_tensor,
const std::vector<kernel::LiteKernel *> &in_kernels,
lite::Tensor *new_tensor,
kernel::LiteKernel *in_convert_op,
OpenCLMemType mem_type) {
kernel::LiteKernel *in_convert_op, MemType mem_type) {
auto in_opencl_op = reinterpret_cast<OpenCLKernel *>(in_convert_op);
for (auto &iv : in_kernels) {
auto kernels = (mem_type == OpenCLMemType::IMG) ? iv->in_kernels() : iv->out_kernels();
auto kernels = (mem_type == MemType::IMG) ? iv->in_kernels() : iv->out_kernels();
auto fk = std::find_if(kernels.begin(), kernels.end(), [&](kernel::LiteKernel *kv) { return kv == nullptr; });
if (fk != kernels.end()) {
*fk = in_convert_op;
} else {
kernels.emplace_back(in_convert_op);
}
auto tensors = (mem_type == OpenCLMemType::IMG) ? iv->in_tensors() : iv->out_tensors();
auto tensors = (mem_type == MemType::IMG) ? iv->in_tensors() : iv->out_tensors();
auto ft = std::find_if(tensors.begin(), tensors.end(), [&](lite::Tensor *kv) { return kv == nullptr; });
if (ft != tensors.end()) {
*ft = new_tensor;
} else {
tensors.emplace_back(new_tensor);
}
if (mem_type == OpenCLMemType::IMG) {
if (mem_type == MemType::IMG) {
iv->SetInKernel(kernels);
iv->set_in_tensors(tensors);
in_opencl_op->AddOutKernel(iv);
@ -95,22 +96,22 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te
const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels,
std::vector<lite::Tensor *> *out_tensors,
std::vector<OpenCLToFormatParameter *> *out_parameters,
std::vector<LiteKernel *> *out_convert_ops, OpenCLMemType mem_type) {
std::vector<LiteKernel *> *out_convert_ops, MemType mem_type) {
out_tensors->clear();
out_parameters->clear();
out_convert_ops->clear();
MS_ASSERT(in_tensors.size() == to_kernels.size());
MS_ASSERT(in_tensors.size() == from_kernels.size());
std::vector<std::vector<kernel::LiteKernel *>> loop_kernels;
if (mem_type == OpenCLMemType::BUF) {
if (mem_type == MemType::BUF) {
GetKernelFromToTensor(in_tensors, nodes_, &loop_kernels, true);
}
ReplaceOutTensorAndKernelToNull(in_tensors, in_kernels, mem_type);
for (size_t i = 0; i < in_tensors.size(); ++i) {
auto dst_format = (mem_type == OpenCLMemType::IMG) ? schema::Format::Format_NHWC4 : schema::Format::Format_NHWC;
auto src_format = (mem_type == OpenCLMemType::IMG) ? schema::Format::Format_NHWC : schema::Format::Format_NHWC4;
auto dst_format = (mem_type == MemType::IMG) ? schema::Format::Format_NHWC4 : schema::Format::Format_NHWC;
auto src_format = (mem_type == MemType::IMG) ? schema::Format::Format_NHWC : schema::Format::Format_NHWC4;
auto *new_tensor = new (std::nothrow) lite::Tensor();
MS_ASSERT(new_tensor);
if (new_tensor == nullptr) {
@ -118,7 +119,7 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te
return RET_ERROR;
}
new_tensor->CopyTensor(*in_tensors[i]);
if (mem_type == OpenCLMemType::IMG) {
if (mem_type == MemType::IMG) {
new_tensor->SetFormat(dst_format);
in_tensors[i]->SetFormat(src_format);
} else {
@ -128,7 +129,7 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te
out_tensors->emplace_back(new_tensor);
KernelKey desc{kGPU, kNumberTypeFloat32, schema::PrimitiveType_ToFormat};
if (mem_type == OpenCLMemType::IMG && ocl_runtime_->GetFp16Enable()) {
if (mem_type == MemType::IMG && ocl_runtime_->GetFp16Enable()) {
desc.data_type = kNumberTypeFloat16;
new_tensor->set_data_type(kNumberTypeFloat16);
}
@ -146,7 +147,7 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te
parameter->out_mem_type = mem_type;
out_parameters->emplace_back(parameter);
LiteKernel *in_convert_op = nullptr;
if (mem_type == OpenCLMemType::IMG) {
if (mem_type == MemType::IMG) {
in_convert_op = lite::GetOpenCLKernel({in_tensors[i]}, {new_tensor}, reinterpret_cast<OpParameter *>(parameter),
context_, desc);
} else {
@ -166,7 +167,7 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te
ReplaceOutTensorAndKernelToConvert(in_tensors.at(i), in_kernels.at(i), new_tensor, in_convert_op, mem_type);
// replace in_tensor of inner kernel which use out tensor
if (mem_type == OpenCLMemType::BUF) {
if (mem_type == MemType::BUF) {
for (auto &iv : loop_kernels[i]) {
auto tensors = iv->in_tensors();
auto jv = std::find(tensors.begin(), tensors.end(), in_tensors.at(i));
@ -196,8 +197,8 @@ int SubGraphOpenCLKernel::Init() {
std::vector<std::vector<kernel::LiteKernel *>> from_kernels_;
GetKernelFromToTensor(in_tensors_, in_nodes_, &from_kernels_, true);
int ret = GenToFormatOp(in_tensors_, from_kernels_, &in_convert_tensors_, &in_parameters_, &in_convert_ops_,
OpenCLMemType::IMG);
int ret =
GenToFormatOp(in_tensors_, from_kernels_, &in_convert_tensors_, &in_parameters_, &in_convert_ops_, MemType::IMG);
if (ret != RET_OK) {
return ret;
}
@ -205,8 +206,8 @@ int SubGraphOpenCLKernel::Init() {
std::vector<std::vector<kernel::LiteKernel *>> to_kernels_;
GetKernelFromToTensor(out_tensors_, out_nodes_, &to_kernels_, false);
ret = GenToFormatOp(out_tensors_, to_kernels_, &out_convert_tensors_, &out_parameters_, &out_convert_ops_,
OpenCLMemType::BUF);
ret =
GenToFormatOp(out_tensors_, to_kernels_, &out_convert_tensors_, &out_parameters_, &out_convert_ops_, MemType::BUF);
if (ret != RET_OK) {
return ret;
}
@ -216,6 +217,11 @@ int SubGraphOpenCLKernel::Init() {
MallocTensorWithReuse();
ret = SubGraphKernel::Prepare();
if (ret != RET_OK) {
MS_LOG(ERROR) << "OpenCL prepare fail";
return ret;
}
return RET_OK;
}
@ -246,7 +252,7 @@ int SubGraphOpenCLKernel::MallocTensorWithReuse() {
for (auto i = 0; i < outputs.size(); ++i) {
auto *output = outputs.at(i);
MS_ASSERT(nullptr != output);
if (op_kernel->GetMemType() == OpenCLMemType::IMG) {
if (op_kernel->GetMemType() == MemType::IMG) {
std::vector<size_t> img_size;
op_kernel->GetImageSize(i, &img_size);
auto data_ptr = allocator_->Malloc(output->Size(), img_size);
@ -328,11 +334,6 @@ int SubGraphOpenCLKernel::Prepare() {
MS_LOG(ERROR) << "OpenCL subgraph init fail";
return ret;
}
ret = SubGraphKernel::Prepare();
if (ret != RET_OK) {
MS_LOG(ERROR) << "OpenCL prepare fail";
return ret;
}
return RET_OK;
}

@ -61,15 +61,15 @@ class SubGraphOpenCLKernel : public SubGraphKernel {
int MallocTensorWithReuse();
int ReplaceOutTensorAndKernelToNull(const std::vector<lite::Tensor *> &in_tensors,
const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels,
OpenCLMemType mem_type);
lite::opencl::MemType mem_type);
int ReplaceOutTensorAndKernelToConvert(const lite::Tensor *in_tensor,
const std::vector<kernel::LiteKernel *> &in_kernels, lite::Tensor *new_tensor,
kernel::LiteKernel *in_convert_op, OpenCLMemType mem_type);
kernel::LiteKernel *in_convert_op, lite::opencl::MemType mem_type);
int GetInOutNodes();
int GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors,
const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels,
std::vector<lite::Tensor *> *out_tensors, std::vector<OpenCLToFormatParameter *> *out_parameters,
std::vector<LiteKernel *> *out_convert_ops, OpenCLMemType mem_type);
std::vector<LiteKernel *> *out_convert_ops, lite::opencl::MemType mem_type);
int GetKernelFromToTensor(const std::vector<lite::Tensor *> &in_tensors,
const std::vector<kernel::LiteKernel *> &in_kernels,
std::vector<std::vector<kernel::LiteKernel *>> *out_kernels, bool is_from);

@ -24,6 +24,7 @@
#include "src/common/file_utils.h"
using mindspore::lite::KernelRegistrar;
using mindspore::lite::opencl::MemType;
namespace mindspore::lite {
kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
@ -243,20 +244,20 @@ int WriteToBin(const std::string &file_path, void *data, size_t size) {
return 0;
}
void PrintTensor(const lite::Tensor *tensor, OpenCLMemType mem_type, int n, const std::string &out_file) {
void PrintTensor(const lite::Tensor *tensor, MemType mem_type, int n, const std::string &out_file) {
if (tensor->data_c() == nullptr) {
return;
}
Image2DInfo img_info(tensor);
auto size = mem_type == OpenCLMemType::BUF ? img_info.OriginSize : img_info.Image2DSize;
auto size = mem_type == MemType::BUF ? img_info.OriginSize : img_info.Image2DSize;
std::vector<char> data(size);
auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
auto runtime = runtime_wrapper.GetInstance();
auto allocator = runtime->GetAllocator();
runtime->SyncCommandQueue();
allocator->MapBuffer(tensor->data_c(), CL_MAP_READ, nullptr, true);
if (mem_type == OpenCLMemType::BUF) {
if (mem_type == MemType::BUF) {
memcpy(data.data(), tensor->data_c(), img_info.OriginSize);
} else {
auto row_size = img_info.width * img_info.FLT4_size;
@ -277,7 +278,7 @@ void PrintTensor(const lite::Tensor *tensor, OpenCLMemType mem_type, int n, cons
}
printf(") ");
auto num = mem_type == OpenCLMemType::BUF ? img_info.ElementsNum : img_info.ElementsC4Num;
auto num = mem_type == MemType::BUF ? img_info.ElementsNum : img_info.ElementsC4Num;
for (int i = 0; i < n && i < num; ++i) {
if (tensor->data_type() == kNumberTypeFloat16) {
printf("%d %7.3f | ", i, reinterpret_cast<float16_t *>(data.data())[i]);

@ -48,7 +48,8 @@ std::string CLErrorCode(cl_int error_code);
int WriteToBin(const std::string &file_path, void *data, size_t size);
void PrintTensor(const lite::Tensor *tensor, OpenCLMemType mem_type, int n = 10, const std::string &out_file = "");
void PrintTensor(const lite::Tensor *tensor, lite::opencl::MemType mem_type, int n = 10,
const std::string &out_file = "");
void PrintKernelOutput(OpenCLKernel *kernel, int n = 10, const std::string &out_file = "");

@ -41,7 +41,7 @@ struct OpenclMemory {
};
class OpenCLRuntime;
enum class MemType : char { SVM, BUF, IMG };
enum class MemType : char { BUF, IMG };
class OpenCLAllocator : public Allocator {
public:

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save