diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl index 44f3d0a051..329f641fa7 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl @@ -1,8 +1,8 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void DepthwiseConv2d_IMG_NC4HW4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias, - __write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding, - int2 dilation, int4 src_size, int4 dst_size, float relu_clip_min, +__kernel void DepthwiseConv2d_IMG_NC4HW4(__write_only image2d_t dst_data, __read_only image2d_t src_data, + __global FLT4 *filter, __global FLT4 *bias, int2 kernel_size, int2 stride, + int2 padding, int2 dilation, int4 src_size, int4 dst_size, float relu_clip_min, float relu_clip_max) { int X = get_global_id(0); int Y = get_global_id(1); @@ -32,8 +32,8 @@ __kernel void DepthwiseConv2d_IMG_NC4HW4(__read_only image2d_t src_data, __globa WRITE_IMAGE(dst_data, (int2)(X, (Z * dst_size.y + Y)), res); } -__kernel void DepthwiseConv2d_IMG_NHWC4_b222(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias, - __write_only image2d_t dst_data, int2 kernel_size, int2 stride, +__kernel void DepthwiseConv2d_IMG_NHWC4_b222(__write_only image2d_t dst_data, __read_only image2d_t src_data, + __global FLT4 *filter, __global FLT4 *bias, int2 kernel_size, int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size, float relu_clip_min, float relu_clip_max) { int X = get_global_id(1) * 2; @@ -126,8 +126,8 @@ __kernel void DepthwiseConv2d_IMG_NHWC4_b222(__read_only image2d_t src_data, __g } } } -__kernel void DepthwiseConv2d_IMG_NHWC4_b221(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias, - __write_only image2d_t dst_data, int2 kernel_size, int2 stride, +__kernel void DepthwiseConv2d_IMG_NHWC4_b221(__write_only image2d_t dst_data, __read_only image2d_t src_data, + __global FLT4 *filter, __global FLT4 *bias, int2 kernel_size, int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size, float relu_clip_min, float relu_clip_max) { int X = get_global_id(1) * 2; @@ -182,8 +182,8 @@ __kernel void DepthwiseConv2d_IMG_NHWC4_b221(__read_only image2d_t src_data, __g WRITE_IMAGE(dst_data, (int2)((X + 1) * dst_size.z + Z, Y + 1), r[3]); } } -__kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias, - __write_only image2d_t dst_data, int2 kernel_size, int2 stride, +__kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__write_only image2d_t dst_data, __read_only image2d_t src_data, + __global FLT4 *filter, __global FLT4 *bias, int2 kernel_size, int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size, float relu_clip_min, float relu_clip_max) { int X = get_global_id(0); @@ -215,8 +215,8 @@ __kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__read_only image2d_t src_data, __gl WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z + 1, Y), r[1]); } } -__kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias, - __global FLT4 *dst_data, int2 kernel_size, int2 stride, int2 padding, +__kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *dst_data, __global FLT4 *src_data, __global FLT4 *filter, + __global FLT4 *bias, int2 kernel_size, int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size, float relu_clip_min, float relu_clip_max) { int X = get_global_id(0); @@ -247,10 +247,9 @@ __kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *src_data, __global FLT4 dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res; } -__kernel void DepthwiseConv2d_BUF_NHWC4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias, - __global FLT4 *dst_data, int2 kernel_size, int2 stride, int2 padding, - int2 dilation, int4 src_size, int4 dst_size, float relu_clip_min, - float relu_clip_max) { +__kernel void DepthwiseConv2d_BUF_NHWC4(__global FLT4 *dst_data, __global FLT4 *src_data, __global FLT4 *filter, + __global FLT4 *bias, int2 kernel_size, int2 stride, int2 padding, int2 dilation, + int4 src_size, int4 dst_size, float relu_clip_min, float relu_clip_max) { int X = get_global_id(0); int Y = get_global_id(1); int Z = get_global_id(2); @@ -279,8 +278,8 @@ __kernel void DepthwiseConv2d_BUF_NHWC4(__global FLT4 *src_data, __global FLT4 * dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res; } -__kernel void DepthwiseConv2d_BUF_NHWC4_1x1(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias, - __global FLT4 *dst_data, int2 kernel_size, int2 stride, int2 padding, +__kernel void DepthwiseConv2d_BUF_NHWC4_1x1(__global FLT4 *dst_data, __global FLT4 *src_data, __global FLT4 *filter, + __global FLT4 *bias, int2 kernel_size, int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size, float relu_clip_min, float relu_clip_max) { int X = get_global_id(0); diff --git a/mindspore/lite/src/runtime/kernel/opencl/image_format.h b/mindspore/lite/src/runtime/kernel/opencl/image_format.h deleted file mode 100644 index 018292ad81..0000000000 --- a/mindspore/lite/src/runtime/kernel/opencl/image_format.h +++ /dev/null @@ -1,62 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_IMAGE_FORMAT_H_ -#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_IMAGE_FORMAT_H_ - -#include "src/runtime/opencl/opencl_runtime.h" - -namespace mindspore { -namespace kernel { -/** - * MindSpore to OpenCL channel order. - * @param num_channels - * @return opencl_channels - */ -cl_channel_order ToChannelOrder(int num_channels) { - switch (num_channels) { - case 1: - return CL_R; - case 2: - return CL_RG; - case 3: - return CL_RGB; - case 4: - return CL_RGBA; - default: - return -1; - } -} - -/** - * MindSpore image channel type to OpenCL channel data type. - * @param data_type - * @return opencl_data_type - */ -cl_channel_type ToImageChannelType(TypeId data_type) { - switch (data_type) { - case kNumberTypeFloat32: - return CL_FLOAT; - case kNumberTypeFloat16: - return CL_HALF_FLOAT; - default: - return -1; - } -} -} // namespace kernel -} // namespace mindspore - -#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_IMAGE_FORMAT_H_ diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc index c57c9f2104..6f84611095 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc @@ -30,6 +30,7 @@ using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::lite::KernelRegistrar; using mindspore::lite::RET_ERROR; using mindspore::lite::RET_OK; +using mindspore::lite::opencl::MemType; using mindspore::schema::PrimitiveType_Eltwise; namespace mindspore::kernel { @@ -72,7 +73,7 @@ void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() { } } -int ArithmeticOpenCLKernel::InitBuffer() { +int ArithmeticOpenCLKernel::InitWeights() { auto fp16_enable = ocl_runtime_->GetFp16Enable(); auto data_size = fp16_enable ? sizeof(float16_t) : sizeof(float); for (auto in_tensor_ : in_tensors_) { @@ -255,7 +256,7 @@ int ArithmeticOpenCLKernel::Init() { #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); #else - if (out_mem_type_ == OpenCLMemType::IMG) { + if (out_mem_type_ == MemType::IMG) { kernel_name += "_IMG"; } else { kernel_name += "_BUF"; @@ -271,7 +272,7 @@ int ArithmeticOpenCLKernel::Init() { } Image2dGetWorkGroupSize(); - InitBuffer(); + InitWeights(); SetArgs(); MS_LOG(DEBUG) << kernel_name << " Init Done!"; return RET_OK; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h index bc9f2f6ffb..6173e66e52 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h @@ -32,7 +32,7 @@ class ArithmeticOpenCLKernel : public OpenCLKernel { int Init() override; int Run() override; - int InitBuffer() override; + int InitWeights() override; int SetArgs(); private: diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.cc index d83eb6ebc4..48ec0d9c94 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.cc @@ -35,7 +35,7 @@ using mindspore::schema::PrimitiveType_BiasAdd; namespace mindspore::kernel { -int BiasAddOpenCLKernel::InitBuffer() { +int BiasAddOpenCLKernel::InitWeights() { int C = in_tensors_[1]->shape()[0]; int div_ci = UP_DIV(C, C4NUM); auto allocator = ocl_runtime_->GetAllocator(); @@ -70,7 +70,7 @@ int BiasAddOpenCLKernel::Init() { MS_LOG(ERROR) << "BiasAdd weight channel size:" << Bias_Size << " must be equal with in_teneors channel size:" << C; return mindspore::lite::RET_ERROR; } - InitBuffer(); + InitWeights(); std::set build_options; std::string source = biasadd_source; std::string program_name = "BiasAdd"; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.h index 625e69c0c6..d6e03a12a0 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.h @@ -35,7 +35,7 @@ class BiasAddOpenCLKernel : public OpenCLKernel { int Init() override; int Run() override; - int InitBuffer() override; + int InitWeights() override; private: cl_int4 GetGlobalshape(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc index 8ce376a204..3909b57589 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc @@ -101,7 +101,7 @@ int ConvolutionOpenCLKernel::Init() { winograd_mem1_ = allocator->Malloc(size, {width, height, img_dtype}); } - InitBuffer(); + InitWeights(); MS_LOG(DEBUG) << "Convolution Init Done!"; return RET_OK; @@ -236,7 +236,7 @@ int ConvolutionOpenCLKernel::InitBias() { return RET_OK; } -int ConvolutionOpenCLKernel::InitBuffer() { +int ConvolutionOpenCLKernel::InitWeights() { InitWeight(); if (has_bias_) { InitBias(); @@ -360,10 +360,13 @@ int ConvolutionOpenCLKernel::Run() { } if (use_winograd_) { - ocl_runtime_->RunKernel(kernel_4x4to36_, {size_t(TILES_XY_), 6, size_t(CI_SLICES_)}, {8, 6, 4}, nullptr); - ocl_runtime_->RunKernel(kernel_conv_, {size_t(UP_DIV(TILES_XY_, 2)), 36, size_t(UP_DIV(CO_SLICES_, 2))}, {8, 6, 2}, - nullptr); - ocl_runtime_->RunKernel(kernel_36to4x4_, {size_t(TILES_XY_), 4, size_t(CO_SLICES_)}, {32, 4, 2}, nullptr); + ocl_runtime_->RunKernel(kernel_4x4to36_, std::vector({size_t(TILES_XY_), 6, size_t(CI_SLICES_)}), + std::vector({8, 6, 4}), nullptr); + ocl_runtime_->RunKernel(kernel_conv_, + std::vector({size_t(UP_DIV(TILES_XY_, 2)), 36, size_t(UP_DIV(CO_SLICES_, 2))}), + std::vector({8, 6, 2}), nullptr); + ocl_runtime_->RunKernel(kernel_36to4x4_, std::vector({size_t(TILES_XY_), 4, size_t(CO_SLICES_)}), + std::vector({32, 4, 2}), nullptr); } else { ocl_runtime_->RunKernel(kernel_conv_, global_, local_, nullptr); } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h index d273e5ec2b..a76386d05c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h @@ -36,11 +36,11 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { int Init() override; int Run() override; - int InitBuffer() override; + int InitWeights() override; + void SetGlobalLocal() override; private: void SetBlockSize(); - void SetGlobalLocal(); int InitWeight(); int InitBias(); int GenerateWinogradWeight(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc index 9cc826f945..3fabd425ab 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc @@ -37,13 +37,21 @@ using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::lite::KernelRegistrar; using mindspore::lite::RET_ERROR; using mindspore::lite::RET_OK; +using mindspore::lite::opencl::MemType; using mindspore::schema::PrimitiveType_DepthwiseConv2D; namespace mindspore::kernel { -int DepthwiseConv2dOpenCLKernel::Init() { +int DepthwiseConv2dOpenCLKernel::CheckSpecs() { + if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16) { + MS_LOG(ERROR) << "Unsupported data type " << in_tensors_[0]->data_type(); + return RET_ERROR; + } + return RET_OK; +} +int DepthwiseConv2dOpenCLKernel::Prepare() { std::string kernel_name = "DepthwiseConv2d"; - if (out_mem_type_ == OpenCLMemType::BUF) { + if (out_mem_type_ == MemType::BUF) { kernel_name += "_BUF"; } else { kernel_name += "_IMG"; @@ -66,14 +74,14 @@ int DepthwiseConv2dOpenCLKernel::Init() { ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif - InitBuffer(); - GetGlobalSize(0, &global_size_); - GetLocalSize(0, global_size_, &local_size_); + InitWeights(); + SetGlobalLocal(); + SetConstArgs(); MS_LOG(DEBUG) << kernel_name << " Init Done! mem type=" << static_cast(out_mem_type_); return mindspore::lite::RET_OK; } -int DepthwiseConv2dOpenCLKernel::InitBuffer() { +int DepthwiseConv2dOpenCLKernel::InitWeights() { auto parameter = reinterpret_cast(op_parameter_); auto allocator = ocl_runtime_->GetAllocator(); bool is_fp16 = ocl_runtime_->GetFp16Enable(); @@ -138,28 +146,7 @@ int DepthwiseConv2dOpenCLKernel::InitBuffer() { } return mindspore::lite::RET_OK; } - -int DepthwiseConv2dOpenCLKernel::GetGlobalSize(size_t idx, std::vector *global_size) { - size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM * block_size_[2]); - std::vector global = {CO4, (size_t)UP_DIV(out_tensors_[0]->Width(), block_size_[1]), - (size_t)UP_DIV(out_tensors_[0]->Height(), block_size_[0])}; - *global_size = std::move(global); - return mindspore::lite::RET_OK; -} - -int DepthwiseConv2dOpenCLKernel::GetLocalSize(size_t idx, const std::vector &global_size, - std::vector *local_size) { - const int max_group_size = ocl_runtime_->DeviceMaxWorkGroupSize(); - int z = global_size[0]; - int y = std::min(max_group_size / z, GetMaxDivisorStrategy0(global_size[2], 8)); - int x = std::max(1, std::min(static_cast(global_size[1]), max_group_size / (y * z))); - local_size->clear(); - *local_size = std::vector({static_cast(z), static_cast(x), static_cast(y)}); - return mindspore::lite::RET_OK; -} - -int DepthwiseConv2dOpenCLKernel::Run() { - MS_LOG(DEBUG) << this->name() << " Running!"; +void DepthwiseConv2dOpenCLKernel::SetConstArgs() { auto parameter = reinterpret_cast(op_parameter_); size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); size_t CI4 = UP_DIV(in_tensors_[0]->Channel(), C4NUM); @@ -174,11 +161,9 @@ int DepthwiseConv2dOpenCLKernel::Run() { cl_int4 dst_size = {(cl_int)out_tensors_[0]->Width(), (cl_int)out_tensors_[0]->Height(), (cl_int)CO4, (cl_int)out_tensors_[0]->Batch()}; - int arg_cnt = 0; - ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c()); + int arg_cnt = 2; ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, packed_weight_, lite::opencl::MemType::BUF); ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, bias_data_, lite::opencl::MemType::BUF); - ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, kernel_size); ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, stride); ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding); @@ -187,31 +172,31 @@ int DepthwiseConv2dOpenCLKernel::Run() { ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size); ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].first); ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].second); - ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); - return mindspore::lite::RET_OK; +} +void DepthwiseConv2dOpenCLKernel::SetGlobalLocal() { + // set global + size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM * block_size_[2]); + std::vector global_size = {CO4, (size_t)UP_DIV(out_tensors_[0]->Width(), block_size_[1]), + (size_t)UP_DIV(out_tensors_[0]->Height(), block_size_[0])}; + // set local + const int max_group_size = ocl_runtime_->DeviceMaxWorkGroupSize(); + int z = global_size[0]; + int y = std::min(max_group_size / z, GetMaxDivisorStrategy0(global_size[2], 8)); + int x = std::max(1, std::min(static_cast(global_size[1]), max_group_size / (y * z))); + std::vector local_size = + std::vector({static_cast(z), static_cast(x), static_cast(y)}); + + OpenCLKernel::AlignGlobalLocal(global_size, local_size); } -kernel::LiteKernel *OpenCLDepthwiseConv2dKernelCreator(const std::vector &inputs, - const std::vector &outputs, - OpParameter *opParameter, const lite::InnerContext *ctx, - const kernel::KernelKey &desc, - const mindspore::lite::PrimitiveC *primitive) { - auto *kernel = - new (std::nothrow) DepthwiseConv2dOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); - if (kernel == nullptr) { - MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; - free(opParameter); - return nullptr; - } - auto ret = kernel->Init(); - if (ret != mindspore::lite::RET_OK) { - delete kernel; - MS_LOG(ERROR) << "Init DepthwiseConv2dOpenCLKernel failed!"; - return nullptr; - } - return kernel; +int DepthwiseConv2dOpenCLKernel::Run() { + MS_LOG(DEBUG) << this->name() << " Running!"; + ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_[0]->data_c()); + ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_[0]->data_c()); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); + return mindspore::lite::RET_OK; } -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_DepthwiseConv2D, OpenCLDepthwiseConv2dKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_DepthwiseConv2D, OpenCLDepthwiseConv2dKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_DepthwiseConv2D, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_DepthwiseConv2D, OpenCLKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h index 58bcbb5edb..5d059b4ee0 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h @@ -31,15 +31,13 @@ class DepthwiseConv2dOpenCLKernel : public OpenCLKernel { ~DepthwiseConv2dOpenCLKernel() override = default; - int Init() override; - int Run() override; + int Prepare() override; - int InitBuffer() override; - - int GetGlobalSize(size_t idx, std::vector *global_size) override; - - int GetLocalSize(size_t idx, const std::vector &global_size, std::vector *local_size) override; + int CheckSpecs() override; + int InitWeights() override; + void SetConstArgs() override; + void SetGlobalLocal() override; private: void *packed_weight_{nullptr}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc index 42a8b56fda..24ec31f732 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc @@ -57,7 +57,7 @@ int GatherOpenCLKernel::Init() { return RET_OK; } -int GatherOpenCLKernel::InitBuffer() { +int GatherOpenCLKernel::InitWeights() { auto indices_tensor = in_tensors_.at(1); int indices_num = indices_tensor->ElementsNum(); bool isIndicesInt32 = indices_tensor->data_type() == kNumberTypeInt32; @@ -88,7 +88,7 @@ int GatherOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running! "; auto param = reinterpret_cast(this->op_parameter_); - if (InitBuffer() != RET_OK) { + if (InitWeights() != RET_OK) { return RET_ERROR; } auto input_shape = in_tensors_[0]->shape(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h index a20870fd31..236c2b6a9c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h @@ -33,7 +33,7 @@ class GatherOpenCLKernel : public OpenCLKernel { int Init() override; int Run() override; - int InitBuffer() override; + int InitWeights() override; private: cl::Kernel kernel_; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc index 254b4c6df7..7461c56545 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc @@ -20,7 +20,6 @@ #include "include/errorcode.h" #include "src/kernel_registry.h" #include "src/runtime/kernel/opencl/utils.h" -#include "src/runtime/kernel/opencl/image_format.h" #ifndef PROGRAM_WITH_IL #include "src/runtime/kernel/opencl/cl/avg_pool2d.cl.inc" #include "src/runtime/kernel/opencl/cl/max_pool2d.cl.inc" @@ -32,6 +31,7 @@ using mindspore::lite::RET_ERROR; using mindspore::lite::RET_INVALID_OP_NAME; using mindspore::lite::RET_MEMORY_FAILED; using mindspore::lite::RET_OK; +using mindspore::lite::opencl::MemType; using mindspore::schema::PrimitiveType_Pooling; namespace mindspore { @@ -73,7 +73,7 @@ int PoolingOpenCLKernel::Init() { kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); #else kernel_name += "_NHWC4"; - if (out_mem_type_ == OpenCLMemType::BUF) { + if (out_mem_type_ == MemType::BUF) { MS_LOG(ERROR) << "buffer output not support yet."; return mindspore::lite::RET_ERROR; } else { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc index 6a3f22a77d..fc2cc555c9 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc @@ -33,7 +33,7 @@ using mindspore::schema::PrimitiveType_PReLU; namespace mindspore::kernel { -int PReluOpenCLKernel::InitBuffer() { +int PReluOpenCLKernel::InitWeights() { auto allocator = ocl_runtime_->GetAllocator(); auto weight_tensor = in_tensors_[1]; if (weight_is_scalar) { @@ -110,7 +110,7 @@ int PReluOpenCLKernel::Init() { ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); - InitBuffer(); + InitWeights(); MS_LOG(DEBUG) << program_name << " init Done!"; return mindspore::lite::RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.h index f54b3b5f5e..78c3c6a369 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.h @@ -34,7 +34,7 @@ class PReluOpenCLKernel : public OpenCLKernel { int Init() override; int Run() override; - int InitBuffer() override; + int InitWeights() override; private: cl::Kernel kernel_; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc index b68441b438..5e1bd1234e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc @@ -30,6 +30,7 @@ using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::lite::KernelRegistrar; using mindspore::lite::RET_ERROR; using mindspore::lite::RET_OK; +using mindspore::lite::opencl::MemType; using mindspore::schema::PrimitiveType_Scale; namespace mindspore::kernel { @@ -52,7 +53,7 @@ void ScaleOpenCLKernel::Image2dGetWorkGroupSize() { global_size_ = {image2d_info.width, image2d_info.height}; } -int ScaleOpenCLKernel::InitBuffer() { +int ScaleOpenCLKernel::InitWeights() { if (!weight_vector_flag_) { return RET_OK; } @@ -178,7 +179,7 @@ int ScaleOpenCLKernel::Init() { #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); #else - if (out_mem_type_ == OpenCLMemType::IMG) { + if (out_mem_type_ == MemType::IMG) { kernel_name += "_IMG"; } else { kernel_name += "_BUF"; @@ -194,7 +195,7 @@ int ScaleOpenCLKernel::Init() { } Image2dGetWorkGroupSize(); - InitBuffer(); + InitWeights(); MS_LOG(DEBUG) << kernel_name << " Init Done!"; return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h index 6aeb16c18c..6273067a15 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h @@ -32,7 +32,7 @@ class ScaleOpenCLKernel : public OpenCLKernel { int Init() override; int Run() override; - int InitBuffer() override; + int InitWeights() override; private: void Image2dGetWorkGroupSize(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc index 83dae3327a..640a147cd6 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc @@ -27,16 +27,40 @@ using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::lite::KernelRegistrar; using mindspore::lite::RET_ERROR; using mindspore::lite::RET_OK; +using mindspore::lite::opencl::MemType; using mindspore::schema::PrimitiveType_ToFormat; namespace mindspore::kernel { -int ToFormatOpenCLKernel::Init() { +int ToFormatOpenCLKernel::CheckSpecs() { + if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16) { + MS_LOG(ERROR) << "Unsupported data type " << in_tensors_[0]->data_type(); + return RET_ERROR; + } auto parameter = reinterpret_cast(op_parameter_); out_mem_type_ = parameter->out_mem_type; + return RET_OK; +} +void ToFormatOpenCLKernel::SetConstArgs() { + cl_int4 shape{(cl_int)N_, (cl_int)H_, (cl_int)W_, (cl_int)C_}; + cl_int4 gsize{(cl_int)(N_ * H_), (cl_int)W_, (cl_int)UP_DIV(C_, C4NUM), 1}; + ocl_runtime_->SetKernelArg(kernel_, 2, gsize); + ocl_runtime_->SetKernelArg(kernel_, 3, shape); +} +void ToFormatOpenCLKernel::SetGlobalLocal() { + std::vector global = {N_ * H_, W_, UP_DIV(C_, C4NUM)}; + std::vector local = {8, 16, 3}; + size_t max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())()); + if (max_work_group_size < 384) { + local[2] = 1; + } + OpenCLKernel::AlignGlobalLocal(global, local); +} + +int ToFormatOpenCLKernel::Prepare() { std::map dtype_str{{kNumberTypeFloat32, "float"}, {kNumberTypeFloat16, "half"}}; std::string kernel_name; - if (parameter->out_mem_type == OpenCLMemType::IMG) { + if (out_mem_type_ == MemType::IMG) { kernel_name = "to_format_NHWC_to_NHWC4_IMG_" + dtype_str[in_tensors_[0]->data_type()]; } else { kernel_name = "to_format_NHWC4_to_NHWC_BUF_" + dtype_str[out_tensors_[0]->data_type()]; @@ -54,6 +78,8 @@ int ToFormatOpenCLKernel::Init() { #endif InitNHWC(); + SetGlobalLocal(); + SetConstArgs(); MS_LOG(DEBUG) << kernel_name << " Init Done!"; return RET_OK; } @@ -86,43 +112,14 @@ int ToFormatOpenCLKernel::InitNHWC() { int ToFormatOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running!"; - std::vector global = {N_ * H_, W_, UP_DIV(C_, C4NUM)}; - std::vector local = {8, 16, 3}; - size_t max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())()); - if (max_work_group_size < 384) { - local[2] = 1; - } - cl_int4 shape{(cl_int)N_, (cl_int)H_, (cl_int)W_, (cl_int)C_}; - cl_int4 gsize{(cl_int)global[0], (cl_int)global[1], (cl_int)global[2], 1}; - - auto src_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::BUF : lite::opencl::MemType::IMG; - auto dst_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::IMG : lite::opencl::MemType::BUF; + auto src_mem_type = (out_mem_type_ == MemType::IMG) ? lite::opencl::MemType::BUF : lite::opencl::MemType::IMG; + auto dst_mem_type = out_mem_type_; ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), src_mem_type); ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), dst_mem_type); - ocl_runtime_->SetKernelArg(kernel_, 2, gsize); - ocl_runtime_->SetKernelArg(kernel_, 3, shape); - ocl_runtime_->RunKernel(kernel_, global, local, nullptr); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); return RET_OK; } -kernel::LiteKernel *OpenCLToFormatKernelCreator(const std::vector &inputs, - const std::vector &outputs, OpParameter *opParameter, - const lite::InnerContext *ctx, const kernel::KernelKey &desc, - const mindspore::lite::PrimitiveC *primitive) { - auto *kernel = new (std::nothrow) ToFormatOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); - if (kernel == nullptr) { - MS_LOG(ERROR) << "kernel " << opParameter->name_ << " create failed."; - free(opParameter); - return nullptr; - } - auto ret = kernel->Init(); - if (ret != mindspore::lite::RET_OK) { - delete kernel; - return nullptr; - } - return kernel; -} - -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_ToFormat, OpenCLToFormatKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_ToFormat, OpenCLToFormatKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_ToFormat, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_ToFormat, OpenCLKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h index d1f39f457f..b18b3c7a80 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h @@ -30,8 +30,12 @@ class ToFormatOpenCLKernel : public OpenCLKernel { : OpenCLKernel(parameter, inputs, outputs) {} ~ToFormatOpenCLKernel() override = default; - int Init() override; int Run() override; + int Prepare() override; + + int CheckSpecs() override; + void SetConstArgs() override; + void SetGlobalLocal() override; private: int InitNHWC(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h index f29e2965a1..295369daf9 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h @@ -27,13 +27,11 @@ using mindspore::lite::RET_OK; namespace mindspore::kernel { -enum class OpenCLMemType { BUF, IMG }; - struct OpenCLToFormatParameter { OpParameter op_parameter{}; schema::Format src_format{schema::Format::Format_NHWC}; schema::Format dst_format{schema::Format::Format_NHWC4}; - OpenCLMemType out_mem_type{OpenCLMemType::IMG}; + lite::opencl::MemType out_mem_type{lite::opencl::MemType::IMG}; }; struct Image2DInfo { @@ -107,13 +105,52 @@ class OpenCLKernel : public LiteKernel { ocl_runtime_ = ocl_runtime_wrap_.GetInstance(); } ~OpenCLKernel() override = default; + int AlignGlobalLocal(const std::vector &global, const std::vector &local) { + std::vector internal_global_ws = global; + for (size_t i = 0; i < local.size(); ++i) { + internal_global_ws[i] = UP_ROUND(global[i], local[i]); + } - int Init() override { return RET_ERROR; } + MS_LOG(DEBUG) << "global size: " << global.size() << ", local size: " << local.size(); + for (size_t i = 0; i < global.size(); i++) { + MS_LOG(DEBUG) << "global[" << i << "] = " << global[i]; + } + for (size_t i = 0; i < local.size(); i++) { + MS_LOG(DEBUG) << "local[" << i << "] = " << local[i]; + } + + if (global.size() == 1) { + global_range_ = cl::NDRange(internal_global_ws[0]); + if (!local.empty()) { + local_range_ = cl::NDRange(local[0]); + } + } else if (global.size() == 2) { + global_range_ = cl::NDRange(internal_global_ws[0], internal_global_ws[1]); + if (!local.empty()) { + local_range_ = cl::NDRange(local[0], local[1]); + } + } else if (global.size() == 3) { + global_range_ = cl::NDRange(internal_global_ws[0], internal_global_ws[1], internal_global_ws[2]); + if (!local.empty()) { + local_range_ = cl::NDRange(local[0], local[1], local[2]); + } + } else { + MS_LOG(ERROR) << "Not supported NDRange!"; + return RET_ERROR; + } + return RET_OK; + } + + int Init() override { return RET_ERROR; } // !!!To be deleted + int Prepare() override { return RET_OK; } int PreProcess() override { return RET_ERROR; } int ReSize() override { return RET_ERROR; } int Run() override { return RET_ERROR; } - virtual int InitBuffer() { return RET_OK; } + virtual int CheckSpecs() { return RET_ERROR; } + virtual int InitWeights() { return RET_OK; } + virtual void SetConstArgs() {} + virtual void SetGlobalLocal() {} virtual int GetGlobalSize(size_t idx, std::vector *global_size) { return RET_ERROR; } virtual int GetLocalSize(size_t idx, const std::vector &global_size, std::vector *local_size) { return RET_ERROR; @@ -128,18 +165,39 @@ class OpenCLKernel : public LiteKernel { return RET_OK; } - OpenCLMemType GetMemType() { return out_mem_type_; } - void SetMemType(OpenCLMemType mem_type) { out_mem_type_ = mem_type; } + lite::opencl::MemType GetMemType() { return out_mem_type_; } + void SetMemType(lite::opencl::MemType mem_type) { out_mem_type_ = mem_type; } protected: lite::opencl::OpenCLRuntime *ocl_runtime_; - OpenCLMemType out_mem_type_{OpenCLMemType::IMG}; - std::vector global_size_; - std::vector local_size_; + lite::opencl::MemType out_mem_type_{lite::opencl::MemType::IMG}; + cl::NDRange global_range_{cl::NullRange}; + cl::NDRange local_range_{cl::NullRange}; + std::vector global_size_; // !!!To be deleted + std::vector local_size_; // !!!To be deleted private: lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_; }; +template +kernel::LiteKernel *OpenCLKernelCreator(const std::vector &inputs, + const std::vector &outputs, OpParameter *opParameter, + const lite::InnerContext *ctx, const kernel::KernelKey &desc, + const mindspore::lite::PrimitiveC *primitive) { + auto *kernel = new (std::nothrow) T(reinterpret_cast(opParameter), inputs, outputs); + if (kernel == nullptr) { + MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; + free(opParameter); + return nullptr; + } + auto ret = kernel->CheckSpecs(); + if (ret != mindspore::lite::RET_OK) { + delete kernel; + MS_LOG(ERROR) << "Init " << opParameter->name_ << " failed!"; + return nullptr; + } + return kernel; +} } // namespace mindspore::kernel #endif // MINDSPORE_LITE_SRC_OPENCL_KERNEL_H_ diff --git a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc index c77acbc360..b425779894 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc @@ -24,20 +24,22 @@ namespace mindspore::kernel { using mindspore::lite::RET_ERROR; using mindspore::lite::RET_OK; +using mindspore::lite::opencl::MemType; + SubGraphOpenCLKernel::~SubGraphOpenCLKernel() { UnInit(); } int SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToNull( const std::vector &in_tensors, const std::vector> &in_kernels, - OpenCLMemType mem_type) { + MemType mem_type) { for (size_t i = 0; i < in_tensors.size(); ++i) { for (auto &jv : in_kernels.at(i)) { - auto tensors = (mem_type == OpenCLMemType::IMG) ? jv->in_tensors() : jv->out_tensors(); + auto tensors = (mem_type == MemType::IMG) ? jv->in_tensors() : jv->out_tensors(); auto ft = std::find_if(tensors.begin(), tensors.end(), [&in_tensors, &i](lite::Tensor *kv) { return kv == in_tensors.at(i); }); if (ft != tensors.end()) { *ft = nullptr; } - auto kernels = (mem_type == OpenCLMemType::IMG) ? jv->in_kernels() : jv->out_kernels(); + auto kernels = (mem_type == MemType::IMG) ? jv->in_kernels() : jv->out_kernels(); std::replace_if( kernels.begin(), kernels.end(), [this, &in_tensors, &i](kernel::LiteKernel *kv) { @@ -47,7 +49,7 @@ int SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToNull( this->nodes_set_.count(kv) == 0; }, nullptr); - if (mem_type == OpenCLMemType::IMG) { + if (mem_type == MemType::IMG) { jv->set_in_tensors(tensors); jv->SetInKernel(kernels); } else { @@ -61,25 +63,24 @@ int SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToNull( int SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToConvert(const lite::Tensor *in_tensor, const std::vector &in_kernels, lite::Tensor *new_tensor, - kernel::LiteKernel *in_convert_op, - OpenCLMemType mem_type) { + kernel::LiteKernel *in_convert_op, MemType mem_type) { auto in_opencl_op = reinterpret_cast(in_convert_op); for (auto &iv : in_kernels) { - auto kernels = (mem_type == OpenCLMemType::IMG) ? iv->in_kernels() : iv->out_kernels(); + auto kernels = (mem_type == MemType::IMG) ? iv->in_kernels() : iv->out_kernels(); auto fk = std::find_if(kernels.begin(), kernels.end(), [&](kernel::LiteKernel *kv) { return kv == nullptr; }); if (fk != kernels.end()) { *fk = in_convert_op; } else { kernels.emplace_back(in_convert_op); } - auto tensors = (mem_type == OpenCLMemType::IMG) ? iv->in_tensors() : iv->out_tensors(); + auto tensors = (mem_type == MemType::IMG) ? iv->in_tensors() : iv->out_tensors(); auto ft = std::find_if(tensors.begin(), tensors.end(), [&](lite::Tensor *kv) { return kv == nullptr; }); if (ft != tensors.end()) { *ft = new_tensor; } else { tensors.emplace_back(new_tensor); } - if (mem_type == OpenCLMemType::IMG) { + if (mem_type == MemType::IMG) { iv->SetInKernel(kernels); iv->set_in_tensors(tensors); in_opencl_op->AddOutKernel(iv); @@ -95,22 +96,22 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector &in_te const std::vector> &in_kernels, std::vector *out_tensors, std::vector *out_parameters, - std::vector *out_convert_ops, OpenCLMemType mem_type) { + std::vector *out_convert_ops, MemType mem_type) { out_tensors->clear(); out_parameters->clear(); out_convert_ops->clear(); MS_ASSERT(in_tensors.size() == to_kernels.size()); MS_ASSERT(in_tensors.size() == from_kernels.size()); std::vector> loop_kernels; - if (mem_type == OpenCLMemType::BUF) { + if (mem_type == MemType::BUF) { GetKernelFromToTensor(in_tensors, nodes_, &loop_kernels, true); } ReplaceOutTensorAndKernelToNull(in_tensors, in_kernels, mem_type); for (size_t i = 0; i < in_tensors.size(); ++i) { - auto dst_format = (mem_type == OpenCLMemType::IMG) ? schema::Format::Format_NHWC4 : schema::Format::Format_NHWC; - auto src_format = (mem_type == OpenCLMemType::IMG) ? schema::Format::Format_NHWC : schema::Format::Format_NHWC4; + auto dst_format = (mem_type == MemType::IMG) ? schema::Format::Format_NHWC4 : schema::Format::Format_NHWC; + auto src_format = (mem_type == MemType::IMG) ? schema::Format::Format_NHWC : schema::Format::Format_NHWC4; auto *new_tensor = new (std::nothrow) lite::Tensor(); MS_ASSERT(new_tensor); if (new_tensor == nullptr) { @@ -118,7 +119,7 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector &in_te return RET_ERROR; } new_tensor->CopyTensor(*in_tensors[i]); - if (mem_type == OpenCLMemType::IMG) { + if (mem_type == MemType::IMG) { new_tensor->SetFormat(dst_format); in_tensors[i]->SetFormat(src_format); } else { @@ -128,7 +129,7 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector &in_te out_tensors->emplace_back(new_tensor); KernelKey desc{kGPU, kNumberTypeFloat32, schema::PrimitiveType_ToFormat}; - if (mem_type == OpenCLMemType::IMG && ocl_runtime_->GetFp16Enable()) { + if (mem_type == MemType::IMG && ocl_runtime_->GetFp16Enable()) { desc.data_type = kNumberTypeFloat16; new_tensor->set_data_type(kNumberTypeFloat16); } @@ -146,7 +147,7 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector &in_te parameter->out_mem_type = mem_type; out_parameters->emplace_back(parameter); LiteKernel *in_convert_op = nullptr; - if (mem_type == OpenCLMemType::IMG) { + if (mem_type == MemType::IMG) { in_convert_op = lite::GetOpenCLKernel({in_tensors[i]}, {new_tensor}, reinterpret_cast(parameter), context_, desc); } else { @@ -166,7 +167,7 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector &in_te ReplaceOutTensorAndKernelToConvert(in_tensors.at(i), in_kernels.at(i), new_tensor, in_convert_op, mem_type); // replace in_tensor of inner kernel which use out tensor - if (mem_type == OpenCLMemType::BUF) { + if (mem_type == MemType::BUF) { for (auto &iv : loop_kernels[i]) { auto tensors = iv->in_tensors(); auto jv = std::find(tensors.begin(), tensors.end(), in_tensors.at(i)); @@ -196,8 +197,8 @@ int SubGraphOpenCLKernel::Init() { std::vector> from_kernels_; GetKernelFromToTensor(in_tensors_, in_nodes_, &from_kernels_, true); - int ret = GenToFormatOp(in_tensors_, from_kernels_, &in_convert_tensors_, &in_parameters_, &in_convert_ops_, - OpenCLMemType::IMG); + int ret = + GenToFormatOp(in_tensors_, from_kernels_, &in_convert_tensors_, &in_parameters_, &in_convert_ops_, MemType::IMG); if (ret != RET_OK) { return ret; } @@ -205,8 +206,8 @@ int SubGraphOpenCLKernel::Init() { std::vector> to_kernels_; GetKernelFromToTensor(out_tensors_, out_nodes_, &to_kernels_, false); - ret = GenToFormatOp(out_tensors_, to_kernels_, &out_convert_tensors_, &out_parameters_, &out_convert_ops_, - OpenCLMemType::BUF); + ret = + GenToFormatOp(out_tensors_, to_kernels_, &out_convert_tensors_, &out_parameters_, &out_convert_ops_, MemType::BUF); if (ret != RET_OK) { return ret; } @@ -216,6 +217,11 @@ int SubGraphOpenCLKernel::Init() { MallocTensorWithReuse(); + ret = SubGraphKernel::Prepare(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "OpenCL prepare fail"; + return ret; + } return RET_OK; } @@ -246,7 +252,7 @@ int SubGraphOpenCLKernel::MallocTensorWithReuse() { for (auto i = 0; i < outputs.size(); ++i) { auto *output = outputs.at(i); MS_ASSERT(nullptr != output); - if (op_kernel->GetMemType() == OpenCLMemType::IMG) { + if (op_kernel->GetMemType() == MemType::IMG) { std::vector img_size; op_kernel->GetImageSize(i, &img_size); auto data_ptr = allocator_->Malloc(output->Size(), img_size); @@ -328,11 +334,6 @@ int SubGraphOpenCLKernel::Prepare() { MS_LOG(ERROR) << "OpenCL subgraph init fail"; return ret; } - ret = SubGraphKernel::Prepare(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "OpenCL prepare fail"; - return ret; - } return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h index 02b7fb2ba7..34de3605c5 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h +++ b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h @@ -61,15 +61,15 @@ class SubGraphOpenCLKernel : public SubGraphKernel { int MallocTensorWithReuse(); int ReplaceOutTensorAndKernelToNull(const std::vector &in_tensors, const std::vector> &in_kernels, - OpenCLMemType mem_type); + lite::opencl::MemType mem_type); int ReplaceOutTensorAndKernelToConvert(const lite::Tensor *in_tensor, const std::vector &in_kernels, lite::Tensor *new_tensor, - kernel::LiteKernel *in_convert_op, OpenCLMemType mem_type); + kernel::LiteKernel *in_convert_op, lite::opencl::MemType mem_type); int GetInOutNodes(); int GenToFormatOp(const std::vector &in_tensors, const std::vector> &in_kernels, std::vector *out_tensors, std::vector *out_parameters, - std::vector *out_convert_ops, OpenCLMemType mem_type); + std::vector *out_convert_ops, lite::opencl::MemType mem_type); int GetKernelFromToTensor(const std::vector &in_tensors, const std::vector &in_kernels, std::vector> *out_kernels, bool is_from); diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.cc b/mindspore/lite/src/runtime/kernel/opencl/utils.cc index 8c3957307b..fbde810442 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/utils.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/utils.cc @@ -24,6 +24,7 @@ #include "src/common/file_utils.h" using mindspore::lite::KernelRegistrar; +using mindspore::lite::opencl::MemType; namespace mindspore::lite { kernel::LiteKernel *GetOpenCLKernel(const std::vector &in_tensors, const std::vector &out_tensors, @@ -243,20 +244,20 @@ int WriteToBin(const std::string &file_path, void *data, size_t size) { return 0; } -void PrintTensor(const lite::Tensor *tensor, OpenCLMemType mem_type, int n, const std::string &out_file) { +void PrintTensor(const lite::Tensor *tensor, MemType mem_type, int n, const std::string &out_file) { if (tensor->data_c() == nullptr) { return; } Image2DInfo img_info(tensor); - auto size = mem_type == OpenCLMemType::BUF ? img_info.OriginSize : img_info.Image2DSize; + auto size = mem_type == MemType::BUF ? img_info.OriginSize : img_info.Image2DSize; std::vector data(size); auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); auto runtime = runtime_wrapper.GetInstance(); auto allocator = runtime->GetAllocator(); runtime->SyncCommandQueue(); allocator->MapBuffer(tensor->data_c(), CL_MAP_READ, nullptr, true); - if (mem_type == OpenCLMemType::BUF) { + if (mem_type == MemType::BUF) { memcpy(data.data(), tensor->data_c(), img_info.OriginSize); } else { auto row_size = img_info.width * img_info.FLT4_size; @@ -277,7 +278,7 @@ void PrintTensor(const lite::Tensor *tensor, OpenCLMemType mem_type, int n, cons } printf(") "); - auto num = mem_type == OpenCLMemType::BUF ? img_info.ElementsNum : img_info.ElementsC4Num; + auto num = mem_type == MemType::BUF ? img_info.ElementsNum : img_info.ElementsC4Num; for (int i = 0; i < n && i < num; ++i) { if (tensor->data_type() == kNumberTypeFloat16) { printf("%d %7.3f | ", i, reinterpret_cast(data.data())[i]); diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.h b/mindspore/lite/src/runtime/kernel/opencl/utils.h index 9e5808d816..cec6316b31 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/utils.h +++ b/mindspore/lite/src/runtime/kernel/opencl/utils.h @@ -48,7 +48,8 @@ std::string CLErrorCode(cl_int error_code); int WriteToBin(const std::string &file_path, void *data, size_t size); -void PrintTensor(const lite::Tensor *tensor, OpenCLMemType mem_type, int n = 10, const std::string &out_file = ""); +void PrintTensor(const lite::Tensor *tensor, lite::opencl::MemType mem_type, int n = 10, + const std::string &out_file = ""); void PrintKernelOutput(OpenCLKernel *kernel, int n = 10, const std::string &out_file = ""); diff --git a/mindspore/lite/src/runtime/opencl/opencl_allocator.h b/mindspore/lite/src/runtime/opencl/opencl_allocator.h index 978bbf19e7..09a1c16231 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_allocator.h +++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.h @@ -41,7 +41,7 @@ struct OpenclMemory { }; class OpenCLRuntime; -enum class MemType : char { SVM, BUF, IMG }; +enum class MemType : char { BUF, IMG }; class OpenCLAllocator : public Allocator { public: diff --git a/mindspore/lite/src/runtime/opencl/opencl_executor.cc b/mindspore/lite/src/runtime/opencl/opencl_executor.cc index a62e09e0cc..5110067d5f 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_executor.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_executor.cc @@ -42,7 +42,7 @@ int OpenCLExecutor::Run(std::vector &inputs, std::vector &ou for (auto i = 0; i < cur_outputs.size(); ++i) { auto *output = cur_outputs.at(i); MS_ASSERT(nullptr != output); - if (op_kernel->GetMemType() == kernel::OpenCLMemType::IMG) { + if (op_kernel->GetMemType() == lite::opencl::MemType::IMG) { std::vector img_size; op_kernel->GetImageSize(i, &img_size); auto data_ptr = allocator_->Malloc(output->Size(), img_size); diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc index 5cd4f6cf63..278e9bb40c 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc @@ -416,7 +416,39 @@ int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const std::vector #endif return RET_OK; } +// Run Kernel with 1D, 2D, 3D group size, and local size can be empty. +int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local, + cl::CommandQueue *command_queue) { + if (command_queue == nullptr) { + command_queue = default_command_queue_; + } + MS_ASSERT(local.size() == 0 || local.size() == global.size()); + cl::Event event; + cl_int ret = CL_SUCCESS; + ret = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange, global, local, nullptr, &event); + if (ret != CL_SUCCESS) { + MS_LOG(ERROR) << "Kernel execute failed:" << CLErrorCode(ret); + return RET_ERROR; + } + static int cnt = 0; + const int flush_period = 10; + if (cnt % flush_period == 0) { + command_queue->flush(); + } + cnt++; + MS_LOG(DEBUG) << "RunKernel success!"; +#if MS_OPENCL_PROFILE + event.wait(); + cl_ulong time_start; + cl_ulong time_end; + event.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start); + event.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end); + double nanoSeconds = time_end - time_start; + MS_LOG(INFO) << "OpenCl Execution time is: " << nanoSeconds / 1000000.0 << "ms"; +#endif + return RET_OK; +} // get gpu divce type GpuInfo OpenCLRuntime::ParseGpuInfo(std::string device_name, std::string device_version) { GpuInfo info; diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.h b/mindspore/lite/src/runtime/opencl/opencl_runtime.h index c9792c314f..72a622c14f 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_runtime.h +++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.h @@ -73,11 +73,12 @@ class OpenCLRuntime { const T value, const MemType mem_type = MemType::IMG) { switch (mem_type) { - case MemType::SVM: { - MS_LOG(DEBUG) << "Set kernel arg[" << index << "] SVM pointer " << value; - return kernel.setArg(index, value); - } case MemType::BUF: { + auto svm_capabilities = GetSVMCapabilities(); + if (svm_capabilities) { + MS_LOG(DEBUG) << "Set kernel arg[" << index << "] SVM pointer " << value; + return kernel.setArg(index, value); + } cl::Buffer *buffer = reinterpret_cast(allocator_->GetBuffer(value)); MS_LOG(DEBUG) << "Set kernel arg[" << index << "] OpenCL Buffer " << buffer << ", host_ptr: " << value; return kernel.setArg(index, *buffer); @@ -113,6 +114,8 @@ class OpenCLRuntime { int BuildKernel(cl::Kernel &kernel, const std::string &program_name, const std::string &kernel_name, const std::set &build_options); int RunKernel(const cl::Kernel &kernel, const std::vector &global, const std::vector &local, + cl::CommandQueue *command_queue); // !!!To be deleted + int RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local, cl::CommandQueue *command_queue); bool CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr, bool sync = false) const;