From 15546ddea83fd163f20cd35cc6d7c2f745b7d627 Mon Sep 17 00:00:00 2001 From: wandongdong Date: Mon, 30 Nov 2020 01:14:25 -0800 Subject: [PATCH] fix opencl depthwise1x1 --- .../kernel/opencl/cl/depthwise_conv2d.cl | 62 +++++++----- .../runtime/kernel/opencl/kernel/biasadd.cc | 99 +++++++++---------- .../runtime/kernel/opencl/kernel/biasadd.h | 10 +- .../src/runtime/kernel/opencl/kernel/power.cc | 4 +- .../src/runtime/kernel/opencl/kernel/prelu.cc | 3 +- .../src/runtime/kernel/opencl/kernel/scale.cc | 3 +- .../src/runtime/kernel/opencl/opencl_kernel.h | 3 +- .../runtime/kernel/opencl/opencl_subgraph.cc | 52 ---------- .../runtime/kernel/opencl/opencl_subgraph.h | 8 +- .../src/runtime/opencl/opencl_executor.cc | 20 ++-- .../lite/src/runtime/opencl/opencl_runtime.cc | 70 ------------- .../lite/src/runtime/opencl/opencl_runtime.h | 2 - 12 files changed, 113 insertions(+), 223 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl index 329f641fa7..4ef013c2d1 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl @@ -182,37 +182,55 @@ __kernel void DepthwiseConv2d_IMG_NHWC4_b221(__write_only image2d_t dst_data, __ WRITE_IMAGE(dst_data, (int2)((X + 1) * dst_size.z + Z, Y + 1), r[3]); } } -__kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__write_only image2d_t dst_data, __read_only image2d_t src_data, - __global FLT4 *filter, __global FLT4 *bias, int2 kernel_size, int2 stride, - int2 padding, int2 dilation, int4 src_size, int4 dst_size, - float relu_clip_min, float relu_clip_max) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2) * 2; +__kernel void DepthwiseConv2d_IMG_NHWC4_1x1_b221(__write_only image2d_t dst_data, __read_only image2d_t src_data, + __global FLT4 *filter, __global FLT4 *bias, int2 kernel_size, + int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size, + float relu_clip_min, float relu_clip_max) { + int X = get_global_id(1) * 2; + int Y = get_global_id(2) * 2; + int Z = get_global_id(0); if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return; - FLT4 r[2] = {(FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f)}; + FLT4 r[4] = {(FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f), + (FLT4)(0.0f, 0.0f, 0.0f, 0.0f)}; int x_offset = X * stride.x + padding.x; int y_offset = Y * stride.y + padding.y; - int fx_c = Z; + int f_len = kernel_size.x * kernel_size.y; + int fx_c = Z * f_len; + bool last_x = (get_global_id(1) == (dst_size.x + 1) / 2) && ((dst_size.x & 0x1) == 1); + bool last_y = (get_global_id(2) == (dst_size.y + 1) / 2) && ((dst_size.y & 0x1) == 1); int y_c = y_offset; - bool outside_y = y_c < 0 || y_c >= src_size.y; + int y_c_a1 = y_c + stride.y; int x_c = x_offset; - bool outside_x = x_c < 0 || x_c >= src_size.x; - if (!outside_x && !outside_y) { - FLT4 flt_p0 = filter[fx_c]; - FLT4 flt_p1 = filter[fx_c + 1]; - FLT4 src_p0 = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c)); - FLT4 src_p1 = READ_IMAGE(src_data, smp_zero, (int2)(Z + 1 + x_c * src_size.z, y_c)); - r[0] += TO_FLT4(src_p0 * flt_p0); - r[1] += TO_FLT4(src_p1 * flt_p1); - } + int x_c_a1 = x_c + stride.x; + int x_sign = x_c < 0 ? -1 : 1; + int x_a1_sign = x_c_a1 < 0 ? -1 : 1; + FLT4 flt_p0 = filter[fx_c]; + FLT4 src_p00_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_sign + x_c * src_size.z, y_c)); + r[0] += TO_FLT4(src_p00_c0 * flt_p0); + FLT4 src_p01_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_a1_sign + x_c_a1 * src_size.z, y_c)); + r[1] += TO_FLT4(src_p01_c0 * flt_p0); + FLT4 src_p10_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c_a1)); + r[2] += TO_FLT4(src_p10_c0 * flt_p0); + FLT4 src_p11_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_a1_sign + x_c_a1 * src_size.z, y_c_a1)); + r[3] += TO_FLT4(src_p11_c0 * flt_p0); + r[0] += bias[Z]; + r[1] += bias[Z]; + r[2] += bias[Z]; + r[3] += bias[Z]; r[0] = clamp(r[0], (FLT)(relu_clip_min), (FLT)(relu_clip_max)); - r[1] += bias[Z + 1]; r[1] = clamp(r[1], (FLT)(relu_clip_min), (FLT)(relu_clip_max)); + r[2] = clamp(r[2], (FLT)(relu_clip_min), (FLT)(relu_clip_max)); + r[3] = clamp(r[3], (FLT)(relu_clip_min), (FLT)(relu_clip_max)); WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, Y), r[0]); - if ((dst_size.z & 0x1) == 0) { - WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z + 1, Y), r[1]); + if (!last_x) { + WRITE_IMAGE(dst_data, (int2)((X + 1) * dst_size.z + Z, Y), r[1]); + } + if (!last_y) { + WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, Y + 1), r[2]); + } + if (!last_y && !last_x) { + WRITE_IMAGE(dst_data, (int2)((X + 1) * dst_size.z + Z, Y + 1), r[3]); } } __kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *dst_data, __global FLT4 *src_data, __global FLT4 *filter, diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.cc index a48949780b..681d68779d 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.cc @@ -35,6 +35,35 @@ using mindspore::schema::PrimitiveType_BiasAdd; namespace mindspore::kernel { +int BiasAddOpenCLKernel::CheckSpecs() { + if (in_tensors_.size() == 0) { + MS_LOG(ERROR) << "Input data size must be greater than 0, but your size is " << in_tensors_.size(); + return RET_ERROR; + } + if (in_tensors_[0]->shape()[0] > 1) { + MS_LOG(ERROR) << "Input data size unsupported multi-batch."; + return RET_ERROR; + } + return RET_OK; +} + +void BiasAddOpenCLKernel::SetConstArgs() { + int arg_idx = 2; + std::map data_type{ + {schema::Format::Format_NC4, 1}, {schema::Format::Format_NHWC4, 2}, {schema::Format::Format_NC4HW4, 3}}; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_shape_); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, BiasAdd_); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, data_type[schema::Format::Format_NHWC4]); +} + +void BiasAddOpenCLKernel::SetGlobalLocal() { + cl_int4 global_size = input_shape_; + global_size.s[2] = UP_DIV(global_size.s[3], C4NUM) * global_size.s[2]; + std::vector local = {1, 1}; + std::vector global = {static_cast(global_size.s[1]), static_cast(global_size.s[2])}; + OpenCLKernel::AlignGlobalLocal(global, local); +} + int BiasAddOpenCLKernel::InitWeights() { int C = in_tensors_[1]->shape()[0]; int div_ci = UP_DIV(C, C4NUM); @@ -52,7 +81,7 @@ int BiasAddOpenCLKernel::InitWeights() { return RET_OK; } -int BiasAddOpenCLKernel::Init() { +int BiasAddOpenCLKernel::Prepare() { in_size_ = in_tensors_[0]->shape().size(); out_size_ = out_tensors_[0]->shape().size(); for (int i = 0; i < in_size_; ++i) { @@ -77,67 +106,27 @@ int BiasAddOpenCLKernel::Init() { ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + auto ret = InitWeights(); + if (ret != RET_OK) { + return ret; + } + SetGlobalLocal(); + SetConstArgs(); MS_LOG(DEBUG) << program_name << " Init Done!"; return mindspore::lite::RET_OK; } int BiasAddOpenCLKernel::Run() { - cl_int4 global_size = GetGlobalshape(); - MS_LOG(DEBUG) << op_parameter_->name_ << " Running!"; - int arg_idx = 0; - std::map data_type{ - {schema::Format::Format_NC4, 1}, {schema::Format::Format_NHWC4, 2}, {schema::Format::Format_NC4HW4, 3}}; - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_shape_); - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, BiasAdd_); - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, data_type[schema::Format::Format_NHWC4]); - std::vector local = {1, 1}; - std::vector global = {static_cast(global_size.s[1]), static_cast(global_size.s[2])}; - auto ret = ocl_runtime_->RunKernel(kernel_, global, local); - if (ret != mindspore::lite::RET_OK) { + ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); + ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); + auto ret = ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + if (ret != RET_OK) { MS_LOG(ERROR) << "Run kernel " << op_parameter_->name_ << " error."; - return mindspore::lite::RET_ERROR; + return RET_ERROR; } - return mindspore::lite::RET_OK; -} - -cl_int4 BiasAddOpenCLKernel::GetGlobalshape() { - cl_int4 global_shape = input_shape_; - global_shape.s[2] = UP_DIV(global_shape.s[3], C4NUM) * global_shape.s[2]; - return global_shape; -} - -kernel::LiteKernel *OpenCLBiasAddKernelCreator(const std::vector &inputs, - const std::vector &outputs, OpParameter *opParameter, - const lite::InnerContext *ctx, const kernel::KernelKey &desc, - const lite::PrimitiveC *primitive) { - if (inputs.size() == 0) { - MS_LOG(ERROR) << "Input data size must be greater than 0, but your size is " << inputs.size(); - free(opParameter); - return nullptr; - } - if (inputs[0]->shape()[0] > 1) { - MS_LOG(ERROR) << "Input data size unsupported multi-batch."; - free(opParameter); - return nullptr; - } - auto *kernel = new (std::nothrow) BiasAddOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); - if (kernel == nullptr) { - MS_LOG(ERROR) << "Kernel " << opParameter->name_ << "is nullptr."; - free(opParameter); - return nullptr; - } - - auto ret = kernel->Init(); - if (ret != mindspore::lite::RET_OK) { - MS_LOG(ERROR) << "Init BiasAdd kernel failed!"; - delete kernel; - return nullptr; - } - return kernel; + return RET_OK; } -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_BiasAdd, OpenCLBiasAddKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_BiasAdd, OpenCLBiasAddKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_BiasAdd, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_BiasAdd, OpenCLKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.h index 82dd1d52e4..31955dd3ac 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.h @@ -33,13 +33,15 @@ class BiasAddOpenCLKernel : public OpenCLKernel { : OpenCLKernel(parameter, inputs, outputs) {} ~BiasAddOpenCLKernel() override = default; - int Init() override; - int Run() override; + int Prepare() override; + + int CheckSpecs() override; + void SetConstArgs() override; + void SetGlobalLocal() override; int InitWeights() override; + int Run() override; private: - cl_int4 GetGlobalshape(); - void *BiasAdd_{nullptr}; int in_size_{}; int out_size_{}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/power.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/power.cc index 80eb852b11..12ce5e3696 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/power.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/power.cc @@ -131,8 +131,8 @@ int PowerOpenCLKernel::Run() { cl_float4 parameter = {power_, shift_, scale_, 1}; ocl_runtime_->SetKernelArg(kernel_, arg_cn++, parameter); } - - ocl_runtime_->RunKernel(kernel_, global, local); + AlignGlobalLocal(global, local); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc index 63efb37838..2ad74ae63c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc @@ -132,7 +132,8 @@ int PReluOpenCLKernel::Run() { std::vector local = {4, 4, 1}; std::vector global = {static_cast(H_), static_cast(W_), static_cast(CO_SLICES_)}; - auto ret = ocl_runtime_->RunKernel(kernel_, global, local); + AlignGlobalLocal(global, local); + auto ret = ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); if (ret != mindspore::lite::RET_OK) { MS_LOG(ERROR) << "Run kernel " << op_parameter_->name_ << " error."; return mindspore::lite::RET_ERROR; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc index 89d892a04d..58f608d819 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc @@ -60,6 +60,7 @@ void ScaleOpenCLKernel::Image2dGetWorkGroupSize() { local_size_ = {16, 16}; auto image2d_info = GpuTensorInfo(out_tensors_[0]); global_size_ = {image2d_info.width, image2d_info.height}; + OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); } int ScaleOpenCLKernel::InitWeights() { @@ -245,7 +246,7 @@ int ScaleOpenCLKernel::Run() { } } ocl_runtime_->SetKernelArg(kernel_, arg_idx++, param->activation_type_); - ocl_runtime_->RunKernel(kernel_, global_size_, local_size_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h index 481968f1e3..f155942986 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h @@ -195,7 +195,6 @@ class OpenCLKernel : public LiteKernel { return RET_OK; } - int Init() override { return RET_ERROR; } // !!!To be deleted int Prepare() override { return RET_OK; } int PreProcess() override { return RET_ERROR; } int ReSize() override { return RET_ERROR; } @@ -235,7 +234,7 @@ class OpenCLKernel : public LiteKernel { tuning_params.push_back(default_tuning_param); std::vector max_work_items = ocl_runtime_->GetWorkItemSize(); size_t max_workgroup_size = ocl_runtime_->GetMaxWorkGroupSize(kernel_); - size_t MIN_WORKGROUP_SIZE = 8; + const size_t MIN_WORKGROUP_SIZE = 8; std::set candidate_x = GenerateLocalByGlobal(global_size_[0]); std::set candidate_y = {1}; std::set candidate_z = {1}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc index 095fc45b47..40363eba69 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc @@ -229,16 +229,11 @@ int OpenCLSubGraph::Init() { return ret; } auto opencl_exec = reinterpret_cast(executor_); - ocl_runtime_->SetProfiling(true); ret = opencl_exec->RunOrTune(in_tensors_, out_tensors_, nodes_, allocator_, nullptr, nullptr, true); if (ret != RET_OK) { MS_LOG(ERROR) << "Run opencl executor failed: " << ret; return ret; } - ocl_runtime_->SetProfiling(false); -#ifdef Debug - ocl_runtime_->SetProfiling(true); -#endif return RET_OK; } @@ -262,50 +257,6 @@ void OpenCLSubGraph::UpdateTensorDataType() { } } -int OpenCLSubGraph::MallocTensorWithReuse() { - int ret; - kernel::LiteKernelUtil::InitTensorRefCount(nodes_); - for (auto *kernel : nodes_) { - MS_ASSERT(kernel); - auto *op_kernel = reinterpret_cast(kernel); - auto outputs = kernel->out_tensors(); - for (auto i = 0; i < outputs.size(); ++i) { - auto *output = outputs.at(i); - MS_ASSERT(output); - if (op_kernel->GetMemType() == MemType::IMG) { - std::vector img_size; - ret = op_kernel->GetImageSize(i, &img_size); - if (ret != RET_OK) { - MS_LOG(WARNING) << "GetImageSize failed"; - } - auto data_ptr = allocator_->Malloc(output->Size(), img_size); - output->set_data(data_ptr); - } else { - ret = output->MallocData(allocator_); - if (ret != RET_OK) { - MS_LOG(WARNING) << "MallocData failed"; - } - } - output->set_allocator(allocator_); - } - for (auto input_kernel : kernel->in_kernels()) { - MS_ASSERT(input_kernel); - ret = input_kernel->DecOutTensorRefCount(); - if (ret != RET_OK) { - MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed"; - } - } - } - for (auto kernel : out_kernels_) { - MS_ASSERT(kernel); - ret = kernel->DecOutTensorRefCount(); - if (ret != RET_OK) { - MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed"; - } - } - return RET_OK; -} - void OpenCLSubGraph::GetKernelFromToTensor(const std::vector &in_tensors, const std::vector &in_kernels, std::vector> *out_kernels, bool is_from) { @@ -379,8 +330,6 @@ void OpenCLSubGraph::UnInit() { delete this->executor_; } -int OpenCLSubGraph::InferShape() { return RET_OK; } - int OpenCLSubGraph::ReSize() { return RET_OK; } int OpenCLSubGraph::Run() { @@ -395,7 +344,6 @@ int OpenCLSubGraph::Run() { MS_LOG(ERROR) << "OpenCL subgraph input tensor data is null"; return RET_ERROR; } - allocator_->UnmapBuffer(tensor->data_c()); ret = allocator_->UnmapBuffer(tensor->data_c()); if (ret != RET_OK) { return ret; diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h index 88a7f3781b..f48760e1f1 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h @@ -43,7 +43,6 @@ class OpenCLSubGraph : public SubGraphKernel { int PostProcess() override { return mindspore::lite::RET_OK; } int Prepare() override; int Init() override; - int InferShape(); int ReSize() override; int Run() override; int Run(const KernelCallBack &before, const KernelCallBack &after) override { return this->Run(); }; @@ -51,7 +50,6 @@ class OpenCLSubGraph : public SubGraphKernel { private: void UnInit(); void UpdateTensorDataType(); - int MallocTensorWithReuse(); void ReplaceOutTensorAndKernelToNull(const std::vector &in_tensors, const std::vector> &in_kernels, lite::opencl::MemType mem_type); @@ -66,6 +64,9 @@ class OpenCLSubGraph : public SubGraphKernel { void GetKernelFromToTensor(const std::vector &in_tensors, const std::vector &in_kernels, std::vector> *out_kernels, bool is_from); + void Fusion(); + + private: lite::opencl::OpenCLAllocator *allocator_{nullptr}; std::vector in_convert_tensors_; std::vector out_convert_tensors_; @@ -78,9 +79,6 @@ class OpenCLSubGraph : public SubGraphKernel { std::set nodes_set_; lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_; lite::opencl::OpenCLRuntime *ocl_runtime_{nullptr}; - - private: - void Fusion(); }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/opencl/opencl_executor.cc b/mindspore/lite/src/runtime/opencl/opencl_executor.cc index ce7118a34e..cb1c888d9c 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_executor.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_executor.cc @@ -30,7 +30,9 @@ int OpenCLExecutor::Run(std::vector &inputs, std::vector &ou int OpenCLExecutor::RunOrTune(std::vector &inputs, std::vector &outputs, std::vector &kernels, Allocator *allocator, const KernelCallBack &before, const KernelCallBack &after, bool is_tune) { - int ret; + int ret{RET_OK}; + ocl_runtime.GetInstance()->SetProfiling(is_tune); + kernel::LiteKernelUtil::InitTensorRefCount(kernels); for (auto *kernel : kernels) { MS_ASSERT(kernel); @@ -77,11 +79,6 @@ int OpenCLExecutor::RunOrTune(std::vector &inputs, std::vectorname(); return ret; } -#ifdef Debug - MS_LOG(INFO) << "OpenCl kernel " << kernel->name() << "(" << kernel->type_str() - << ") execute time is: " << op_kernel->GetProfilingTimeMs() << "ms"; - -#endif } if (after != nullptr) { if (!after(TensorVectorCast(kernel->in_tensors()), TensorVectorCast(kernel->out_tensors()), callbackParam)) { @@ -95,7 +92,16 @@ int OpenCLExecutor::RunOrTune(std::vector &inputs, std::vectorname() << " failed"; } } +#ifdef Debug + MS_LOG(INFO) << "OpenCl kernel " << kernel->name() << "(" << kernel->type_str() + << ") execute time is: " << op_kernel->GetProfilingTimeMs() << "ms"; +#endif } - return RET_OK; +#ifdef Debug + ocl_runtime.GetInstance()->SetProfiling(true); +#else + ocl_runtime.GetInstance()->SetProfiling(false); +#endif + return ret; } } // namespace mindspore::lite::opencl diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc index 5043def50a..129d0c512c 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc @@ -406,76 +406,6 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na return RET_OK; } -// Run Kernel with 1D, 2D, 3D group size, and local size can be empty. -int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const std::vector &global, - const std::vector &local, cl::CommandQueue *command_queue) { - if (command_queue == nullptr) { - command_queue = default_command_queue_; - } - MS_ASSERT(local.size() == 0 || local.size() == global.size()); - std::vector internal_global_ws = global; - for (size_t i = 0; i < local.size(); ++i) { - internal_global_ws[i] = ROUND_UP(global[i], local[i]); - } - - MS_LOG(DEBUG) << "global size: " << global.size() << ", local size: " << local.size(); - for (size_t i = 0; i < global.size(); i++) { - MS_LOG(DEBUG) << "global[" << i << "] = " << global[i]; - } - for (size_t i = 0; i < local.size(); i++) { - MS_LOG(DEBUG) << "local[" << i << "] = " << local[i]; - } - - cl::NDRange global_range = cl::NullRange; - cl::NDRange local_range = cl::NullRange; - if (global.size() == 1) { - global_range = cl::NDRange(internal_global_ws[0]); - if (!local.empty()) { - local_range = cl::NDRange(local[0]); - } - } else if (global.size() == 2) { - global_range = cl::NDRange(internal_global_ws[0], internal_global_ws[1]); - if (!local.empty()) { - local_range = cl::NDRange(local[0], local[1]); - } - } else if (global.size() == 3) { - global_range = cl::NDRange(internal_global_ws[0], internal_global_ws[1], internal_global_ws[2]); - if (!local.empty()) { - local_range = cl::NDRange(local[0], local[1], local[2]); - } - } else { - MS_LOG(ERROR) << "Not supported NDRange!"; - return RET_ERROR; - } - - cl::Event event; - cl_int ret = CL_SUCCESS; - ret = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange, global_range, local_range, nullptr, &event); - if (ret != CL_SUCCESS) { - MS_LOG(ERROR) << "Kernel execute failed:" << CLErrorCode(ret); - return RET_ERROR; - } - static int cnt = 0; - const int flush_period = 10; - if (cnt % flush_period == 0) { - auto flush_ret = command_queue->flush(); - if (flush_ret != CL_SUCCESS) { - MS_LOG(WARNING) << "CL Flush failed:" << CLErrorCode(ret); - } - } - cnt++; - MS_LOG(DEBUG) << "RunKernel success!"; -#ifdef Debug - event.wait(); - cl_ulong time_start; - cl_ulong time_end; - event.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start); - event.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end); - double nanoSeconds = time_end - time_start; - MS_LOG(INFO) << "OpenCl Execution time is: " << nanoSeconds / 1000000.0 << "ms"; -#endif - return RET_OK; -} // Run Kernel with 1D, 2D, 3D group size, and local size can be empty. int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local, cl::CommandQueue *command_queue, cl::Event *event) { diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.h b/mindspore/lite/src/runtime/opencl/opencl_runtime.h index 47ebe126f4..f923ee7e67 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_runtime.h +++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.h @@ -115,8 +115,6 @@ class OpenCLRuntime { bool LoadSource(const std::string &program_name, const std::string &source); int BuildKernel(cl::Kernel &kernel, const std::string &program_name, const std::string &kernel_name, const std::set &build_options = {}); - int RunKernel(const cl::Kernel &kernel, const std::vector &global, const std::vector &local, - cl::CommandQueue *command_queue = nullptr); // !!!To be deleted int RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local, cl::CommandQueue *command_queue = nullptr, cl::Event *event = nullptr); bool CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr,