From 3a47029423dfb053b53f4bbd79d71ac7093c6ccd Mon Sep 17 00:00:00 2001 From: chenzupeng Date: Wed, 28 Oct 2020 17:31:40 +0800 Subject: [PATCH] optimize activation --- .../runtime/kernel/opencl/cl/activation.cl | 74 ++++++++----------- .../kernel/opencl/kernel/activation.cc | 58 +++++---------- .../runtime/kernel/opencl/kernel/activation.h | 9 +-- 3 files changed, 49 insertions(+), 92 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl index d606787821..8eec5b9c39 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl @@ -1,15 +1,12 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable -#define SLICES 4 -#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) -#define MIN(X, Y) (X < Y ? X : Y) __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void LeakyRelu(__read_only image2d_t input, __write_only image2d_t output, const int4 img_shape, +__kernel void LeakyRelu(__read_only image2d_t input, __write_only image2d_t output, const int2 img_shape, const float alpha) { - int Y = get_global_id(0); // H - int X = get_global_id(1); // W C4 - if (X >= img_shape.z || Y >= img_shape.y) return; + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= img_shape.x || Y >= img_shape.y) return; FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y)); FLT4 tmp; FLT alpha_f = TO_FLT(alpha); @@ -20,53 +17,40 @@ __kernel void LeakyRelu(__read_only image2d_t input, __write_only image2d_t outp WRITE_IMAGE(output, (int2)(X, Y), tmp); } -__kernel void Relu(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape) { - int Y = get_global_id(0); - int X = get_global_id(1); - if (X >= input_shape.z || Y >= input_shape.y) return; +__kernel void Relu(__read_only image2d_t input, __write_only image2d_t output, const int2 img_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= img_shape.x || Y >= img_shape.y) return; FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y)); - FLT4 tmp; - tmp.x = in_c4.x > 0.0f ? in_c4.x : 0.0f; - tmp.y = in_c4.y > 0.0f ? in_c4.y : 0.0f; - tmp.z = in_c4.z > 0.0f ? in_c4.z : 0.0f; - tmp.w = in_c4.w > 0.0f ? in_c4.w : 0.0f; - WRITE_IMAGE(output, (int2)(X, Y), tmp); + in_c4 = max(in_c4, (FLT)(0.f)); + WRITE_IMAGE(output, (int2)(X, Y), in_c4); } -__kernel void Relu6(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape) { - int Y = get_global_id(0); - int X = get_global_id(1); - if (X >= input_shape.z || Y >= input_shape.y) return; +__kernel void Relu6(__read_only image2d_t input, __write_only image2d_t output, const int2 img_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= img_shape.x || Y >= img_shape.y) return; FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y)); - FLT4 tmp; - tmp.x = in_c4.x > 0.0f ? MIN(in_c4.x, 6.0f) : 0.0f; - tmp.y = in_c4.y > 0.0f ? MIN(in_c4.y, 6.0f) : 0.0f; - tmp.z = in_c4.z > 0.0f ? MIN(in_c4.z, 6.0f) : 0.0f; - tmp.w = in_c4.w > 0.0f ? MIN(in_c4.w, 6.0f) : 0.0f; - WRITE_IMAGE(output, (int2)(X, Y), tmp); + in_c4 = clamp(in_c4, (FLT)(0.f), (FLT)(6.f)); + WRITE_IMAGE(output, (int2)(X, Y), in_c4); } -__kernel void Sigmoid(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape) { - int Y = get_global_id(0); - int X = get_global_id(1); - if (X >= input_shape.z || Y >= input_shape.y) return; +__kernel void Sigmoid(__read_only image2d_t input, __write_only image2d_t output, const int2 img_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= img_shape.x || Y >= img_shape.y) return; FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y)); - FLT4 tmp; - tmp.x = 1.0f / (1.0f + exp(-in_c4.x)); - tmp.y = 1.0f / (1.0f + exp(-in_c4.y)); - tmp.z = 1.0f / (1.0f + exp(-in_c4.z)); - tmp.w = 1.0f / (1.0f + exp(-in_c4.w)); - WRITE_IMAGE(output, (int2)(X, Y), tmp); + in_c4 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-in_c4)); + WRITE_IMAGE(output, (int2)(X, Y), in_c4); } -__kernel void Tanh(__read_only image2d_t input, __write_only image2d_t output, int4 input_shape) { - int Y = get_global_id(0); - int X = get_global_id(1); - if (X >= input_shape.z || Y >= input_shape.y) return; +__kernel void Tanh(__read_only image2d_t input, __write_only image2d_t output, const int2 img_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= img_shape.x || Y >= img_shape.y) return; FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y)); - in_c4.x = (exp(in_c4.x) - exp(-in_c4.x)) / (exp(in_c4.x) + exp(-in_c4.x)); - in_c4.y = (exp(in_c4.y) - exp(-in_c4.y)) / (exp(in_c4.y) + exp(-in_c4.y)); - in_c4.z = (exp(in_c4.z) - exp(-in_c4.z)) / (exp(in_c4.z) + exp(-in_c4.z)); - in_c4.w = (exp(in_c4.w) - exp(-in_c4.w)) / (exp(in_c4.w) + exp(-in_c4.w)); + FLT4 exp0 = exp(in_c4); + FLT4 exp1 = exp(-in_c4); + in_c4 = (exp0 - exp1) / (exp0 + exp1); WRITE_IMAGE(output, (int2)(X, Y), in_c4); } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc index e01db37085..4fcf9fd8ed 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc @@ -41,26 +41,6 @@ using mindspore::schema::PrimitiveType_Activation; namespace mindspore::kernel { int ActivationOpenClKernel::Init() { - in_size_ = in_tensors_[0]->shape().size(); - out_size_ = out_tensors_[0]->shape().size(); - size_t n, h, w, c; - if (in_size_ == 2) { - n = in_tensors_[0]->shape()[0]; - c = in_tensors_[0]->shape()[1]; - h = w = 1; - } else { - n = in_tensors_[0]->shape()[0]; - h = in_tensors_[0]->shape()[1]; - w = in_tensors_[0]->shape()[2]; - c = in_tensors_[0]->shape()[3]; - } - nhwc_shape_ = {n, h, w, c}; - enable_fp16_ = ocl_runtime_->GetFp16Enable(); - fp_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float); - if (in_size_ != 2 && in_size_ != 4) { - MS_LOG(ERROR) << "Activate fun only support dim=4 or 2, but your dim=" << in_size_; - return mindspore::lite::RET_ERROR; - } std::map kernel_names{{ActivationType_LEAKY_RELU, "LeakyRelu"}, {ActivationType_RELU, "Relu"}, {ActivationType_SIGMOID, "Sigmoid"}, @@ -70,45 +50,43 @@ int ActivationOpenClKernel::Init() { MS_LOG(ERROR) << "schema::ActivationType:" << type_ << "not found"; return mindspore::lite::RET_ERROR; } - + outShape = Image2DInfo(out_tensors_[0]); + local_size_ = {}; + global_size_ = {outShape.width, outShape.height}; std::string source = activation_source; std::set build_options; std::string program_name = "Activation"; ocl_runtime_->LoadSource(program_name, source); std::string kernel_name = kernel_names[type_]; ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); - MS_LOG(DEBUG) << op_parameter_->name_ << " init Done!"; + SetArgs(); + MS_LOG(DEBUG) << kernel_name << " init Done!"; return mindspore::lite::RET_OK; } +int ActivationOpenClKernel::SetArgs() { + int arg_idx = 2; + cl_int2 image_size = {static_cast(outShape.width), static_cast(outShape.height)}; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, image_size); + if (type_ == ActivationType_LEAKY_RELU) { + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, alpha_); + } + return RET_OK; +} + int ActivationOpenClKernel::Run() { - MS_LOG(DEBUG) << op_parameter_->name_ << " begin running!"; - cl_int4 img2d_shape = GetImg2dShape(); + MS_LOG(DEBUG) << this->name() << " begin running!"; int arg_idx = 0; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, img2d_shape); - if (type_ == ActivationType_LEAKY_RELU) { - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, alpha_); - } - std::vector local = {}; - std::vector global = {static_cast(img2d_shape.s[1]), static_cast(img2d_shape.s[2])}; - auto ret = ocl_runtime_->RunKernel(kernel_, global, local, nullptr); + auto ret = ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); if (ret != mindspore::lite::RET_OK) { - MS_LOG(ERROR) << "Run kernel:" << op_parameter_->name_ << " fail."; + MS_LOG(ERROR) << "Run kernel:" << this->name() << " fail."; return mindspore::lite::RET_ERROR; } return mindspore::lite::RET_OK; } -cl_int4 ActivationOpenClKernel::GetImg2dShape() { - cl_int4 img2d_shape = {1, 1, 1, 1}; - img2d_shape.s[1] = nhwc_shape_[1]; - img2d_shape.s[2] = nhwc_shape_[2] * UP_DIV(nhwc_shape_[3], C4NUM); - img2d_shape.s[3] = C4NUM; - return img2d_shape; -} - kernel::LiteKernel *OpenClActivationKernelCreator(const std::vector &inputs, const std::vector &outputs, OpParameter *opParameter, const lite::InnerContext *ctx, const kernel::KernelKey &desc, diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h index f2b44af5fb..d979bb538c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h @@ -37,16 +37,11 @@ class ActivationOpenClKernel : public OpenCLKernel { int Run() override; private: - cl_int4 GetImg2dShape(); - + int SetArgs(); cl::Kernel kernel_; int type_; float alpha_; - int in_size_{}; - int out_size_{}; - size_t fp_size{}; - bool enable_fp16_{false}; - std::vector nhwc_shape_; + Image2DInfo outShape = Image2DInfo(nullptr); }; } // namespace mindspore::kernel