!7913 [MS][LITE][GPU]optimize activation

Merge pull request !7913 from chenzupeng/master-lite
4 years ago · 84a315279b
parent 8f44074796 3a47029423
commit 84a315279b
3 changed files with 49 additions and 92 deletions
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl
@ -1,15 +1,12 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable

-#define SLICES 4
-#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
-#define MIN(X, Y) (X < Y ? X : Y)
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

-__kernel void LeakyRelu(__read_only image2d_t input, __write_only image2d_t output, const int4 img_shape,
+__kernel void LeakyRelu(__read_only image2d_t input, __write_only image2d_t output, const int2 img_shape,
                        const float alpha) {
-  int Y = get_global_id(0);  // H
-  int X = get_global_id(1);  // W C4
-  if (X >= img_shape.z || Y >= img_shape.y) return;
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= img_shape.x || Y >= img_shape.y) return;
  FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y));
  FLT4 tmp;
  FLT alpha_f = TO_FLT(alpha);
@ -20,53 +17,40 @@ __kernel void LeakyRelu(__read_only image2d_t input, __write_only image2d_t outp
  WRITE_IMAGE(output, (int2)(X, Y), tmp);
 }

-__kernel void Relu(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape) {
-  int Y = get_global_id(0);
-  int X = get_global_id(1);
-  if (X >= input_shape.z || Y >= input_shape.y) return;
+__kernel void Relu(__read_only image2d_t input, __write_only image2d_t output, const int2 img_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= img_shape.x || Y >= img_shape.y) return;
  FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y));
-  FLT4 tmp;
-  tmp.x = in_c4.x > 0.0f ? in_c4.x : 0.0f;
-  tmp.y = in_c4.y > 0.0f ? in_c4.y : 0.0f;
-  tmp.z = in_c4.z > 0.0f ? in_c4.z : 0.0f;
-  tmp.w = in_c4.w > 0.0f ? in_c4.w : 0.0f;
-  WRITE_IMAGE(output, (int2)(X, Y), tmp);
+  in_c4 = max(in_c4, (FLT)(0.f));
+  WRITE_IMAGE(output, (int2)(X, Y), in_c4);
 }

-__kernel void Relu6(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape) {
-  int Y = get_global_id(0);
-  int X = get_global_id(1);
-  if (X >= input_shape.z || Y >= input_shape.y) return;
+__kernel void Relu6(__read_only image2d_t input, __write_only image2d_t output, const int2 img_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= img_shape.x || Y >= img_shape.y) return;
  FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y));
-  FLT4 tmp;
-  tmp.x = in_c4.x > 0.0f ? MIN(in_c4.x, 6.0f) : 0.0f;
-  tmp.y = in_c4.y > 0.0f ? MIN(in_c4.y, 6.0f) : 0.0f;
-  tmp.z = in_c4.z > 0.0f ? MIN(in_c4.z, 6.0f) : 0.0f;
-  tmp.w = in_c4.w > 0.0f ? MIN(in_c4.w, 6.0f) : 0.0f;
-  WRITE_IMAGE(output, (int2)(X, Y), tmp);
+  in_c4 = clamp(in_c4, (FLT)(0.f), (FLT)(6.f));
+  WRITE_IMAGE(output, (int2)(X, Y), in_c4);
 }

-__kernel void Sigmoid(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape) {
-  int Y = get_global_id(0);
-  int X = get_global_id(1);
-  if (X >= input_shape.z || Y >= input_shape.y) return;
+__kernel void Sigmoid(__read_only image2d_t input, __write_only image2d_t output, const int2 img_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= img_shape.x || Y >= img_shape.y) return;
  FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y));
-  FLT4 tmp;
-  tmp.x = 1.0f / (1.0f + exp(-in_c4.x));
-  tmp.y = 1.0f / (1.0f + exp(-in_c4.y));
-  tmp.z = 1.0f / (1.0f + exp(-in_c4.z));
-  tmp.w = 1.0f / (1.0f + exp(-in_c4.w));
-  WRITE_IMAGE(output, (int2)(X, Y), tmp);
+  in_c4 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-in_c4));
+  WRITE_IMAGE(output, (int2)(X, Y), in_c4);
 }

-__kernel void Tanh(__read_only image2d_t input, __write_only image2d_t output, int4 input_shape) {
-  int Y = get_global_id(0);
-  int X = get_global_id(1);
-  if (X >= input_shape.z || Y >= input_shape.y) return;
+__kernel void Tanh(__read_only image2d_t input, __write_only image2d_t output, const int2 img_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= img_shape.x || Y >= img_shape.y) return;
  FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y));
-  in_c4.x = (exp(in_c4.x) - exp(-in_c4.x)) / (exp(in_c4.x) + exp(-in_c4.x));
-  in_c4.y = (exp(in_c4.y) - exp(-in_c4.y)) / (exp(in_c4.y) + exp(-in_c4.y));
-  in_c4.z = (exp(in_c4.z) - exp(-in_c4.z)) / (exp(in_c4.z) + exp(-in_c4.z));
-  in_c4.w = (exp(in_c4.w) - exp(-in_c4.w)) / (exp(in_c4.w) + exp(-in_c4.w));
+  FLT4 exp0 = exp(in_c4);
+  FLT4 exp1 = exp(-in_c4);
+  in_c4 = (exp0 - exp1) / (exp0 + exp1);
  WRITE_IMAGE(output, (int2)(X, Y), in_c4);
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
@ -41,26 +41,6 @@ using mindspore::schema::PrimitiveType_Activation;
 namespace mindspore::kernel {

 int ActivationOpenClKernel::Init() {
-  in_size_ = in_tensors_[0]->shape().size();
-  out_size_ = out_tensors_[0]->shape().size();
-  size_t n, h, w, c;
-  if (in_size_ == 2) {
-    n = in_tensors_[0]->shape()[0];
-    c = in_tensors_[0]->shape()[1];
-    h = w = 1;
-  } else {
-    n = in_tensors_[0]->shape()[0];
-    h = in_tensors_[0]->shape()[1];
-    w = in_tensors_[0]->shape()[2];
-    c = in_tensors_[0]->shape()[3];
-  }
-  nhwc_shape_ = {n, h, w, c};
-  enable_fp16_ = ocl_runtime_->GetFp16Enable();
-  fp_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
-  if (in_size_ != 2 && in_size_ != 4) {
-    MS_LOG(ERROR) << "Activate fun only support dim=4 or 2, but your dim=" << in_size_;
-    return mindspore::lite::RET_ERROR;
-  }
  std::map<int, std::string> kernel_names{{ActivationType_LEAKY_RELU, "LeakyRelu"},
                                          {ActivationType_RELU, "Relu"},
                                          {ActivationType_SIGMOID, "Sigmoid"},
@ -70,45 +50,43 @@ int ActivationOpenClKernel::Init() {
    MS_LOG(ERROR) << "schema::ActivationType:" << type_ << "not found";
    return mindspore::lite::RET_ERROR;
  }
-
+  outShape = Image2DInfo(out_tensors_[0]);
+  local_size_ = {};
+  global_size_ = {outShape.width, outShape.height};
  std::string source = activation_source;
  std::set<std::string> build_options;
  std::string program_name = "Activation";
  ocl_runtime_->LoadSource(program_name, source);
  std::string kernel_name = kernel_names[type_];
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
-  MS_LOG(DEBUG) << op_parameter_->name_ << " init Done!";
+  SetArgs();
+  MS_LOG(DEBUG) << kernel_name << " init Done!";
  return mindspore::lite::RET_OK;
 }

+int ActivationOpenClKernel::SetArgs() {
+  int arg_idx = 2;
+  cl_int2 image_size = {static_cast<int>(outShape.width), static_cast<int>(outShape.height)};
+  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, image_size);
+  if (type_ == ActivationType_LEAKY_RELU) {
+    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, alpha_);
+  }
+  return RET_OK;
+}
+
 int ActivationOpenClKernel::Run() {
-  MS_LOG(DEBUG) << op_parameter_->name_ << " begin running!";
-  cl_int4 img2d_shape = GetImg2dShape();
+  MS_LOG(DEBUG) << this->name() << " begin running!";
  int arg_idx = 0;
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
-  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, img2d_shape);
-  if (type_ == ActivationType_LEAKY_RELU) {
-    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, alpha_);
-  }
-  std::vector<size_t> local = {};
-  std::vector<size_t> global = {static_cast<size_t>(img2d_shape.s[1]), static_cast<size_t>(img2d_shape.s[2])};
-  auto ret = ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
+  auto ret = ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
  if (ret != mindspore::lite::RET_OK) {
-    MS_LOG(ERROR) << "Run kernel:" << op_parameter_->name_ << " fail.";
+    MS_LOG(ERROR) << "Run kernel:" << this->name() << " fail.";
    return mindspore::lite::RET_ERROR;
  }
  return mindspore::lite::RET_OK;
 }

-cl_int4 ActivationOpenClKernel::GetImg2dShape() {
-  cl_int4 img2d_shape = {1, 1, 1, 1};
-  img2d_shape.s[1] = nhwc_shape_[1];
-  img2d_shape.s[2] = nhwc_shape_[2] * UP_DIV(nhwc_shape_[3], C4NUM);
-  img2d_shape.s[3] = C4NUM;
-  return img2d_shape;
-}
-
 kernel::LiteKernel *OpenClActivationKernelCreator(const std::vector<lite::Tensor *> &inputs,
                                                  const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
                                                  const lite::InnerContext *ctx, const kernel::KernelKey &desc,
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h
@ -37,16 +37,11 @@ class ActivationOpenClKernel : public OpenCLKernel {
  int Run() override;

 private:
-  cl_int4 GetImg2dShape();
-
+  int SetArgs();
  cl::Kernel kernel_;
  int type_;
  float alpha_;
-  int in_size_{};
-  int out_size_{};
-  size_t fp_size{};
-  bool enable_fp16_{false};
-  std::vector<size_t> nhwc_shape_;
+  Image2DInfo outShape = Image2DInfo(nullptr);
 };

 }  // namespace mindspore::kernel