activation support fp16 in opencl

5 years ago · c29e9596dd
parent 9d0deb6955
commit c29e9596dd
9 changed files with 169 additions and 93 deletions
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl
@ -1,22 +1,22 @@
-#pragma OPENCL EXTENSION cl_arm_printf : enable
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable

 #define SLICES 4
 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
 #define MIN(X, Y) (X < Y ? X : Y)
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

-__kernel void ReluScalar(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
-                         const float alpha) {
+__kernel void LeakyRelu(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
+                        __global FLT *alpha) {
  int C = input_shape.w;     // channel size
  int Y = get_global_id(0);  // height id
  int X = get_global_id(1);  // weight id
  for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
    FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
    FLT4 tmp;
-    tmp.x = in_c4.x >= 0 ? in_c4.x : in_c4.x * alpha;
-    tmp.y = in_c4.y >= 0 ? in_c4.y : in_c4.y * alpha;
-    tmp.z = in_c4.z >= 0 ? in_c4.z : in_c4.z * alpha;
-    tmp.w = in_c4.w >= 0 ? in_c4.w : in_c4.w * alpha;
+    tmp.x = in_c4.x > 0.0f ? in_c4.x : in_c4.x * alpha[0];
+    tmp.y = in_c4.y > 0.0f ? in_c4.y : in_c4.y * alpha[0];
+    tmp.z = in_c4.z > 0.0f ? in_c4.z : in_c4.z * alpha[0];
+    tmp.w = in_c4.w > 0.0f ? in_c4.w : in_c4.w * alpha[0];
    WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
  }
 }
@ -28,10 +28,10 @@ __kernel void Relu(__read_only image2d_t input, __write_only image2d_t output, c
  for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
    FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
    FLT4 tmp;
-    tmp.x = in_c4.x >= 0 ? in_c4.x : 0;
-    tmp.y = in_c4.y >= 0 ? in_c4.y : 0;
-    tmp.z = in_c4.z >= 0 ? in_c4.z : 0;
-    tmp.w = in_c4.w >= 0 ? in_c4.w : 0;
+    tmp.x = in_c4.x > 0.0f ? in_c4.x : 0.0f;
+    tmp.y = in_c4.y > 0.0f ? in_c4.y : 0.0f;
+    tmp.z = in_c4.z > 0.0f ? in_c4.z : 0.0f;
+    tmp.w = in_c4.w > 0.0f ? in_c4.w : 0.0f;
    WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
  }
 }
@ -43,10 +43,10 @@ __kernel void Relu6(__read_only image2d_t input, __write_only image2d_t output,
  for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
    FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
    FLT4 tmp;
-    tmp.x = in_c4.x >= 0 ? MIN(in_c4.x, 6) : 0;
-    tmp.y = in_c4.y >= 0 ? MIN(in_c4.y, 6) : 0;
-    tmp.z = in_c4.z >= 0 ? MIN(in_c4.z, 6) : 0;
-    tmp.w = in_c4.w >= 0 ? MIN(in_c4.w, 6) : 0;
+    tmp.x = in_c4.x > 0.0f ? MIN(in_c4.x, 6.0f) : 0.0f;
+    tmp.y = in_c4.y > 0.0f ? MIN(in_c4.y, 6.0f) : 0.0f;
+    tmp.z = in_c4.z > 0.0f ? MIN(in_c4.z, 6.0f) : 0.0f;
+    tmp.w = in_c4.w > 0.0f ? MIN(in_c4.w, 6.0f) : 0.0f;
    WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
  }
 }
@ -58,10 +58,10 @@ __kernel void Sigmoid(__read_only image2d_t input, __write_only image2d_t output
  for (int num = 0; num < UP_DIV(C, SLICES); ++num) {
    FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X * UP_DIV(C, SLICES) + num, Y));  // NHWC4: H WC
    FLT4 tmp;
-    tmp.x = 1 / (1 + exp(-in_c4.x));
-    tmp.y = 1 / (1 + exp(-in_c4.y));
-    tmp.z = 1 / (1 + exp(-in_c4.z));
-    tmp.w = 1 / (1 + exp(-in_c4.w));
+    tmp.x = 1.0f / (1.0f + exp(-in_c4.x));
+    tmp.y = 1.0f / (1.0f + exp(-in_c4.y));
+    tmp.z = 1.0f / (1.0f + exp(-in_c4.z));
+    tmp.w = 1.0f / (1.0f + exp(-in_c4.w));
    WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
  }
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl
@ -84,7 +84,7 @@ __kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__read_only image2d_t src_data, __gl
      bool outside_x = x_c < 0 || x_c >= src_size.x;
      if (!outside_x && !outside_y) {
        FLT4 flt_p = filter[fx_c];
-        FLT4 src_p = READ_IMAGE(src_data, smp_zero, (int2)(Z, (y_c * src_size.x + x_c) * src_size.z));
+        FLT4 src_p = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c));
        r += TO_FLT4(src_p * flt_p);
      }
    }
@ -92,7 +92,7 @@ __kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__read_only image2d_t src_data, __gl
  FLT4 bias_p = bias[Z];
  FLT4 res = TO_FLT4(r) + bias_p;
  res = clamp(res, (FLT)(relu_clip_min), (FLT)(relu_clip_max));
-  WRITE_IMAGE(dst_data, (int2)(Z, (Y * dst_size.x + X) * dst_size.z), res);
+  WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, Y), res);
 }
 __kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias,
                                         __global FLT4 *dst_data, int2 kernel_size, int2 stride,
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/prelu.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/prelu.cl
@ -14,16 +14,16 @@ __kernel void PRelu(__read_only image2d_t input, __write_only image2d_t output,
    FLT4 tmp;
    if (dim == 1) {
      FLT4 weight = READ_IMAGE(alpha, smp_zero, (int2)(0, 0));
-      tmp.x = in_c4.x >= 0 ? in_c4.x : in_c4.x * weight.x;
-      tmp.y = in_c4.y >= 0 ? in_c4.y : in_c4.y * weight.x;
-      tmp.z = in_c4.z >= 0 ? in_c4.z : in_c4.z * weight.x;
-      tmp.w = in_c4.w >= 0 ? in_c4.w : in_c4.w * weight.x;
+      tmp.x = in_c4.x > 0.0f ? in_c4.x : in_c4.x * weight.x;
+      tmp.y = in_c4.y > 0.0f ? in_c4.y : in_c4.y * weight.x;
+      tmp.z = in_c4.z > 0.0f ? in_c4.z : in_c4.z * weight.x;
+      tmp.w = in_c4.w > 0.0f ? in_c4.w : in_c4.w * weight.x;
    } else {
      FLT4 weight = READ_IMAGE(alpha, smp_zero, (int2)(num, 0));
-      tmp.x = in_c4.x >= 0 ? in_c4.x : in_c4.x * weight.x;
-      tmp.y = in_c4.y >= 0 ? in_c4.y : in_c4.y * weight.y;
-      tmp.z = in_c4.z >= 0 ? in_c4.z : in_c4.z * weight.z;
-      tmp.w = in_c4.w >= 0 ? in_c4.w : in_c4.w * weight.w;
+      tmp.x = in_c4.x > 0.0f ? in_c4.x : in_c4.x * weight.x;
+      tmp.y = in_c4.y > 0.0f ? in_c4.y : in_c4.y * weight.y;
+      tmp.z = in_c4.z > 0.0f ? in_c4.z : in_c4.z * weight.z;
+      tmp.w = in_c4.w > 0.0f ? in_c4.w : in_c4.w * weight.w;
    }
    WRITE_IMAGE(output, (int2)(X * UP_DIV(C, SLICES) + num, Y), tmp);  // NHWC4: H WC
  }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
@ -38,15 +38,33 @@ using mindspore::schema::PrimitiveType_Activation;

 namespace mindspore::kernel {

+void ActivationOpenClKernel::InitBuffer() {
+  auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
+  alpha_buff_ = allocator->Malloc(fp_size);
+  alpha_buff_ = allocator->MapBuffer(alpha_buff_, CL_MAP_WRITE, nullptr, true);
+  memset(alpha_buff_, 0x00, fp_size);
+  if (enable_fp16_) {
+    auto fp16 = (float16_t)alpha_;
+    memcpy(alpha_buff_, &fp16, fp_size);
+  } else {
+    memcpy(alpha_buff_, &alpha_, fp_size);
+  }
+  allocator->UnmapBuffer(alpha_buff_);
+}
+
 int ActivationOpenClKernel::Init() {
  in_size_ = in_tensors_[0]->shape().size();
  out_size_ = out_tensors_[0]->shape().size();
+  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  enable_fp16_ = ocl_runtime->GetFp16Enable();
+  fp_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
  if (in_size_ != 2 && in_size_ != 4) {
    MS_LOG(ERROR) << "Activate fun only support dim=4 or 2, but your dim=" << in_size_;
    return RET_ERROR;
  }
+  InitBuffer();
  std::map<int, std::vector<std::string>> Program_Kernel{
-    {ActivationType_LEAKY_RELU, std::vector<std::string>{"LEAKY_RELU", "ReluScalar"}},
+    {ActivationType_LEAKY_RELU, std::vector<std::string>{"LEAKY_RELU", "LeakyRelu"}},
    {ActivationType_RELU, std::vector<std::string>{"RELU", "Relu"}},
    {ActivationType_SIGMOID, std::vector<std::string>{"SIGMOID", "Sigmoid"}},
    {ActivationType_RELU6, std::vector<std::string>{"RELU6", "Relu6"}}};
@ -57,7 +75,6 @@ int ActivationOpenClKernel::Init() {

  std::string source = activation_source;
  std::set<std::string> build_options;
-  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  ocl_runtime->LoadSource(Program_Kernel[type_][0], source);
  ocl_runtime->BuildKernel(kernel_, Program_Kernel[type_][0], Program_Kernel[type_][1], build_options);

@ -87,7 +104,7 @@ int ActivationOpenClKernel::Run() {
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->Data());
  ocl_runtime->SetKernelArg(kernel_, arg_idx++, img2d_shape);
  if (type_ == ActivationType_LEAKY_RELU) {
-    ocl_runtime->SetKernelArg(kernel_, arg_idx++, alpha_);
+    ocl_runtime->SetKernelArg(kernel_, arg_idx++, alpha_buff_, lite::opencl::MemType::BUF);
  }
  std::vector<size_t> local = {1, 1};
  std::vector<size_t> global = {static_cast<size_t>(img2d_shape.s[1]), static_cast<size_t>(img2d_shape.s[2])};
@ -114,12 +131,10 @@ cl_int4 ActivationOpenClKernel::GetImg2dShape() {

 int ActivationOpenClKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
  cl_int4 img_shape = GetImg2dShape();
-#ifdef ENABLE_FP16
-  size_t img_dtype = CL_HALF_FLOAT;
-#else
  size_t img_dtype = CL_FLOAT;
-#endif
-
+  if (enable_fp16_) {
+    img_dtype = CL_HALF_FLOAT;
+  }
  img_size->clear();
  img_size->push_back(img_shape.s[2] * UP_DIV(img_shape.s[3], C4NUM));
  img_size->push_back(img_shape.s[1]);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h
@ -39,13 +39,17 @@ class ActivationOpenClKernel : public OpenCLKernel {
  int Run() override;
  int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
  cl_int4 GetImg2dShape();
+  void InitBuffer();

 private:
  cl::Kernel kernel_;
  int type_;
  float alpha_;
+  void *alpha_buff_;
  int in_size_;
  int out_size_;
+  size_t fp_size;
+  bool enable_fp16_{false};
 };

 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.cc
@ -56,7 +56,7 @@ int BiasAddOpenCLKernel::Init() {
  out_size_ = out_tensors_[0]->shape().size();
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  enable_fp16_ = ocl_runtime->GetFp16Enable();
-  fp_size = enable_fp16_ ? sizeof(float) / 2 : sizeof(float);
+  fp_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
  if (in_size_ != 4 && in_size_ != 2) {
    MS_LOG(ERROR) << "BiasAdd only support dim=4 or 2, but your dim=" << in_size_;
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
@ -44,13 +44,17 @@ int DepthwiseConv2dOpenCLKernel::Init() {
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  std::string kernel_name = "DepthwiseConv2d";
  auto in_format = in_tensors_[0]->GetFormat();
-  in_ori_format_ = in_format;
+  in_ori_format_ = in_tensors_[0]->GetFormat();
  out_ori_format_ = out_tensors_[0]->GetFormat();
-  out_tensors_[0]->SetFormat(in_format);
+  in_format = (in_format == schema::Format_NHWC)
+                ? schema::Format_NHWC4
+                : ((in_format == schema::Format_NCHW) ? schema::Format_NC4HW4 : in_format);
  if (in_format != schema::Format_NHWC4 && in_format != schema::Format_NC4HW4) {
    MS_LOG(ERROR) << "input format(" << in_format << ") "
                  << "format not support!";
  }
+  in_tensors_[0]->SetFormat(in_format);
+  out_tensors_[0]->SetFormat(in_format);
  if (out_mem_type_ == OpenCLMemType::BUF) {
    kernel_name += "_BUF";
  } else {
@ -182,7 +186,7 @@ int DepthwiseConv2dOpenCLKernel::Run() {
  GetLocalSize(0, global, &local);

  std::map<ActType, std::pair<float, float>> relu_clips{
-    {ActType_No, {FLT_MIN, FLT_MAX}}, {ActType_Relu, {0.0, FLT_MAX}}, {ActType_Relu6, {0, 6.0}}};
+    {ActType_No, {-FLT_MAX, FLT_MAX}}, {ActType_Relu, {0.0, FLT_MAX}}, {ActType_Relu6, {0, 6.0}}};
  cl_int2 kernel_size = {parameter->kernel_h_, parameter->kernel_w_};
  cl_int2 stride = {parameter->stride_h_, parameter->stride_w_};
  cl_int2 padding = {-parameter->pad_u_, -parameter->pad_l_};
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc
@ -68,7 +68,7 @@ int PReluOpenCLKernel::Init() {
  std::string kernel_name = "PRelu";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  enable_fp16_ = ocl_runtime->GetFp16Enable();
-  fp_size = enable_fp16_ ? sizeof(float) / 2 : sizeof(float);
+  fp_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
  InitBuffer();
  ocl_runtime->LoadSource(program_name, source);
  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/activation_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/activation_tests.cc