fix bug in transpose and matmul, softmax output image2d

4 years ago · 37a662001c
parent 2f14c40934
commit 37a662001c
10 changed files with 189 additions and 105 deletions
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl
@ -35,6 +35,32 @@ __kernel void SoftMax_NHWC4_BUF(__read_only image2d_t input, __global FLT4 *outp
  }
 }

+__kernel void SoftMax_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape) {
+  int X = get_global_id(0);  // H
+  int Y = get_global_id(1);  // W
+  int H = input_shape.x;
+  int W = input_shape.y;
+  int C = input_shape.z;
+  int S = input_shape.w;
+
+  if (X >= H || Y >= W) return;
+
+  FLT sum = 0.0f;
+  for (int d = 0; d < S; ++d) {
+    FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X));
+    sum += exp(t.x);
+    if (d * 4 + 1 < C) sum += exp(t.y);
+    if (d * 4 + 2 < C) sum += exp(t.z);
+    if (d * 4 + 3 < C) sum += exp(t.w);
+  }
+
+  for (int d = 0; d < S; ++d) {
+    FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X));
+    t = exp(t) / sum;
+    WRITE_IMAGE(output, (int2)(Y * S + d, X), t);
+  }
+}
+
 __kernel void SoftMax_NC4HW4_BUF(__read_only image2d_t input, __global FLT4 *output, const int4 input_shape) {
  int X = get_global_id(0);  // H
  int Y = get_global_id(1);  // W
@ -66,44 +92,45 @@ __kernel void SoftMax_NC4HW4_BUF(__read_only image2d_t input, __global FLT4 *out
  }
 }

-__kernel void SoftMax_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  if (X >= input_shape.x || Y >= input_shape.y) return;
+__kernel void SoftMax_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape) {
+  int X = get_global_id(0);  // H
+  int Y = get_global_id(1);  // W
+  int H = input_shape.x;
+  int W = input_shape.y;
+  int C = input_shape.z;
+  int S = input_shape.w;
+
+  if (X >= H || Y >= W) return;

  FLT sum = 0.0f;
-  for (int d = 0; d < input_shape.w; ++d) {
-    FLT4 t = READ_IMAGE(input, smp_none, (int2)(Y * input_shape.w + d, X));
+  for (int d = 0; d < S; ++d) {
+    FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X));
    sum += exp(t.x);
-    if (d * 4 + 1 < input_shape.z) sum += exp(t.y);
-    if (d * 4 + 2 < input_shape.z) sum += exp(t.z);
-    if (d * 4 + 3 < input_shape.z) sum += exp(t.w);
+    if (d * 4 + 1 < C) sum += exp(t.y);
+    if (d * 4 + 2 < C) sum += exp(t.z);
+    if (d * 4 + 3 < C) sum += exp(t.w);
  }

-  for (int d = 0; d < input_shape.w; ++d) {
-    FLT4 t = READ_IMAGE(input, smp_none, (int2)(Y * input_shape.w + d, X));
-    t = divide_no_check(exp(t), sum);
-    FLT4 result = TO_FLT4(t);
-    WRITE_IMAGE(output, (int2)(Y * input_shape.w + d, X), result);
+  for (int d = 0; d < S; ++d) {
+    FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X));
+    t = exp(t) / sum;
+    WRITE_IMAGE(output, (int2)(Y, d * H + X), t);
  }
 }

-__kernel void SoftMax1x1_IMG(__read_only image2d_t input, __write_only image2d_t output, const FLT4 mask,
-                             const int slices, const int slices_x32) {
+__kernel void SoftMax1x1_NHWC4_BUF(__read_only image2d_t input, __global FLT4 *output, const float4 mask,
+                                   const int slices, const int slices_x32) {
  int tid = get_local_id(0);
-  int slices_count = 0;
-  int offset = 0;
  FLT sum = 0.0f;
-  do {
-    int z = offset + tid;
-    if (z < slices) {
-      FLT4 mask_temp = z == slices - 1 ? mask : (FLT4)(1.0f);
-      FLT4 src = READ_IMAGE(input, smp_none, (int2)(0, 0));
-      sum += dot(mask_temp, exp(src));
-      offset += 32;
-    }
-    slices_count++;
-  } while (slices_count < slices_x32);
+  for (size_t i = tid; i < slices - 1; i += 32) {
+    FLT4 src = READ_IMAGE(input, smp_zero, (int2)(i, 0));
+    sum += dot((FLT4)(1.0f), exp(src));
+  }
+  if ((slices - 1) % 32 == tid) {
+    FLT4 src = READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0));
+
+    sum += dot(TO_FLT4(mask), exp(src));
+  }

  __local FLT4 tmp[8];
  __local FLT *tmpx1 = (__local FLT *)tmp;
@ -122,21 +149,31 @@ __kernel void SoftMax1x1_IMG(__read_only image2d_t input, __write_only image2d_t
  }
  barrier(CLK_LOCAL_MEM_FENCE);
  sum = tmpx1[0];
-
-  offset = 0;
-  slices_count = 0;
-  do {
-    int z = offset + tid;
-    if (z < slices) {
-      FLT4 res = TO_FLT4(exp(READ_IMAGE(input, smp_none, (int2)(0, 0))) * sum);
-      WRITE_IMAGE(output, (int2)(0, 0), res);
-      offset += 32;
+  for (size_t i = tid; i < slices - 1; i += 32) {
+    FLT4 result = READ_IMAGE(input, smp_zero, (int2)(i, 0));
+    result = exp(result) * sum;
+    output[i] = result;
+  }
+  if ((slices - 1) % 32 == tid) {
+    FLT4 result = READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0));
+    result = exp(result) * sum;
+    __global FLT4 *remain_ptr4 = output;
+    remain_ptr4 += slices - 1;
+    __global FLT *remain_ptr = (__global FLT *)remain_ptr4;
+    remain_ptr[0] = result.x;
+    if (mask.y > 0.f) {
+      remain_ptr[1] = result.y;
+    }
+    if (mask.z > 0.f) {
+      remain_ptr[2] = result.z;
+    }
+    if (mask.w > 0.f) {
+      remain_ptr[3] = result.w;
    }
-    slices_count++;
-  } while (slices_count < slices_x32);
+  }
 }

-__kernel void SoftMax1x1_NHWC4_BUF(__read_only image2d_t input, __global FLT4 *output, const float4 mask,
+__kernel void SoftMax1x1_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const float4 mask,
                                   const int slices, const int slices_x32) {
  int tid = get_local_id(0);
  FLT sum = 0.0f;
@ -167,27 +204,10 @@ __kernel void SoftMax1x1_NHWC4_BUF(__read_only image2d_t input, __global FLT4 *o
  }
  barrier(CLK_LOCAL_MEM_FENCE);
  sum = tmpx1[0];
-  for (size_t i = tid; i < slices - 1; i += 32) {
+  for (size_t i = tid; i < slices; i += 32) {
    FLT4 result = READ_IMAGE(input, smp_zero, (int2)(i, 0));
    result = exp(result) * sum;
-    output[i] = result;
-  }
-  if ((slices - 1) % 32 == tid) {
-    FLT4 result = READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0));
-    result = exp(result) * sum;
-    __global FLT4 *remain_ptr4 = output;
-    remain_ptr4 += slices - 1;
-    __global FLT *remain_ptr = (__global FLT *)remain_ptr4;
-    remain_ptr[0] = result.x;
-    if (mask.y > 0.f) {
-      remain_ptr[1] = result.y;
-    }
-    if (mask.z > 0.f) {
-      remain_ptr[2] = result.z;
-    }
-    if (mask.w > 0.f) {
-      remain_ptr[3] = result.w;
-    }
+    WRITE_IMAGE(output, (int2)(i, 0), result);
  }
 }

@ -245,3 +265,41 @@ __kernel void SoftMax1x1_NC4HW4_BUF(__read_only image2d_t input, __global FLT4 *
    }
  }
 }
+
+__kernel void SoftMax1x1_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output, const float4 mask,
+                                    const int slices, const int slices_x32) {
+  int tid = get_local_id(0);
+  FLT sum = 0.0f;
+  for (size_t i = tid; i < slices - 1; i += 32) {
+    FLT4 src = READ_IMAGE(input, smp_zero, (int2)(0, i));
+    sum += dot((FLT4)(1.0f), exp(src));
+  }
+  if ((slices - 1) % 32 == tid) {
+    FLT4 src = READ_IMAGE(input, smp_zero, (int2)(0, slices - 1));
+
+    sum += dot(TO_FLT4(mask), exp(src));
+  }
+
+  __local FLT4 tmp[8];
+  __local FLT *tmpx1 = (__local FLT *)tmp;
+  tmpx1[tid] = sum;
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if (tid == 0) {
+    sum = dot((FLT4)(1.0f), tmp[0]);
+    sum += dot((FLT4)(1.0f), tmp[1]);
+    sum += dot((FLT4)(1.0f), tmp[2]);
+    sum += dot((FLT4)(1.0f), tmp[3]);
+    sum += dot((FLT4)(1.0f), tmp[4]);
+    sum += dot((FLT4)(1.0f), tmp[5]);
+    sum += dot((FLT4)(1.0f), tmp[6]);
+    sum += dot((FLT4)(1.0f), tmp[7]);
+    tmpx1[0] = divide_no_check(1.0f, sum);
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  sum = tmpx1[0];
+  for (size_t i = tid; i < slices; i += 32) {
+    FLT4 result = READ_IMAGE(input, smp_zero, (int2)(0, i));
+    result = exp(result) * sum;
+    WRITE_IMAGE(output, (int2)(0, i), result);
+  }
+}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl
@ -43,7 +43,7 @@ __kernel void transpose_IMG(__read_only image2d_t src_data, __write_only image2d
  WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 3), result[3]);
 }

-__kernel void transpose_NHWC4_BUF(__read_only image2d_t src_data, global FLT4 *dst_data, int2 HW, int2 C, int W,
+__kernel void transpose_NHWC4_BUF(__read_only image2d_t src_data, global float4 *dst_data, int2 HW, int2 C, int W,
                                  int H) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
@ -89,13 +89,13 @@ __kernel void transpose_NHWC4_BUF(__read_only image2d_t src_data, global FLT4 *d
  result[3].z = x2.w;
  result[3].w = x3.w;

-  if (4 * Y < C.x) dst_data[4 * Y * HW.y + X] = result[0];
-  if (4 * Y + 1 < C.x) dst_data[(4 * Y + 1) * HW.y + X] = result[1];
-  if (4 * Y + 2 < C.x) dst_data[(4 * Y + 2) * HW.y + X] = result[2];
-  if (4 * Y + 3 < C.x) dst_data[(4 * Y + 3) * HW.y + X] = result[3];
+  if (4 * Y < C.x) dst_data[4 * Y * HW.y + X] = convert_float4(result[0]);
+  if (4 * Y + 1 < C.x) dst_data[(4 * Y + 1) * HW.y + X] = convert_float4(result[1]);
+  if (4 * Y + 2 < C.x) dst_data[(4 * Y + 2) * HW.y + X] = convert_float4(result[2]);
+  if (4 * Y + 3 < C.x) dst_data[(4 * Y + 3) * HW.y + X] = convert_float4(result[3]);
 }

-__kernel void transpose_NC4HW4_BUF(__read_only image2d_t src_data, global FLT4 *dst_data, int2 HW, int2 C, int W,
+__kernel void transpose_NC4HW4_BUF(__read_only image2d_t src_data, global float4 *dst_data, int2 HW, int2 C, int W,
                                   int H) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
@ -131,8 +131,8 @@ __kernel void transpose_NC4HW4_BUF(__read_only image2d_t src_data, global FLT4 *
  result[3].z = x2.w;
  result[3].w = x3.w;

-  if (4 * Y < C.x) dst_data[4 * Y * HW.y + X] = result[0];
-  if (4 * Y + 1 < C.x) dst_data[(4 * Y + 1) * HW.y + X] = result[1];
-  if (4 * Y + 2 < C.x) dst_data[(4 * Y + 2) * HW.y + X] = result[2];
-  if (4 * Y + 3 < C.x) dst_data[(4 * Y + 3) * HW.y + X] = result[3];
+  if (4 * Y < C.x) dst_data[4 * Y * HW.y + X] = convert_float4(result[0]);
+  if (4 * Y + 1 < C.x) dst_data[(4 * Y + 1) * HW.y + X] = convert_float4(result[1]);
+  if (4 * Y + 2 < C.x) dst_data[(4 * Y + 2) * HW.y + X] = convert_float4(result[2]);
+  if (4 * Y + 3 < C.x) dst_data[(4 * Y + 3) * HW.y + X] = convert_float4(result[3]);
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
@ -100,7 +100,6 @@ int ActivationOpenClKernel::Run() {
    ocl_runtime->SetKernelArg(kernel_, arg_idx++, alpha_);
  }
  std::vector<size_t> local = {};
-  std::cout << img2d_shape.s[1] << "   " << img2d_shape.s[2] << std::endl;
  std::vector<size_t> global = {static_cast<size_t>(img2d_shape.s[1]), static_cast<size_t>(img2d_shape.s[2])};
  auto ret = ocl_runtime->RunKernel(kernel_, global, local, nullptr);
  if (ret != RET_OK) {
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
@ -132,12 +132,15 @@ void Conv2dTransposeOpenCLKernel::PadWeight() {
  bias_ = allocator->Malloc(im_dst_x * im_dst_y * C4NUM * data_size, img_size);
  bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true);
  memset(bias_, 0x00, div_co * C4NUM * data_size);
-  auto bias_dtype = in_tensors_[2]->data_type();
  if (in_tensors_.size() >= 3) {
+    auto bias_dtype = in_tensors_[2]->data_type();
    if (bias_dtype == kNumberTypeFloat32 && enable_fp16_) {
-      auto fdata = reinterpret_cast<float *>(in_tensors_[2]->MutableData());
      for (int i = 0; i < co; i++) {
-        reinterpret_cast<uint16_t *>(bias_)[i] = Float32ToShort(fdata[i]);
+        reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(in_tensors_[2]->MutableData())[i];
+      }
+    } else if (bias_dtype == kNumberTypeFloat16 && !enable_fp16_) {
+      for (int i = 0; i < co; i++) {
+        reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(in_tensors_[2]->MutableData())[i];
      }
    } else {
      memcpy(bias_, in_tensors_[2]->MutableData(), co * data_size);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
@ -152,14 +152,12 @@ void MatMulOpenCLKernel::PadWeight() {
  memset(bias_, 0x00, co4 * C4NUM * dtype_size);
  if (in_tensors_.size() >= 3) {
    if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) {
-      auto fdata = reinterpret_cast<float *>(in_tensors_[2]->MutableData());
      for (int i = 0; i < co; i++) {
-        reinterpret_cast<uint16_t *>(bias_)[i] = Float32ToShort(fdata[i]);
+        reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(in_tensors_[2]->MutableData())[i];
      }
    } else if (in_tensors_[2]->data_type() == kNumberTypeFloat16 && !enable_fp16_) {
-      auto fdata = reinterpret_cast<uint16_t *>(in_tensors_[2]->MutableData());
      for (int i = 0; i < co; i++) {
-        reinterpret_cast<float *>(bias_)[i] = ShortToFloat32(fdata[i]);
+        reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(in_tensors_[2]->MutableData())[i];
      }
    } else {
      memcpy(bias_, in_tensors_[2]->MutableData(), co * dtype_size);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
@ -65,13 +65,26 @@ int SoftmaxOpenCLKernel::SetWorkGroupSize1x1() {

 int SoftmaxOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
  size_t im_dst_x, im_dst_y;
-  if (onexone_flag_) {
-    im_dst_x = UP_DIV(in_tensors_[0]->shape()[1], C4NUM);
-    im_dst_y = 1;
+  auto out_shape = out_tensors_[0]->shape();
+  int n = 1, h = 1, w = 1, c = 1;
+  if (out_shape.size() == 2) {
+    n = out_shape[0];
+    c = out_shape[1];
+  } else if (out_shape.size() == 4) {
+    n = out_shape[0];
+    h = out_shape[1];
+    w = out_shape[2];
+    c = out_shape[3];
+  }
+  if (op_format_ == schema::Format_NHWC4) {
+    im_dst_x = w * UP_DIV(c, C4NUM);
+    im_dst_y = n * h;
+  } else if (op_format_ == schema::Format_NC4HW4) {
+    im_dst_x = w;
+    im_dst_y = n * UP_DIV(c, C4NUM) * h;
  } else {
-    size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
-    im_dst_x = out_tensors_[0]->Width() * CO4;
-    im_dst_y = out_tensors_[0]->Height();
+    MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_);
+    return RET_ERROR;
  }
  size_t img_dtype = CL_FLOAT;
  if (enable_fp16_) {
@ -110,8 +123,7 @@ int SoftmaxOpenCLKernel::Init() {
  if (!is_image_out_) {
    out_mem_type_ = OpenCLMemType::BUF;
  } else {
-    MS_LOG(ERROR) << "image2d output not support yet.";
-    return RET_ERROR;
+    out_mem_type_ = OpenCLMemType::IMG;
  }
  if (out_mem_type_ == OpenCLMemType::BUF) {
    kernel_name += "_BUF";
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h
@ -51,7 +51,7 @@ class SoftmaxOpenCLKernel : public OpenCLKernel {
  bool onexone_flag_{false};
  std::vector<size_t> local_size_;
  std::vector<size_t> global_size_;
-  bool is_image_out_{false};
+  bool is_image_out_{true};
  bool enable_fp16_{false};
 };

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
@ -36,6 +36,14 @@ int TransposeOpenCLKernel::Init() {
  std::string kernel_name = "transpose";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  enable_fp16_ = ocl_runtime->GetFp16Enable();
+  auto param = reinterpret_cast<TransposeParameter *>(op_parameter_);
+  if (param->num_axes_ == 4 && param->perm_[0] == 0 && param->perm_[1] == 3 && param->perm_[2] == 1 &&
+      param->perm_[3] == 2) {
+    type = TransposeType::NHWC2NCHW;
+  } else {
+    MS_LOG(ERROR) << "unsupported transpose axes.";
+    return RET_ERROR;
+  }
  out_mem_type_ = OpenCLMemType::BUF;
  kernel_name += "_" + std::string(EnumNameFormat(op_format_));
  if (out_mem_type_ == OpenCLMemType::BUF) {
@ -146,4 +154,5 @@ kernel::LiteKernel *OpenCLTransposeKernelCreator(const std::vector<lite::Tensor
 }

 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Transpose, OpenCLTransposeKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Transpose, OpenCLTransposeKernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h
@ -20,10 +20,14 @@
 #include <vector>

 #include "src/lite_kernel.h"
+#include "nnacl/transpose.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/opencl_kernel.h"

 namespace mindspore::kernel {
+
+enum class TransposeType { NHWC2NCHW, NCHW2NHWC };
+
 class TransposeOpenCLKernel : public OpenCLKernel {
 public:
  explicit TransposeOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
@ -39,6 +43,7 @@ class TransposeOpenCLKernel : public OpenCLKernel {
 private:
  cl::Kernel kernel_;
  bool enable_fp16_{false};
+  TransposeType type{TransposeType::NHWC2NCHW};
 };
 }  // namespace mindspore::kernel

--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc
@ -37,6 +37,17 @@ void RunTestTranspose(const std::vector<int> &shape, void *input_data, void *out
    ocl_runtime->SetFp16Enable(true);
    dtype_size = sizeof(float16_t);
  }
+  auto param_ptr = std::make_unique<TransposeParameter>();
+  auto param = param_ptr.get();
+  if (param == nullptr) {
+    MS_LOG(ERROR) << "param_ptr create error.";
+    return;
+  }
+  param->num_axes_ = 4;
+  param->perm_[0] = 0;
+  param->perm_[1] = 3;
+  param->perm_[2] = 1;
+  param->perm_[3] = 2;
  auto allocator = ocl_runtime->GetAllocator();
  int h = shape[0];
  int w = shape[1];
@ -59,7 +70,8 @@ void RunTestTranspose(const std::vector<int> &shape, void *input_data, void *out
  }
  std::vector<lite::Tensor *> inputs{tensor_x};
  std::vector<lite::Tensor *> outputs{tensor_out};
-  auto arith_kernel_ptr = std::make_unique<kernel::TransposeOpenCLKernel>(nullptr, inputs, outputs);
+  auto arith_kernel_ptr =
+    std::make_unique<kernel::TransposeOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs);
  auto arith_kernel = arith_kernel_ptr.get();
  if (arith_kernel == nullptr) {
    MS_LOG(ERROR) << "arith_kernel create error.";
@ -94,26 +106,14 @@ void RunTestTranspose(const std::vector<int> &shape, void *input_data, void *out
 }

 TEST_F(TestTransposeOpenCL, TransposeFp32) {
-  int h = 1;
-  int w = 64;
-  int c = 7360;
+  int h = 2;
+  int w = 2;
+  int c = 3;
  std::vector<int> shape = {h, w, c};
-  size_t input_size;
-  std::string input_path = "./test_data/transpose/transpose_fp32_input.bin";
-  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
-  if (input_data == nullptr) {
-    MS_LOG(ERROR) << "input_data load error.";
-    return;
-  }
+  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
+  std::vector<float> output_data = {0.0f, 3.0f, 6.0f, 9.0f, 1.0f, 4.0f, 7.0f, 10.0f, 2.0f, 5.0f, 8.0f, 11.0f};

-  size_t output_size;
-  std::string output_path = "./test_data/transpose/transpose_fp32_output.bin";
-  auto correct_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size));
-  if (correct_data == nullptr) {
-    MS_LOG(ERROR) << "correct_data create error.";
-    return;
-  }
-  RunTestTranspose(shape, input_data, correct_data, false);
+  RunTestTranspose(shape, input_data.data(), output_data.data(), false);
 }

 TEST_F(TestTransposeOpenCL, TransposeFp16) {