From 37c6c1c3f59dbac78d070d5ae0e5bead89007c82 Mon Sep 17 00:00:00 2001
From: chenzupeng <chenzupeng@huawei.com>
Date: Fri, 11 Sep 2020 17:45:39 +0800
Subject: [PATCH] add batchmatmul and reduce

---
 .../src/runtime/kernel/opencl/cl/matmul.cl    | 153 ++++++++++++----
 .../src/runtime/kernel/opencl/cl/reduce.cl    |  61 +++++++
 .../runtime/kernel/opencl/kernel/activation.h |   1 -
 .../kernel/opencl/kernel/conv2d_transpose.cc  |  16 +-
 .../runtime/kernel/opencl/kernel/matmul.cc    | 159 +++++++++++------
 .../src/runtime/kernel/opencl/kernel/matmul.h |  12 +-
 .../runtime/kernel/opencl/kernel/reduce.cc    | 166 ++++++++++++++++++
 .../src/runtime/kernel/opencl/kernel/reduce.h |  48 +++++
 .../kernel/opencl/subgraph_opencl_kernel.cc   |  12 --
 mindspore/lite/test/CMakeLists.txt            |   2 +
 .../src/runtime/kernel/opencl/matmul_tests.cc |  93 ++++++++--
 .../src/runtime/kernel/opencl/reduce_tests.cc | 156 ++++++++++++++++
 12 files changed, 749 insertions(+), 130 deletions(-)
 create mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl
 create mode 100644 mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc
 create mode 100644 mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h
 create mode 100644 mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc

diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl
index 300584c2f9..914fffe5ce 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl
@@ -1,57 +1,146 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#define C4NUM 4
+#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void MatMul_NHWC4(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
-                           __write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) {
-  int2 gid = (int2)(get_global_id(0), get_global_id(1));
-  int2 lid = (int2)(get_local_id(0), get_local_id(1));
+__kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
+                              __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) {
+  int gidx = get_global_id(0);  // CO4
+  int gidz = get_global_id(2);  // N
+  int lidx = get_local_id(0);
+  int lidy = get_local_id(1);
+  int ci4 = UP_DIV(in_shape.w, C4NUM);
+  int co4 = UP_DIV(out_shape.w, C4NUM);
+  int n = out_shape.z;
+  bool inside = gidx < co4 && gidz < n;
   FLT4 result = (FLT4)(0.0f);
-  bool inside = gid.x < offset_co.y;
-  for (uint i = lid.y; i < offset_ci.y && inside; i += 4) {
-    FLT4 v = READ_IMAGE(input, smp_zero, (int2)(i, 0));
-    FLT16 w = weight[gid.x + i * offset_co.y];
+  for (uint i = lidy; i < ci4 && inside; i += 4) {
+    FLT4 v = READ_IMAGE(input, smp_zero, (int2)(i, gidz));
+    FLT16 w = weight[i * co4 + gidx];
     result.x += dot(v, w.s0123);
     result.y += dot(v, w.s4567);
     result.z += dot(v, w.s89ab);
     result.w += dot(v, w.scdef);
   }
-  __local FLT4 temp[64][4];
-  temp[lid.x][lid.y] = result;
+  WRITE_IMAGE(output, (int2)(gidx, gidz), result);
+  __local FLT4 temp[32][4];
+  temp[lidx][lidy] = result;
   barrier(CLK_LOCAL_MEM_FENCE);
-  if (lid.y == 0 && inside) {
-    result += temp[lid.x][1];
-    result += temp[lid.x][2];
-    result += temp[lid.x][3];
+  if (lidy == 0 && inside) {
+    result += temp[lidx][1];
+    result += temp[lidx][2];
+    result += temp[lidx][3];
     if (has_bias != 0) {
-      result += READ_IMAGE(bias, smp_zero, (int2)(gid.x, 0));
+      result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
     }
-    WRITE_IMAGE(output, (int2)(gid.x, 0), result);
+    WRITE_IMAGE(output, (int2)(gidx, gidz), result);
   }
 }
 
-__kernel void MatMul_NC4HW4(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
-                            __write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) {
-  int2 gid = (int2)(get_global_id(0), get_global_id(1));
-  int2 lid = (int2)(get_local_id(0), get_local_id(1));
+__kernel void MatMul_NC4HW4_2d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
+                               __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) {
+  int gidx = get_global_id(0);  // CO4
+  int gidz = get_global_id(2);  // N
+  int lidx = get_local_id(0);
+  int lidy = get_local_id(1);
+  int ci4 = UP_DIV(in_shape.w, C4NUM);
+  int co4 = UP_DIV(out_shape.w, C4NUM);
+  int n = out_shape.z;
+  bool inside = gidx < co4 && gidz < n;
   FLT4 result = (FLT4)(0.0f);
-  bool inside = gid.x < offset_co.y;
-  for (uint i = lid.y; i < offset_ci.y && inside; i += 4) {
-    FLT4 v = READ_IMAGE(input, smp_zero, (int2)(0, i));
-    FLT16 w = weight[gid.x + i * offset_co.y];
+  for (uint i = lidy; i < ci4 && inside; i += 4) {
+    FLT4 v = READ_IMAGE(input, smp_zero, (int2)(gidz * ci4 + i, 0));
+    FLT16 w = weight[i * co4 + gidx];
     result.x += dot(v, w.s0123);
     result.y += dot(v, w.s4567);
     result.z += dot(v, w.s89ab);
     result.w += dot(v, w.scdef);
   }
-  __local FLT4 temp[64][4];
-  temp[lid.x][lid.y] = result;
+  __local FLT4 temp[32][4];
+  temp[lidx][lidy] = result;
   barrier(CLK_LOCAL_MEM_FENCE);
-  if (lid.y == 0 && inside) {
-    result += temp[lid.x][1];
-    result += temp[lid.x][2];
-    result += temp[lid.x][3];
+  if (lidy == 0 && inside) {
+    result += temp[lidx][1];
+    result += temp[lidx][2];
+    result += temp[lidx][3];
     if (has_bias != 0) {
-      result += READ_IMAGE(bias, smp_zero, (int2)(gid.x, 0));
+      result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
     }
-    WRITE_IMAGE(output, (int2)(0, gid.x), result);
+    WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, 0), result);
+  }
+}
+
+__kernel void MatMul_NHWC4_4d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
+                              __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) {
+  int gidx = get_global_id(0);  // CO4
+  int gidy = get_global_id(1);  // N * H * 4
+  int gidz = get_global_id(2);  // W
+  int lidx = get_local_id(0);
+  int lidy = get_local_id(1);
+  int ci4 = UP_DIV(in_shape.w, C4NUM);
+  int co4 = UP_DIV(out_shape.w, C4NUM);
+  int n = out_shape.x;
+  int h = out_shape.y;
+  int w = out_shape.z;
+  int nh_index = gidy / 4;
+  bool inside = gidx < co4 && gidz < w && nh_index < n * h;
+  FLT4 result = (FLT4)(0.0f);
+  for (uint i = lidy; i < ci4 && inside; i += 4) {
+    FLT4 v = READ_IMAGE(input, smp_zero, (int2)(gidz * ci4 + i, nh_index));
+    FLT16 weight_value = weight[nh_index * ci4 * co4 + i * co4 + gidx];
+    result.x += dot(v, weight_value.s0123);
+    result.y += dot(v, weight_value.s4567);
+    result.z += dot(v, weight_value.s89ab);
+    result.w += dot(v, weight_value.scdef);
+  }
+  __local FLT4 temp[32][4];
+  temp[lidx][lidy] = result;
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if (lidy == 0 && inside) {
+    result += temp[lidx][1];
+    result += temp[lidx][2];
+    result += temp[lidx][3];
+    if (has_bias != 0) {
+      result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
+    }
+    WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, nh_index), result);
+  }
+}
+
+__kernel void MatMul_NC4HW4_4d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
+                               __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) {
+  int gidx = get_global_id(0);  // CO4
+  int gidy = get_global_id(1);  // N * H * 4
+  int gidz = get_global_id(2);  // W
+  int lidx = get_local_id(0);
+  int lidy = get_local_id(1);
+  int ci4 = UP_DIV(in_shape.w, C4NUM);
+  int co4 = UP_DIV(out_shape.w, C4NUM);
+  int n = out_shape.x;
+  int h = out_shape.y;
+  int w = out_shape.z;
+  int nh_index = gidy / 4;
+  bool inside = gidx < co4 && gidz < w && nh_index < n * h;
+  int n_index = nh_index / h;
+  int h_index = nh_index % h;
+  FLT4 result = (FLT4)(0.0f);
+  for (uint i = lidy; i < ci4 && inside; i += 4) {
+    FLT4 v = READ_IMAGE(input, smp_zero, (int2)(gidz, n_index * ci4 * h + i * h + h_index));
+    FLT16 weight_value = weight[nh_index * ci4 * co4 + i * co4 + gidx];
+    result.x += dot(v, weight_value.s0123);
+    result.y += dot(v, weight_value.s4567);
+    result.z += dot(v, weight_value.s89ab);
+    result.w += dot(v, weight_value.scdef);
+  }
+  __local FLT4 temp[32][4];
+  temp[lidx][lidy] = result;
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if (lidy == 0 && inside) {
+    result += temp[lidx][1];
+    result += temp[lidx][2];
+    result += temp[lidx][3];
+    if (has_bias != 0) {
+      result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
+    }
+    WRITE_IMAGE(output, (int2)(gidz, n_index * co4 * h + gidx * h + h_index), result);
   }
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl
new file mode 100644
index 0000000000..8f2269301f
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl
@@ -0,0 +1,61 @@
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+__kernel void mean_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
+  int X = get_global_id(0);  // C4
+  if (X >= size.z) {
+    return;
+  }
+  FLT4 result = (FLT4)0.f;
+  for (int h = 0; h < size.x; h++) {
+    for (int w = 0; w < size.y; w++) {
+      result += READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h));
+    }
+  }
+  result /= size.x * size.y;
+  WRITE_IMAGE(dst_data, (int2)(X, 0), result);
+}
+
+__kernel void mean_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
+  int X = get_global_id(0);  // C4
+  if (X >= size.z) {
+    return;
+  }
+  FLT4 result = (FLT4)0.f;
+  for (int h = 0; h < size.x; h++) {
+    for (int w = 0; w < size.y; w++) {
+      result += READ_IMAGE(src_data, smp_zero, (int2)(w, X * size.x + h));
+    }
+  }
+  result /= size.x * size.y;
+  WRITE_IMAGE(dst_data, (int2)(0, X), result);
+}
+
+__kernel void sum_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
+  int X = get_global_id(0);  // C4
+  if (X >= size.z) {
+    return;
+  }
+  FLT4 result = (FLT4)0.f;
+  for (int h = 0; h < size.x; h++) {
+    for (int w = 0; w < size.y; w++) {
+      result += READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h));
+    }
+  }
+  WRITE_IMAGE(dst_data, (int2)(X, 0), result);
+}
+
+__kernel void sum_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
+  int X = get_global_id(0);  // C4
+  if (X >= size.z) {
+    return;
+  }
+  FLT4 result = (FLT4)0.f;
+  for (int h = 0; h < size.x; h++) {
+    for (int w = 0; w < size.y; w++) {
+      result += READ_IMAGE(src_data, smp_zero, (int2)(w, X * size.x + h));
+    }
+  }
+  WRITE_IMAGE(dst_data, (int2)(0, X), result);
+}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h
index 3c89a08b36..cd22208468 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h
@@ -45,7 +45,6 @@ class ActivationOpenClKernel : public OpenCLKernel {
   cl::Kernel kernel_;
   int type_;
   float alpha_;
-  void *alpha_buff_;
   int in_size_;
   int out_size_;
   size_t fp_size;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
index 26b02d7e24..860ab489a9 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
@@ -94,14 +94,20 @@ void Conv2dTransposeOpenCLKernel::PadWeight() {
                 int ori_index = ((ci_offset * kh + kh_i) * kw + kw_i) * co + co_offset;
                 if (enable_fp16_) {
                   if (weight_dtype == kNumberTypeFloat32) {
-                    reinterpret_cast<uint16_t *>(padWeight_)[index++] =
-                      Float32ToShort(reinterpret_cast<float *>(origin_weight)[ori_index]);
+                    reinterpret_cast<float16_t *>(padWeight_)[index++] =
+                      reinterpret_cast<float *>(origin_weight)[ori_index];
                   } else {
-                    reinterpret_cast<uint16_t *>(padWeight_)[index++] =
-                      reinterpret_cast<uint16_t *>(origin_weight)[ori_index];
+                    reinterpret_cast<float16_t *>(padWeight_)[index++] =
+                      reinterpret_cast<float16_t *>(origin_weight)[ori_index];
                   }
                 } else {
-                  reinterpret_cast<float *>(padWeight_)[index++] = reinterpret_cast<float *>(origin_weight)[ori_index];
+                  if (weight_dtype == kNumberTypeFloat32) {
+                    reinterpret_cast<float *>(padWeight_)[index++] =
+                      reinterpret_cast<float *>(origin_weight)[ori_index];
+                  } else {
+                    reinterpret_cast<float *>(padWeight_)[index++] =
+                      reinterpret_cast<float16_t *>(origin_weight)[ori_index];
+                  }
                 }
               } else {
                 index++;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
index ed51c416bc..a79d3c71b7 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2019 Huawei Technologies n., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,10 @@
 
 #include <set>
 #include <string>
+#include <map>
 #include "nnacl/fp32/common_func.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
-#include "nnacl/fp32/matmul.h"
 #include "src/runtime/kernel/opencl/kernel/matmul.h"
 #ifndef PROGRAM_WITH_IL
 #include "src/runtime/kernel/opencl/cl/matmul.cl.inc"
@@ -36,7 +36,26 @@ int MatMulOpenCLKernel::Init() {
   std::string kernel_name = "MatMul";
   kernel_name += "_" + std::string(EnumNameFormat(op_format_));
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  auto param = reinterpret_cast<MatMulParameter *>(op_parameter_);
+  transposeA = param->a_transpose_;
+  if (transposeA) {
+    MS_LOG(ERROR) << "matmul only support a_transpose_=false yet.";
+    return RET_ERROR;
+  }
+  transposeB = param->b_transpose_;
   enable_fp16_ = ocl_runtime->GetFp16Enable();
+  if (in_tensors_[0]->shape().size() != out_tensors_[0]->shape().size() ||
+      (in_tensors_[0]->shape().size() != 2 && in_tensors_[0]->shape().size() != 4)) {
+    MS_LOG(ERROR) << "matmul only support input shape size=2 or 4.";
+    return RET_ERROR;
+  }
+  dims = in_tensors_[0]->shape().size();
+  for (int i = 0; i < dims; i++) {
+    inShape[MAX_DIMS - dims + i] = in_tensors_[0]->shape()[i];
+    outShape[MAX_DIMS - dims + i] = out_tensors_[0]->shape()[i];
+  }
+  std::map<int, std::string> dims2str = {{2, "_2d"}, {4, "_4d"}};
+  kernel_name += dims2str[dims];
 #ifdef PROGRAM_WITH_IL
   kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
 #else
@@ -46,21 +65,7 @@ int MatMulOpenCLKernel::Init() {
   ocl_runtime->LoadSource(program_name, source);
   ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif
-  int ci, co;
-  if (in_tensors_[1]->shape().size() != 2) {
-    MS_LOG(ERROR) << "matmul do not support input shape size=" << in_tensors_[1]->shape().size();
-    return RET_ERROR;
-  }
-  if (in_tensors_[1]->shape().size() == 2) {
-    ci = in_tensors_[1]->shape()[1];
-    co = in_tensors_[1]->shape()[0];
-  } else {
-    ci = in_tensors_[1]->shape()[3];
-    co = in_tensors_[1]->shape()[0];
-  }
 
-  sizeCI = {ci, UP_DIV(ci, C4NUM)};
-  sizeCO = {co, UP_DIV(co, C4NUM)};
   PadWeight();
   in_ori_format_ = in_tensors_[0]->GetFormat();
   out_ori_format_ = out_tensors_[0]->GetFormat();
@@ -73,51 +78,69 @@ int MatMulOpenCLKernel::Init() {
 int MatMulOpenCLKernel::ReSize() { return RET_OK; }
 
 void MatMulOpenCLKernel::PadWeight() {
+  // ABMCI @ ABCICO = ABMCO
   auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
+  int ci = inShape[3];
+  int ci4 = UP_DIV(ci, C4NUM);
+  int co = outShape[3];
+  int co4 = UP_DIV(co, C4NUM);
+  int a = inShape[0];
+  int b = inShape[1];
 
-  size_t dtype_size = enable_fp16_ ? sizeof(int16_t) : sizeof(float);
-  padWeight_ = allocator->Malloc(sizeCI.s[1] * sizeCO.s[1] * C4NUM * C4NUM * dtype_size);
+  size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
+  padWeight_ = allocator->Malloc(a * b * ci4 * co4 * C4NUM * C4NUM * dtype_size);
   padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
-  memset(padWeight_, 0x00, sizeCI.s[1] * sizeCO.s[1] * C4NUM * C4NUM * dtype_size);
-  auto origin_weight = in_tensors_.at(kWeightIndex)->MutableData();
-  int divCI = sizeCI.s[1];
-  int divCO = sizeCO.s[1];
-  int co = sizeCO.s[0];
+  auto padWeightFp32 = reinterpret_cast<float *>(padWeight_);
+  auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_);
+  memset(padWeight_, 0x00, a * b * ci4 * co4 * C4NUM * C4NUM * dtype_size);
+  auto originWeightFp32 = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->MutableData());
+  auto originWeightFp16 = reinterpret_cast<float16_t *>(in_tensors_.at(kWeightIndex)->MutableData());
+  bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16;
+
+  // pad weight
+  // ABCICO -> AB(CI4)(CO4)(4 from CO)(4 from CI)
+  // if tranposeB, ABCOCI -> AB(CI4)(CO4)(4 from CO)(4 from CI)
   int index = 0;
-  for (int i = 0; i < divCI; ++i) {
-    for (int j = 0; j < divCO; ++j) {
-      for (int k = 0; k < C4NUM; ++k) {
-        for (int l = 0; l < C4NUM; ++l) {
-          int src_x = i * C4NUM + l;
-          int src_y = j * C4NUM + k;
-          if (src_x < sizeCI.s[0] && src_y < sizeCO.s[0]) {
-            if (enable_fp16_) {
-              if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat32) {
-                reinterpret_cast<uint16_t *>(padWeight_)[index++] =
-                  Float32ToShort(reinterpret_cast<float *>(origin_weight)[src_y * sizeCI.s[0] + src_x]);
-              } else {
-                reinterpret_cast<uint16_t *>(padWeight_)[index++] =
-                  reinterpret_cast<uint16_t *>(origin_weight)[src_y * sizeCI.s[0] + src_x];
-              }
-            } else {
-              if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16) {
-                reinterpret_cast<float *>(padWeight_)[index++] =
-                  ShortToFloat32(reinterpret_cast<uint16_t *>(origin_weight)[src_y * sizeCI.s[0] + src_x]);
+  for (int aa = 0; aa < a; aa++) {
+    for (int bb = 0; bb < b; bb++) {
+      int baseAB = (aa * b + bb) * ci * co;
+      for (int i = 0; i < ci4; ++i) {
+        for (int j = 0; j < co4; ++j) {
+          for (int k = 0; k < C4NUM; ++k) {
+            for (int l = 0; l < C4NUM; ++l) {
+              int src_ci = i * C4NUM + l;
+              int src_co = j * C4NUM + k;
+              if (src_ci < ci && src_co < co) {
+                int originId = baseAB + src_ci * co + src_co;
+                if (transposeB) {
+                  originId = baseAB + src_co * ci + src_ci;
+                }
+                if (enable_fp16_) {
+                  if (!isModelFp16) {
+                    padWeightFp16[index++] = originWeightFp32[originId];
+                  } else {
+                    padWeightFp16[index++] = originWeightFp16[originId];
+                  }
+                } else {
+                  if (!isModelFp16) {
+                    padWeightFp32[index++] = originWeightFp32[originId];
+                  } else {
+                    padWeightFp32[index++] = originWeightFp16[originId];
+                  }
+                }
               } else {
-                reinterpret_cast<float *>(padWeight_)[index++] =
-                  reinterpret_cast<float *>(origin_weight)[src_y * sizeCI.s[0] + src_x];
+                index++;
               }
             }
-          } else {
-            index++;
           }
         }
       }
     }
   }
 
+  // pad FC Bias
   size_t im_dst_x, im_dst_y;
-  im_dst_x = divCO;
+  im_dst_x = co4;
   im_dst_y = 1;
   size_t img_dtype = CL_FLOAT;
   if (enable_fp16_) {
@@ -126,13 +149,18 @@ void MatMulOpenCLKernel::PadWeight() {
   std::vector<size_t> img_size{im_dst_x, im_dst_y, img_dtype};
   bias_ = allocator->Malloc(im_dst_x * im_dst_y * C4NUM * dtype_size, img_size);
   bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true);
-  memset(bias_, 0x00, divCO * C4NUM * dtype_size);
+  memset(bias_, 0x00, co4 * C4NUM * dtype_size);
   if (in_tensors_.size() >= 3) {
     if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) {
       auto fdata = reinterpret_cast<float *>(in_tensors_[2]->MutableData());
       for (int i = 0; i < co; i++) {
         reinterpret_cast<uint16_t *>(bias_)[i] = Float32ToShort(fdata[i]);
       }
+    } else if (in_tensors_[2]->data_type() == kNumberTypeFloat16 && !enable_fp16_) {
+      auto fdata = reinterpret_cast<uint16_t *>(in_tensors_[2]->MutableData());
+      for (int i = 0; i < co; i++) {
+        reinterpret_cast<float *>(bias_)[i] = ShortToFloat32(fdata[i]);
+      }
     } else {
       memcpy(bias_, in_tensors_[2]->MutableData(), co * dtype_size);
     }
@@ -142,12 +170,23 @@ void MatMulOpenCLKernel::PadWeight() {
 
 int MatMulOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
   size_t im_dst_x, im_dst_y;
-  if (op_format_ == schema::Format::Format_NHWC4) {
-    im_dst_x = sizeCO.s[1];
-    im_dst_y = 1;
-  } else if (op_format_ == schema::Format::Format_NC4HW4) {
-    im_dst_x = 1;
-    im_dst_y = sizeCO.s[1];
+  auto out_shape = out_tensors_[0]->shape();
+  int n = 1, h = 1, w = 1, c = 1;
+  if (dims == 2) {
+    n = out_shape[0];
+    c = out_shape[1];
+  } else if (dims == 4) {
+    n = out_shape[0];
+    h = out_shape[1];
+    w = out_shape[2];
+    c = out_shape[3];
+  }
+  if (op_format_ == schema::Format_NHWC4) {
+    im_dst_x = w * UP_DIV(c, C4NUM);
+    im_dst_y = n * h;
+  } else if (op_format_ == schema::Format_NC4HW4) {
+    im_dst_x = w;
+    im_dst_y = n * UP_DIV(c, C4NUM) * h;
   } else {
     MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_);
     return RET_ERROR;
@@ -166,15 +205,19 @@ int MatMulOpenCLKernel::Run() {
   MS_LOG(DEBUG) << this->name() << " Running!";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   // local size should less than MAX_GROUP_SIZE
-  std::vector<size_t> local = {64, 4};
-  std::vector<size_t> global = {UP_ROUND(sizeCO.s[1], local[0]), 4};
+  std::vector<size_t> local = {32, 4, 1};
+  std::vector<size_t> global = {UP_DIV(static_cast<size_t>(outShape[3]), C4NUM),
+                                4 * static_cast<size_t>(outShape[0]) * static_cast<size_t>(outShape[1]),
+                                static_cast<size_t>(outShape[2])};
   int arg_count = 0;
+  cl_int4 in_shape = {inShape[0], inShape[1], inShape[2], inShape[3]};
+  cl_int4 out_shape = {outShape[0], outShape[1], outShape[2], outShape[3]};
   ocl_runtime->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->MutableData());
   ocl_runtime->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF);
   ocl_runtime->SetKernelArg(kernel_, arg_count++, bias_);
   ocl_runtime->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->MutableData());
-  ocl_runtime->SetKernelArg(kernel_, arg_count++, sizeCI);
-  ocl_runtime->SetKernelArg(kernel_, arg_count++, sizeCO);
+  ocl_runtime->SetKernelArg(kernel_, arg_count++, in_shape);
+  ocl_runtime->SetKernelArg(kernel_, arg_count++, out_shape);
   ocl_runtime->SetKernelArg(kernel_, arg_count++, hasBias_ ? 1 : 0);
   ocl_runtime->RunKernel(kernel_, global, local, nullptr);
   return RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h
index dfd102107e..0a540f005f 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h
@@ -20,7 +20,7 @@
 #include <vector>
 
 #include "src/runtime/kernel/opencl/opencl_kernel.h"
-#include "nnacl/conv_parameter.h"
+#include "nnacl/matmul_parameter.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 
 namespace mindspore::kernel {
@@ -29,7 +29,7 @@ class MatMulOpenCLKernel : public OpenCLKernel {
  public:
   explicit MatMulOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                               const std::vector<lite::Tensor *> &outputs, bool hasBias)
-      : OpenCLKernel(parameter, inputs, outputs) {
+      : OpenCLKernel(parameter, inputs, outputs), inShape(MAX_DIMS, 1), outShape(MAX_DIMS, 1) {
     hasBias_ = hasBias;
   }
   ~MatMulOpenCLKernel() override{};
@@ -46,8 +46,12 @@ class MatMulOpenCLKernel : public OpenCLKernel {
   void *bias_;
   bool hasBias_{false};
   bool enable_fp16_{false};
-  cl_int2 sizeCI;
-  cl_int2 sizeCO;
+  bool transposeA{false};
+  bool transposeB{true};
+  int dims;
+  static constexpr int MAX_DIMS = 4;  // max supported matmul dims
+  std::vector<int> inShape;
+  std::vector<int> outShape;
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc
new file mode 100644
index 0000000000..7a7b64c88b
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc
@@ -0,0 +1,166 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <set>
+#include <string>
+#include <map>
+#include "include/errorcode.h"
+#include "src/kernel_registry.h"
+#include "src/runtime/opencl/opencl_runtime.h"
+#include "src/runtime/kernel/opencl/kernel/reduce.h"
+#include "src/runtime/kernel/opencl/cl/reduce.cl.inc"
+
+using mindspore::kernel::KERNEL_ARCH::kGPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_NULL_PTR;
+using mindspore::lite::RET_OK;
+using mindspore::lite::RET_PARAM_INVALID;
+using mindspore::schema::PrimitiveType_Mean;
+using mindspore::schema::PrimitiveType_Reduce;
+using mindspore::schema::ReduceMode;
+using mindspore::schema::ReduceMode_ReduceMax;
+using mindspore::schema::ReduceMode_ReduceMean;
+using mindspore::schema::ReduceMode_ReduceMin;
+using mindspore::schema::ReduceMode_ReduceProd;
+using mindspore::schema::ReduceMode_ReduceSum;
+using mindspore::schema::ReduceMode_ReduceSumSquare;
+
+namespace mindspore::kernel {
+
+int ReduceOpenCLKernel::Init() {
+  InitNHWCShape();
+  auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
+  if (reduce_param == nullptr) {
+    return RET_NULL_PTR;
+  }
+  std::map<int, std::string> reduce_type2str{{ReduceMode_ReduceMean, "mean"}, {ReduceMode_ReduceSum, "sum"}};
+  if (reduce_type2str.find(reduce_param->mode_) == reduce_type2str.end()) {
+    MS_LOG(ERROR) << "not supported reduce type:" << reduce_param->mode_;
+    return RET_PARAM_INVALID;
+  }
+  if (reduce_param->num_axes_ != 2 || ((reduce_param->axes_[0] != 1 || reduce_param->axes_[1] != 2) &&
+                                       (reduce_param->axes_[0] != 2 || reduce_param->axes_[1] != 1))) {
+    MS_LOG(ERROR) << "reduce op only support axes HW";
+    return RET_PARAM_INVALID;
+  }
+  std::string kernel_name = reduce_type2str.at(reduce_param->mode_);
+  kernel_name += "_" + std::string(EnumNameFormat(op_format_));
+  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  enable_fp16_ = ocl_runtime->GetFp16Enable();
+
+  if (in_tensors_[0]->shape().back() != out_tensors_[0]->shape().back()) {
+    MS_LOG(ERROR) << "Reduce input channel " << in_tensors_[0]->shape().back() << " should equal output channel"
+                  << out_tensors_[0]->shape().back();
+    return RET_ERROR;
+  }
+#ifdef PROGRAM_WITH_IL
+  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
+#else
+  std::set<std::string> build_options;
+  std::string source = reduce_source;
+  ocl_runtime->LoadSource(kernel_name, source);
+  ocl_runtime->BuildKernel(kernel_, kernel_name, kernel_name, build_options);
+#endif
+  in_ori_format_ = in_tensors_[0]->GetFormat();
+  out_ori_format_ = out_tensors_[0]->GetFormat();
+  in_tensors_[0]->SetFormat(op_format_);
+  out_tensors_[0]->SetFormat(op_format_);
+  MS_LOG(DEBUG) << kernel_name << " Init Done!";
+  return RET_OK;
+}
+
+void ReduceOpenCLKernel::InitNHWCShape() {
+  std::vector<int> shapex = out_tensors_[0]->shape();
+  size_t n = 1, h = 1, w = 1, c = 1;
+  if (shapex.size() == 2) {
+    n = shapex[0];
+    c = shapex[1];
+  } else if (shapex.size() == 4) {
+    n = shapex[0];
+    h = shapex[1];
+    w = shapex[2];
+    c = shapex[3];
+  }
+  nhwc_shape_ = {n, h, w, c};
+}
+
+int ReduceOpenCLKernel::ReSize() { return RET_OK; }
+
+int ReduceOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
+  size_t im_dst_x, im_dst_y;
+
+  if (op_format_ == schema::Format_NHWC4) {
+    im_dst_x = nhwc_shape_[2] * UP_DIV(nhwc_shape_[3], C4NUM);
+    im_dst_y = nhwc_shape_[0] * nhwc_shape_[1];
+  } else if (op_format_ == schema::Format_NC4HW4) {
+    im_dst_x = nhwc_shape_[2];
+    im_dst_y = nhwc_shape_[0] * UP_DIV(nhwc_shape_[3], C4NUM) * nhwc_shape_[1];
+  } else {
+    MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_);
+    return RET_ERROR;
+  }
+  size_t img_dtype = CL_FLOAT;
+  if (enable_fp16_) {
+    img_dtype = CL_HALF_FLOAT;
+  }
+  img_size->clear();
+  std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
+  *img_size = vec;
+  return RET_OK;
+}
+
+int ReduceOpenCLKernel::Run() {
+  MS_LOG(DEBUG) << this->name() << " Running!";
+  std::vector<int> shapex = in_tensors_[0]->shape();
+  int h = shapex[1];
+  int w = shapex[2];
+  int c = shapex[3];
+  int c4 = UP_DIV(c, C4NUM);
+  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  std::vector<size_t> local = {};
+  std::vector<size_t> global = {static_cast<size_t>(c4)};
+  cl_int4 size = {h, w, c4, 1};
+  int arg_idx = 0;
+  ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->MutableData());
+  ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->MutableData());
+  ocl_runtime->SetKernelArg(kernel_, arg_idx++, size);
+  ocl_runtime->RunKernel(kernel_, global, local, nullptr);
+  return RET_OK;
+}
+
+kernel::LiteKernel *OpenCLReduceKernelCreator(const std::vector<lite::Tensor *> &inputs,
+                                              const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
+                                              const lite::Context *ctx, const kernel::KernelKey &desc,
+                                              const mindspore::lite::PrimitiveC *primitive) {
+  auto *kernel = new (std::nothrow) ReduceOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
+  if (kernel == nullptr) {
+    MS_LOG(ERROR) << "kernel " << opParameter->name_ << " create failed.";
+    return nullptr;
+  }
+  auto ret = kernel->Init();
+  if (ret != RET_OK) {
+    delete kernel;
+    return nullptr;
+  }
+  return kernel;
+}
+
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Mean, OpenCLReduceKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Mean, OpenCLReduceKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Reduce, OpenCLReduceKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Reduce, OpenCLReduceKernelCreator)
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h
new file mode 100644
index 0000000000..a3897dd18f
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h
@@ -0,0 +1,48 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_REDUCE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_REDUCE_H_
+
+#include <vector>
+
+#include "src/lite_kernel.h"
+#include "src/runtime/opencl/opencl_runtime.h"
+#include "src/runtime/kernel/opencl/opencl_kernel.h"
+#include "nnacl/reduce_parameter.h"
+
+namespace mindspore::kernel {
+class ReduceOpenCLKernel : public OpenCLKernel {
+ public:
+  explicit ReduceOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                              const std::vector<lite::Tensor *> &outputs)
+      : OpenCLKernel(parameter, inputs, outputs) {}
+  ~ReduceOpenCLKernel() override{};
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+  int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
+  void InitNHWCShape();
+
+ private:
+  cl::Kernel kernel_;
+  bool enable_fp16_{false};
+  std::vector<size_t> nhwc_shape_;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_REDUCE_H_
diff --git a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
index ec208024a3..f4e56b600b 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
@@ -73,18 +73,6 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te
       return RET_ERROR;
     }
     new_tensor->CopyTensor(*in_tensors[i]);
-    if ((dst_format == schema::Format::Format_NCHW || dst_format == schema::Format::Format_NC4HW4) &&
-        (src_format == schema::Format::Format_NHWC || src_format == schema::Format::Format_NHWC4)) {
-      auto shape = new_tensor->shape();
-      std::vector<int> dst_shape{shape[0], shape[3], shape[1], shape[2]};
-      new_tensor->set_shape(shape);
-    }
-    if ((dst_format == schema::Format::Format_NHWC || dst_format == schema::Format::Format_NHWC4) &&
-        (src_format == schema::Format::Format_NCHW || src_format == schema::Format::Format_NC4HW4)) {
-      auto shape = new_tensor->shape();
-      std::vector<int> dst_shape{shape[0], shape[2], shape[3], shape[1]};
-      new_tensor->set_shape(shape);
-    }
     if (mem_type == OpenCLMemType::IMG) {
       new_tensor->SetFormat(dst_format);
       in_tensors[i]->SetFormat(src_format);
diff --git a/mindspore/lite/test/CMakeLists.txt b/mindspore/lite/test/CMakeLists.txt
index a6222c0615..c86374c506 100644
--- a/mindspore/lite/test/CMakeLists.txt
+++ b/mindspore/lite/test/CMakeLists.txt
@@ -127,6 +127,7 @@ if (SUPPORT_GPU)
             ${LITE_DIR}/src/runtime/kernel/opencl/kernel/to_format.cc
             ${LITE_DIR}/src/runtime/kernel/opencl/kernel/biasadd.cc
             ${LITE_DIR}/src/runtime/kernel/opencl/kernel/scale.cc
+            ${LITE_DIR}/src/runtime/kernel/opencl/kernel/reduce.cc
             )
 endif()
 ### minddata lite
@@ -315,6 +316,7 @@ if (SUPPORT_GPU)
             ${TEST_DIR}/ut/src/runtime/kernel/opencl/reshape_tests.cc
             ${TEST_DIR}/ut/src/runtime/kernel/opencl/biasadd_tests.cc
             ${TEST_DIR}/ut/src/runtime/kernel/opencl/scale_tests.cc
+            ${TEST_DIR}/ut/src/runtime/kernel/opencl/reduce_tests.cc
             )
 endif()
 
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc
index 8a09409592..dd46e107d4 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc
@@ -30,7 +30,7 @@ class TestMatMulOpenCL : public mindspore::CommonTest {
 };
 
 void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *weight_data, void *output_data,
-                       bool enable_fp16) {
+                       bool enable_fp16, int dims) {
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   ocl_runtime->Init();
   size_t dtype_size = sizeof(float);
@@ -39,20 +39,41 @@ void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *we
     dtype_size = sizeof(int16_t);
   }
   auto allocator = ocl_runtime->GetAllocator();
-  int ci = shape[0];
-  int co = shape[1];
-  std::vector<int> input_shape = {1, ci};
+  std::vector<int> input_shape, output_shape, weight_shape;
+  if (dims == 2) {
+    int ci = shape[0];
+    int co = shape[1];
+    input_shape = {1, ci};
+    output_shape = {1, co};
+    weight_shape = {co, ci};
+  } else if (dims == 4) {
+    int a = shape[0];
+    int b = shape[1];
+    int m = shape[2];
+    int ci = shape[3];
+    int co = shape[4];
+    input_shape = {a, b, m, ci};
+    output_shape = {a, b, m, co};
+    weight_shape = {a, b, co, ci};
+  }
+  auto param_ptr = std::make_unique<MatMulParameter>();
+  auto param = param_ptr.get();
+  if (param == nullptr) {
+    MS_LOG(ERROR) << "param_ptr create error.";
+    return;
+  }
+  param->a_transpose_ = false;
+  param->b_transpose_ = true;
   auto tensor_x_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
-                                                     input_shape, schema::Format_NC);
+                                                     input_shape, dims == 2 ? schema::Format_NC : schema::Format_NHWC);
   auto tensor_x = tensor_x_ptr.get();
   if (tensor_x == nullptr) {
     MS_LOG(ERROR) << "tensor_x create error.";
     return;
   }
 
-  std::vector<int> w_shape = {co, ci};
-  auto tensor_w_ptr =
-    std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), w_shape);
+  auto tensor_w_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
+                                                     weight_shape, dims == 2 ? schema::Format_NC : schema::Format_NHWC);
   auto tensor_w = tensor_w_ptr.get();
   if (tensor_w == nullptr) {
     MS_LOG(ERROR) << "tensor_w create error.";
@@ -60,9 +81,9 @@ void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *we
   }
   tensor_w->SetData(weight_data);
 
-  std::vector<int> out_shape = {1, co};
-  auto tensor_out_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
-                                                       out_shape, schema::Format_NC);
+  auto tensor_out_ptr =
+    std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), output_shape,
+                                   dims == 2 ? schema::Format_NC : schema::Format_NHWC);
   auto tensor_out = tensor_out_ptr.get();
   if (tensor_out == nullptr) {
     MS_LOG(ERROR) << "tensor_out create error.";
@@ -70,7 +91,8 @@ void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *we
   }
   std::vector<lite::Tensor *> inputs{tensor_x, tensor_w};
   std::vector<lite::Tensor *> outputs{tensor_out};
-  auto op_kernel_ptr = std::make_unique<kernel::MatMulOpenCLKernel>(nullptr, inputs, outputs, false);
+  auto op_kernel_ptr =
+    std::make_unique<kernel::MatMulOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs, false);
   auto op_kernel = op_kernel_ptr.get();
   if (op_kernel == nullptr) {
     MS_LOG(ERROR) << "op_kernel create error.";
@@ -89,12 +111,13 @@ void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *we
     return;
   }
   pGraph->Init();
-  memcpy(inputs[0]->MutableData(), input_data, ci * dtype_size);
+  memcpy(inputs[0]->MutableData(), input_data, tensor_x->ElementsNum() * dtype_size);
   pGraph->Run();
   if (enable_fp16) {
-    CompareOutput(outputs[0]->MutableData(), output_data, co, static_cast<float16_t>(1e-3), 2e-2);
+    CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast<float16_t>(1e-3),
+                  2e-2);
   } else {
-    CompareOutput(outputs[0]->MutableData(), output_data, co, static_cast<float>(1e-5));
+    CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast<float>(1e-5));
   }
 
   tensor_x->SetData(nullptr);
@@ -125,7 +148,7 @@ void RunTestCaseMatMul(const std::vector<int> shape, const std::vector<std::stri
     MS_LOG(ERROR) << "output_data load error.";
     return;
   }
-  RunTestCaseMatMul(shape, input_data, weight_data, output_data, enable_fp16);
+  RunTestCaseMatMul(shape, input_data, weight_data, output_data, enable_fp16, 2);
 }
 
 TEST_F(TestMatMulOpenCL, MatMulFp32) {
@@ -156,7 +179,7 @@ TEST_F(TestMatMulOpenCL, MatMulFp32_2) {
   std::vector<float> weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
                                     1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
   std::vector<float> output_data = {10.f, 10.f, 10.f};
-  RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), false);
+  RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), false, 2);
 }
 
 TEST_F(TestMatMulOpenCL, MatMulFp16_2) {
@@ -167,6 +190,40 @@ TEST_F(TestMatMulOpenCL, MatMulFp16_2) {
   std::vector<float16_t> weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
                                         1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
   std::vector<float16_t> output_data = {10.f, 10.f, 10.f};
-  RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), true);
+  RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), true, 2);
+}
+
+TEST_F(TestMatMulOpenCL, MatMulFp32_4D) {
+  int a = 1;
+  int b = 2;
+  int c = 2;
+  int ci = 5;
+  int co = 3;
+  std::vector<int> shape = {a, b, c, ci, co};
+  std::vector<float> input_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                                   1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  std::vector<float> weight_data = {1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,  9.0f,  10.0f,
+                                    11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f,
+                                    21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f};
+  std::vector<float> output_data = {15.0f, 40.0f,  65.0f,  15.0f, 40.0f,  65.0f,
+                                    90.0f, 115.0f, 140.0f, 90.0f, 115.0f, 140.0f};
+  RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), false, 4);
+}
+
+TEST_F(TestMatMulOpenCL, MatMulFp16_4D) {
+  int a = 1;
+  int b = 2;
+  int c = 2;
+  int ci = 5;
+  int co = 3;
+  std::vector<int> shape = {a, b, c, ci, co};
+  std::vector<float16_t> input_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                                       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  std::vector<float16_t> weight_data = {1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,  9.0f,  10.0f,
+                                        11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f,
+                                        21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f};
+  std::vector<float16_t> output_data = {15.0f, 40.0f,  65.0f,  15.0f, 40.0f,  65.0f,
+                                        90.0f, 115.0f, 140.0f, 90.0f, 115.0f, 140.0f};
+  RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), true, 4);
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc
new file mode 100644
index 0000000000..54959f14b7
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc
@@ -0,0 +1,156 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <iostream>
+#include <memory>
+#include "mindspore/core/utils/log_adapter.h"
+#include "common/common_test.h"
+#include "mindspore/lite/src/common/file_utils.h"
+#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
+#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
+#include "mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h"
+#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
+
+namespace mindspore {
+class TestReduceOpenCL : public mindspore::CommonTest {
+ public:
+  TestReduceOpenCL() {}
+};
+
+void RunTestCaseReduce(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16,
+                       int reduce_mode) {
+  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  ocl_runtime->Init();
+  size_t dtype_size = sizeof(float);
+  if (enable_fp16) {
+    ocl_runtime->SetFp16Enable(true);
+    dtype_size = sizeof(float16_t);
+  }
+  auto allocator = ocl_runtime->GetAllocator();
+  auto param_ptr = std::make_unique<ReduceParameter>();
+  auto param = param_ptr.get();
+  if (param == nullptr) {
+    MS_LOG(ERROR) << "param_ptr create error.";
+    return;
+  }
+  param->axes_[0] = 1;
+  param->axes_[1] = 2;
+  param->num_axes_ = 2;
+  param->mode_ = reduce_mode;
+  int n = shape[0];
+  int h = shape[1];
+  int w = shape[2];
+  int c = shape[3];
+  std::vector<int> input_shape = {n, h, w, c};
+  auto tensor_x_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
+                                                     input_shape, schema::Format_NHWC);
+  auto tensor_x = tensor_x_ptr.get();
+  if (tensor_x == nullptr) {
+    MS_LOG(ERROR) << "tensor_x create error.";
+    return;
+  }
+  std::vector<int> out_shape = {n, c};
+  auto tensor_out_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
+                                                       out_shape, schema::Format_NC);
+  auto tensor_out = tensor_out_ptr.get();
+  if (tensor_out == nullptr) {
+    MS_LOG(ERROR) << "tensor_out create error.";
+    return;
+  }
+  std::vector<lite::Tensor *> inputs{tensor_x};
+  std::vector<lite::Tensor *> outputs{tensor_out};
+  auto arith_kernel_ptr =
+    std::make_unique<kernel::ReduceOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs);
+  auto arith_kernel = arith_kernel_ptr.get();
+  if (arith_kernel == nullptr) {
+    MS_LOG(ERROR) << "arith_kernel create error.";
+    return;
+  }
+  arith_kernel->Init();
+
+  inputs[0]->MallocData(allocator);
+
+  std::vector<kernel::LiteKernel *> kernels{arith_kernel};
+  auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs, outputs, kernels, kernels, kernels);
+  auto pGraph = pGraph_ptr.get();
+  if (pGraph == nullptr) {
+    MS_LOG(ERROR) << "pGraph create error.";
+    return;
+  }
+  pGraph->Init();
+  memcpy(inputs[0]->MutableData(), input_data, inputs[0]->ElementsNum() * dtype_size);
+  pGraph->Run();
+
+  if (enable_fp16) {
+    CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3),
+                  2e-2);
+  } else {
+    CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5));
+  }
+  inputs[0]->SetData(nullptr);
+  outputs[0]->SetData(nullptr);
+
+  MS_LOG(INFO) << "Test Reduce passed";
+  lite::opencl::OpenCLRuntime::DeleteInstance();
+}
+
+TEST_F(TestReduceOpenCL, ReduceMeanFp32) {
+  int n = 1;
+  int h = 2;
+  int w = 2;
+  int c = 3;
+  std::vector<int> shape = {n, h, w, c};
+  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
+  std::vector<float> output_data = {4.5f, 5.5f, 6.5f};
+
+  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMean);
+}
+
+TEST_F(TestReduceOpenCL, ReduceMeanFp16) {
+  int n = 1;
+  int h = 2;
+  int w = 2;
+  int c = 3;
+  std::vector<int> shape = {n, h, w, c};
+  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
+  std::vector<float16_t> output_data = {4.5f, 5.5f, 6.5f};
+
+  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceMean);
+}
+
+TEST_F(TestReduceOpenCL, ReduceSumFp32) {
+  int n = 1;
+  int h = 2;
+  int w = 2;
+  int c = 3;
+  std::vector<int> shape = {n, h, w, c};
+  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
+  std::vector<float> output_data = {18.0f, 22.0f, 26.0f};
+
+  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceSum);
+}
+
+TEST_F(TestReduceOpenCL, ReduceSumFp16) {
+  int n = 1;
+  int h = 2;
+  int w = 2;
+  int c = 3;
+  std::vector<int> shape = {n, h, w, c};
+  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
+  std::vector<float16_t> output_data = {18.0f, 22.0f, 26.0f};
+
+  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceSum);
+}
+}  // namespace mindspore