From 37c6c1c3f59dbac78d070d5ae0e5bead89007c82 Mon Sep 17 00:00:00 2001 From: chenzupeng Date: Fri, 11 Sep 2020 17:45:39 +0800 Subject: [PATCH] add batchmatmul and reduce --- .../src/runtime/kernel/opencl/cl/matmul.cl | 153 ++++++++++++---- .../src/runtime/kernel/opencl/cl/reduce.cl | 61 +++++++ .../runtime/kernel/opencl/kernel/activation.h | 1 - .../kernel/opencl/kernel/conv2d_transpose.cc | 16 +- .../runtime/kernel/opencl/kernel/matmul.cc | 159 +++++++++++------ .../src/runtime/kernel/opencl/kernel/matmul.h | 12 +- .../runtime/kernel/opencl/kernel/reduce.cc | 166 ++++++++++++++++++ .../src/runtime/kernel/opencl/kernel/reduce.h | 48 +++++ .../kernel/opencl/subgraph_opencl_kernel.cc | 12 -- mindspore/lite/test/CMakeLists.txt | 2 + .../src/runtime/kernel/opencl/matmul_tests.cc | 93 ++++++++-- .../src/runtime/kernel/opencl/reduce_tests.cc | 156 ++++++++++++++++ 12 files changed, 749 insertions(+), 130 deletions(-) create mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl create mode 100644 mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc create mode 100644 mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h create mode 100644 mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl index 300584c2f9..914fffe5ce 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl @@ -1,57 +1,146 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable +#define C4NUM 4 +#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void MatMul_NHWC4(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, - __write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) { - int2 gid = (int2)(get_global_id(0), get_global_id(1)); - int2 lid = (int2)(get_local_id(0), get_local_id(1)); +__kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, + __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) { + int gidx = get_global_id(0); // CO4 + int gidz = get_global_id(2); // N + int lidx = get_local_id(0); + int lidy = get_local_id(1); + int ci4 = UP_DIV(in_shape.w, C4NUM); + int co4 = UP_DIV(out_shape.w, C4NUM); + int n = out_shape.z; + bool inside = gidx < co4 && gidz < n; FLT4 result = (FLT4)(0.0f); - bool inside = gid.x < offset_co.y; - for (uint i = lid.y; i < offset_ci.y && inside; i += 4) { - FLT4 v = READ_IMAGE(input, smp_zero, (int2)(i, 0)); - FLT16 w = weight[gid.x + i * offset_co.y]; + for (uint i = lidy; i < ci4 && inside; i += 4) { + FLT4 v = READ_IMAGE(input, smp_zero, (int2)(i, gidz)); + FLT16 w = weight[i * co4 + gidx]; result.x += dot(v, w.s0123); result.y += dot(v, w.s4567); result.z += dot(v, w.s89ab); result.w += dot(v, w.scdef); } - __local FLT4 temp[64][4]; - temp[lid.x][lid.y] = result; + WRITE_IMAGE(output, (int2)(gidx, gidz), result); + __local FLT4 temp[32][4]; + temp[lidx][lidy] = result; barrier(CLK_LOCAL_MEM_FENCE); - if (lid.y == 0 && inside) { - result += temp[lid.x][1]; - result += temp[lid.x][2]; - result += temp[lid.x][3]; + if (lidy == 0 && inside) { + result += temp[lidx][1]; + result += temp[lidx][2]; + result += temp[lidx][3]; if (has_bias != 0) { - result += READ_IMAGE(bias, smp_zero, (int2)(gid.x, 0)); + result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); } - WRITE_IMAGE(output, (int2)(gid.x, 0), result); + WRITE_IMAGE(output, (int2)(gidx, gidz), result); } } -__kernel void MatMul_NC4HW4(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, - __write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) { - int2 gid = (int2)(get_global_id(0), get_global_id(1)); - int2 lid = (int2)(get_local_id(0), get_local_id(1)); +__kernel void MatMul_NC4HW4_2d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, + __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) { + int gidx = get_global_id(0); // CO4 + int gidz = get_global_id(2); // N + int lidx = get_local_id(0); + int lidy = get_local_id(1); + int ci4 = UP_DIV(in_shape.w, C4NUM); + int co4 = UP_DIV(out_shape.w, C4NUM); + int n = out_shape.z; + bool inside = gidx < co4 && gidz < n; FLT4 result = (FLT4)(0.0f); - bool inside = gid.x < offset_co.y; - for (uint i = lid.y; i < offset_ci.y && inside; i += 4) { - FLT4 v = READ_IMAGE(input, smp_zero, (int2)(0, i)); - FLT16 w = weight[gid.x + i * offset_co.y]; + for (uint i = lidy; i < ci4 && inside; i += 4) { + FLT4 v = READ_IMAGE(input, smp_zero, (int2)(gidz * ci4 + i, 0)); + FLT16 w = weight[i * co4 + gidx]; result.x += dot(v, w.s0123); result.y += dot(v, w.s4567); result.z += dot(v, w.s89ab); result.w += dot(v, w.scdef); } - __local FLT4 temp[64][4]; - temp[lid.x][lid.y] = result; + __local FLT4 temp[32][4]; + temp[lidx][lidy] = result; barrier(CLK_LOCAL_MEM_FENCE); - if (lid.y == 0 && inside) { - result += temp[lid.x][1]; - result += temp[lid.x][2]; - result += temp[lid.x][3]; + if (lidy == 0 && inside) { + result += temp[lidx][1]; + result += temp[lidx][2]; + result += temp[lidx][3]; if (has_bias != 0) { - result += READ_IMAGE(bias, smp_zero, (int2)(gid.x, 0)); + result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); } - WRITE_IMAGE(output, (int2)(0, gid.x), result); + WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, 0), result); + } +} + +__kernel void MatMul_NHWC4_4d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, + __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) { + int gidx = get_global_id(0); // CO4 + int gidy = get_global_id(1); // N * H * 4 + int gidz = get_global_id(2); // W + int lidx = get_local_id(0); + int lidy = get_local_id(1); + int ci4 = UP_DIV(in_shape.w, C4NUM); + int co4 = UP_DIV(out_shape.w, C4NUM); + int n = out_shape.x; + int h = out_shape.y; + int w = out_shape.z; + int nh_index = gidy / 4; + bool inside = gidx < co4 && gidz < w && nh_index < n * h; + FLT4 result = (FLT4)(0.0f); + for (uint i = lidy; i < ci4 && inside; i += 4) { + FLT4 v = READ_IMAGE(input, smp_zero, (int2)(gidz * ci4 + i, nh_index)); + FLT16 weight_value = weight[nh_index * ci4 * co4 + i * co4 + gidx]; + result.x += dot(v, weight_value.s0123); + result.y += dot(v, weight_value.s4567); + result.z += dot(v, weight_value.s89ab); + result.w += dot(v, weight_value.scdef); + } + __local FLT4 temp[32][4]; + temp[lidx][lidy] = result; + barrier(CLK_LOCAL_MEM_FENCE); + if (lidy == 0 && inside) { + result += temp[lidx][1]; + result += temp[lidx][2]; + result += temp[lidx][3]; + if (has_bias != 0) { + result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); + } + WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, nh_index), result); + } +} + +__kernel void MatMul_NC4HW4_4d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, + __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) { + int gidx = get_global_id(0); // CO4 + int gidy = get_global_id(1); // N * H * 4 + int gidz = get_global_id(2); // W + int lidx = get_local_id(0); + int lidy = get_local_id(1); + int ci4 = UP_DIV(in_shape.w, C4NUM); + int co4 = UP_DIV(out_shape.w, C4NUM); + int n = out_shape.x; + int h = out_shape.y; + int w = out_shape.z; + int nh_index = gidy / 4; + bool inside = gidx < co4 && gidz < w && nh_index < n * h; + int n_index = nh_index / h; + int h_index = nh_index % h; + FLT4 result = (FLT4)(0.0f); + for (uint i = lidy; i < ci4 && inside; i += 4) { + FLT4 v = READ_IMAGE(input, smp_zero, (int2)(gidz, n_index * ci4 * h + i * h + h_index)); + FLT16 weight_value = weight[nh_index * ci4 * co4 + i * co4 + gidx]; + result.x += dot(v, weight_value.s0123); + result.y += dot(v, weight_value.s4567); + result.z += dot(v, weight_value.s89ab); + result.w += dot(v, weight_value.scdef); + } + __local FLT4 temp[32][4]; + temp[lidx][lidy] = result; + barrier(CLK_LOCAL_MEM_FENCE); + if (lidy == 0 && inside) { + result += temp[lidx][1]; + result += temp[lidx][2]; + result += temp[lidx][3]; + if (has_bias != 0) { + result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); + } + WRITE_IMAGE(output, (int2)(gidz, n_index * co4 * h + gidx * h + h_index), result); } } diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl new file mode 100644 index 0000000000..8f2269301f --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl @@ -0,0 +1,61 @@ +#ifdef cl_khr_fp16 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#endif +__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; +__kernel void mean_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { + int X = get_global_id(0); // C4 + if (X >= size.z) { + return; + } + FLT4 result = (FLT4)0.f; + for (int h = 0; h < size.x; h++) { + for (int w = 0; w < size.y; w++) { + result += READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h)); + } + } + result /= size.x * size.y; + WRITE_IMAGE(dst_data, (int2)(X, 0), result); +} + +__kernel void mean_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { + int X = get_global_id(0); // C4 + if (X >= size.z) { + return; + } + FLT4 result = (FLT4)0.f; + for (int h = 0; h < size.x; h++) { + for (int w = 0; w < size.y; w++) { + result += READ_IMAGE(src_data, smp_zero, (int2)(w, X * size.x + h)); + } + } + result /= size.x * size.y; + WRITE_IMAGE(dst_data, (int2)(0, X), result); +} + +__kernel void sum_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { + int X = get_global_id(0); // C4 + if (X >= size.z) { + return; + } + FLT4 result = (FLT4)0.f; + for (int h = 0; h < size.x; h++) { + for (int w = 0; w < size.y; w++) { + result += READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h)); + } + } + WRITE_IMAGE(dst_data, (int2)(X, 0), result); +} + +__kernel void sum_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { + int X = get_global_id(0); // C4 + if (X >= size.z) { + return; + } + FLT4 result = (FLT4)0.f; + for (int h = 0; h < size.x; h++) { + for (int w = 0; w < size.y; w++) { + result += READ_IMAGE(src_data, smp_zero, (int2)(w, X * size.x + h)); + } + } + WRITE_IMAGE(dst_data, (int2)(0, X), result); +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h index 3c89a08b36..cd22208468 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h @@ -45,7 +45,6 @@ class ActivationOpenClKernel : public OpenCLKernel { cl::Kernel kernel_; int type_; float alpha_; - void *alpha_buff_; int in_size_; int out_size_; size_t fp_size; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc index 26b02d7e24..860ab489a9 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc @@ -94,14 +94,20 @@ void Conv2dTransposeOpenCLKernel::PadWeight() { int ori_index = ((ci_offset * kh + kh_i) * kw + kw_i) * co + co_offset; if (enable_fp16_) { if (weight_dtype == kNumberTypeFloat32) { - reinterpret_cast(padWeight_)[index++] = - Float32ToShort(reinterpret_cast(origin_weight)[ori_index]); + reinterpret_cast(padWeight_)[index++] = + reinterpret_cast(origin_weight)[ori_index]; } else { - reinterpret_cast(padWeight_)[index++] = - reinterpret_cast(origin_weight)[ori_index]; + reinterpret_cast(padWeight_)[index++] = + reinterpret_cast(origin_weight)[ori_index]; } } else { - reinterpret_cast(padWeight_)[index++] = reinterpret_cast(origin_weight)[ori_index]; + if (weight_dtype == kNumberTypeFloat32) { + reinterpret_cast(padWeight_)[index++] = + reinterpret_cast(origin_weight)[ori_index]; + } else { + reinterpret_cast(padWeight_)[index++] = + reinterpret_cast(origin_weight)[ori_index]; + } } } else { index++; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc index ed51c416bc..a79d3c71b7 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc @@ -1,5 +1,5 @@ /** - * Copyright 2019 Huawei Technologies Co., Ltd + * Copyright 2019 Huawei Technologies n., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,10 +16,10 @@ #include #include +#include #include "nnacl/fp32/common_func.h" #include "src/kernel_registry.h" #include "src/runtime/opencl/opencl_runtime.h" -#include "nnacl/fp32/matmul.h" #include "src/runtime/kernel/opencl/kernel/matmul.h" #ifndef PROGRAM_WITH_IL #include "src/runtime/kernel/opencl/cl/matmul.cl.inc" @@ -36,7 +36,26 @@ int MatMulOpenCLKernel::Init() { std::string kernel_name = "MatMul"; kernel_name += "_" + std::string(EnumNameFormat(op_format_)); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + auto param = reinterpret_cast(op_parameter_); + transposeA = param->a_transpose_; + if (transposeA) { + MS_LOG(ERROR) << "matmul only support a_transpose_=false yet."; + return RET_ERROR; + } + transposeB = param->b_transpose_; enable_fp16_ = ocl_runtime->GetFp16Enable(); + if (in_tensors_[0]->shape().size() != out_tensors_[0]->shape().size() || + (in_tensors_[0]->shape().size() != 2 && in_tensors_[0]->shape().size() != 4)) { + MS_LOG(ERROR) << "matmul only support input shape size=2 or 4."; + return RET_ERROR; + } + dims = in_tensors_[0]->shape().size(); + for (int i = 0; i < dims; i++) { + inShape[MAX_DIMS - dims + i] = in_tensors_[0]->shape()[i]; + outShape[MAX_DIMS - dims + i] = out_tensors_[0]->shape()[i]; + } + std::map dims2str = {{2, "_2d"}, {4, "_4d"}}; + kernel_name += dims2str[dims]; #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name); #else @@ -46,21 +65,7 @@ int MatMulOpenCLKernel::Init() { ocl_runtime->LoadSource(program_name, source); ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif - int ci, co; - if (in_tensors_[1]->shape().size() != 2) { - MS_LOG(ERROR) << "matmul do not support input shape size=" << in_tensors_[1]->shape().size(); - return RET_ERROR; - } - if (in_tensors_[1]->shape().size() == 2) { - ci = in_tensors_[1]->shape()[1]; - co = in_tensors_[1]->shape()[0]; - } else { - ci = in_tensors_[1]->shape()[3]; - co = in_tensors_[1]->shape()[0]; - } - sizeCI = {ci, UP_DIV(ci, C4NUM)}; - sizeCO = {co, UP_DIV(co, C4NUM)}; PadWeight(); in_ori_format_ = in_tensors_[0]->GetFormat(); out_ori_format_ = out_tensors_[0]->GetFormat(); @@ -73,51 +78,69 @@ int MatMulOpenCLKernel::Init() { int MatMulOpenCLKernel::ReSize() { return RET_OK; } void MatMulOpenCLKernel::PadWeight() { + // ABMCI @ ABCICO = ABMCO auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator(); + int ci = inShape[3]; + int ci4 = UP_DIV(ci, C4NUM); + int co = outShape[3]; + int co4 = UP_DIV(co, C4NUM); + int a = inShape[0]; + int b = inShape[1]; - size_t dtype_size = enable_fp16_ ? sizeof(int16_t) : sizeof(float); - padWeight_ = allocator->Malloc(sizeCI.s[1] * sizeCO.s[1] * C4NUM * C4NUM * dtype_size); + size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float); + padWeight_ = allocator->Malloc(a * b * ci4 * co4 * C4NUM * C4NUM * dtype_size); padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true); - memset(padWeight_, 0x00, sizeCI.s[1] * sizeCO.s[1] * C4NUM * C4NUM * dtype_size); - auto origin_weight = in_tensors_.at(kWeightIndex)->MutableData(); - int divCI = sizeCI.s[1]; - int divCO = sizeCO.s[1]; - int co = sizeCO.s[0]; + auto padWeightFp32 = reinterpret_cast(padWeight_); + auto padWeightFp16 = reinterpret_cast(padWeight_); + memset(padWeight_, 0x00, a * b * ci4 * co4 * C4NUM * C4NUM * dtype_size); + auto originWeightFp32 = reinterpret_cast(in_tensors_.at(kWeightIndex)->MutableData()); + auto originWeightFp16 = reinterpret_cast(in_tensors_.at(kWeightIndex)->MutableData()); + bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16; + + // pad weight + // ABCICO -> AB(CI4)(CO4)(4 from CO)(4 from CI) + // if tranposeB, ABCOCI -> AB(CI4)(CO4)(4 from CO)(4 from CI) int index = 0; - for (int i = 0; i < divCI; ++i) { - for (int j = 0; j < divCO; ++j) { - for (int k = 0; k < C4NUM; ++k) { - for (int l = 0; l < C4NUM; ++l) { - int src_x = i * C4NUM + l; - int src_y = j * C4NUM + k; - if (src_x < sizeCI.s[0] && src_y < sizeCO.s[0]) { - if (enable_fp16_) { - if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat32) { - reinterpret_cast(padWeight_)[index++] = - Float32ToShort(reinterpret_cast(origin_weight)[src_y * sizeCI.s[0] + src_x]); - } else { - reinterpret_cast(padWeight_)[index++] = - reinterpret_cast(origin_weight)[src_y * sizeCI.s[0] + src_x]; - } - } else { - if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16) { - reinterpret_cast(padWeight_)[index++] = - ShortToFloat32(reinterpret_cast(origin_weight)[src_y * sizeCI.s[0] + src_x]); + for (int aa = 0; aa < a; aa++) { + for (int bb = 0; bb < b; bb++) { + int baseAB = (aa * b + bb) * ci * co; + for (int i = 0; i < ci4; ++i) { + for (int j = 0; j < co4; ++j) { + for (int k = 0; k < C4NUM; ++k) { + for (int l = 0; l < C4NUM; ++l) { + int src_ci = i * C4NUM + l; + int src_co = j * C4NUM + k; + if (src_ci < ci && src_co < co) { + int originId = baseAB + src_ci * co + src_co; + if (transposeB) { + originId = baseAB + src_co * ci + src_ci; + } + if (enable_fp16_) { + if (!isModelFp16) { + padWeightFp16[index++] = originWeightFp32[originId]; + } else { + padWeightFp16[index++] = originWeightFp16[originId]; + } + } else { + if (!isModelFp16) { + padWeightFp32[index++] = originWeightFp32[originId]; + } else { + padWeightFp32[index++] = originWeightFp16[originId]; + } + } } else { - reinterpret_cast(padWeight_)[index++] = - reinterpret_cast(origin_weight)[src_y * sizeCI.s[0] + src_x]; + index++; } } - } else { - index++; } } } } } + // pad FC Bias size_t im_dst_x, im_dst_y; - im_dst_x = divCO; + im_dst_x = co4; im_dst_y = 1; size_t img_dtype = CL_FLOAT; if (enable_fp16_) { @@ -126,13 +149,18 @@ void MatMulOpenCLKernel::PadWeight() { std::vector img_size{im_dst_x, im_dst_y, img_dtype}; bias_ = allocator->Malloc(im_dst_x * im_dst_y * C4NUM * dtype_size, img_size); bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true); - memset(bias_, 0x00, divCO * C4NUM * dtype_size); + memset(bias_, 0x00, co4 * C4NUM * dtype_size); if (in_tensors_.size() >= 3) { if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) { auto fdata = reinterpret_cast(in_tensors_[2]->MutableData()); for (int i = 0; i < co; i++) { reinterpret_cast(bias_)[i] = Float32ToShort(fdata[i]); } + } else if (in_tensors_[2]->data_type() == kNumberTypeFloat16 && !enable_fp16_) { + auto fdata = reinterpret_cast(in_tensors_[2]->MutableData()); + for (int i = 0; i < co; i++) { + reinterpret_cast(bias_)[i] = ShortToFloat32(fdata[i]); + } } else { memcpy(bias_, in_tensors_[2]->MutableData(), co * dtype_size); } @@ -142,12 +170,23 @@ void MatMulOpenCLKernel::PadWeight() { int MatMulOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { size_t im_dst_x, im_dst_y; - if (op_format_ == schema::Format::Format_NHWC4) { - im_dst_x = sizeCO.s[1]; - im_dst_y = 1; - } else if (op_format_ == schema::Format::Format_NC4HW4) { - im_dst_x = 1; - im_dst_y = sizeCO.s[1]; + auto out_shape = out_tensors_[0]->shape(); + int n = 1, h = 1, w = 1, c = 1; + if (dims == 2) { + n = out_shape[0]; + c = out_shape[1]; + } else if (dims == 4) { + n = out_shape[0]; + h = out_shape[1]; + w = out_shape[2]; + c = out_shape[3]; + } + if (op_format_ == schema::Format_NHWC4) { + im_dst_x = w * UP_DIV(c, C4NUM); + im_dst_y = n * h; + } else if (op_format_ == schema::Format_NC4HW4) { + im_dst_x = w; + im_dst_y = n * UP_DIV(c, C4NUM) * h; } else { MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_); return RET_ERROR; @@ -166,15 +205,19 @@ int MatMulOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running!"; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); // local size should less than MAX_GROUP_SIZE - std::vector local = {64, 4}; - std::vector global = {UP_ROUND(sizeCO.s[1], local[0]), 4}; + std::vector local = {32, 4, 1}; + std::vector global = {UP_DIV(static_cast(outShape[3]), C4NUM), + 4 * static_cast(outShape[0]) * static_cast(outShape[1]), + static_cast(outShape[2])}; int arg_count = 0; + cl_int4 in_shape = {inShape[0], inShape[1], inShape[2], inShape[3]}; + cl_int4 out_shape = {outShape[0], outShape[1], outShape[2], outShape[3]}; ocl_runtime->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->MutableData()); ocl_runtime->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF); ocl_runtime->SetKernelArg(kernel_, arg_count++, bias_); ocl_runtime->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->MutableData()); - ocl_runtime->SetKernelArg(kernel_, arg_count++, sizeCI); - ocl_runtime->SetKernelArg(kernel_, arg_count++, sizeCO); + ocl_runtime->SetKernelArg(kernel_, arg_count++, in_shape); + ocl_runtime->SetKernelArg(kernel_, arg_count++, out_shape); ocl_runtime->SetKernelArg(kernel_, arg_count++, hasBias_ ? 1 : 0); ocl_runtime->RunKernel(kernel_, global, local, nullptr); return RET_OK; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h index dfd102107e..0a540f005f 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h @@ -20,7 +20,7 @@ #include #include "src/runtime/kernel/opencl/opencl_kernel.h" -#include "nnacl/conv_parameter.h" +#include "nnacl/matmul_parameter.h" #include "src/runtime/opencl/opencl_runtime.h" namespace mindspore::kernel { @@ -29,7 +29,7 @@ class MatMulOpenCLKernel : public OpenCLKernel { public: explicit MatMulOpenCLKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs, bool hasBias) - : OpenCLKernel(parameter, inputs, outputs) { + : OpenCLKernel(parameter, inputs, outputs), inShape(MAX_DIMS, 1), outShape(MAX_DIMS, 1) { hasBias_ = hasBias; } ~MatMulOpenCLKernel() override{}; @@ -46,8 +46,12 @@ class MatMulOpenCLKernel : public OpenCLKernel { void *bias_; bool hasBias_{false}; bool enable_fp16_{false}; - cl_int2 sizeCI; - cl_int2 sizeCO; + bool transposeA{false}; + bool transposeB{true}; + int dims; + static constexpr int MAX_DIMS = 4; // max supported matmul dims + std::vector inShape; + std::vector outShape; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc new file mode 100644 index 0000000000..7a7b64c88b --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc @@ -0,0 +1,166 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "include/errorcode.h" +#include "src/kernel_registry.h" +#include "src/runtime/opencl/opencl_runtime.h" +#include "src/runtime/kernel/opencl/kernel/reduce.h" +#include "src/runtime/kernel/opencl/cl/reduce.cl.inc" + +using mindspore::kernel::KERNEL_ARCH::kGPU; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_NULL_PTR; +using mindspore::lite::RET_OK; +using mindspore::lite::RET_PARAM_INVALID; +using mindspore::schema::PrimitiveType_Mean; +using mindspore::schema::PrimitiveType_Reduce; +using mindspore::schema::ReduceMode; +using mindspore::schema::ReduceMode_ReduceMax; +using mindspore::schema::ReduceMode_ReduceMean; +using mindspore::schema::ReduceMode_ReduceMin; +using mindspore::schema::ReduceMode_ReduceProd; +using mindspore::schema::ReduceMode_ReduceSum; +using mindspore::schema::ReduceMode_ReduceSumSquare; + +namespace mindspore::kernel { + +int ReduceOpenCLKernel::Init() { + InitNHWCShape(); + auto reduce_param = reinterpret_cast(op_parameter_); + if (reduce_param == nullptr) { + return RET_NULL_PTR; + } + std::map reduce_type2str{{ReduceMode_ReduceMean, "mean"}, {ReduceMode_ReduceSum, "sum"}}; + if (reduce_type2str.find(reduce_param->mode_) == reduce_type2str.end()) { + MS_LOG(ERROR) << "not supported reduce type:" << reduce_param->mode_; + return RET_PARAM_INVALID; + } + if (reduce_param->num_axes_ != 2 || ((reduce_param->axes_[0] != 1 || reduce_param->axes_[1] != 2) && + (reduce_param->axes_[0] != 2 || reduce_param->axes_[1] != 1))) { + MS_LOG(ERROR) << "reduce op only support axes HW"; + return RET_PARAM_INVALID; + } + std::string kernel_name = reduce_type2str.at(reduce_param->mode_); + kernel_name += "_" + std::string(EnumNameFormat(op_format_)); + auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + enable_fp16_ = ocl_runtime->GetFp16Enable(); + + if (in_tensors_[0]->shape().back() != out_tensors_[0]->shape().back()) { + MS_LOG(ERROR) << "Reduce input channel " << in_tensors_[0]->shape().back() << " should equal output channel" + << out_tensors_[0]->shape().back(); + return RET_ERROR; + } +#ifdef PROGRAM_WITH_IL + kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name); +#else + std::set build_options; + std::string source = reduce_source; + ocl_runtime->LoadSource(kernel_name, source); + ocl_runtime->BuildKernel(kernel_, kernel_name, kernel_name, build_options); +#endif + in_ori_format_ = in_tensors_[0]->GetFormat(); + out_ori_format_ = out_tensors_[0]->GetFormat(); + in_tensors_[0]->SetFormat(op_format_); + out_tensors_[0]->SetFormat(op_format_); + MS_LOG(DEBUG) << kernel_name << " Init Done!"; + return RET_OK; +} + +void ReduceOpenCLKernel::InitNHWCShape() { + std::vector shapex = out_tensors_[0]->shape(); + size_t n = 1, h = 1, w = 1, c = 1; + if (shapex.size() == 2) { + n = shapex[0]; + c = shapex[1]; + } else if (shapex.size() == 4) { + n = shapex[0]; + h = shapex[1]; + w = shapex[2]; + c = shapex[3]; + } + nhwc_shape_ = {n, h, w, c}; +} + +int ReduceOpenCLKernel::ReSize() { return RET_OK; } + +int ReduceOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { + size_t im_dst_x, im_dst_y; + + if (op_format_ == schema::Format_NHWC4) { + im_dst_x = nhwc_shape_[2] * UP_DIV(nhwc_shape_[3], C4NUM); + im_dst_y = nhwc_shape_[0] * nhwc_shape_[1]; + } else if (op_format_ == schema::Format_NC4HW4) { + im_dst_x = nhwc_shape_[2]; + im_dst_y = nhwc_shape_[0] * UP_DIV(nhwc_shape_[3], C4NUM) * nhwc_shape_[1]; + } else { + MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_); + return RET_ERROR; + } + size_t img_dtype = CL_FLOAT; + if (enable_fp16_) { + img_dtype = CL_HALF_FLOAT; + } + img_size->clear(); + std::vector vec{im_dst_x, im_dst_y, img_dtype}; + *img_size = vec; + return RET_OK; +} + +int ReduceOpenCLKernel::Run() { + MS_LOG(DEBUG) << this->name() << " Running!"; + std::vector shapex = in_tensors_[0]->shape(); + int h = shapex[1]; + int w = shapex[2]; + int c = shapex[3]; + int c4 = UP_DIV(c, C4NUM); + auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + std::vector local = {}; + std::vector global = {static_cast(c4)}; + cl_int4 size = {h, w, c4, 1}; + int arg_idx = 0; + ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->MutableData()); + ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->MutableData()); + ocl_runtime->SetKernelArg(kernel_, arg_idx++, size); + ocl_runtime->RunKernel(kernel_, global, local, nullptr); + return RET_OK; +} + +kernel::LiteKernel *OpenCLReduceKernelCreator(const std::vector &inputs, + const std::vector &outputs, OpParameter *opParameter, + const lite::Context *ctx, const kernel::KernelKey &desc, + const mindspore::lite::PrimitiveC *primitive) { + auto *kernel = new (std::nothrow) ReduceOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); + if (kernel == nullptr) { + MS_LOG(ERROR) << "kernel " << opParameter->name_ << " create failed."; + return nullptr; + } + auto ret = kernel->Init(); + if (ret != RET_OK) { + delete kernel; + return nullptr; + } + return kernel; +} + +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Mean, OpenCLReduceKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Mean, OpenCLReduceKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Reduce, OpenCLReduceKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Reduce, OpenCLReduceKernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h new file mode 100644 index 0000000000..a3897dd18f --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h @@ -0,0 +1,48 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_REDUCE_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_REDUCE_H_ + +#include + +#include "src/lite_kernel.h" +#include "src/runtime/opencl/opencl_runtime.h" +#include "src/runtime/kernel/opencl/opencl_kernel.h" +#include "nnacl/reduce_parameter.h" + +namespace mindspore::kernel { +class ReduceOpenCLKernel : public OpenCLKernel { + public: + explicit ReduceOpenCLKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs) + : OpenCLKernel(parameter, inputs, outputs) {} + ~ReduceOpenCLKernel() override{}; + + int Init() override; + int ReSize() override; + int Run() override; + int GetImageSize(size_t idx, std::vector *img_size) override; + void InitNHWCShape(); + + private: + cl::Kernel kernel_; + bool enable_fp16_{false}; + std::vector nhwc_shape_; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_REDUCE_H_ diff --git a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc index ec208024a3..f4e56b600b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc @@ -73,18 +73,6 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector &in_te return RET_ERROR; } new_tensor->CopyTensor(*in_tensors[i]); - if ((dst_format == schema::Format::Format_NCHW || dst_format == schema::Format::Format_NC4HW4) && - (src_format == schema::Format::Format_NHWC || src_format == schema::Format::Format_NHWC4)) { - auto shape = new_tensor->shape(); - std::vector dst_shape{shape[0], shape[3], shape[1], shape[2]}; - new_tensor->set_shape(shape); - } - if ((dst_format == schema::Format::Format_NHWC || dst_format == schema::Format::Format_NHWC4) && - (src_format == schema::Format::Format_NCHW || src_format == schema::Format::Format_NC4HW4)) { - auto shape = new_tensor->shape(); - std::vector dst_shape{shape[0], shape[2], shape[3], shape[1]}; - new_tensor->set_shape(shape); - } if (mem_type == OpenCLMemType::IMG) { new_tensor->SetFormat(dst_format); in_tensors[i]->SetFormat(src_format); diff --git a/mindspore/lite/test/CMakeLists.txt b/mindspore/lite/test/CMakeLists.txt index a6222c0615..c86374c506 100644 --- a/mindspore/lite/test/CMakeLists.txt +++ b/mindspore/lite/test/CMakeLists.txt @@ -127,6 +127,7 @@ if (SUPPORT_GPU) ${LITE_DIR}/src/runtime/kernel/opencl/kernel/to_format.cc ${LITE_DIR}/src/runtime/kernel/opencl/kernel/biasadd.cc ${LITE_DIR}/src/runtime/kernel/opencl/kernel/scale.cc + ${LITE_DIR}/src/runtime/kernel/opencl/kernel/reduce.cc ) endif() ### minddata lite @@ -315,6 +316,7 @@ if (SUPPORT_GPU) ${TEST_DIR}/ut/src/runtime/kernel/opencl/reshape_tests.cc ${TEST_DIR}/ut/src/runtime/kernel/opencl/biasadd_tests.cc ${TEST_DIR}/ut/src/runtime/kernel/opencl/scale_tests.cc + ${TEST_DIR}/ut/src/runtime/kernel/opencl/reduce_tests.cc ) endif() diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc index 8a09409592..dd46e107d4 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc @@ -30,7 +30,7 @@ class TestMatMulOpenCL : public mindspore::CommonTest { }; void RunTestCaseMatMul(const std::vector &shape, void *input_data, void *weight_data, void *output_data, - bool enable_fp16) { + bool enable_fp16, int dims) { auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); ocl_runtime->Init(); size_t dtype_size = sizeof(float); @@ -39,20 +39,41 @@ void RunTestCaseMatMul(const std::vector &shape, void *input_data, void *we dtype_size = sizeof(int16_t); } auto allocator = ocl_runtime->GetAllocator(); - int ci = shape[0]; - int co = shape[1]; - std::vector input_shape = {1, ci}; + std::vector input_shape, output_shape, weight_shape; + if (dims == 2) { + int ci = shape[0]; + int co = shape[1]; + input_shape = {1, ci}; + output_shape = {1, co}; + weight_shape = {co, ci}; + } else if (dims == 4) { + int a = shape[0]; + int b = shape[1]; + int m = shape[2]; + int ci = shape[3]; + int co = shape[4]; + input_shape = {a, b, m, ci}; + output_shape = {a, b, m, co}; + weight_shape = {a, b, co, ci}; + } + auto param_ptr = std::make_unique(); + auto param = param_ptr.get(); + if (param == nullptr) { + MS_LOG(ERROR) << "param_ptr create error."; + return; + } + param->a_transpose_ = false; + param->b_transpose_ = true; auto tensor_x_ptr = std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), - input_shape, schema::Format_NC); + input_shape, dims == 2 ? schema::Format_NC : schema::Format_NHWC); auto tensor_x = tensor_x_ptr.get(); if (tensor_x == nullptr) { MS_LOG(ERROR) << "tensor_x create error."; return; } - std::vector w_shape = {co, ci}; - auto tensor_w_ptr = - std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), w_shape); + auto tensor_w_ptr = std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), + weight_shape, dims == 2 ? schema::Format_NC : schema::Format_NHWC); auto tensor_w = tensor_w_ptr.get(); if (tensor_w == nullptr) { MS_LOG(ERROR) << "tensor_w create error."; @@ -60,9 +81,9 @@ void RunTestCaseMatMul(const std::vector &shape, void *input_data, void *we } tensor_w->SetData(weight_data); - std::vector out_shape = {1, co}; - auto tensor_out_ptr = std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), - out_shape, schema::Format_NC); + auto tensor_out_ptr = + std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), output_shape, + dims == 2 ? schema::Format_NC : schema::Format_NHWC); auto tensor_out = tensor_out_ptr.get(); if (tensor_out == nullptr) { MS_LOG(ERROR) << "tensor_out create error."; @@ -70,7 +91,8 @@ void RunTestCaseMatMul(const std::vector &shape, void *input_data, void *we } std::vector inputs{tensor_x, tensor_w}; std::vector outputs{tensor_out}; - auto op_kernel_ptr = std::make_unique(nullptr, inputs, outputs, false); + auto op_kernel_ptr = + std::make_unique(reinterpret_cast(param), inputs, outputs, false); auto op_kernel = op_kernel_ptr.get(); if (op_kernel == nullptr) { MS_LOG(ERROR) << "op_kernel create error."; @@ -89,12 +111,13 @@ void RunTestCaseMatMul(const std::vector &shape, void *input_data, void *we return; } pGraph->Init(); - memcpy(inputs[0]->MutableData(), input_data, ci * dtype_size); + memcpy(inputs[0]->MutableData(), input_data, tensor_x->ElementsNum() * dtype_size); pGraph->Run(); if (enable_fp16) { - CompareOutput(outputs[0]->MutableData(), output_data, co, static_cast(1e-3), 2e-2); + CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast(1e-3), + 2e-2); } else { - CompareOutput(outputs[0]->MutableData(), output_data, co, static_cast(1e-5)); + CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast(1e-5)); } tensor_x->SetData(nullptr); @@ -125,7 +148,7 @@ void RunTestCaseMatMul(const std::vector shape, const std::vector weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; std::vector output_data = {10.f, 10.f, 10.f}; - RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), false); + RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), false, 2); } TEST_F(TestMatMulOpenCL, MatMulFp16_2) { @@ -167,6 +190,40 @@ TEST_F(TestMatMulOpenCL, MatMulFp16_2) { std::vector weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; std::vector output_data = {10.f, 10.f, 10.f}; - RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), true); + RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), true, 2); +} + +TEST_F(TestMatMulOpenCL, MatMulFp32_4D) { + int a = 1; + int b = 2; + int c = 2; + int ci = 5; + int co = 3; + std::vector shape = {a, b, c, ci, co}; + std::vector input_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; + std::vector weight_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, + 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, + 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f}; + std::vector output_data = {15.0f, 40.0f, 65.0f, 15.0f, 40.0f, 65.0f, + 90.0f, 115.0f, 140.0f, 90.0f, 115.0f, 140.0f}; + RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), false, 4); +} + +TEST_F(TestMatMulOpenCL, MatMulFp16_4D) { + int a = 1; + int b = 2; + int c = 2; + int ci = 5; + int co = 3; + std::vector shape = {a, b, c, ci, co}; + std::vector input_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; + std::vector weight_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, + 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, + 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f}; + std::vector output_data = {15.0f, 40.0f, 65.0f, 15.0f, 40.0f, 65.0f, + 90.0f, 115.0f, 140.0f, 90.0f, 115.0f, 140.0f}; + RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), true, 4); } } // namespace mindspore diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc new file mode 100644 index 0000000000..54959f14b7 --- /dev/null +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc @@ -0,0 +1,156 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include "mindspore/core/utils/log_adapter.h" +#include "common/common_test.h" +#include "mindspore/lite/src/common/file_utils.h" +#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" +#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" +#include "mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h" +#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h" + +namespace mindspore { +class TestReduceOpenCL : public mindspore::CommonTest { + public: + TestReduceOpenCL() {} +}; + +void RunTestCaseReduce(const std::vector &shape, void *input_data, void *output_data, bool enable_fp16, + int reduce_mode) { + auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + ocl_runtime->Init(); + size_t dtype_size = sizeof(float); + if (enable_fp16) { + ocl_runtime->SetFp16Enable(true); + dtype_size = sizeof(float16_t); + } + auto allocator = ocl_runtime->GetAllocator(); + auto param_ptr = std::make_unique(); + auto param = param_ptr.get(); + if (param == nullptr) { + MS_LOG(ERROR) << "param_ptr create error."; + return; + } + param->axes_[0] = 1; + param->axes_[1] = 2; + param->num_axes_ = 2; + param->mode_ = reduce_mode; + int n = shape[0]; + int h = shape[1]; + int w = shape[2]; + int c = shape[3]; + std::vector input_shape = {n, h, w, c}; + auto tensor_x_ptr = std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), + input_shape, schema::Format_NHWC); + auto tensor_x = tensor_x_ptr.get(); + if (tensor_x == nullptr) { + MS_LOG(ERROR) << "tensor_x create error."; + return; + } + std::vector out_shape = {n, c}; + auto tensor_out_ptr = std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), + out_shape, schema::Format_NC); + auto tensor_out = tensor_out_ptr.get(); + if (tensor_out == nullptr) { + MS_LOG(ERROR) << "tensor_out create error."; + return; + } + std::vector inputs{tensor_x}; + std::vector outputs{tensor_out}; + auto arith_kernel_ptr = + std::make_unique(reinterpret_cast(param), inputs, outputs); + auto arith_kernel = arith_kernel_ptr.get(); + if (arith_kernel == nullptr) { + MS_LOG(ERROR) << "arith_kernel create error."; + return; + } + arith_kernel->Init(); + + inputs[0]->MallocData(allocator); + + std::vector kernels{arith_kernel}; + auto pGraph_ptr = std::make_unique(inputs, outputs, kernels, kernels, kernels); + auto pGraph = pGraph_ptr.get(); + if (pGraph == nullptr) { + MS_LOG(ERROR) << "pGraph create error."; + return; + } + pGraph->Init(); + memcpy(inputs[0]->MutableData(), input_data, inputs[0]->ElementsNum() * dtype_size); + pGraph->Run(); + + if (enable_fp16) { + CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast(1e-3), + 2e-2); + } else { + CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast(1e-5)); + } + inputs[0]->SetData(nullptr); + outputs[0]->SetData(nullptr); + + MS_LOG(INFO) << "Test Reduce passed"; + lite::opencl::OpenCLRuntime::DeleteInstance(); +} + +TEST_F(TestReduceOpenCL, ReduceMeanFp32) { + int n = 1; + int h = 2; + int w = 2; + int c = 3; + std::vector shape = {n, h, w, c}; + std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; + std::vector output_data = {4.5f, 5.5f, 6.5f}; + + RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMean); +} + +TEST_F(TestReduceOpenCL, ReduceMeanFp16) { + int n = 1; + int h = 2; + int w = 2; + int c = 3; + std::vector shape = {n, h, w, c}; + std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; + std::vector output_data = {4.5f, 5.5f, 6.5f}; + + RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceMean); +} + +TEST_F(TestReduceOpenCL, ReduceSumFp32) { + int n = 1; + int h = 2; + int w = 2; + int c = 3; + std::vector shape = {n, h, w, c}; + std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; + std::vector output_data = {18.0f, 22.0f, 26.0f}; + + RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceSum); +} + +TEST_F(TestReduceOpenCL, ReduceSumFp16) { + int n = 1; + int h = 2; + int w = 2; + int c = 3; + std::vector shape = {n, h, w, c}; + std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; + std::vector output_data = {18.0f, 22.0f, 26.0f}; + + RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceSum); +} +} // namespace mindspore