!5898 [MS][LITE][GPU]add reduce op and batchmatmul op

Merge pull request !5898 from chenzupeng/master-lite
pull/5898/MERGE
mindspore-ci-bot 5 years ago committed by Gitee
commit ad37b6845f

@ -1,57 +1,146 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define C4NUM 4
#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void MatMul_NHWC4(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, __kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
__write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) { __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) {
int2 gid = (int2)(get_global_id(0), get_global_id(1)); int gidx = get_global_id(0); // CO4
int2 lid = (int2)(get_local_id(0), get_local_id(1)); int gidz = get_global_id(2); // N
int lidx = get_local_id(0);
int lidy = get_local_id(1);
int ci4 = UP_DIV(in_shape.w, C4NUM);
int co4 = UP_DIV(out_shape.w, C4NUM);
int n = out_shape.z;
bool inside = gidx < co4 && gidz < n;
FLT4 result = (FLT4)(0.0f); FLT4 result = (FLT4)(0.0f);
bool inside = gid.x < offset_co.y; for (uint i = lidy; i < ci4 && inside; i += 4) {
for (uint i = lid.y; i < offset_ci.y && inside; i += 4) { FLT4 v = READ_IMAGE(input, smp_zero, (int2)(i, gidz));
FLT4 v = READ_IMAGE(input, smp_zero, (int2)(i, 0)); FLT16 w = weight[i * co4 + gidx];
FLT16 w = weight[gid.x + i * offset_co.y];
result.x += dot(v, w.s0123); result.x += dot(v, w.s0123);
result.y += dot(v, w.s4567); result.y += dot(v, w.s4567);
result.z += dot(v, w.s89ab); result.z += dot(v, w.s89ab);
result.w += dot(v, w.scdef); result.w += dot(v, w.scdef);
} }
__local FLT4 temp[64][4]; WRITE_IMAGE(output, (int2)(gidx, gidz), result);
temp[lid.x][lid.y] = result; __local FLT4 temp[32][4];
temp[lidx][lidy] = result;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (lid.y == 0 && inside) { if (lidy == 0 && inside) {
result += temp[lid.x][1]; result += temp[lidx][1];
result += temp[lid.x][2]; result += temp[lidx][2];
result += temp[lid.x][3]; result += temp[lidx][3];
if (has_bias != 0) { if (has_bias != 0) {
result += READ_IMAGE(bias, smp_zero, (int2)(gid.x, 0)); result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
} }
WRITE_IMAGE(output, (int2)(gid.x, 0), result); WRITE_IMAGE(output, (int2)(gidx, gidz), result);
} }
} }
__kernel void MatMul_NC4HW4(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, __kernel void MatMul_NC4HW4_2d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
__write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) { __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) {
int2 gid = (int2)(get_global_id(0), get_global_id(1)); int gidx = get_global_id(0); // CO4
int2 lid = (int2)(get_local_id(0), get_local_id(1)); int gidz = get_global_id(2); // N
int lidx = get_local_id(0);
int lidy = get_local_id(1);
int ci4 = UP_DIV(in_shape.w, C4NUM);
int co4 = UP_DIV(out_shape.w, C4NUM);
int n = out_shape.z;
bool inside = gidx < co4 && gidz < n;
FLT4 result = (FLT4)(0.0f); FLT4 result = (FLT4)(0.0f);
bool inside = gid.x < offset_co.y; for (uint i = lidy; i < ci4 && inside; i += 4) {
for (uint i = lid.y; i < offset_ci.y && inside; i += 4) { FLT4 v = READ_IMAGE(input, smp_zero, (int2)(gidz * ci4 + i, 0));
FLT4 v = READ_IMAGE(input, smp_zero, (int2)(0, i)); FLT16 w = weight[i * co4 + gidx];
FLT16 w = weight[gid.x + i * offset_co.y];
result.x += dot(v, w.s0123); result.x += dot(v, w.s0123);
result.y += dot(v, w.s4567); result.y += dot(v, w.s4567);
result.z += dot(v, w.s89ab); result.z += dot(v, w.s89ab);
result.w += dot(v, w.scdef); result.w += dot(v, w.scdef);
} }
__local FLT4 temp[64][4]; __local FLT4 temp[32][4];
temp[lid.x][lid.y] = result; temp[lidx][lidy] = result;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (lid.y == 0 && inside) { if (lidy == 0 && inside) {
result += temp[lid.x][1]; result += temp[lidx][1];
result += temp[lid.x][2]; result += temp[lidx][2];
result += temp[lid.x][3]; result += temp[lidx][3];
if (has_bias != 0) { if (has_bias != 0) {
result += READ_IMAGE(bias, smp_zero, (int2)(gid.x, 0)); result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
} }
WRITE_IMAGE(output, (int2)(0, gid.x), result); WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, 0), result);
}
}
__kernel void MatMul_NHWC4_4d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
__write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) {
int gidx = get_global_id(0); // CO4
int gidy = get_global_id(1); // N * H * 4
int gidz = get_global_id(2); // W
int lidx = get_local_id(0);
int lidy = get_local_id(1);
int ci4 = UP_DIV(in_shape.w, C4NUM);
int co4 = UP_DIV(out_shape.w, C4NUM);
int n = out_shape.x;
int h = out_shape.y;
int w = out_shape.z;
int nh_index = gidy / 4;
bool inside = gidx < co4 && gidz < w && nh_index < n * h;
FLT4 result = (FLT4)(0.0f);
for (uint i = lidy; i < ci4 && inside; i += 4) {
FLT4 v = READ_IMAGE(input, smp_zero, (int2)(gidz * ci4 + i, nh_index));
FLT16 weight_value = weight[nh_index * ci4 * co4 + i * co4 + gidx];
result.x += dot(v, weight_value.s0123);
result.y += dot(v, weight_value.s4567);
result.z += dot(v, weight_value.s89ab);
result.w += dot(v, weight_value.scdef);
}
__local FLT4 temp[32][4];
temp[lidx][lidy] = result;
barrier(CLK_LOCAL_MEM_FENCE);
if (lidy == 0 && inside) {
result += temp[lidx][1];
result += temp[lidx][2];
result += temp[lidx][3];
if (has_bias != 0) {
result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
}
WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, nh_index), result);
}
}
__kernel void MatMul_NC4HW4_4d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
__write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) {
int gidx = get_global_id(0); // CO4
int gidy = get_global_id(1); // N * H * 4
int gidz = get_global_id(2); // W
int lidx = get_local_id(0);
int lidy = get_local_id(1);
int ci4 = UP_DIV(in_shape.w, C4NUM);
int co4 = UP_DIV(out_shape.w, C4NUM);
int n = out_shape.x;
int h = out_shape.y;
int w = out_shape.z;
int nh_index = gidy / 4;
bool inside = gidx < co4 && gidz < w && nh_index < n * h;
int n_index = nh_index / h;
int h_index = nh_index % h;
FLT4 result = (FLT4)(0.0f);
for (uint i = lidy; i < ci4 && inside; i += 4) {
FLT4 v = READ_IMAGE(input, smp_zero, (int2)(gidz, n_index * ci4 * h + i * h + h_index));
FLT16 weight_value = weight[nh_index * ci4 * co4 + i * co4 + gidx];
result.x += dot(v, weight_value.s0123);
result.y += dot(v, weight_value.s4567);
result.z += dot(v, weight_value.s89ab);
result.w += dot(v, weight_value.scdef);
}
__local FLT4 temp[32][4];
temp[lidx][lidy] = result;
barrier(CLK_LOCAL_MEM_FENCE);
if (lidy == 0 && inside) {
result += temp[lidx][1];
result += temp[lidx][2];
result += temp[lidx][3];
if (has_bias != 0) {
result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
}
WRITE_IMAGE(output, (int2)(gidz, n_index * co4 * h + gidx * h + h_index), result);
} }
} }

@ -0,0 +1,61 @@
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void mean_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
int X = get_global_id(0); // C4
if (X >= size.z) {
return;
}
FLT4 result = (FLT4)0.f;
for (int h = 0; h < size.x; h++) {
for (int w = 0; w < size.y; w++) {
result += READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h));
}
}
result /= size.x * size.y;
WRITE_IMAGE(dst_data, (int2)(X, 0), result);
}
__kernel void mean_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
int X = get_global_id(0); // C4
if (X >= size.z) {
return;
}
FLT4 result = (FLT4)0.f;
for (int h = 0; h < size.x; h++) {
for (int w = 0; w < size.y; w++) {
result += READ_IMAGE(src_data, smp_zero, (int2)(w, X * size.x + h));
}
}
result /= size.x * size.y;
WRITE_IMAGE(dst_data, (int2)(0, X), result);
}
__kernel void sum_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
int X = get_global_id(0); // C4
if (X >= size.z) {
return;
}
FLT4 result = (FLT4)0.f;
for (int h = 0; h < size.x; h++) {
for (int w = 0; w < size.y; w++) {
result += READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h));
}
}
WRITE_IMAGE(dst_data, (int2)(X, 0), result);
}
__kernel void sum_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
int X = get_global_id(0); // C4
if (X >= size.z) {
return;
}
FLT4 result = (FLT4)0.f;
for (int h = 0; h < size.x; h++) {
for (int w = 0; w < size.y; w++) {
result += READ_IMAGE(src_data, smp_zero, (int2)(w, X * size.x + h));
}
}
WRITE_IMAGE(dst_data, (int2)(0, X), result);
}

@ -45,7 +45,6 @@ class ActivationOpenClKernel : public OpenCLKernel {
cl::Kernel kernel_; cl::Kernel kernel_;
int type_; int type_;
float alpha_; float alpha_;
void *alpha_buff_;
int in_size_; int in_size_;
int out_size_; int out_size_;
size_t fp_size; size_t fp_size;

@ -94,14 +94,20 @@ void Conv2dTransposeOpenCLKernel::PadWeight() {
int ori_index = ((ci_offset * kh + kh_i) * kw + kw_i) * co + co_offset; int ori_index = ((ci_offset * kh + kh_i) * kw + kw_i) * co + co_offset;
if (enable_fp16_) { if (enable_fp16_) {
if (weight_dtype == kNumberTypeFloat32) { if (weight_dtype == kNumberTypeFloat32) {
reinterpret_cast<uint16_t *>(padWeight_)[index++] = reinterpret_cast<float16_t *>(padWeight_)[index++] =
Float32ToShort(reinterpret_cast<float *>(origin_weight)[ori_index]); reinterpret_cast<float *>(origin_weight)[ori_index];
} else { } else {
reinterpret_cast<uint16_t *>(padWeight_)[index++] = reinterpret_cast<float16_t *>(padWeight_)[index++] =
reinterpret_cast<uint16_t *>(origin_weight)[ori_index]; reinterpret_cast<float16_t *>(origin_weight)[ori_index];
} }
} else { } else {
reinterpret_cast<float *>(padWeight_)[index++] = reinterpret_cast<float *>(origin_weight)[ori_index]; if (weight_dtype == kNumberTypeFloat32) {
reinterpret_cast<float *>(padWeight_)[index++] =
reinterpret_cast<float *>(origin_weight)[ori_index];
} else {
reinterpret_cast<float *>(padWeight_)[index++] =
reinterpret_cast<float16_t *>(origin_weight)[ori_index];
}
} }
} else { } else {
index++; index++;

@ -1,5 +1,5 @@
/** /**
* Copyright 2019 Huawei Technologies Co., Ltd * Copyright 2019 Huawei Technologies n., Ltd
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
@ -16,10 +16,10 @@
#include <set> #include <set>
#include <string> #include <string>
#include <map>
#include "nnacl/fp32/common_func.h" #include "nnacl/fp32/common_func.h"
#include "src/kernel_registry.h" #include "src/kernel_registry.h"
#include "src/runtime/opencl/opencl_runtime.h" #include "src/runtime/opencl/opencl_runtime.h"
#include "nnacl/fp32/matmul.h"
#include "src/runtime/kernel/opencl/kernel/matmul.h" #include "src/runtime/kernel/opencl/kernel/matmul.h"
#ifndef PROGRAM_WITH_IL #ifndef PROGRAM_WITH_IL
#include "src/runtime/kernel/opencl/cl/matmul.cl.inc" #include "src/runtime/kernel/opencl/cl/matmul.cl.inc"
@ -36,7 +36,26 @@ int MatMulOpenCLKernel::Init() {
std::string kernel_name = "MatMul"; std::string kernel_name = "MatMul";
kernel_name += "_" + std::string(EnumNameFormat(op_format_)); kernel_name += "_" + std::string(EnumNameFormat(op_format_));
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
auto param = reinterpret_cast<MatMulParameter *>(op_parameter_);
transposeA = param->a_transpose_;
if (transposeA) {
MS_LOG(ERROR) << "matmul only support a_transpose_=false yet.";
return RET_ERROR;
}
transposeB = param->b_transpose_;
enable_fp16_ = ocl_runtime->GetFp16Enable(); enable_fp16_ = ocl_runtime->GetFp16Enable();
if (in_tensors_[0]->shape().size() != out_tensors_[0]->shape().size() ||
(in_tensors_[0]->shape().size() != 2 && in_tensors_[0]->shape().size() != 4)) {
MS_LOG(ERROR) << "matmul only support input shape size=2 or 4.";
return RET_ERROR;
}
dims = in_tensors_[0]->shape().size();
for (int i = 0; i < dims; i++) {
inShape[MAX_DIMS - dims + i] = in_tensors_[0]->shape()[i];
outShape[MAX_DIMS - dims + i] = out_tensors_[0]->shape()[i];
}
std::map<int, std::string> dims2str = {{2, "_2d"}, {4, "_4d"}};
kernel_name += dims2str[dims];
#ifdef PROGRAM_WITH_IL #ifdef PROGRAM_WITH_IL
kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name); kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
#else #else
@ -46,21 +65,7 @@ int MatMulOpenCLKernel::Init() {
ocl_runtime->LoadSource(program_name, source); ocl_runtime->LoadSource(program_name, source);
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
#endif #endif
int ci, co;
if (in_tensors_[1]->shape().size() != 2) {
MS_LOG(ERROR) << "matmul do not support input shape size=" << in_tensors_[1]->shape().size();
return RET_ERROR;
}
if (in_tensors_[1]->shape().size() == 2) {
ci = in_tensors_[1]->shape()[1];
co = in_tensors_[1]->shape()[0];
} else {
ci = in_tensors_[1]->shape()[3];
co = in_tensors_[1]->shape()[0];
}
sizeCI = {ci, UP_DIV(ci, C4NUM)};
sizeCO = {co, UP_DIV(co, C4NUM)};
PadWeight(); PadWeight();
in_ori_format_ = in_tensors_[0]->GetFormat(); in_ori_format_ = in_tensors_[0]->GetFormat();
out_ori_format_ = out_tensors_[0]->GetFormat(); out_ori_format_ = out_tensors_[0]->GetFormat();
@ -73,51 +78,69 @@ int MatMulOpenCLKernel::Init() {
int MatMulOpenCLKernel::ReSize() { return RET_OK; } int MatMulOpenCLKernel::ReSize() { return RET_OK; }
void MatMulOpenCLKernel::PadWeight() { void MatMulOpenCLKernel::PadWeight() {
// ABMCI @ ABCICO = ABMCO
auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator(); auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
int ci = inShape[3];
int ci4 = UP_DIV(ci, C4NUM);
int co = outShape[3];
int co4 = UP_DIV(co, C4NUM);
int a = inShape[0];
int b = inShape[1];
size_t dtype_size = enable_fp16_ ? sizeof(int16_t) : sizeof(float); size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
padWeight_ = allocator->Malloc(sizeCI.s[1] * sizeCO.s[1] * C4NUM * C4NUM * dtype_size); padWeight_ = allocator->Malloc(a * b * ci4 * co4 * C4NUM * C4NUM * dtype_size);
padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true); padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
memset(padWeight_, 0x00, sizeCI.s[1] * sizeCO.s[1] * C4NUM * C4NUM * dtype_size); auto padWeightFp32 = reinterpret_cast<float *>(padWeight_);
auto origin_weight = in_tensors_.at(kWeightIndex)->MutableData(); auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_);
int divCI = sizeCI.s[1]; memset(padWeight_, 0x00, a * b * ci4 * co4 * C4NUM * C4NUM * dtype_size);
int divCO = sizeCO.s[1]; auto originWeightFp32 = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->MutableData());
int co = sizeCO.s[0]; auto originWeightFp16 = reinterpret_cast<float16_t *>(in_tensors_.at(kWeightIndex)->MutableData());
bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16;
// pad weight
// ABCICO -> AB(CI4)(CO4)(4 from CO)(4 from CI)
// if tranposeB, ABCOCI -> AB(CI4)(CO4)(4 from CO)(4 from CI)
int index = 0; int index = 0;
for (int i = 0; i < divCI; ++i) { for (int aa = 0; aa < a; aa++) {
for (int j = 0; j < divCO; ++j) { for (int bb = 0; bb < b; bb++) {
for (int k = 0; k < C4NUM; ++k) { int baseAB = (aa * b + bb) * ci * co;
for (int l = 0; l < C4NUM; ++l) { for (int i = 0; i < ci4; ++i) {
int src_x = i * C4NUM + l; for (int j = 0; j < co4; ++j) {
int src_y = j * C4NUM + k; for (int k = 0; k < C4NUM; ++k) {
if (src_x < sizeCI.s[0] && src_y < sizeCO.s[0]) { for (int l = 0; l < C4NUM; ++l) {
if (enable_fp16_) { int src_ci = i * C4NUM + l;
if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat32) { int src_co = j * C4NUM + k;
reinterpret_cast<uint16_t *>(padWeight_)[index++] = if (src_ci < ci && src_co < co) {
Float32ToShort(reinterpret_cast<float *>(origin_weight)[src_y * sizeCI.s[0] + src_x]); int originId = baseAB + src_ci * co + src_co;
} else { if (transposeB) {
reinterpret_cast<uint16_t *>(padWeight_)[index++] = originId = baseAB + src_co * ci + src_ci;
reinterpret_cast<uint16_t *>(origin_weight)[src_y * sizeCI.s[0] + src_x]; }
} if (enable_fp16_) {
} else { if (!isModelFp16) {
if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16) { padWeightFp16[index++] = originWeightFp32[originId];
reinterpret_cast<float *>(padWeight_)[index++] = } else {
ShortToFloat32(reinterpret_cast<uint16_t *>(origin_weight)[src_y * sizeCI.s[0] + src_x]); padWeightFp16[index++] = originWeightFp16[originId];
}
} else {
if (!isModelFp16) {
padWeightFp32[index++] = originWeightFp32[originId];
} else {
padWeightFp32[index++] = originWeightFp16[originId];
}
}
} else { } else {
reinterpret_cast<float *>(padWeight_)[index++] = index++;
reinterpret_cast<float *>(origin_weight)[src_y * sizeCI.s[0] + src_x];
} }
} }
} else {
index++;
} }
} }
} }
} }
} }
// pad FC Bias
size_t im_dst_x, im_dst_y; size_t im_dst_x, im_dst_y;
im_dst_x = divCO; im_dst_x = co4;
im_dst_y = 1; im_dst_y = 1;
size_t img_dtype = CL_FLOAT; size_t img_dtype = CL_FLOAT;
if (enable_fp16_) { if (enable_fp16_) {
@ -126,13 +149,18 @@ void MatMulOpenCLKernel::PadWeight() {
std::vector<size_t> img_size{im_dst_x, im_dst_y, img_dtype}; std::vector<size_t> img_size{im_dst_x, im_dst_y, img_dtype};
bias_ = allocator->Malloc(im_dst_x * im_dst_y * C4NUM * dtype_size, img_size); bias_ = allocator->Malloc(im_dst_x * im_dst_y * C4NUM * dtype_size, img_size);
bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true); bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true);
memset(bias_, 0x00, divCO * C4NUM * dtype_size); memset(bias_, 0x00, co4 * C4NUM * dtype_size);
if (in_tensors_.size() >= 3) { if (in_tensors_.size() >= 3) {
if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) { if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) {
auto fdata = reinterpret_cast<float *>(in_tensors_[2]->MutableData()); auto fdata = reinterpret_cast<float *>(in_tensors_[2]->MutableData());
for (int i = 0; i < co; i++) { for (int i = 0; i < co; i++) {
reinterpret_cast<uint16_t *>(bias_)[i] = Float32ToShort(fdata[i]); reinterpret_cast<uint16_t *>(bias_)[i] = Float32ToShort(fdata[i]);
} }
} else if (in_tensors_[2]->data_type() == kNumberTypeFloat16 && !enable_fp16_) {
auto fdata = reinterpret_cast<uint16_t *>(in_tensors_[2]->MutableData());
for (int i = 0; i < co; i++) {
reinterpret_cast<float *>(bias_)[i] = ShortToFloat32(fdata[i]);
}
} else { } else {
memcpy(bias_, in_tensors_[2]->MutableData(), co * dtype_size); memcpy(bias_, in_tensors_[2]->MutableData(), co * dtype_size);
} }
@ -142,12 +170,23 @@ void MatMulOpenCLKernel::PadWeight() {
int MatMulOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) { int MatMulOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
size_t im_dst_x, im_dst_y; size_t im_dst_x, im_dst_y;
if (op_format_ == schema::Format::Format_NHWC4) { auto out_shape = out_tensors_[0]->shape();
im_dst_x = sizeCO.s[1]; int n = 1, h = 1, w = 1, c = 1;
im_dst_y = 1; if (dims == 2) {
} else if (op_format_ == schema::Format::Format_NC4HW4) { n = out_shape[0];
im_dst_x = 1; c = out_shape[1];
im_dst_y = sizeCO.s[1]; } else if (dims == 4) {
n = out_shape[0];
h = out_shape[1];
w = out_shape[2];
c = out_shape[3];
}
if (op_format_ == schema::Format_NHWC4) {
im_dst_x = w * UP_DIV(c, C4NUM);
im_dst_y = n * h;
} else if (op_format_ == schema::Format_NC4HW4) {
im_dst_x = w;
im_dst_y = n * UP_DIV(c, C4NUM) * h;
} else { } else {
MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_); MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_);
return RET_ERROR; return RET_ERROR;
@ -166,15 +205,19 @@ int MatMulOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!"; MS_LOG(DEBUG) << this->name() << " Running!";
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
// local size should less than MAX_GROUP_SIZE // local size should less than MAX_GROUP_SIZE
std::vector<size_t> local = {64, 4}; std::vector<size_t> local = {32, 4, 1};
std::vector<size_t> global = {UP_ROUND(sizeCO.s[1], local[0]), 4}; std::vector<size_t> global = {UP_DIV(static_cast<size_t>(outShape[3]), C4NUM),
4 * static_cast<size_t>(outShape[0]) * static_cast<size_t>(outShape[1]),
static_cast<size_t>(outShape[2])};
int arg_count = 0; int arg_count = 0;
cl_int4 in_shape = {inShape[0], inShape[1], inShape[2], inShape[3]};
cl_int4 out_shape = {outShape[0], outShape[1], outShape[2], outShape[3]};
ocl_runtime->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->MutableData()); ocl_runtime->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->MutableData());
ocl_runtime->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF); ocl_runtime->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF);
ocl_runtime->SetKernelArg(kernel_, arg_count++, bias_); ocl_runtime->SetKernelArg(kernel_, arg_count++, bias_);
ocl_runtime->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->MutableData()); ocl_runtime->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->MutableData());
ocl_runtime->SetKernelArg(kernel_, arg_count++, sizeCI); ocl_runtime->SetKernelArg(kernel_, arg_count++, in_shape);
ocl_runtime->SetKernelArg(kernel_, arg_count++, sizeCO); ocl_runtime->SetKernelArg(kernel_, arg_count++, out_shape);
ocl_runtime->SetKernelArg(kernel_, arg_count++, hasBias_ ? 1 : 0); ocl_runtime->SetKernelArg(kernel_, arg_count++, hasBias_ ? 1 : 0);
ocl_runtime->RunKernel(kernel_, global, local, nullptr); ocl_runtime->RunKernel(kernel_, global, local, nullptr);
return RET_OK; return RET_OK;

@ -20,7 +20,7 @@
#include <vector> #include <vector>
#include "src/runtime/kernel/opencl/opencl_kernel.h" #include "src/runtime/kernel/opencl/opencl_kernel.h"
#include "nnacl/conv_parameter.h" #include "nnacl/matmul_parameter.h"
#include "src/runtime/opencl/opencl_runtime.h" #include "src/runtime/opencl/opencl_runtime.h"
namespace mindspore::kernel { namespace mindspore::kernel {
@ -29,7 +29,7 @@ class MatMulOpenCLKernel : public OpenCLKernel {
public: public:
explicit MatMulOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, explicit MatMulOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, bool hasBias) const std::vector<lite::Tensor *> &outputs, bool hasBias)
: OpenCLKernel(parameter, inputs, outputs) { : OpenCLKernel(parameter, inputs, outputs), inShape(MAX_DIMS, 1), outShape(MAX_DIMS, 1) {
hasBias_ = hasBias; hasBias_ = hasBias;
} }
~MatMulOpenCLKernel() override{}; ~MatMulOpenCLKernel() override{};
@ -46,8 +46,12 @@ class MatMulOpenCLKernel : public OpenCLKernel {
void *bias_; void *bias_;
bool hasBias_{false}; bool hasBias_{false};
bool enable_fp16_{false}; bool enable_fp16_{false};
cl_int2 sizeCI; bool transposeA{false};
cl_int2 sizeCO; bool transposeB{true};
int dims;
static constexpr int MAX_DIMS = 4; // max supported matmul dims
std::vector<int> inShape;
std::vector<int> outShape;
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel

@ -0,0 +1,166 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <set>
#include <string>
#include <map>
#include "include/errorcode.h"
#include "src/kernel_registry.h"
#include "src/runtime/opencl/opencl_runtime.h"
#include "src/runtime/kernel/opencl/kernel/reduce.h"
#include "src/runtime/kernel/opencl/cl/reduce.cl.inc"
using mindspore::kernel::KERNEL_ARCH::kGPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_NULL_PTR;
using mindspore::lite::RET_OK;
using mindspore::lite::RET_PARAM_INVALID;
using mindspore::schema::PrimitiveType_Mean;
using mindspore::schema::PrimitiveType_Reduce;
using mindspore::schema::ReduceMode;
using mindspore::schema::ReduceMode_ReduceMax;
using mindspore::schema::ReduceMode_ReduceMean;
using mindspore::schema::ReduceMode_ReduceMin;
using mindspore::schema::ReduceMode_ReduceProd;
using mindspore::schema::ReduceMode_ReduceSum;
using mindspore::schema::ReduceMode_ReduceSumSquare;
namespace mindspore::kernel {
int ReduceOpenCLKernel::Init() {
InitNHWCShape();
auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
if (reduce_param == nullptr) {
return RET_NULL_PTR;
}
std::map<int, std::string> reduce_type2str{{ReduceMode_ReduceMean, "mean"}, {ReduceMode_ReduceSum, "sum"}};
if (reduce_type2str.find(reduce_param->mode_) == reduce_type2str.end()) {
MS_LOG(ERROR) << "not supported reduce type:" << reduce_param->mode_;
return RET_PARAM_INVALID;
}
if (reduce_param->num_axes_ != 2 || ((reduce_param->axes_[0] != 1 || reduce_param->axes_[1] != 2) &&
(reduce_param->axes_[0] != 2 || reduce_param->axes_[1] != 1))) {
MS_LOG(ERROR) << "reduce op only support axes HW";
return RET_PARAM_INVALID;
}
std::string kernel_name = reduce_type2str.at(reduce_param->mode_);
kernel_name += "_" + std::string(EnumNameFormat(op_format_));
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
enable_fp16_ = ocl_runtime->GetFp16Enable();
if (in_tensors_[0]->shape().back() != out_tensors_[0]->shape().back()) {
MS_LOG(ERROR) << "Reduce input channel " << in_tensors_[0]->shape().back() << " should equal output channel"
<< out_tensors_[0]->shape().back();
return RET_ERROR;
}
#ifdef PROGRAM_WITH_IL
kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
#else
std::set<std::string> build_options;
std::string source = reduce_source;
ocl_runtime->LoadSource(kernel_name, source);
ocl_runtime->BuildKernel(kernel_, kernel_name, kernel_name, build_options);
#endif
in_ori_format_ = in_tensors_[0]->GetFormat();
out_ori_format_ = out_tensors_[0]->GetFormat();
in_tensors_[0]->SetFormat(op_format_);
out_tensors_[0]->SetFormat(op_format_);
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}
void ReduceOpenCLKernel::InitNHWCShape() {
std::vector<int> shapex = out_tensors_[0]->shape();
size_t n = 1, h = 1, w = 1, c = 1;
if (shapex.size() == 2) {
n = shapex[0];
c = shapex[1];
} else if (shapex.size() == 4) {
n = shapex[0];
h = shapex[1];
w = shapex[2];
c = shapex[3];
}
nhwc_shape_ = {n, h, w, c};
}
int ReduceOpenCLKernel::ReSize() { return RET_OK; }
int ReduceOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
size_t im_dst_x, im_dst_y;
if (op_format_ == schema::Format_NHWC4) {
im_dst_x = nhwc_shape_[2] * UP_DIV(nhwc_shape_[3], C4NUM);
im_dst_y = nhwc_shape_[0] * nhwc_shape_[1];
} else if (op_format_ == schema::Format_NC4HW4) {
im_dst_x = nhwc_shape_[2];
im_dst_y = nhwc_shape_[0] * UP_DIV(nhwc_shape_[3], C4NUM) * nhwc_shape_[1];
} else {
MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_);
return RET_ERROR;
}
size_t img_dtype = CL_FLOAT;
if (enable_fp16_) {
img_dtype = CL_HALF_FLOAT;
}
img_size->clear();
std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
*img_size = vec;
return RET_OK;
}
int ReduceOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
std::vector<int> shapex = in_tensors_[0]->shape();
int h = shapex[1];
int w = shapex[2];
int c = shapex[3];
int c4 = UP_DIV(c, C4NUM);
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
std::vector<size_t> local = {};
std::vector<size_t> global = {static_cast<size_t>(c4)};
cl_int4 size = {h, w, c4, 1};
int arg_idx = 0;
ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->MutableData());
ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->MutableData());
ocl_runtime->SetKernelArg(kernel_, arg_idx++, size);
ocl_runtime->RunKernel(kernel_, global, local, nullptr);
return RET_OK;
}
kernel::LiteKernel *OpenCLReduceKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const lite::Context *ctx, const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
auto *kernel = new (std::nothrow) ReduceOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << "kernel " << opParameter->name_ << " create failed.";
return nullptr;
}
auto ret = kernel->Init();
if (ret != RET_OK) {
delete kernel;
return nullptr;
}
return kernel;
}
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Mean, OpenCLReduceKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Mean, OpenCLReduceKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Reduce, OpenCLReduceKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Reduce, OpenCLReduceKernelCreator)
} // namespace mindspore::kernel

@ -0,0 +1,48 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_REDUCE_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_REDUCE_H_
#include <vector>
#include "src/lite_kernel.h"
#include "src/runtime/opencl/opencl_runtime.h"
#include "src/runtime/kernel/opencl/opencl_kernel.h"
#include "nnacl/reduce_parameter.h"
namespace mindspore::kernel {
class ReduceOpenCLKernel : public OpenCLKernel {
public:
explicit ReduceOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs)
: OpenCLKernel(parameter, inputs, outputs) {}
~ReduceOpenCLKernel() override{};
int Init() override;
int ReSize() override;
int Run() override;
int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
void InitNHWCShape();
private:
cl::Kernel kernel_;
bool enable_fp16_{false};
std::vector<size_t> nhwc_shape_;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_REDUCE_H_

@ -73,18 +73,6 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te
return RET_ERROR; return RET_ERROR;
} }
new_tensor->CopyTensor(*in_tensors[i]); new_tensor->CopyTensor(*in_tensors[i]);
if ((dst_format == schema::Format::Format_NCHW || dst_format == schema::Format::Format_NC4HW4) &&
(src_format == schema::Format::Format_NHWC || src_format == schema::Format::Format_NHWC4)) {
auto shape = new_tensor->shape();
std::vector<int> dst_shape{shape[0], shape[3], shape[1], shape[2]};
new_tensor->set_shape(shape);
}
if ((dst_format == schema::Format::Format_NHWC || dst_format == schema::Format::Format_NHWC4) &&
(src_format == schema::Format::Format_NCHW || src_format == schema::Format::Format_NC4HW4)) {
auto shape = new_tensor->shape();
std::vector<int> dst_shape{shape[0], shape[2], shape[3], shape[1]};
new_tensor->set_shape(shape);
}
if (mem_type == OpenCLMemType::IMG) { if (mem_type == OpenCLMemType::IMG) {
new_tensor->SetFormat(dst_format); new_tensor->SetFormat(dst_format);
in_tensors[i]->SetFormat(src_format); in_tensors[i]->SetFormat(src_format);

@ -127,6 +127,7 @@ if (SUPPORT_GPU)
${LITE_DIR}/src/runtime/kernel/opencl/kernel/to_format.cc ${LITE_DIR}/src/runtime/kernel/opencl/kernel/to_format.cc
${LITE_DIR}/src/runtime/kernel/opencl/kernel/biasadd.cc ${LITE_DIR}/src/runtime/kernel/opencl/kernel/biasadd.cc
${LITE_DIR}/src/runtime/kernel/opencl/kernel/scale.cc ${LITE_DIR}/src/runtime/kernel/opencl/kernel/scale.cc
${LITE_DIR}/src/runtime/kernel/opencl/kernel/reduce.cc
) )
endif() endif()
### minddata lite ### minddata lite
@ -315,6 +316,7 @@ if (SUPPORT_GPU)
${TEST_DIR}/ut/src/runtime/kernel/opencl/reshape_tests.cc ${TEST_DIR}/ut/src/runtime/kernel/opencl/reshape_tests.cc
${TEST_DIR}/ut/src/runtime/kernel/opencl/biasadd_tests.cc ${TEST_DIR}/ut/src/runtime/kernel/opencl/biasadd_tests.cc
${TEST_DIR}/ut/src/runtime/kernel/opencl/scale_tests.cc ${TEST_DIR}/ut/src/runtime/kernel/opencl/scale_tests.cc
${TEST_DIR}/ut/src/runtime/kernel/opencl/reduce_tests.cc
) )
endif() endif()

@ -30,7 +30,7 @@ class TestMatMulOpenCL : public mindspore::CommonTest {
}; };
void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *weight_data, void *output_data, void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *weight_data, void *output_data,
bool enable_fp16) { bool enable_fp16, int dims) {
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
ocl_runtime->Init(); ocl_runtime->Init();
size_t dtype_size = sizeof(float); size_t dtype_size = sizeof(float);
@ -39,20 +39,41 @@ void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *we
dtype_size = sizeof(int16_t); dtype_size = sizeof(int16_t);
} }
auto allocator = ocl_runtime->GetAllocator(); auto allocator = ocl_runtime->GetAllocator();
int ci = shape[0]; std::vector<int> input_shape, output_shape, weight_shape;
int co = shape[1]; if (dims == 2) {
std::vector<int> input_shape = {1, ci}; int ci = shape[0];
int co = shape[1];
input_shape = {1, ci};
output_shape = {1, co};
weight_shape = {co, ci};
} else if (dims == 4) {
int a = shape[0];
int b = shape[1];
int m = shape[2];
int ci = shape[3];
int co = shape[4];
input_shape = {a, b, m, ci};
output_shape = {a, b, m, co};
weight_shape = {a, b, co, ci};
}
auto param_ptr = std::make_unique<MatMulParameter>();
auto param = param_ptr.get();
if (param == nullptr) {
MS_LOG(ERROR) << "param_ptr create error.";
return;
}
param->a_transpose_ = false;
param->b_transpose_ = true;
auto tensor_x_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), auto tensor_x_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
input_shape, schema::Format_NC); input_shape, dims == 2 ? schema::Format_NC : schema::Format_NHWC);
auto tensor_x = tensor_x_ptr.get(); auto tensor_x = tensor_x_ptr.get();
if (tensor_x == nullptr) { if (tensor_x == nullptr) {
MS_LOG(ERROR) << "tensor_x create error."; MS_LOG(ERROR) << "tensor_x create error.";
return; return;
} }
std::vector<int> w_shape = {co, ci}; auto tensor_w_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
auto tensor_w_ptr = weight_shape, dims == 2 ? schema::Format_NC : schema::Format_NHWC);
std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), w_shape);
auto tensor_w = tensor_w_ptr.get(); auto tensor_w = tensor_w_ptr.get();
if (tensor_w == nullptr) { if (tensor_w == nullptr) {
MS_LOG(ERROR) << "tensor_w create error."; MS_LOG(ERROR) << "tensor_w create error.";
@ -60,9 +81,9 @@ void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *we
} }
tensor_w->SetData(weight_data); tensor_w->SetData(weight_data);
std::vector<int> out_shape = {1, co}; auto tensor_out_ptr =
auto tensor_out_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), output_shape,
out_shape, schema::Format_NC); dims == 2 ? schema::Format_NC : schema::Format_NHWC);
auto tensor_out = tensor_out_ptr.get(); auto tensor_out = tensor_out_ptr.get();
if (tensor_out == nullptr) { if (tensor_out == nullptr) {
MS_LOG(ERROR) << "tensor_out create error."; MS_LOG(ERROR) << "tensor_out create error.";
@ -70,7 +91,8 @@ void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *we
} }
std::vector<lite::Tensor *> inputs{tensor_x, tensor_w}; std::vector<lite::Tensor *> inputs{tensor_x, tensor_w};
std::vector<lite::Tensor *> outputs{tensor_out}; std::vector<lite::Tensor *> outputs{tensor_out};
auto op_kernel_ptr = std::make_unique<kernel::MatMulOpenCLKernel>(nullptr, inputs, outputs, false); auto op_kernel_ptr =
std::make_unique<kernel::MatMulOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs, false);
auto op_kernel = op_kernel_ptr.get(); auto op_kernel = op_kernel_ptr.get();
if (op_kernel == nullptr) { if (op_kernel == nullptr) {
MS_LOG(ERROR) << "op_kernel create error."; MS_LOG(ERROR) << "op_kernel create error.";
@ -89,12 +111,13 @@ void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *we
return; return;
} }
pGraph->Init(); pGraph->Init();
memcpy(inputs[0]->MutableData(), input_data, ci * dtype_size); memcpy(inputs[0]->MutableData(), input_data, tensor_x->ElementsNum() * dtype_size);
pGraph->Run(); pGraph->Run();
if (enable_fp16) { if (enable_fp16) {
CompareOutput(outputs[0]->MutableData(), output_data, co, static_cast<float16_t>(1e-3), 2e-2); CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast<float16_t>(1e-3),
2e-2);
} else { } else {
CompareOutput(outputs[0]->MutableData(), output_data, co, static_cast<float>(1e-5)); CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast<float>(1e-5));
} }
tensor_x->SetData(nullptr); tensor_x->SetData(nullptr);
@ -125,7 +148,7 @@ void RunTestCaseMatMul(const std::vector<int> shape, const std::vector<std::stri
MS_LOG(ERROR) << "output_data load error."; MS_LOG(ERROR) << "output_data load error.";
return; return;
} }
RunTestCaseMatMul(shape, input_data, weight_data, output_data, enable_fp16); RunTestCaseMatMul(shape, input_data, weight_data, output_data, enable_fp16, 2);
} }
TEST_F(TestMatMulOpenCL, MatMulFp32) { TEST_F(TestMatMulOpenCL, MatMulFp32) {
@ -156,7 +179,7 @@ TEST_F(TestMatMulOpenCL, MatMulFp32_2) {
std::vector<float> weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, std::vector<float> weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
std::vector<float> output_data = {10.f, 10.f, 10.f}; std::vector<float> output_data = {10.f, 10.f, 10.f};
RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), false); RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), false, 2);
} }
TEST_F(TestMatMulOpenCL, MatMulFp16_2) { TEST_F(TestMatMulOpenCL, MatMulFp16_2) {
@ -167,6 +190,40 @@ TEST_F(TestMatMulOpenCL, MatMulFp16_2) {
std::vector<float16_t> weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, std::vector<float16_t> weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
std::vector<float16_t> output_data = {10.f, 10.f, 10.f}; std::vector<float16_t> output_data = {10.f, 10.f, 10.f};
RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), true); RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), true, 2);
}
TEST_F(TestMatMulOpenCL, MatMulFp32_4D) {
int a = 1;
int b = 2;
int c = 2;
int ci = 5;
int co = 3;
std::vector<int> shape = {a, b, c, ci, co};
std::vector<float> input_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
std::vector<float> weight_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f,
11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f,
21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f};
std::vector<float> output_data = {15.0f, 40.0f, 65.0f, 15.0f, 40.0f, 65.0f,
90.0f, 115.0f, 140.0f, 90.0f, 115.0f, 140.0f};
RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), false, 4);
}
TEST_F(TestMatMulOpenCL, MatMulFp16_4D) {
int a = 1;
int b = 2;
int c = 2;
int ci = 5;
int co = 3;
std::vector<int> shape = {a, b, c, ci, co};
std::vector<float16_t> input_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
std::vector<float16_t> weight_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f,
11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f,
21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f};
std::vector<float16_t> output_data = {15.0f, 40.0f, 65.0f, 15.0f, 40.0f, 65.0f,
90.0f, 115.0f, 140.0f, 90.0f, 115.0f, 140.0f};
RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), true, 4);
} }
} // namespace mindspore } // namespace mindspore

@ -0,0 +1,156 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <iostream>
#include <memory>
#include "mindspore/core/utils/log_adapter.h"
#include "common/common_test.h"
#include "mindspore/lite/src/common/file_utils.h"
#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
#include "mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h"
#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
namespace mindspore {
class TestReduceOpenCL : public mindspore::CommonTest {
public:
TestReduceOpenCL() {}
};
void RunTestCaseReduce(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16,
int reduce_mode) {
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
ocl_runtime->Init();
size_t dtype_size = sizeof(float);
if (enable_fp16) {
ocl_runtime->SetFp16Enable(true);
dtype_size = sizeof(float16_t);
}
auto allocator = ocl_runtime->GetAllocator();
auto param_ptr = std::make_unique<ReduceParameter>();
auto param = param_ptr.get();
if (param == nullptr) {
MS_LOG(ERROR) << "param_ptr create error.";
return;
}
param->axes_[0] = 1;
param->axes_[1] = 2;
param->num_axes_ = 2;
param->mode_ = reduce_mode;
int n = shape[0];
int h = shape[1];
int w = shape[2];
int c = shape[3];
std::vector<int> input_shape = {n, h, w, c};
auto tensor_x_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
input_shape, schema::Format_NHWC);
auto tensor_x = tensor_x_ptr.get();
if (tensor_x == nullptr) {
MS_LOG(ERROR) << "tensor_x create error.";
return;
}
std::vector<int> out_shape = {n, c};
auto tensor_out_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
out_shape, schema::Format_NC);
auto tensor_out = tensor_out_ptr.get();
if (tensor_out == nullptr) {
MS_LOG(ERROR) << "tensor_out create error.";
return;
}
std::vector<lite::Tensor *> inputs{tensor_x};
std::vector<lite::Tensor *> outputs{tensor_out};
auto arith_kernel_ptr =
std::make_unique<kernel::ReduceOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs);
auto arith_kernel = arith_kernel_ptr.get();
if (arith_kernel == nullptr) {
MS_LOG(ERROR) << "arith_kernel create error.";
return;
}
arith_kernel->Init();
inputs[0]->MallocData(allocator);
std::vector<kernel::LiteKernel *> kernels{arith_kernel};
auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs, outputs, kernels, kernels, kernels);
auto pGraph = pGraph_ptr.get();
if (pGraph == nullptr) {
MS_LOG(ERROR) << "pGraph create error.";
return;
}
pGraph->Init();
memcpy(inputs[0]->MutableData(), input_data, inputs[0]->ElementsNum() * dtype_size);
pGraph->Run();
if (enable_fp16) {
CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3),
2e-2);
} else {
CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5));
}
inputs[0]->SetData(nullptr);
outputs[0]->SetData(nullptr);
MS_LOG(INFO) << "Test Reduce passed";
lite::opencl::OpenCLRuntime::DeleteInstance();
}
TEST_F(TestReduceOpenCL, ReduceMeanFp32) {
int n = 1;
int h = 2;
int w = 2;
int c = 3;
std::vector<int> shape = {n, h, w, c};
std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
std::vector<float> output_data = {4.5f, 5.5f, 6.5f};
RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMean);
}
TEST_F(TestReduceOpenCL, ReduceMeanFp16) {
int n = 1;
int h = 2;
int w = 2;
int c = 3;
std::vector<int> shape = {n, h, w, c};
std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
std::vector<float16_t> output_data = {4.5f, 5.5f, 6.5f};
RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceMean);
}
TEST_F(TestReduceOpenCL, ReduceSumFp32) {
int n = 1;
int h = 2;
int w = 2;
int c = 3;
std::vector<int> shape = {n, h, w, c};
std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
std::vector<float> output_data = {18.0f, 22.0f, 26.0f};
RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceSum);
}
TEST_F(TestReduceOpenCL, ReduceSumFp16) {
int n = 1;
int h = 2;
int w = 2;
int c = 3;
std::vector<int> shape = {n, h, w, c};
std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
std::vector<float16_t> output_data = {18.0f, 22.0f, 26.0f};
RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceSum);
}
} // namespace mindspore
Loading…
Cancel
Save