!5898 [MS][LITE][GPU]add reduce op and batchmatmul op
Merge pull request !5898 from chenzupeng/master-litepull/5898/MERGE
commit
ad37b6845f
@ -1,57 +1,146 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#define C4NUM 4
|
||||
#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void MatMul_NHWC4(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
|
||||
__write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) {
|
||||
int2 gid = (int2)(get_global_id(0), get_global_id(1));
|
||||
int2 lid = (int2)(get_local_id(0), get_local_id(1));
|
||||
__kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
|
||||
__write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) {
|
||||
int gidx = get_global_id(0); // CO4
|
||||
int gidz = get_global_id(2); // N
|
||||
int lidx = get_local_id(0);
|
||||
int lidy = get_local_id(1);
|
||||
int ci4 = UP_DIV(in_shape.w, C4NUM);
|
||||
int co4 = UP_DIV(out_shape.w, C4NUM);
|
||||
int n = out_shape.z;
|
||||
bool inside = gidx < co4 && gidz < n;
|
||||
FLT4 result = (FLT4)(0.0f);
|
||||
bool inside = gid.x < offset_co.y;
|
||||
for (uint i = lid.y; i < offset_ci.y && inside; i += 4) {
|
||||
FLT4 v = READ_IMAGE(input, smp_zero, (int2)(i, 0));
|
||||
FLT16 w = weight[gid.x + i * offset_co.y];
|
||||
for (uint i = lidy; i < ci4 && inside; i += 4) {
|
||||
FLT4 v = READ_IMAGE(input, smp_zero, (int2)(i, gidz));
|
||||
FLT16 w = weight[i * co4 + gidx];
|
||||
result.x += dot(v, w.s0123);
|
||||
result.y += dot(v, w.s4567);
|
||||
result.z += dot(v, w.s89ab);
|
||||
result.w += dot(v, w.scdef);
|
||||
}
|
||||
__local FLT4 temp[64][4];
|
||||
temp[lid.x][lid.y] = result;
|
||||
WRITE_IMAGE(output, (int2)(gidx, gidz), result);
|
||||
__local FLT4 temp[32][4];
|
||||
temp[lidx][lidy] = result;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (lid.y == 0 && inside) {
|
||||
result += temp[lid.x][1];
|
||||
result += temp[lid.x][2];
|
||||
result += temp[lid.x][3];
|
||||
if (lidy == 0 && inside) {
|
||||
result += temp[lidx][1];
|
||||
result += temp[lidx][2];
|
||||
result += temp[lidx][3];
|
||||
if (has_bias != 0) {
|
||||
result += READ_IMAGE(bias, smp_zero, (int2)(gid.x, 0));
|
||||
result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
|
||||
}
|
||||
WRITE_IMAGE(output, (int2)(gid.x, 0), result);
|
||||
WRITE_IMAGE(output, (int2)(gidx, gidz), result);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void MatMul_NC4HW4(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
|
||||
__write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) {
|
||||
int2 gid = (int2)(get_global_id(0), get_global_id(1));
|
||||
int2 lid = (int2)(get_local_id(0), get_local_id(1));
|
||||
__kernel void MatMul_NC4HW4_2d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
|
||||
__write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) {
|
||||
int gidx = get_global_id(0); // CO4
|
||||
int gidz = get_global_id(2); // N
|
||||
int lidx = get_local_id(0);
|
||||
int lidy = get_local_id(1);
|
||||
int ci4 = UP_DIV(in_shape.w, C4NUM);
|
||||
int co4 = UP_DIV(out_shape.w, C4NUM);
|
||||
int n = out_shape.z;
|
||||
bool inside = gidx < co4 && gidz < n;
|
||||
FLT4 result = (FLT4)(0.0f);
|
||||
bool inside = gid.x < offset_co.y;
|
||||
for (uint i = lid.y; i < offset_ci.y && inside; i += 4) {
|
||||
FLT4 v = READ_IMAGE(input, smp_zero, (int2)(0, i));
|
||||
FLT16 w = weight[gid.x + i * offset_co.y];
|
||||
for (uint i = lidy; i < ci4 && inside; i += 4) {
|
||||
FLT4 v = READ_IMAGE(input, smp_zero, (int2)(gidz * ci4 + i, 0));
|
||||
FLT16 w = weight[i * co4 + gidx];
|
||||
result.x += dot(v, w.s0123);
|
||||
result.y += dot(v, w.s4567);
|
||||
result.z += dot(v, w.s89ab);
|
||||
result.w += dot(v, w.scdef);
|
||||
}
|
||||
__local FLT4 temp[64][4];
|
||||
temp[lid.x][lid.y] = result;
|
||||
__local FLT4 temp[32][4];
|
||||
temp[lidx][lidy] = result;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (lid.y == 0 && inside) {
|
||||
result += temp[lid.x][1];
|
||||
result += temp[lid.x][2];
|
||||
result += temp[lid.x][3];
|
||||
if (lidy == 0 && inside) {
|
||||
result += temp[lidx][1];
|
||||
result += temp[lidx][2];
|
||||
result += temp[lidx][3];
|
||||
if (has_bias != 0) {
|
||||
result += READ_IMAGE(bias, smp_zero, (int2)(gid.x, 0));
|
||||
result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
|
||||
}
|
||||
WRITE_IMAGE(output, (int2)(0, gid.x), result);
|
||||
WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, 0), result);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void MatMul_NHWC4_4d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
|
||||
__write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) {
|
||||
int gidx = get_global_id(0); // CO4
|
||||
int gidy = get_global_id(1); // N * H * 4
|
||||
int gidz = get_global_id(2); // W
|
||||
int lidx = get_local_id(0);
|
||||
int lidy = get_local_id(1);
|
||||
int ci4 = UP_DIV(in_shape.w, C4NUM);
|
||||
int co4 = UP_DIV(out_shape.w, C4NUM);
|
||||
int n = out_shape.x;
|
||||
int h = out_shape.y;
|
||||
int w = out_shape.z;
|
||||
int nh_index = gidy / 4;
|
||||
bool inside = gidx < co4 && gidz < w && nh_index < n * h;
|
||||
FLT4 result = (FLT4)(0.0f);
|
||||
for (uint i = lidy; i < ci4 && inside; i += 4) {
|
||||
FLT4 v = READ_IMAGE(input, smp_zero, (int2)(gidz * ci4 + i, nh_index));
|
||||
FLT16 weight_value = weight[nh_index * ci4 * co4 + i * co4 + gidx];
|
||||
result.x += dot(v, weight_value.s0123);
|
||||
result.y += dot(v, weight_value.s4567);
|
||||
result.z += dot(v, weight_value.s89ab);
|
||||
result.w += dot(v, weight_value.scdef);
|
||||
}
|
||||
__local FLT4 temp[32][4];
|
||||
temp[lidx][lidy] = result;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (lidy == 0 && inside) {
|
||||
result += temp[lidx][1];
|
||||
result += temp[lidx][2];
|
||||
result += temp[lidx][3];
|
||||
if (has_bias != 0) {
|
||||
result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
|
||||
}
|
||||
WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, nh_index), result);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void MatMul_NC4HW4_4d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
|
||||
__write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) {
|
||||
int gidx = get_global_id(0); // CO4
|
||||
int gidy = get_global_id(1); // N * H * 4
|
||||
int gidz = get_global_id(2); // W
|
||||
int lidx = get_local_id(0);
|
||||
int lidy = get_local_id(1);
|
||||
int ci4 = UP_DIV(in_shape.w, C4NUM);
|
||||
int co4 = UP_DIV(out_shape.w, C4NUM);
|
||||
int n = out_shape.x;
|
||||
int h = out_shape.y;
|
||||
int w = out_shape.z;
|
||||
int nh_index = gidy / 4;
|
||||
bool inside = gidx < co4 && gidz < w && nh_index < n * h;
|
||||
int n_index = nh_index / h;
|
||||
int h_index = nh_index % h;
|
||||
FLT4 result = (FLT4)(0.0f);
|
||||
for (uint i = lidy; i < ci4 && inside; i += 4) {
|
||||
FLT4 v = READ_IMAGE(input, smp_zero, (int2)(gidz, n_index * ci4 * h + i * h + h_index));
|
||||
FLT16 weight_value = weight[nh_index * ci4 * co4 + i * co4 + gidx];
|
||||
result.x += dot(v, weight_value.s0123);
|
||||
result.y += dot(v, weight_value.s4567);
|
||||
result.z += dot(v, weight_value.s89ab);
|
||||
result.w += dot(v, weight_value.scdef);
|
||||
}
|
||||
__local FLT4 temp[32][4];
|
||||
temp[lidx][lidy] = result;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (lidy == 0 && inside) {
|
||||
result += temp[lidx][1];
|
||||
result += temp[lidx][2];
|
||||
result += temp[lidx][3];
|
||||
if (has_bias != 0) {
|
||||
result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
|
||||
}
|
||||
WRITE_IMAGE(output, (int2)(gidz, n_index * co4 * h + gidx * h + h_index), result);
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,61 @@
|
||||
#ifdef cl_khr_fp16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
__kernel void mean_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
|
||||
int X = get_global_id(0); // C4
|
||||
if (X >= size.z) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = (FLT4)0.f;
|
||||
for (int h = 0; h < size.x; h++) {
|
||||
for (int w = 0; w < size.y; w++) {
|
||||
result += READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h));
|
||||
}
|
||||
}
|
||||
result /= size.x * size.y;
|
||||
WRITE_IMAGE(dst_data, (int2)(X, 0), result);
|
||||
}
|
||||
|
||||
__kernel void mean_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
|
||||
int X = get_global_id(0); // C4
|
||||
if (X >= size.z) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = (FLT4)0.f;
|
||||
for (int h = 0; h < size.x; h++) {
|
||||
for (int w = 0; w < size.y; w++) {
|
||||
result += READ_IMAGE(src_data, smp_zero, (int2)(w, X * size.x + h));
|
||||
}
|
||||
}
|
||||
result /= size.x * size.y;
|
||||
WRITE_IMAGE(dst_data, (int2)(0, X), result);
|
||||
}
|
||||
|
||||
__kernel void sum_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
|
||||
int X = get_global_id(0); // C4
|
||||
if (X >= size.z) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = (FLT4)0.f;
|
||||
for (int h = 0; h < size.x; h++) {
|
||||
for (int w = 0; w < size.y; w++) {
|
||||
result += READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h));
|
||||
}
|
||||
}
|
||||
WRITE_IMAGE(dst_data, (int2)(X, 0), result);
|
||||
}
|
||||
|
||||
__kernel void sum_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
|
||||
int X = get_global_id(0); // C4
|
||||
if (X >= size.z) {
|
||||
return;
|
||||
}
|
||||
FLT4 result = (FLT4)0.f;
|
||||
for (int h = 0; h < size.x; h++) {
|
||||
for (int w = 0; w < size.y; w++) {
|
||||
result += READ_IMAGE(src_data, smp_zero, (int2)(w, X * size.x + h));
|
||||
}
|
||||
}
|
||||
WRITE_IMAGE(dst_data, (int2)(0, X), result);
|
||||
}
|
@ -0,0 +1,166 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include "include/errorcode.h"
|
||||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/reduce.h"
|
||||
#include "src/runtime/kernel/opencl/cl/reduce.cl.inc"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_NULL_PTR;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::lite::RET_PARAM_INVALID;
|
||||
using mindspore::schema::PrimitiveType_Mean;
|
||||
using mindspore::schema::PrimitiveType_Reduce;
|
||||
using mindspore::schema::ReduceMode;
|
||||
using mindspore::schema::ReduceMode_ReduceMax;
|
||||
using mindspore::schema::ReduceMode_ReduceMean;
|
||||
using mindspore::schema::ReduceMode_ReduceMin;
|
||||
using mindspore::schema::ReduceMode_ReduceProd;
|
||||
using mindspore::schema::ReduceMode_ReduceSum;
|
||||
using mindspore::schema::ReduceMode_ReduceSumSquare;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
int ReduceOpenCLKernel::Init() {
|
||||
InitNHWCShape();
|
||||
auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
|
||||
if (reduce_param == nullptr) {
|
||||
return RET_NULL_PTR;
|
||||
}
|
||||
std::map<int, std::string> reduce_type2str{{ReduceMode_ReduceMean, "mean"}, {ReduceMode_ReduceSum, "sum"}};
|
||||
if (reduce_type2str.find(reduce_param->mode_) == reduce_type2str.end()) {
|
||||
MS_LOG(ERROR) << "not supported reduce type:" << reduce_param->mode_;
|
||||
return RET_PARAM_INVALID;
|
||||
}
|
||||
if (reduce_param->num_axes_ != 2 || ((reduce_param->axes_[0] != 1 || reduce_param->axes_[1] != 2) &&
|
||||
(reduce_param->axes_[0] != 2 || reduce_param->axes_[1] != 1))) {
|
||||
MS_LOG(ERROR) << "reduce op only support axes HW";
|
||||
return RET_PARAM_INVALID;
|
||||
}
|
||||
std::string kernel_name = reduce_type2str.at(reduce_param->mode_);
|
||||
kernel_name += "_" + std::string(EnumNameFormat(op_format_));
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
enable_fp16_ = ocl_runtime->GetFp16Enable();
|
||||
|
||||
if (in_tensors_[0]->shape().back() != out_tensors_[0]->shape().back()) {
|
||||
MS_LOG(ERROR) << "Reduce input channel " << in_tensors_[0]->shape().back() << " should equal output channel"
|
||||
<< out_tensors_[0]->shape().back();
|
||||
return RET_ERROR;
|
||||
}
|
||||
#ifdef PROGRAM_WITH_IL
|
||||
kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
|
||||
#else
|
||||
std::set<std::string> build_options;
|
||||
std::string source = reduce_source;
|
||||
ocl_runtime->LoadSource(kernel_name, source);
|
||||
ocl_runtime->BuildKernel(kernel_, kernel_name, kernel_name, build_options);
|
||||
#endif
|
||||
in_ori_format_ = in_tensors_[0]->GetFormat();
|
||||
out_ori_format_ = out_tensors_[0]->GetFormat();
|
||||
in_tensors_[0]->SetFormat(op_format_);
|
||||
out_tensors_[0]->SetFormat(op_format_);
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void ReduceOpenCLKernel::InitNHWCShape() {
|
||||
std::vector<int> shapex = out_tensors_[0]->shape();
|
||||
size_t n = 1, h = 1, w = 1, c = 1;
|
||||
if (shapex.size() == 2) {
|
||||
n = shapex[0];
|
||||
c = shapex[1];
|
||||
} else if (shapex.size() == 4) {
|
||||
n = shapex[0];
|
||||
h = shapex[1];
|
||||
w = shapex[2];
|
||||
c = shapex[3];
|
||||
}
|
||||
nhwc_shape_ = {n, h, w, c};
|
||||
}
|
||||
|
||||
int ReduceOpenCLKernel::ReSize() { return RET_OK; }
|
||||
|
||||
int ReduceOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
|
||||
size_t im_dst_x, im_dst_y;
|
||||
|
||||
if (op_format_ == schema::Format_NHWC4) {
|
||||
im_dst_x = nhwc_shape_[2] * UP_DIV(nhwc_shape_[3], C4NUM);
|
||||
im_dst_y = nhwc_shape_[0] * nhwc_shape_[1];
|
||||
} else if (op_format_ == schema::Format_NC4HW4) {
|
||||
im_dst_x = nhwc_shape_[2];
|
||||
im_dst_y = nhwc_shape_[0] * UP_DIV(nhwc_shape_[3], C4NUM) * nhwc_shape_[1];
|
||||
} else {
|
||||
MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_);
|
||||
return RET_ERROR;
|
||||
}
|
||||
size_t img_dtype = CL_FLOAT;
|
||||
if (enable_fp16_) {
|
||||
img_dtype = CL_HALF_FLOAT;
|
||||
}
|
||||
img_size->clear();
|
||||
std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
|
||||
*img_size = vec;
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ReduceOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
std::vector<int> shapex = in_tensors_[0]->shape();
|
||||
int h = shapex[1];
|
||||
int w = shapex[2];
|
||||
int c = shapex[3];
|
||||
int c4 = UP_DIV(c, C4NUM);
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
std::vector<size_t> local = {};
|
||||
std::vector<size_t> global = {static_cast<size_t>(c4)};
|
||||
cl_int4 size = {h, w, c4, 1};
|
||||
int arg_idx = 0;
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->MutableData());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->MutableData());
|
||||
ocl_runtime->SetKernelArg(kernel_, arg_idx++, size);
|
||||
ocl_runtime->RunKernel(kernel_, global, local, nullptr);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
kernel::LiteKernel *OpenCLReduceKernelCreator(const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
|
||||
const lite::Context *ctx, const kernel::KernelKey &desc,
|
||||
const mindspore::lite::PrimitiveC *primitive) {
|
||||
auto *kernel = new (std::nothrow) ReduceOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
|
||||
if (kernel == nullptr) {
|
||||
MS_LOG(ERROR) << "kernel " << opParameter->name_ << " create failed.";
|
||||
return nullptr;
|
||||
}
|
||||
auto ret = kernel->Init();
|
||||
if (ret != RET_OK) {
|
||||
delete kernel;
|
||||
return nullptr;
|
||||
}
|
||||
return kernel;
|
||||
}
|
||||
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Mean, OpenCLReduceKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Mean, OpenCLReduceKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Reduce, OpenCLReduceKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Reduce, OpenCLReduceKernelCreator)
|
||||
} // namespace mindspore::kernel
|
@ -0,0 +1,48 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_REDUCE_H_
|
||||
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_REDUCE_H_
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "src/lite_kernel.h"
|
||||
#include "src/runtime/opencl/opencl_runtime.h"
|
||||
#include "src/runtime/kernel/opencl/opencl_kernel.h"
|
||||
#include "nnacl/reduce_parameter.h"
|
||||
|
||||
namespace mindspore::kernel {
|
||||
class ReduceOpenCLKernel : public OpenCLKernel {
|
||||
public:
|
||||
explicit ReduceOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs)
|
||||
: OpenCLKernel(parameter, inputs, outputs) {}
|
||||
~ReduceOpenCLKernel() override{};
|
||||
|
||||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override;
|
||||
int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
|
||||
void InitNHWCShape();
|
||||
|
||||
private:
|
||||
cl::Kernel kernel_;
|
||||
bool enable_fp16_{false};
|
||||
std::vector<size_t> nhwc_shape_;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_REDUCE_H_
|
@ -0,0 +1,156 @@
|
||||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include "mindspore/core/utils/log_adapter.h"
|
||||
#include "common/common_test.h"
|
||||
#include "mindspore/lite/src/common/file_utils.h"
|
||||
#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
|
||||
#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
|
||||
#include "mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h"
|
||||
#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
|
||||
|
||||
namespace mindspore {
|
||||
class TestReduceOpenCL : public mindspore::CommonTest {
|
||||
public:
|
||||
TestReduceOpenCL() {}
|
||||
};
|
||||
|
||||
void RunTestCaseReduce(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16,
|
||||
int reduce_mode) {
|
||||
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
|
||||
ocl_runtime->Init();
|
||||
size_t dtype_size = sizeof(float);
|
||||
if (enable_fp16) {
|
||||
ocl_runtime->SetFp16Enable(true);
|
||||
dtype_size = sizeof(float16_t);
|
||||
}
|
||||
auto allocator = ocl_runtime->GetAllocator();
|
||||
auto param_ptr = std::make_unique<ReduceParameter>();
|
||||
auto param = param_ptr.get();
|
||||
if (param == nullptr) {
|
||||
MS_LOG(ERROR) << "param_ptr create error.";
|
||||
return;
|
||||
}
|
||||
param->axes_[0] = 1;
|
||||
param->axes_[1] = 2;
|
||||
param->num_axes_ = 2;
|
||||
param->mode_ = reduce_mode;
|
||||
int n = shape[0];
|
||||
int h = shape[1];
|
||||
int w = shape[2];
|
||||
int c = shape[3];
|
||||
std::vector<int> input_shape = {n, h, w, c};
|
||||
auto tensor_x_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
|
||||
input_shape, schema::Format_NHWC);
|
||||
auto tensor_x = tensor_x_ptr.get();
|
||||
if (tensor_x == nullptr) {
|
||||
MS_LOG(ERROR) << "tensor_x create error.";
|
||||
return;
|
||||
}
|
||||
std::vector<int> out_shape = {n, c};
|
||||
auto tensor_out_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
|
||||
out_shape, schema::Format_NC);
|
||||
auto tensor_out = tensor_out_ptr.get();
|
||||
if (tensor_out == nullptr) {
|
||||
MS_LOG(ERROR) << "tensor_out create error.";
|
||||
return;
|
||||
}
|
||||
std::vector<lite::Tensor *> inputs{tensor_x};
|
||||
std::vector<lite::Tensor *> outputs{tensor_out};
|
||||
auto arith_kernel_ptr =
|
||||
std::make_unique<kernel::ReduceOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs);
|
||||
auto arith_kernel = arith_kernel_ptr.get();
|
||||
if (arith_kernel == nullptr) {
|
||||
MS_LOG(ERROR) << "arith_kernel create error.";
|
||||
return;
|
||||
}
|
||||
arith_kernel->Init();
|
||||
|
||||
inputs[0]->MallocData(allocator);
|
||||
|
||||
std::vector<kernel::LiteKernel *> kernels{arith_kernel};
|
||||
auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs, outputs, kernels, kernels, kernels);
|
||||
auto pGraph = pGraph_ptr.get();
|
||||
if (pGraph == nullptr) {
|
||||
MS_LOG(ERROR) << "pGraph create error.";
|
||||
return;
|
||||
}
|
||||
pGraph->Init();
|
||||
memcpy(inputs[0]->MutableData(), input_data, inputs[0]->ElementsNum() * dtype_size);
|
||||
pGraph->Run();
|
||||
|
||||
if (enable_fp16) {
|
||||
CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3),
|
||||
2e-2);
|
||||
} else {
|
||||
CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5));
|
||||
}
|
||||
inputs[0]->SetData(nullptr);
|
||||
outputs[0]->SetData(nullptr);
|
||||
|
||||
MS_LOG(INFO) << "Test Reduce passed";
|
||||
lite::opencl::OpenCLRuntime::DeleteInstance();
|
||||
}
|
||||
|
||||
TEST_F(TestReduceOpenCL, ReduceMeanFp32) {
|
||||
int n = 1;
|
||||
int h = 2;
|
||||
int w = 2;
|
||||
int c = 3;
|
||||
std::vector<int> shape = {n, h, w, c};
|
||||
std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
|
||||
std::vector<float> output_data = {4.5f, 5.5f, 6.5f};
|
||||
|
||||
RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMean);
|
||||
}
|
||||
|
||||
TEST_F(TestReduceOpenCL, ReduceMeanFp16) {
|
||||
int n = 1;
|
||||
int h = 2;
|
||||
int w = 2;
|
||||
int c = 3;
|
||||
std::vector<int> shape = {n, h, w, c};
|
||||
std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
|
||||
std::vector<float16_t> output_data = {4.5f, 5.5f, 6.5f};
|
||||
|
||||
RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceMean);
|
||||
}
|
||||
|
||||
TEST_F(TestReduceOpenCL, ReduceSumFp32) {
|
||||
int n = 1;
|
||||
int h = 2;
|
||||
int w = 2;
|
||||
int c = 3;
|
||||
std::vector<int> shape = {n, h, w, c};
|
||||
std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
|
||||
std::vector<float> output_data = {18.0f, 22.0f, 26.0f};
|
||||
|
||||
RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceSum);
|
||||
}
|
||||
|
||||
TEST_F(TestReduceOpenCL, ReduceSumFp16) {
|
||||
int n = 1;
|
||||
int h = 2;
|
||||
int w = 2;
|
||||
int c = 3;
|
||||
std::vector<int> shape = {n, h, w, c};
|
||||
std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
|
||||
std::vector<float16_t> output_data = {18.0f, 22.0f, 26.0f};
|
||||
|
||||
RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceSum);
|
||||
}
|
||||
} // namespace mindspore
|
Loading…
Reference in new issue