opencl_strided_slice

pull/8383/head
wangdongxu 4 years ago
parent c324fec6d4
commit c94563c06b

@ -1,146 +0,0 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define INT2 int2
#define INT4 int4
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
__kernel void slice_NHWC4(__read_only image2d_t input, __write_only image2d_t output, INT4 input_shape, INT4 out_shape,
INT4 begin, INT2 sharedNoUpdiv) {
int X = get_global_id(1); // H
int Y = get_global_id(2); // W
if (X >= out_shape.y || Y >= out_shape.z) {
return;
}
FLT4 result;
if (sharedNoUpdiv.x % 4 == 0) {
for (int i = 0; i < out_shape.w; i++) {
result = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + (i + begin.w), (X + begin.y)));
WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + i, (X)), result);
}
} else {
int begin_postion = sharedNoUpdiv.x % 4;
FLT4 first = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + begin.w, (X + begin.y)));
if (begin_postion == 1) {
for (int i = 1; i <= out_shape.w; i++) {
FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + (begin.w + i), (X + begin.y)));
result.x = first.y;
result.y = first.z;
result.z = first.w;
result.w = second.x;
WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + i - 1, (X)), result);
first.y = second.y;
first.z = second.z;
first.w = second.w;
}
} else if (begin_postion == 2) {
for (int i = 1; i <= out_shape.w; i++) {
FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + (begin.w + i), (X + begin.y)));
result.x = first.z;
result.y = first.w;
result.z = second.x;
result.w = second.y;
WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + i - 1, (X)), result);
first.z = second.z;
first.w = second.w;
}
} else {
for (int i = 1; i <= out_shape.w; i++) {
FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + (begin.w + i), (X + begin.y)));
result.x = first.w;
result.y = second.x;
result.z = second.y;
result.w = second.z;
WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + i - 1, (X)), result);
first.w = second.w;
}
}
}
// judge the line of size
int size = sharedNoUpdiv.y % 4;
FLT4 result_fill0;
if (size == 1) {
result_fill0.x = result.x;
result_fill0.y = 0;
result_fill0.z = 0;
result_fill0.w = 0;
WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + out_shape.w - 1, (X)), result_fill0);
} else if (size == 2) {
result_fill0.x = result.x;
result_fill0.y = result.y;
result_fill0.z = 0;
result_fill0.w = 0;
WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + out_shape.w - 1, (X)), result_fill0);
} else if (size == 3) {
result_fill0.x = result.x;
result_fill0.y = result.y;
result_fill0.z = result.z;
result_fill0.w = 0;
WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + out_shape.w - 1, (X)), result_fill0);
}
}
__kernel void slice_NC4HW4(__read_only image2d_t input, __write_only image2d_t output, INT4 input_shape, INT4 out_shape,
INT4 begin, INT2 sharedNoUpdiv) {
int X = get_global_id(1); // H
int Y = get_global_id(2); // W
if (X >= out_shape.y || Y >= out_shape.z) {
return;
}
FLT4 result;
if (sharedNoUpdiv.x % 4 == 0) {
for (int i = 0; i < out_shape.w; i++) {
result = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z), (i + begin.w) * input_shape.y + (X + begin.y)));
WRITE_IMAGE(output, (INT2)((Y), (i * out_shape.y + X)), result);
}
} else {
int begin_postion = sharedNoUpdiv.x % 4;
FLT4 first = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z), (begin.w) * input_shape.y + (X + begin.y)));
if (begin_postion == 1) {
for (int i = 1; i <= out_shape.w; i++) {
FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z), (i + begin.w) * input_shape.y + (X + begin.y)));
result.x = first.y;
result.y = first.z;
result.z = first.w;
result.w = second.x;
WRITE_IMAGE(output, (INT2)((Y), ((i - 1) * out_shape.y + X)), result);
first.y = second.y;
first.z = second.z;
first.w = second.w;
}
} else if (begin_postion == 2) {
for (int i = 1; i <= out_shape.w; i++) {
FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z), (i + begin.w) * input_shape.y + (X + begin.y)));
result.x = first.z;
result.y = first.w;
result.z = second.x;
result.w = second.y;
WRITE_IMAGE(output, (INT2)((Y), ((i - 1) * out_shape.y + X)), result);
first.z = second.z;
first.w = second.w;
}
} else {
for (int i = 1; i <= out_shape.w; i++) {
FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z), (i + begin.w) * input_shape.y + (X + begin.y)));
result.x = first.w;
result.y = second.x;
result.z = second.y;
result.w = second.z;
WRITE_IMAGE(output, (INT2)((Y), ((i - 1) * out_shape.y + X)), result);
first.w = second.w;
}
}
}
// judge the line of size
int size = sharedNoUpdiv.y % 4;
FLT4 result_fill0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
if (size == 1) {
result_fill0.x = result.x;
WRITE_IMAGE(output, (INT2)((Y), ((out_shape.w - 1) * out_shape.y + X)), result_fill0);
} else if (size == 2) {
result_fill0.x = result.x;
result_fill0.y = result.y;
WRITE_IMAGE(output, (INT2)((Y), ((out_shape.w - 1) * out_shape.y + X)), result_fill0);
} else if (size == 3) {
result_fill0.x = result.x;
result_fill0.y = result.y;
result_fill0.z = result.z;
WRITE_IMAGE(output, (INT2)((Y), ((out_shape.w - 1) * out_shape.y + X)), result_fill0);
}
}

@ -0,0 +1,59 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
__kernel void strided_slice(__read_only image2d_t input, __write_only image2d_t output, int4 input_shape,
int4 output_shape, int2 io_slices, int4 begin, int4 stride, int4 size) {
int IN = input_shape.x, IH = input_shape.y, IW = input_shape.z, CI = input_shape.w;
int ON = output_shape.x, OH = output_shape.y, OW = output_shape.z, CO = output_shape.w;
int CI_SLICES = io_slices.x, CO_SLICES = io_slices.y;
int on_oh = get_global_id(0);
int ow = get_global_id(1);
int co_slice = get_global_id(2);
int on = on_oh / OH;
int oh = on_oh % OH;
if (on >= ON || oh >= OH || ow >= OW || co_slice >= CO_SLICES) {
return;
}
FLT tmp[4];
for (int i = 0; i < 4; ++i) {
// output_shape idx -> size idx. because squeeze(output_shape)=squeeze(size)
// for example:
// python code: B = A[1, 1:16, 2:16, 3:16]
// input_shape = [16, 16, 16, 16]
// begin = [ 1, 1, 2, 3]
// end = [ 2, 16, 16, 16]
// stride = [ 1, 1, 1, 1]
// size = [ 1, 15, 14, 13] = ceil((end - begin) / stride)
// output_shape = [ 15, 14, 13]
int idx = ((on * OH + oh) * OW + ow) * CO + co_slice * 4 + i;
int co_ = idx % size.w;
idx /= size.w;
int ow_ = idx % size.z;
idx /= size.z;
int oh_ = idx % size.y;
idx /= size.y;
int on_ = idx;
int in = begin.x + stride.x * on_;
int ih = begin.y + stride.y * oh_;
int iw = begin.z + stride.z * ow_;
int ci = begin.w + stride.w * co_;
FLT4 src = READ_IMAGE(input, smp_none, (int2)(iw * CI_SLICES + ci / 4, in * IH + ih));
int offset = ci % 4;
if (offset == 0) {
tmp[i] = src.x;
} else if (offset == 1) {
tmp[i] = src.y;
} else if (offset == 2) {
tmp[i] = src.z;
} else {
tmp[i] = src.w;
}
}
FLT4 out = (FLT4)(tmp[0], tmp[1], tmp[2], tmp[3]);
WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, on_oh), out);
}

@ -1,106 +0,0 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cstring>
#include <string>
#include <algorithm>
#include <set>
#include "src/kernel_registry.h"
#include "src/runtime/kernel/opencl/kernel/slice.h"
#include "src/runtime/kernel/opencl/utils.h"
#include "src/runtime/kernel/opencl/cl/slice.cl.inc"
using mindspore::kernel::KERNEL_ARCH::kGPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_Slice;
namespace mindspore::kernel {
int SliceOpenCLKernel::Init() {
std::set<std::string> build_options;
std::string source = slice_source;
std::string program_name = "slice";
std::string kernel_name = "slice_NHWC4";
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}
void SlcieGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
const int max_divider = 8;
const int max_x = 4, max_y = 8;
int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
int yz = max_size / x;
int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));
local->clear();
local->push_back(x);
local->push_back(y);
local->push_back(z);
}
int SliceOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
auto param = reinterpret_cast<SliceParameter *>(this->op_parameter_);
auto input_shape = in_tensors_[0]->shape();
cl_int4 input_shape_ = {input_shape[0], input_shape[1], input_shape[2], UP_DIV(input_shape[3], C4NUM)};
cl_int4 size_ = {param->size_[0], param->size_[1], param->size_[2], UP_DIV(param->size_[3], C4NUM)};
cl_int4 begin_ = {param->begin_[0], param->begin_[1], param->begin_[2], param->begin_[3] / 4};
cl_int2 sharedNoUpdiv = {param->begin_[3], param->size_[3]};
uint32_t OH = param->size_[1];
uint32_t OW = param->size_[2];
const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
std::vector<size_t> local = {1, 1, 1}; // init local
std::vector<size_t> global = {1, OH, OW};
SlcieGetWorkGroup(global, &local, max_global[0]);
int arg_cn = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); // input tensor
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, size_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, begin_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, sharedNoUpdiv);
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
return RET_OK;
}
kernel::LiteKernel *OpenCLSliceKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
auto *kernel = new (std::nothrow) SliceOpenCLKernel(opParameter, inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << " new SliceOpenCLKernel failed ";
free(opParameter);
return nullptr;
}
auto ret = kernel->Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << " Init kernel failed, name: Slice ";
delete kernel;
return nullptr;
}
return kernel;
}
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Slice, OpenCLSliceKernelCreator);
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Slice, OpenCLSliceKernelCreator);
} // namespace mindspore::kernel

@ -0,0 +1,192 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cstring>
#include <deque>
#include <string>
#include <algorithm>
#include <set>
#include "src/kernel_registry.h"
#include "src/runtime/kernel/opencl/kernel/strided_slice.h"
#include "src/runtime/kernel/opencl/utils.h"
#include "src/runtime/kernel/opencl/cl/strided_slice.cl.inc"
#include "nnacl/strided_slice.h"
using mindspore::kernel::KERNEL_ARCH::kGPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_Slice;
using mindspore::schema::PrimitiveType_StridedSlice;
namespace mindspore::kernel {
int SliceOpenCLKernel::CheckSpecs() {
const std::string kernel_name = op_parameter_->type_ == PrimitiveType_Slice ? "Slice" : "StridedSlice";
if (in_tensors_.size() != 1) {
MS_LOG(ERROR) << kernel_name + " only supports 1 input Tensor.";
return RET_ERROR;
}
if (out_tensors_.size() != 1) {
MS_LOG(ERROR) << kernel_name + " only supports 1 output Tensor.";
return RET_ERROR;
}
auto in_ndim = in_tensors_.front()->shape().size();
if (in_ndim == 0 || in_ndim > 4) {
MS_LOG(ERROR) << kernel_name + " only supports 1D-4D input tensor";
return RET_ERROR;
}
auto out_ndim = out_tensors_.front()->shape().size();
if (out_ndim > 4) {
MS_LOG(ERROR) << kernel_name + " only supports 0D-4D output tensor";
return RET_ERROR;
}
if (InitConstArgs() != RET_OK) {
MS_LOG(ERROR) << "call SliceOpenCLKernel::InitConstArgs() failed";
return RET_ERROR;
}
return RET_OK;
}
int SliceOpenCLKernel::Prepare() {
std::set<std::string> build_options;
std::string program_name = "strided_slice";
ocl_runtime_->LoadSource(program_name, strided_slice_source);
ocl_runtime_->BuildKernel(kernel_, program_name, "strided_slice", build_options);
SetConstArgs();
SetGlobalLocal();
return RET_OK;
}
int SliceOpenCLKernel::InitConstArgs() {
auto input_info = Image2DInfo(in_tensors_.front());
auto output_info = Image2DInfo(out_tensors_.front());
input_shape_ = {static_cast<cl_int>(input_info.N), static_cast<cl_int>(input_info.H),
static_cast<cl_int>(input_info.W), static_cast<cl_int>(input_info.C)};
output_shape_ = {static_cast<cl_int>(output_info.N), static_cast<cl_int>(output_info.H),
static_cast<cl_int>(output_info.W), static_cast<cl_int>(output_info.C)};
io_slices_ = {static_cast<cl_int>(input_info.Slice), static_cast<cl_int>(output_info.Slice)};
if (op_parameter_->type_ == PrimitiveType_Slice) {
auto param = reinterpret_cast<SliceParameter *>(op_parameter_);
Broadcast2GpuShape(param->begin_, begin_.s, param->param_length_, 0);
Broadcast2GpuShape(param->size_, size_.s, param->param_length_, -1);
for (int i = 0; i < 4; ++i) {
if (begin_.s[i] < 0) {
begin_.s[i] += input_shape_.s[i];
}
if (begin_.s[i] < 0 || begin_.s[i] >= input_shape_.s[i]) {
MS_LOG(ERROR) << "Slice kernel only supports 0<=begin<input_shape but begin[i]=" << begin_.s[i]
<< " input_shape[i]=" << input_shape_.s[i];
return RET_ERROR;
}
if (size_.s[i] < -1 || size_.s[i] == 0) {
MS_LOG(ERROR) << "Slice kernel only supports size=-1 or size>0 but size[i]=" << size_.s[i];
return RET_ERROR;
}
if (size_.s[i] == -1 || begin_.s[i] + size_.s[i] > input_shape_.s[i]) {
size_.s[i] = input_shape_.s[i] - begin_.s[i];
}
}
} else {
auto param = reinterpret_cast<StridedSliceParameter *>(op_parameter_);
cl_int4 end = input_shape_;
Broadcast2GpuShape(param->begins_, begin_.s, param->num_axes_, 0);
Broadcast2GpuShape(param->strides_, stride_.s, param->num_axes_, 1);
Broadcast2GpuShape(param->ends_, end.s, param->num_axes_);
for (int i = 0; i < 4; ++i) {
// begin is negative
if (begin_.s[i] < 0) {
begin_.s[i] += input_shape_.s[i];
}
// avoid begin is out of range
begin_.s[i] = std::clamp(begin_.s[i], 0, input_shape_.s[i] - 1);
// end is negative
if (end.s[i] < 0) {
end.s[i] += input_shape_.s[i];
}
// avoid end is out of range
end.s[i] = std::clamp(end.s[i], -1, input_shape_.s[i]);
// check stride begin end
if (stride_.s[i] > 0) {
if (begin_.s[i] >= end.s[i]) {
MS_LOG(ERROR) << "StridedSlice kernel only supports begin_<end when stride>0";
return RET_ERROR;
}
} else if (stride_.s[i] < 0) {
if (begin_.s[i] <= end.s[i]) {
MS_LOG(ERROR) << "StridedSlice kernel only supports begin_>end when stride<0";
return RET_ERROR;
}
} else {
MS_LOG(ERROR) << "StridedSlice kernel only supports stride!=0";
return RET_ERROR;
}
size_.s[i] = std::ceil(static_cast<float>(end.s[i] - begin_.s[i]) / static_cast<float>(stride_.s[i]));
}
}
// check size
std::vector<int> shape_not_1;
std::vector<int> size_not_1;
std::copy_if(out_tensors_.front()->shape().begin(), out_tensors_.front()->shape().end(), shape_not_1.begin(),
[](int x) { return x > 1; });
std::copy_if(size_.s, size_.s + 4, size_not_1.begin(), [](int x) { return x > 1; });
if (shape_not_1 != size_not_1) {
MS_LOG(ERROR) << "Slice/StridedSlice kernel output shape infer error";
return RET_ERROR;
}
return RET_OK;
}
void SliceOpenCLKernel::SetConstArgs() {
int arg_cn = 2;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, io_slices_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, begin_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_);
ocl_runtime_->SetKernelArg(kernel_, arg_cn, size_);
}
void SliceOpenCLKernel::SetGlobalLocal() {
auto output_info = Image2DInfo(out_tensors_.front());
std::vector<size_t> global = {output_info.N * output_info.H, output_info.W, output_info.Slice};
const int max_divider = 8;
auto max_work_group_size = ocl_runtime_->DeviceMaxWorkGroupSize();
size_t local_c = GetMaxDivisorStrategy0(global[2], max_divider);
size_t local_hw = max_work_group_size / local_c;
size_t local_h = std::min(UP_DIV(global[0], 2), local_hw);
size_t local_w = std::min(local_hw / local_h, global[1]);
std::vector<size_t> local = {local_h, local_w, local_c};
AlignGlobalLocal(global, local);
}
int SliceOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
return RET_OK;
}
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Slice, OpenCLKernelCreator<SliceOpenCLKernel>);
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Slice, OpenCLKernelCreator<SliceOpenCLKernel>);
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_StridedSlice, OpenCLKernelCreator<SliceOpenCLKernel>);
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_StridedSlice, OpenCLKernelCreator<SliceOpenCLKernel>);
} // namespace mindspore::kernel

@ -14,8 +14,8 @@
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_SLICE_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_SLICE_H_
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_STRIDED_SLICE_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_STRIDED_SLICE_H_
#include <vector>
#include "src/runtime/kernel/opencl/opencl_kernel.h"
@ -31,12 +31,23 @@ class SliceOpenCLKernel : public OpenCLKernel {
~SliceOpenCLKernel() override = default;
int Init() override;
int Prepare() override;
int Run() override;
int CheckSpecs() override;
void SetConstArgs() override;
void SetGlobalLocal() override;
private:
int InitConstArgs();
cl::Kernel kernel_;
cl_int4 input_shape_{};
cl_int4 output_shape_{};
cl_int2 io_slices_{};
cl_int4 begin_{};
cl_int4 stride_{{1, 1, 1, 1}};
cl_int4 size_{};
};
} // namespace mindspore::kernel

@ -34,33 +34,67 @@ struct OpenCLToFormatParameter {
lite::opencl::MemType out_mem_type{lite::opencl::MemType::IMG};
};
template <typename SrcT, typename DstT>
void Broadcast2GpuShape(const SrcT *src, DstT *dst, int src_num) {
auto *N = dst;
auto *H = dst + 1;
auto *W = dst + 2;
auto *C = dst + 3;
if (src_num == 1) {
*N = src[0];
} else if (src_num == 2) {
*N = src[0];
*C = src[1];
} else if (src_num == 3) {
*N = src[0];
*W = src[1];
*C = src[2];
} else if (src_num == 4) {
*N = src[0];
*H = src[1];
*W = src[2];
*C = src[3];
} else if (src_num >= 5) {
MS_LOG(ERROR) << "GPU doesn't support ndim>=" << src_num;
}
}
template <typename SrcT, typename DstT>
void Broadcast2GpuShape(const SrcT *src, DstT *dst, int src_num, DstT default_value) {
for (int i = 0; i < 4; ++i) {
dst[i] = default_value;
}
Broadcast2GpuShape(src, dst, src_num);
}
struct Image2DInfo {
explicit Image2DInfo(const lite::Tensor *tensor) {
if (tensor == nullptr) {
return;
}
auto shape = tensor->shape();
if (shape.size() == 1) {
auto ndim = shape.size();
if (ndim == 1) {
N = shape[0];
} else if (shape.size() == 2) {
} else if (ndim == 2) {
N = shape[0];
C = shape[1];
} else if (shape.size() == 3) {
} else if (ndim == 3) {
N = shape[0];
W = shape[1];
C = shape[2];
} else if (shape.size() == 4) {
} else if (ndim == 4) {
N = shape[0];
H = shape[1];
W = shape[2];
C = shape[3];
} else if (shape.size() >= 5) {
MS_LOG(ERROR) << "GPU dont't support Tensor with dim=" << shape.size();
} else if (ndim >= 5) {
MS_LOG(ERROR) << "GPU doesn't support Tensor with ndim>=" << ndim;
}
Slice = UP_DIV(C, C4NUM);
FLT_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half) : sizeof(cl_float);
FLT4_size = FLT_size * 4;
Slice = UP_DIV(C, C4NUM);
if (W * Slice <= MAX_IMAGE2D_SIZE) {
height = N * H;
width = W * Slice;

@ -15,11 +15,17 @@
*/
#include <string>
#include "src/common/log_adapter.h"
#include "mindspore/lite/src/common/file_utils.h"
#include "common/common_test.h"
#include "src/kernel_registry.h"
#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
using mindspore::kernel::LiteKernel;
using mindspore::kernel::SubGraphOpenCLKernel;
using mindspore::lite::KernelRegistry;
using mindspore::lite::Tensor;
using mindspore::schema::Format::Format_NHWC;
namespace mindspore {
void LoadTestData(void *dst, size_t dst_size, const std::string &file_path) {
@ -35,4 +41,80 @@ void LoadTestData(void *dst, size_t dst_size, const std::string &file_path) {
}
}
void TestMain(const std::vector<std::tuple<std::vector<int>, float *, Tensor::Category>> &input_infos,
std::tuple<std::vector<int>, float *> output_info, OpParameter *op_parameter, bool fp16_enable,
float atol, bool print_output) {
MS_LOG(DEBUG) << "initialize OpenCLRuntime and OpenCLAllocator";
auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
auto ocl_runtime = runtime_wrapper.GetInstance();
EXPECT_TRUE(ocl_runtime->Init() == RET_OK);
ocl_runtime->SetFp16Enable(fp16_enable);
auto allocator = ocl_runtime->GetAllocator();
MS_LOG(DEBUG) << "create Tensors & init weight data";
std::vector<Tensor> tensors;
std::vector<Tensor *> kernel_inputs;
std::vector<Tensor *> subgraph_inputs;
std::map<Tensor *, float *> subgraph_inputs_data;
for (auto input_info : input_infos) {
const std::vector<int> &shape = std::get<0>(input_info);
auto *input_data = std::get<1>(input_info);
const Tensor::Category category = std::get<2>(input_info);
tensors.emplace_back(kNumberTypeFloat32, shape, Format_NHWC, category);
auto *new_tensor = &tensors.back();
kernel_inputs.push_back(new_tensor);
if (category != Tensor::Category::VAR) {
memcpy(new_tensor->MutableData(), input_data, new_tensor->Size());
} else {
subgraph_inputs.push_back(new_tensor);
subgraph_inputs_data[new_tensor] = input_data;
}
}
const std::vector<int> &output_shape = std::get<0>(output_info);
float *expect_data = std::get<1>(output_info);
auto output = Tensor(kNumberTypeFloat32, output_shape, Format_NHWC, Tensor::Category::VAR);
MS_LOG(DEBUG) << "create OpenCL Kernel";
auto primitive_type = static_cast<schema::PrimitiveType>(op_parameter->type_);
kernel::KernelKey key{kernel::kGPU, kernel_inputs.front()->data_type(), primitive_type};
auto creator = KernelRegistry::GetInstance()->GetCreator(key);
if (creator == nullptr) {
std::cerr << "get kernel registry function error: " << schema::EnumNamePrimitiveType(primitive_type) << std::endl;
free(op_parameter);
FAIL();
}
auto *kernel = creator(kernel_inputs, {&output}, op_parameter, nullptr, key, nullptr);
if (kernel == nullptr) {
std::cerr << "call kernel registry function error: " << schema::EnumNamePrimitiveType(primitive_type) << std::endl;
free(op_parameter);
FAIL();
}
MS_LOG(DEBUG) << "create SubGraph & init input data";
std::vector<LiteKernel *> kernels{kernel};
auto sub_graph = new (std::nothrow) SubGraphOpenCLKernel(subgraph_inputs, {&output}, kernels, kernels, kernels);
if (sub_graph == nullptr) {
return;
}
for (auto input : subgraph_inputs) {
EXPECT_TRUE(input->MallocData(allocator) == RET_OK);
}
EXPECT_TRUE(sub_graph->Init() == RET_OK);
for (auto input : subgraph_inputs) {
memcpy(input->data_c(), subgraph_inputs_data[input], input->Size());
}
MS_LOG(DEBUG) << "run SubGraph & compare result";
EXPECT_TRUE(sub_graph->Run() == RET_OK);
if (print_output) {
for (int i = 0; i < output.ElementsNum(); ++i) {
printf("%d: expect=%.3f output=%.3f\n", i, expect_data[i], reinterpret_cast<float *>(output.data_c())[i]);
}
}
CommonTest::CompareOutputData(reinterpret_cast<float *>(output.data_c()), expect_data, output.ElementsNum(), atol);
MS_LOG(DEBUG) << "release resources";
delete sub_graph;
}
} // namespace mindspore

@ -14,16 +14,18 @@
* limitations under the License.
*/
#ifndef TESTS_UT_OPENCL_KERNEL_TESTS_UTILS_H_
#define TESTS_UT_OPENCL_KERNEL_TESTS_UTILS_H_
#include <string>
#include <iostream>
#include "tests/ut/cpp/common/common_test.h"
#include "src/common/log_adapter.h"
#include <vector>
#include <tuple>
#include <map>
#include "mindspore/lite/src/tensor.h"
#include "mindspore/lite/src/common/file_utils.h"
#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
#ifndef TESTS_UT_OPENCL_KERNEL_TESTS_UTILS_H_
#define TESTS_UT_OPENCL_KERNEL_TESTS_UTILS_H_
using mindspore::lite::Tensor;
namespace mindspore {
@ -63,6 +65,10 @@ void CompareOutput(lite::Tensor *output_tensor, const std::string &file_path, T
CompareOutput(output_tensor->data_c(), expect_data, output_tensor->ElementsNum(), atol, rtol);
}
void TestMain(const std::vector<std::tuple<std::vector<int>, float *, Tensor::Category>> &input_infos,
std::tuple<std::vector<int>, float *> output_info, OpParameter *op_parameter, bool fp16_enable = false,
float atol = 10e-9, bool print_output = false);
} // namespace mindspore
#endif // TESTS_UT_OPENCL_KERNEL_TESTS_UTILS_H_

Loading…
Cancel
Save