parent
c324fec6d4
commit
c94563c06b
@ -1,146 +0,0 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#define INT2 int2
|
||||
#define INT4 int4
|
||||
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
|
||||
__kernel void slice_NHWC4(__read_only image2d_t input, __write_only image2d_t output, INT4 input_shape, INT4 out_shape,
|
||||
INT4 begin, INT2 sharedNoUpdiv) {
|
||||
int X = get_global_id(1); // H
|
||||
int Y = get_global_id(2); // W
|
||||
if (X >= out_shape.y || Y >= out_shape.z) {
|
||||
return;
|
||||
}
|
||||
FLT4 result;
|
||||
if (sharedNoUpdiv.x % 4 == 0) {
|
||||
for (int i = 0; i < out_shape.w; i++) {
|
||||
result = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + (i + begin.w), (X + begin.y)));
|
||||
WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + i, (X)), result);
|
||||
}
|
||||
} else {
|
||||
int begin_postion = sharedNoUpdiv.x % 4;
|
||||
FLT4 first = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + begin.w, (X + begin.y)));
|
||||
if (begin_postion == 1) {
|
||||
for (int i = 1; i <= out_shape.w; i++) {
|
||||
FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + (begin.w + i), (X + begin.y)));
|
||||
result.x = first.y;
|
||||
result.y = first.z;
|
||||
result.z = first.w;
|
||||
result.w = second.x;
|
||||
WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + i - 1, (X)), result);
|
||||
first.y = second.y;
|
||||
first.z = second.z;
|
||||
first.w = second.w;
|
||||
}
|
||||
} else if (begin_postion == 2) {
|
||||
for (int i = 1; i <= out_shape.w; i++) {
|
||||
FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + (begin.w + i), (X + begin.y)));
|
||||
result.x = first.z;
|
||||
result.y = first.w;
|
||||
result.z = second.x;
|
||||
result.w = second.y;
|
||||
WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + i - 1, (X)), result);
|
||||
first.z = second.z;
|
||||
first.w = second.w;
|
||||
}
|
||||
} else {
|
||||
for (int i = 1; i <= out_shape.w; i++) {
|
||||
FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z) * input_shape.w + (begin.w + i), (X + begin.y)));
|
||||
result.x = first.w;
|
||||
result.y = second.x;
|
||||
result.z = second.y;
|
||||
result.w = second.z;
|
||||
WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + i - 1, (X)), result);
|
||||
first.w = second.w;
|
||||
}
|
||||
}
|
||||
}
|
||||
// judge the line of size
|
||||
int size = sharedNoUpdiv.y % 4;
|
||||
FLT4 result_fill0;
|
||||
if (size == 1) {
|
||||
result_fill0.x = result.x;
|
||||
result_fill0.y = 0;
|
||||
result_fill0.z = 0;
|
||||
result_fill0.w = 0;
|
||||
WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + out_shape.w - 1, (X)), result_fill0);
|
||||
} else if (size == 2) {
|
||||
result_fill0.x = result.x;
|
||||
result_fill0.y = result.y;
|
||||
result_fill0.z = 0;
|
||||
result_fill0.w = 0;
|
||||
WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + out_shape.w - 1, (X)), result_fill0);
|
||||
} else if (size == 3) {
|
||||
result_fill0.x = result.x;
|
||||
result_fill0.y = result.y;
|
||||
result_fill0.z = result.z;
|
||||
result_fill0.w = 0;
|
||||
WRITE_IMAGE(output, (INT2)((Y)*out_shape.w + out_shape.w - 1, (X)), result_fill0);
|
||||
}
|
||||
}
|
||||
__kernel void slice_NC4HW4(__read_only image2d_t input, __write_only image2d_t output, INT4 input_shape, INT4 out_shape,
|
||||
INT4 begin, INT2 sharedNoUpdiv) {
|
||||
int X = get_global_id(1); // H
|
||||
int Y = get_global_id(2); // W
|
||||
if (X >= out_shape.y || Y >= out_shape.z) {
|
||||
return;
|
||||
}
|
||||
FLT4 result;
|
||||
if (sharedNoUpdiv.x % 4 == 0) {
|
||||
for (int i = 0; i < out_shape.w; i++) {
|
||||
result = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z), (i + begin.w) * input_shape.y + (X + begin.y)));
|
||||
WRITE_IMAGE(output, (INT2)((Y), (i * out_shape.y + X)), result);
|
||||
}
|
||||
} else {
|
||||
int begin_postion = sharedNoUpdiv.x % 4;
|
||||
FLT4 first = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z), (begin.w) * input_shape.y + (X + begin.y)));
|
||||
if (begin_postion == 1) {
|
||||
for (int i = 1; i <= out_shape.w; i++) {
|
||||
FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z), (i + begin.w) * input_shape.y + (X + begin.y)));
|
||||
result.x = first.y;
|
||||
result.y = first.z;
|
||||
result.z = first.w;
|
||||
result.w = second.x;
|
||||
WRITE_IMAGE(output, (INT2)((Y), ((i - 1) * out_shape.y + X)), result);
|
||||
first.y = second.y;
|
||||
first.z = second.z;
|
||||
first.w = second.w;
|
||||
}
|
||||
} else if (begin_postion == 2) {
|
||||
for (int i = 1; i <= out_shape.w; i++) {
|
||||
FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z), (i + begin.w) * input_shape.y + (X + begin.y)));
|
||||
result.x = first.z;
|
||||
result.y = first.w;
|
||||
result.z = second.x;
|
||||
result.w = second.y;
|
||||
WRITE_IMAGE(output, (INT2)((Y), ((i - 1) * out_shape.y + X)), result);
|
||||
first.z = second.z;
|
||||
first.w = second.w;
|
||||
}
|
||||
} else {
|
||||
for (int i = 1; i <= out_shape.w; i++) {
|
||||
FLT4 second = READ_IMAGE(input, smp_none, (INT2)((Y + begin.z), (i + begin.w) * input_shape.y + (X + begin.y)));
|
||||
result.x = first.w;
|
||||
result.y = second.x;
|
||||
result.z = second.y;
|
||||
result.w = second.z;
|
||||
WRITE_IMAGE(output, (INT2)((Y), ((i - 1) * out_shape.y + X)), result);
|
||||
first.w = second.w;
|
||||
}
|
||||
}
|
||||
}
|
||||
// judge the line of size
|
||||
int size = sharedNoUpdiv.y % 4;
|
||||
FLT4 result_fill0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
if (size == 1) {
|
||||
result_fill0.x = result.x;
|
||||
WRITE_IMAGE(output, (INT2)((Y), ((out_shape.w - 1) * out_shape.y + X)), result_fill0);
|
||||
} else if (size == 2) {
|
||||
result_fill0.x = result.x;
|
||||
result_fill0.y = result.y;
|
||||
WRITE_IMAGE(output, (INT2)((Y), ((out_shape.w - 1) * out_shape.y + X)), result_fill0);
|
||||
} else if (size == 3) {
|
||||
result_fill0.x = result.x;
|
||||
result_fill0.y = result.y;
|
||||
result_fill0.z = result.z;
|
||||
WRITE_IMAGE(output, (INT2)((Y), ((out_shape.w - 1) * out_shape.y + X)), result_fill0);
|
||||
}
|
||||
}
|
@ -0,0 +1,59 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
|
||||
|
||||
__kernel void strided_slice(__read_only image2d_t input, __write_only image2d_t output, int4 input_shape,
|
||||
int4 output_shape, int2 io_slices, int4 begin, int4 stride, int4 size) {
|
||||
int IN = input_shape.x, IH = input_shape.y, IW = input_shape.z, CI = input_shape.w;
|
||||
int ON = output_shape.x, OH = output_shape.y, OW = output_shape.z, CO = output_shape.w;
|
||||
int CI_SLICES = io_slices.x, CO_SLICES = io_slices.y;
|
||||
int on_oh = get_global_id(0);
|
||||
int ow = get_global_id(1);
|
||||
int co_slice = get_global_id(2);
|
||||
int on = on_oh / OH;
|
||||
int oh = on_oh % OH;
|
||||
if (on >= ON || oh >= OH || ow >= OW || co_slice >= CO_SLICES) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLT tmp[4];
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
// output_shape idx -> size idx. because squeeze(output_shape)=squeeze(size)
|
||||
// for example:
|
||||
// python code: B = A[1, 1:16, 2:16, 3:16]
|
||||
// input_shape = [16, 16, 16, 16]
|
||||
// begin = [ 1, 1, 2, 3]
|
||||
// end = [ 2, 16, 16, 16]
|
||||
// stride = [ 1, 1, 1, 1]
|
||||
// size = [ 1, 15, 14, 13] = ceil((end - begin) / stride)
|
||||
// output_shape = [ 15, 14, 13]
|
||||
int idx = ((on * OH + oh) * OW + ow) * CO + co_slice * 4 + i;
|
||||
int co_ = idx % size.w;
|
||||
idx /= size.w;
|
||||
int ow_ = idx % size.z;
|
||||
idx /= size.z;
|
||||
int oh_ = idx % size.y;
|
||||
idx /= size.y;
|
||||
int on_ = idx;
|
||||
|
||||
int in = begin.x + stride.x * on_;
|
||||
int ih = begin.y + stride.y * oh_;
|
||||
int iw = begin.z + stride.z * ow_;
|
||||
int ci = begin.w + stride.w * co_;
|
||||
|
||||
FLT4 src = READ_IMAGE(input, smp_none, (int2)(iw * CI_SLICES + ci / 4, in * IH + ih));
|
||||
int offset = ci % 4;
|
||||
if (offset == 0) {
|
||||
tmp[i] = src.x;
|
||||
} else if (offset == 1) {
|
||||
tmp[i] = src.y;
|
||||
} else if (offset == 2) {
|
||||
tmp[i] = src.z;
|
||||
} else {
|
||||
tmp[i] = src.w;
|
||||
}
|
||||
}
|
||||
|
||||
FLT4 out = (FLT4)(tmp[0], tmp[1], tmp[2], tmp[3]);
|
||||
WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, on_oh), out);
|
||||
}
|
@ -1,106 +0,0 @@
|
||||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/slice.h"
|
||||
#include "src/runtime/kernel/opencl/utils.h"
|
||||
#include "src/runtime/kernel/opencl/cl/slice.cl.inc"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_Slice;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
int SliceOpenCLKernel::Init() {
|
||||
std::set<std::string> build_options;
|
||||
std::string source = slice_source;
|
||||
std::string program_name = "slice";
|
||||
std::string kernel_name = "slice_NHWC4";
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void SlcieGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
|
||||
const int max_divider = 8;
|
||||
const int max_x = 4, max_y = 8;
|
||||
int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
|
||||
int yz = max_size / x;
|
||||
int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
|
||||
int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));
|
||||
|
||||
local->clear();
|
||||
local->push_back(x);
|
||||
local->push_back(y);
|
||||
local->push_back(z);
|
||||
}
|
||||
|
||||
int SliceOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
auto param = reinterpret_cast<SliceParameter *>(this->op_parameter_);
|
||||
auto input_shape = in_tensors_[0]->shape();
|
||||
cl_int4 input_shape_ = {input_shape[0], input_shape[1], input_shape[2], UP_DIV(input_shape[3], C4NUM)};
|
||||
cl_int4 size_ = {param->size_[0], param->size_[1], param->size_[2], UP_DIV(param->size_[3], C4NUM)};
|
||||
cl_int4 begin_ = {param->begin_[0], param->begin_[1], param->begin_[2], param->begin_[3] / 4};
|
||||
cl_int2 sharedNoUpdiv = {param->begin_[3], param->size_[3]};
|
||||
uint32_t OH = param->size_[1];
|
||||
uint32_t OW = param->size_[2];
|
||||
|
||||
const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
|
||||
std::vector<size_t> local = {1, 1, 1}; // init local
|
||||
std::vector<size_t> global = {1, OH, OW};
|
||||
SlcieGetWorkGroup(global, &local, max_global[0]);
|
||||
int arg_cn = 0;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); // input tensor
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, size_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, begin_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, sharedNoUpdiv);
|
||||
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
kernel::LiteKernel *OpenCLSliceKernelCreator(const std::vector<lite::Tensor *> &inputs,
|
||||
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
|
||||
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
|
||||
const mindspore::lite::PrimitiveC *primitive) {
|
||||
auto *kernel = new (std::nothrow) SliceOpenCLKernel(opParameter, inputs, outputs);
|
||||
if (kernel == nullptr) {
|
||||
MS_LOG(ERROR) << " new SliceOpenCLKernel failed ";
|
||||
free(opParameter);
|
||||
return nullptr;
|
||||
}
|
||||
auto ret = kernel->Init();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << " Init kernel failed, name: Slice ";
|
||||
delete kernel;
|
||||
return nullptr;
|
||||
}
|
||||
return kernel;
|
||||
}
|
||||
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Slice, OpenCLSliceKernelCreator);
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Slice, OpenCLSliceKernelCreator);
|
||||
} // namespace mindspore::kernel
|
@ -0,0 +1,192 @@
|
||||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include <cstring>
|
||||
#include <deque>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
#include "src/kernel_registry.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/strided_slice.h"
|
||||
#include "src/runtime/kernel/opencl/utils.h"
|
||||
#include "src/runtime/kernel/opencl/cl/strided_slice.cl.inc"
|
||||
#include "nnacl/strided_slice.h"
|
||||
|
||||
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
||||
using mindspore::lite::KernelRegistrar;
|
||||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_Slice;
|
||||
using mindspore::schema::PrimitiveType_StridedSlice;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
int SliceOpenCLKernel::CheckSpecs() {
|
||||
const std::string kernel_name = op_parameter_->type_ == PrimitiveType_Slice ? "Slice" : "StridedSlice";
|
||||
if (in_tensors_.size() != 1) {
|
||||
MS_LOG(ERROR) << kernel_name + " only supports 1 input Tensor.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (out_tensors_.size() != 1) {
|
||||
MS_LOG(ERROR) << kernel_name + " only supports 1 output Tensor.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
auto in_ndim = in_tensors_.front()->shape().size();
|
||||
if (in_ndim == 0 || in_ndim > 4) {
|
||||
MS_LOG(ERROR) << kernel_name + " only supports 1D-4D input tensor";
|
||||
return RET_ERROR;
|
||||
}
|
||||
auto out_ndim = out_tensors_.front()->shape().size();
|
||||
if (out_ndim > 4) {
|
||||
MS_LOG(ERROR) << kernel_name + " only supports 0D-4D output tensor";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (InitConstArgs() != RET_OK) {
|
||||
MS_LOG(ERROR) << "call SliceOpenCLKernel::InitConstArgs() failed";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int SliceOpenCLKernel::Prepare() {
|
||||
std::set<std::string> build_options;
|
||||
std::string program_name = "strided_slice";
|
||||
ocl_runtime_->LoadSource(program_name, strided_slice_source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, "strided_slice", build_options);
|
||||
SetConstArgs();
|
||||
SetGlobalLocal();
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int SliceOpenCLKernel::InitConstArgs() {
|
||||
auto input_info = Image2DInfo(in_tensors_.front());
|
||||
auto output_info = Image2DInfo(out_tensors_.front());
|
||||
input_shape_ = {static_cast<cl_int>(input_info.N), static_cast<cl_int>(input_info.H),
|
||||
static_cast<cl_int>(input_info.W), static_cast<cl_int>(input_info.C)};
|
||||
output_shape_ = {static_cast<cl_int>(output_info.N), static_cast<cl_int>(output_info.H),
|
||||
static_cast<cl_int>(output_info.W), static_cast<cl_int>(output_info.C)};
|
||||
io_slices_ = {static_cast<cl_int>(input_info.Slice), static_cast<cl_int>(output_info.Slice)};
|
||||
|
||||
if (op_parameter_->type_ == PrimitiveType_Slice) {
|
||||
auto param = reinterpret_cast<SliceParameter *>(op_parameter_);
|
||||
Broadcast2GpuShape(param->begin_, begin_.s, param->param_length_, 0);
|
||||
Broadcast2GpuShape(param->size_, size_.s, param->param_length_, -1);
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
if (begin_.s[i] < 0) {
|
||||
begin_.s[i] += input_shape_.s[i];
|
||||
}
|
||||
if (begin_.s[i] < 0 || begin_.s[i] >= input_shape_.s[i]) {
|
||||
MS_LOG(ERROR) << "Slice kernel only supports 0<=begin<input_shape but begin[i]=" << begin_.s[i]
|
||||
<< " input_shape[i]=" << input_shape_.s[i];
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (size_.s[i] < -1 || size_.s[i] == 0) {
|
||||
MS_LOG(ERROR) << "Slice kernel only supports size=-1 or size>0 but size[i]=" << size_.s[i];
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (size_.s[i] == -1 || begin_.s[i] + size_.s[i] > input_shape_.s[i]) {
|
||||
size_.s[i] = input_shape_.s[i] - begin_.s[i];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
auto param = reinterpret_cast<StridedSliceParameter *>(op_parameter_);
|
||||
cl_int4 end = input_shape_;
|
||||
Broadcast2GpuShape(param->begins_, begin_.s, param->num_axes_, 0);
|
||||
Broadcast2GpuShape(param->strides_, stride_.s, param->num_axes_, 1);
|
||||
Broadcast2GpuShape(param->ends_, end.s, param->num_axes_);
|
||||
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
// begin is negative
|
||||
if (begin_.s[i] < 0) {
|
||||
begin_.s[i] += input_shape_.s[i];
|
||||
}
|
||||
// avoid begin is out of range
|
||||
begin_.s[i] = std::clamp(begin_.s[i], 0, input_shape_.s[i] - 1);
|
||||
// end is negative
|
||||
if (end.s[i] < 0) {
|
||||
end.s[i] += input_shape_.s[i];
|
||||
}
|
||||
// avoid end is out of range
|
||||
end.s[i] = std::clamp(end.s[i], -1, input_shape_.s[i]);
|
||||
|
||||
// check stride begin end
|
||||
if (stride_.s[i] > 0) {
|
||||
if (begin_.s[i] >= end.s[i]) {
|
||||
MS_LOG(ERROR) << "StridedSlice kernel only supports begin_<end when stride>0";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else if (stride_.s[i] < 0) {
|
||||
if (begin_.s[i] <= end.s[i]) {
|
||||
MS_LOG(ERROR) << "StridedSlice kernel only supports begin_>end when stride<0";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else {
|
||||
MS_LOG(ERROR) << "StridedSlice kernel only supports stride!=0";
|
||||
return RET_ERROR;
|
||||
}
|
||||
size_.s[i] = std::ceil(static_cast<float>(end.s[i] - begin_.s[i]) / static_cast<float>(stride_.s[i]));
|
||||
}
|
||||
}
|
||||
|
||||
// check size
|
||||
std::vector<int> shape_not_1;
|
||||
std::vector<int> size_not_1;
|
||||
std::copy_if(out_tensors_.front()->shape().begin(), out_tensors_.front()->shape().end(), shape_not_1.begin(),
|
||||
[](int x) { return x > 1; });
|
||||
std::copy_if(size_.s, size_.s + 4, size_not_1.begin(), [](int x) { return x > 1; });
|
||||
if (shape_not_1 != size_not_1) {
|
||||
MS_LOG(ERROR) << "Slice/StridedSlice kernel output shape infer error";
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void SliceOpenCLKernel::SetConstArgs() {
|
||||
int arg_cn = 2;
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, io_slices_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, begin_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_);
|
||||
ocl_runtime_->SetKernelArg(kernel_, arg_cn, size_);
|
||||
}
|
||||
|
||||
void SliceOpenCLKernel::SetGlobalLocal() {
|
||||
auto output_info = Image2DInfo(out_tensors_.front());
|
||||
std::vector<size_t> global = {output_info.N * output_info.H, output_info.W, output_info.Slice};
|
||||
|
||||
const int max_divider = 8;
|
||||
auto max_work_group_size = ocl_runtime_->DeviceMaxWorkGroupSize();
|
||||
size_t local_c = GetMaxDivisorStrategy0(global[2], max_divider);
|
||||
size_t local_hw = max_work_group_size / local_c;
|
||||
size_t local_h = std::min(UP_DIV(global[0], 2), local_hw);
|
||||
size_t local_w = std::min(local_hw / local_h, global[1]);
|
||||
std::vector<size_t> local = {local_h, local_w, local_c};
|
||||
AlignGlobalLocal(global, local);
|
||||
}
|
||||
|
||||
int SliceOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running! ";
|
||||
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
|
||||
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Slice, OpenCLKernelCreator<SliceOpenCLKernel>);
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Slice, OpenCLKernelCreator<SliceOpenCLKernel>);
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_StridedSlice, OpenCLKernelCreator<SliceOpenCLKernel>);
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_StridedSlice, OpenCLKernelCreator<SliceOpenCLKernel>);
|
||||
} // namespace mindspore::kernel
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue